Llama-3.1-8B-Magpie-Align-v0.1 / trainer_state.json
flydust's picture
Model save
590b033 verified
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 0.9959925193694897,
"eval_steps": 100,
"global_step": 233,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.004274646005877639,
"grad_norm": 3.4727758395385346,
"learning_rate": 4.166666666666666e-08,
"logits/chosen": -0.9238853454589844,
"logits/rejected": -0.9009266495704651,
"logps/chosen": -211.83998107910156,
"logps/rejected": -194.95265197753906,
"loss": 0.6931,
"rewards/accuracies": 0.0,
"rewards/chosen": 0.0,
"rewards/margins": 0.0,
"rewards/rejected": 0.0,
"step": 1
},
{
"epoch": 0.008549292011755277,
"grad_norm": 3.5000648062483686,
"learning_rate": 8.333333333333333e-08,
"logits/chosen": -0.9474210739135742,
"logits/rejected": -0.9417086243629456,
"logps/chosen": -160.0943603515625,
"logps/rejected": -163.26644897460938,
"loss": 0.6931,
"rewards/accuracies": 0.0,
"rewards/chosen": 0.0,
"rewards/margins": 0.0,
"rewards/rejected": 0.0,
"step": 2
},
{
"epoch": 0.012823938017632914,
"grad_norm": 3.8566721368935113,
"learning_rate": 1.25e-07,
"logits/chosen": -0.8552289009094238,
"logits/rejected": -0.9027292132377625,
"logps/chosen": -197.13523864746094,
"logps/rejected": -191.77366638183594,
"loss": 0.6932,
"rewards/accuracies": 0.5,
"rewards/chosen": 0.00209163804538548,
"rewards/margins": 0.0021166829392313957,
"rewards/rejected": -2.5045330403372645e-05,
"step": 3
},
{
"epoch": 0.017098584023510555,
"grad_norm": 3.527297888533762,
"learning_rate": 1.6666666666666665e-07,
"logits/chosen": -0.9195055961608887,
"logits/rejected": -0.9506024122238159,
"logps/chosen": -175.96563720703125,
"logps/rejected": -177.187255859375,
"loss": 0.6931,
"rewards/accuracies": 0.5625,
"rewards/chosen": 0.0005994887324050069,
"rewards/margins": 0.001228818204253912,
"rewards/rejected": -0.0006293297046795487,
"step": 4
},
{
"epoch": 0.02137323002938819,
"grad_norm": 3.274108961837268,
"learning_rate": 2.0833333333333333e-07,
"logits/chosen": -0.9131849408149719,
"logits/rejected": -0.9851359128952026,
"logps/chosen": -196.52279663085938,
"logps/rejected": -209.4899444580078,
"loss": 0.6936,
"rewards/accuracies": 0.5,
"rewards/chosen": -0.0014956831000745296,
"rewards/margins": -0.002281556138768792,
"rewards/rejected": 0.0007858729222789407,
"step": 5
},
{
"epoch": 0.02564787603526583,
"grad_norm": 3.4643988401861643,
"learning_rate": 2.5e-07,
"logits/chosen": -1.0323811769485474,
"logits/rejected": -1.0281962156295776,
"logps/chosen": -175.13864135742188,
"logps/rejected": -171.71237182617188,
"loss": 0.6934,
"rewards/accuracies": 0.34375,
"rewards/chosen": -0.0016992997843772173,
"rewards/margins": -0.0023347530514001846,
"rewards/rejected": 0.000635453499853611,
"step": 6
},
{
"epoch": 0.029922522041143467,
"grad_norm": 3.753822101296772,
"learning_rate": 2.916666666666667e-07,
"logits/chosen": -0.8140788078308105,
"logits/rejected": -0.8268399238586426,
"logps/chosen": -204.0390625,
"logps/rejected": -210.50558471679688,
"loss": 0.6926,
"rewards/accuracies": 0.59375,
"rewards/chosen": 0.0002044451393885538,
"rewards/margins": 0.0006307458970695734,
"rewards/rejected": -0.0004263008013367653,
"step": 7
},
{
"epoch": 0.03419716804702111,
"grad_norm": 3.1848827253835568,
"learning_rate": 3.333333333333333e-07,
"logits/chosen": -0.9922436475753784,
"logits/rejected": -0.9979274868965149,
"logps/chosen": -192.83494567871094,
"logps/rejected": -200.88128662109375,
"loss": 0.693,
"rewards/accuracies": 0.59375,
"rewards/chosen": 0.002161582000553608,
"rewards/margins": 0.0022183258552104235,
"rewards/rejected": -5.674359272234142e-05,
"step": 8
},
{
"epoch": 0.03847181405289874,
"grad_norm": 3.7147220039656568,
"learning_rate": 3.75e-07,
"logits/chosen": -0.9252921342849731,
"logits/rejected": -0.9685516357421875,
"logps/chosen": -175.70448303222656,
"logps/rejected": -180.89736938476562,
"loss": 0.6927,
"rewards/accuracies": 0.6875,
"rewards/chosen": 0.0024666767567396164,
"rewards/margins": 0.002756566507741809,
"rewards/rejected": -0.0002898902166634798,
"step": 9
},
{
"epoch": 0.04274646005877638,
"grad_norm": 3.553251668230928,
"learning_rate": 4.1666666666666667e-07,
"logits/chosen": -0.9595114588737488,
"logits/rejected": -0.9833444356918335,
"logps/chosen": -208.72735595703125,
"logps/rejected": -214.8730926513672,
"loss": 0.693,
"rewards/accuracies": 0.4375,
"rewards/chosen": 0.002427927916869521,
"rewards/margins": 0.001882559503428638,
"rewards/rejected": 0.0005453681806102395,
"step": 10
},
{
"epoch": 0.04702110606465402,
"grad_norm": 3.4673888891096216,
"learning_rate": 4.5833333333333327e-07,
"logits/chosen": -0.986074686050415,
"logits/rejected": -0.9903304576873779,
"logps/chosen": -138.227783203125,
"logps/rejected": -137.13824462890625,
"loss": 0.6931,
"rewards/accuracies": 0.40625,
"rewards/chosen": 0.0006541174370795488,
"rewards/margins": -0.0011124282609671354,
"rewards/rejected": 0.0017665456980466843,
"step": 11
},
{
"epoch": 0.05129575207053166,
"grad_norm": 3.412261478741586,
"learning_rate": 5e-07,
"logits/chosen": -0.896647036075592,
"logits/rejected": -0.9640191197395325,
"logps/chosen": -157.36685180664062,
"logps/rejected": -180.9624481201172,
"loss": 0.693,
"rewards/accuracies": 0.53125,
"rewards/chosen": 0.0007026732200756669,
"rewards/margins": 0.0007272702641785145,
"rewards/rejected": -2.459682582411915e-05,
"step": 12
},
{
"epoch": 0.055570398076409296,
"grad_norm": 3.5015942981464434,
"learning_rate": 5.416666666666666e-07,
"logits/chosen": -0.8603953123092651,
"logits/rejected": -0.8457555770874023,
"logps/chosen": -190.04727172851562,
"logps/rejected": -196.87872314453125,
"loss": 0.6918,
"rewards/accuracies": 0.59375,
"rewards/chosen": 0.0017774368170648813,
"rewards/margins": 0.002889451337978244,
"rewards/rejected": -0.0011120146373286843,
"step": 13
},
{
"epoch": 0.059845044082286934,
"grad_norm": 3.3564122983724283,
"learning_rate": 5.833333333333334e-07,
"logits/chosen": -0.9946928024291992,
"logits/rejected": -0.9674972295761108,
"logps/chosen": -173.98526000976562,
"logps/rejected": -167.90187072753906,
"loss": 0.6926,
"rewards/accuracies": 0.53125,
"rewards/chosen": 0.00402639526873827,
"rewards/margins": 0.00235772505402565,
"rewards/rejected": 0.0016686702147126198,
"step": 14
},
{
"epoch": 0.06411969008816458,
"grad_norm": 3.6183547057085903,
"learning_rate": 6.249999999999999e-07,
"logits/chosen": -0.9302492737770081,
"logits/rejected": -0.9131873846054077,
"logps/chosen": -172.501953125,
"logps/rejected": -165.2920684814453,
"loss": 0.6914,
"rewards/accuracies": 0.53125,
"rewards/chosen": 0.008232050575315952,
"rewards/margins": 0.0026909802109003067,
"rewards/rejected": 0.005541070364415646,
"step": 15
},
{
"epoch": 0.06839433609404222,
"grad_norm": 4.094403587057344,
"learning_rate": 6.666666666666666e-07,
"logits/chosen": -0.8987658023834229,
"logits/rejected": -0.918194591999054,
"logps/chosen": -182.8192901611328,
"logps/rejected": -188.6702423095703,
"loss": 0.6907,
"rewards/accuracies": 0.625,
"rewards/chosen": 0.0033055683597922325,
"rewards/margins": 0.006322154775261879,
"rewards/rejected": -0.003016585949808359,
"step": 16
},
{
"epoch": 0.07266898209991986,
"grad_norm": 3.7556102735295602,
"learning_rate": 7.083333333333334e-07,
"logits/chosen": -0.7985554933547974,
"logits/rejected": -0.8355307579040527,
"logps/chosen": -218.515869140625,
"logps/rejected": -218.05130004882812,
"loss": 0.6907,
"rewards/accuracies": 0.53125,
"rewards/chosen": 0.002777060493826866,
"rewards/margins": 0.0028633405454456806,
"rewards/rejected": -8.627981878817081e-05,
"step": 17
},
{
"epoch": 0.07694362810579748,
"grad_norm": 3.7267439469140835,
"learning_rate": 7.5e-07,
"logits/chosen": -1.0510368347167969,
"logits/rejected": -1.1025066375732422,
"logps/chosen": -187.49362182617188,
"logps/rejected": -213.5237274169922,
"loss": 0.6893,
"rewards/accuracies": 0.46875,
"rewards/chosen": 0.003624723292887211,
"rewards/margins": 0.0034801624715328217,
"rewards/rejected": 0.00014456117060035467,
"step": 18
},
{
"epoch": 0.08121827411167512,
"grad_norm": 3.6435544044761947,
"learning_rate": 7.916666666666666e-07,
"logits/chosen": -1.0699188709259033,
"logits/rejected": -1.0673398971557617,
"logps/chosen": -185.699951171875,
"logps/rejected": -175.41836547851562,
"loss": 0.6889,
"rewards/accuracies": 0.8125,
"rewards/chosen": 0.01173433754593134,
"rewards/margins": 0.009952141903340816,
"rewards/rejected": 0.001782197505235672,
"step": 19
},
{
"epoch": 0.08549292011755276,
"grad_norm": 3.56212142993403,
"learning_rate": 8.333333333333333e-07,
"logits/chosen": -0.9693958759307861,
"logits/rejected": -1.0447947978973389,
"logps/chosen": -160.5248260498047,
"logps/rejected": -177.9250030517578,
"loss": 0.6883,
"rewards/accuracies": 0.53125,
"rewards/chosen": 0.0071954806335270405,
"rewards/margins": 0.007437723223119974,
"rewards/rejected": -0.00024224258959293365,
"step": 20
},
{
"epoch": 0.0897675661234304,
"grad_norm": 3.599911110818667,
"learning_rate": 8.75e-07,
"logits/chosen": -0.8949970006942749,
"logits/rejected": -0.9538885951042175,
"logps/chosen": -155.24188232421875,
"logps/rejected": -175.83969116210938,
"loss": 0.6875,
"rewards/accuracies": 0.75,
"rewards/chosen": -0.0026197312399744987,
"rewards/margins": 0.013713551685214043,
"rewards/rejected": -0.016333281993865967,
"step": 21
},
{
"epoch": 0.09404221212930804,
"grad_norm": 3.809589692220953,
"learning_rate": 9.166666666666665e-07,
"logits/chosen": -0.8426035642623901,
"logits/rejected": -0.909124493598938,
"logps/chosen": -162.82546997070312,
"logps/rejected": -168.50677490234375,
"loss": 0.6853,
"rewards/accuracies": 0.625,
"rewards/chosen": 0.014116955921053886,
"rewards/margins": 0.008742437697947025,
"rewards/rejected": 0.005374519154429436,
"step": 22
},
{
"epoch": 0.09831685813518568,
"grad_norm": 3.8481134168387325,
"learning_rate": 9.583333333333334e-07,
"logits/chosen": -0.9963463544845581,
"logits/rejected": -1.030158281326294,
"logps/chosen": -212.16732788085938,
"logps/rejected": -226.55050659179688,
"loss": 0.6831,
"rewards/accuracies": 0.6875,
"rewards/chosen": -0.005188916344195604,
"rewards/margins": 0.037282973527908325,
"rewards/rejected": -0.04247189313173294,
"step": 23
},
{
"epoch": 0.10259150414106331,
"grad_norm": 4.017050664188984,
"learning_rate": 1e-06,
"logits/chosen": -0.9375415444374084,
"logits/rejected": -0.9786323308944702,
"logps/chosen": -161.66697692871094,
"logps/rejected": -171.43328857421875,
"loss": 0.6793,
"rewards/accuracies": 0.8125,
"rewards/chosen": -0.0014034155756235123,
"rewards/margins": 0.029357939958572388,
"rewards/rejected": -0.03076135367155075,
"step": 24
},
{
"epoch": 0.10686615014694095,
"grad_norm": 4.219585279560121,
"learning_rate": 9.999435142363483e-07,
"logits/chosen": -0.9440574049949646,
"logits/rejected": -0.97591233253479,
"logps/chosen": -142.18214416503906,
"logps/rejected": -145.74217224121094,
"loss": 0.6753,
"rewards/accuracies": 0.59375,
"rewards/chosen": 0.006380043923854828,
"rewards/margins": 0.02844325453042984,
"rewards/rejected": -0.022063210606575012,
"step": 25
},
{
"epoch": 0.11114079615281859,
"grad_norm": 4.465336404650454,
"learning_rate": 9.997740697079592e-07,
"logits/chosen": -0.907569408416748,
"logits/rejected": -0.9431344270706177,
"logps/chosen": -186.16468811035156,
"logps/rejected": -188.70187377929688,
"loss": 0.6698,
"rewards/accuracies": 0.65625,
"rewards/chosen": -0.03334157541394234,
"rewards/margins": 0.04588525742292404,
"rewards/rejected": -0.07922682911157608,
"step": 26
},
{
"epoch": 0.11541544215869623,
"grad_norm": 4.035688150172887,
"learning_rate": 9.994917046996472e-07,
"logits/chosen": -0.9081155061721802,
"logits/rejected": -0.9375332593917847,
"logps/chosen": -196.47586059570312,
"logps/rejected": -210.2967071533203,
"loss": 0.6745,
"rewards/accuracies": 0.46875,
"rewards/chosen": -0.06558644771575928,
"rewards/margins": 0.024703964591026306,
"rewards/rejected": -0.09029041230678558,
"step": 27
},
{
"epoch": 0.11969008816457387,
"grad_norm": 4.589583975444085,
"learning_rate": 9.990964830098245e-07,
"logits/chosen": -0.9100086688995361,
"logits/rejected": -0.9473557472229004,
"logps/chosen": -183.28317260742188,
"logps/rejected": -191.90957641601562,
"loss": 0.6642,
"rewards/accuracies": 0.65625,
"rewards/chosen": -0.0952601209282875,
"rewards/margins": 0.06084320694208145,
"rewards/rejected": -0.15610332787036896,
"step": 28
},
{
"epoch": 0.12396473417045151,
"grad_norm": 4.479468138978008,
"learning_rate": 9.985884939360872e-07,
"logits/chosen": -1.1165940761566162,
"logits/rejected": -1.1295504570007324,
"logps/chosen": -166.12542724609375,
"logps/rejected": -165.3243408203125,
"loss": 0.6578,
"rewards/accuracies": 0.6875,
"rewards/chosen": -0.11186902225017548,
"rewards/margins": 0.050340794026851654,
"rewards/rejected": -0.16220980882644653,
"step": 29
},
{
"epoch": 0.12823938017632916,
"grad_norm": 4.703342289738615,
"learning_rate": 9.97967852255038e-07,
"logits/chosen": -0.9528751969337463,
"logits/rejected": -0.9631531238555908,
"logps/chosen": -254.89320373535156,
"logps/rejected": -258.4338073730469,
"loss": 0.6583,
"rewards/accuracies": 0.6875,
"rewards/chosen": -0.22431208193302155,
"rewards/margins": 0.050895195454359055,
"rewards/rejected": -0.2752072513103485,
"step": 30
},
{
"epoch": 0.13251402618220678,
"grad_norm": 4.869138630164683,
"learning_rate": 9.972346981963546e-07,
"logits/chosen": -1.059159755706787,
"logits/rejected": -1.1036772727966309,
"logps/chosen": -245.163330078125,
"logps/rejected": -268.007568359375,
"loss": 0.6513,
"rewards/accuracies": 0.65625,
"rewards/chosen": -0.28532153367996216,
"rewards/margins": 0.1269759237766266,
"rewards/rejected": -0.41229742765426636,
"step": 31
},
{
"epoch": 0.13678867218808444,
"grad_norm": 4.841549329203085,
"learning_rate": 9.96389197411104e-07,
"logits/chosen": -0.9731124043464661,
"logits/rejected": -1.025037169456482,
"logps/chosen": -209.7532958984375,
"logps/rejected": -234.02642822265625,
"loss": 0.6414,
"rewards/accuracies": 0.71875,
"rewards/chosen": -0.20661108195781708,
"rewards/margins": 0.18127745389938354,
"rewards/rejected": -0.38788852095603943,
"step": 32
},
{
"epoch": 0.14106331819396206,
"grad_norm": 4.527086707272363,
"learning_rate": 9.954315409343168e-07,
"logits/chosen": -0.9516006708145142,
"logits/rejected": -1.0085594654083252,
"logps/chosen": -233.74896240234375,
"logps/rejected": -257.0697937011719,
"loss": 0.6391,
"rewards/accuracies": 0.6875,
"rewards/chosen": -0.3886350095272064,
"rewards/margins": 0.19288921356201172,
"rewards/rejected": -0.5815242528915405,
"step": 33
},
{
"epoch": 0.14533796419983971,
"grad_norm": 5.675688535211087,
"learning_rate": 9.943619451418224e-07,
"logits/chosen": -0.9171434640884399,
"logits/rejected": -0.9520907998085022,
"logps/chosen": -232.1197967529297,
"logps/rejected": -252.1339874267578,
"loss": 0.6138,
"rewards/accuracies": 0.6875,
"rewards/chosen": -0.4231939911842346,
"rewards/margins": 0.2075667530298233,
"rewards/rejected": -0.6307607293128967,
"step": 34
},
{
"epoch": 0.14961261020571734,
"grad_norm": 4.555434497600014,
"learning_rate": 9.931806517013612e-07,
"logits/chosen": -0.9599072933197021,
"logits/rejected": -0.9873026013374329,
"logps/chosen": -235.87911987304688,
"logps/rejected": -277.68585205078125,
"loss": 0.6249,
"rewards/accuracies": 0.78125,
"rewards/chosen": -0.5353493690490723,
"rewards/margins": 0.26459625363349915,
"rewards/rejected": -0.799945592880249,
"step": 35
},
{
"epoch": 0.15388725621159496,
"grad_norm": 4.824318328500942,
"learning_rate": 9.918879275179817e-07,
"logits/chosen": -1.1668760776519775,
"logits/rejected": -1.1293714046478271,
"logps/chosen": -288.35406494140625,
"logps/rejected": -298.5234375,
"loss": 0.6005,
"rewards/accuracies": 0.5625,
"rewards/chosen": -0.6688686609268188,
"rewards/margins": 0.22619600594043732,
"rewards/rejected": -0.8950645923614502,
"step": 36
},
{
"epoch": 0.15816190221747262,
"grad_norm": 4.401543973490666,
"learning_rate": 9.904840646737345e-07,
"logits/chosen": -0.9521760940551758,
"logits/rejected": -0.9997081756591797,
"logps/chosen": -282.0852355957031,
"logps/rejected": -336.65020751953125,
"loss": 0.6319,
"rewards/accuracies": 0.6875,
"rewards/chosen": -0.7313822507858276,
"rewards/margins": 0.29729628562927246,
"rewards/rejected": -1.0286785364151,
"step": 37
},
{
"epoch": 0.16243654822335024,
"grad_norm": 5.15798330777588,
"learning_rate": 9.889693803616791e-07,
"logits/chosen": -1.0276933908462524,
"logits/rejected": -1.045649766921997,
"logps/chosen": -311.87677001953125,
"logps/rejected": -334.0548095703125,
"loss": 0.6,
"rewards/accuracies": 0.65625,
"rewards/chosen": -1.0708937644958496,
"rewards/margins": 0.2531777620315552,
"rewards/rejected": -1.3240714073181152,
"step": 38
},
{
"epoch": 0.1667111942292279,
"grad_norm": 4.8993868987417555,
"learning_rate": 9.873442168142157e-07,
"logits/chosen": -0.909888505935669,
"logits/rejected": -0.9343925714492798,
"logps/chosen": -254.18350219726562,
"logps/rejected": -285.18243408203125,
"loss": 0.5973,
"rewards/accuracies": 0.75,
"rewards/chosen": -0.9324233531951904,
"rewards/margins": 0.22102315723896027,
"rewards/rejected": -1.1534464359283447,
"step": 39
},
{
"epoch": 0.17098584023510552,
"grad_norm": 4.965329298176294,
"learning_rate": 9.856089412257604e-07,
"logits/chosen": -0.8430695533752441,
"logits/rejected": -0.8712520599365234,
"logps/chosen": -278.5356750488281,
"logps/rejected": -313.9254455566406,
"loss": 0.5892,
"rewards/accuracies": 0.75,
"rewards/chosen": -1.097804069519043,
"rewards/margins": 0.29625624418258667,
"rewards/rejected": -1.3940601348876953,
"step": 40
},
{
"epoch": 0.17526048624098317,
"grad_norm": 4.964663460213464,
"learning_rate": 9.8376394566978e-07,
"logits/chosen": -0.9349880218505859,
"logits/rejected": -0.9195177555084229,
"logps/chosen": -353.0047607421875,
"logps/rejected": -375.0325927734375,
"loss": 0.5905,
"rewards/accuracies": 0.75,
"rewards/chosen": -1.4320145845413208,
"rewards/margins": 0.2865561842918396,
"rewards/rejected": -1.7185708284378052,
"step": 41
},
{
"epoch": 0.1795351322468608,
"grad_norm": 4.673829673061791,
"learning_rate": 9.818096470102066e-07,
"logits/chosen": -0.9460776448249817,
"logits/rejected": -1.0075451135635376,
"logps/chosen": -326.8829040527344,
"logps/rejected": -359.4068908691406,
"loss": 0.59,
"rewards/accuracies": 0.71875,
"rewards/chosen": -1.3413481712341309,
"rewards/margins": 0.43105652928352356,
"rewards/rejected": -1.772404670715332,
"step": 42
},
{
"epoch": 0.18380977825273845,
"grad_norm": 4.756386075263894,
"learning_rate": 9.797464868072486e-07,
"logits/chosen": -0.8998066186904907,
"logits/rejected": -0.9348124265670776,
"logps/chosen": -347.86090087890625,
"logps/rejected": -441.64501953125,
"loss": 0.5674,
"rewards/accuracies": 0.8125,
"rewards/chosen": -1.69937264919281,
"rewards/margins": 0.7839919924736023,
"rewards/rejected": -2.4833645820617676,
"step": 43
},
{
"epoch": 0.18808442425861607,
"grad_norm": 5.666891206447458,
"learning_rate": 9.775749312176248e-07,
"logits/chosen": -0.8193731307983398,
"logits/rejected": -0.8275444507598877,
"logps/chosen": -334.3702697753906,
"logps/rejected": -402.1867370605469,
"loss": 0.592,
"rewards/accuracies": 0.71875,
"rewards/chosen": -1.655839204788208,
"rewards/margins": 0.6519087553024292,
"rewards/rejected": -2.3077480792999268,
"step": 44
},
{
"epoch": 0.19235907026449373,
"grad_norm": 5.611553010112512,
"learning_rate": 9.752954708892377e-07,
"logits/chosen": -0.8545299172401428,
"logits/rejected": -0.9027716517448425,
"logps/chosen": -371.7701721191406,
"logps/rejected": -439.71881103515625,
"loss": 0.5779,
"rewards/accuracies": 0.84375,
"rewards/chosen": -1.9063547849655151,
"rewards/margins": 0.5792344808578491,
"rewards/rejected": -2.4855895042419434,
"step": 45
},
{
"epoch": 0.19663371627037135,
"grad_norm": 5.062237682542423,
"learning_rate": 9.729086208503173e-07,
"logits/chosen": -0.9441611766815186,
"logits/rejected": -0.956858217716217,
"logps/chosen": -451.3914794921875,
"logps/rejected": -498.17999267578125,
"loss": 0.5592,
"rewards/accuracies": 0.78125,
"rewards/chosen": -2.6227638721466064,
"rewards/margins": 0.4997914731502533,
"rewards/rejected": -3.1225552558898926,
"step": 46
},
{
"epoch": 0.200908362276249,
"grad_norm": 5.547722907580694,
"learning_rate": 9.70414920393052e-07,
"logits/chosen": -0.8402402400970459,
"logits/rejected": -0.8305561542510986,
"logps/chosen": -410.6358642578125,
"logps/rejected": -456.8866882324219,
"loss": 0.5657,
"rewards/accuracies": 0.71875,
"rewards/chosen": -2.3196969032287598,
"rewards/margins": 0.5383195281028748,
"rewards/rejected": -2.8580164909362793,
"step": 47
},
{
"epoch": 0.20518300828212663,
"grad_norm": 5.843768728466239,
"learning_rate": 9.678149329517409e-07,
"logits/chosen": -0.9230031967163086,
"logits/rejected": -0.9459983706474304,
"logps/chosen": -421.91253662109375,
"logps/rejected": -464.15460205078125,
"loss": 0.5158,
"rewards/accuracies": 0.78125,
"rewards/chosen": -2.3388688564300537,
"rewards/margins": 0.5758055448532104,
"rewards/rejected": -2.9146745204925537,
"step": 48
},
{
"epoch": 0.20945765428800428,
"grad_norm": 6.600018288252386,
"learning_rate": 9.651092459754877e-07,
"logits/chosen": -0.7874542474746704,
"logits/rejected": -0.7807765007019043,
"logps/chosen": -553.3658447265625,
"logps/rejected": -578.9154052734375,
"loss": 0.5601,
"rewards/accuracies": 0.65625,
"rewards/chosen": -3.3889737129211426,
"rewards/margins": 0.2704327702522278,
"rewards/rejected": -3.6594066619873047,
"step": 49
},
{
"epoch": 0.2137323002938819,
"grad_norm": 6.555644686187637,
"learning_rate": 9.62298470795473e-07,
"logits/chosen": -0.7596021890640259,
"logits/rejected": -0.8105506896972656,
"logps/chosen": -396.783935546875,
"logps/rejected": -437.79541015625,
"loss": 0.5798,
"rewards/accuracies": 0.6875,
"rewards/chosen": -2.4116053581237793,
"rewards/margins": 0.3920546770095825,
"rewards/rejected": -2.8036601543426514,
"step": 50
},
{
"epoch": 0.21800694629975956,
"grad_norm": 6.744038117473701,
"learning_rate": 9.59383242486827e-07,
"logits/chosen": -0.8625648617744446,
"logits/rejected": -0.8875184059143066,
"logps/chosen": -505.1508483886719,
"logps/rejected": -608.79248046875,
"loss": 0.5389,
"rewards/accuracies": 0.625,
"rewards/chosen": -3.0446105003356934,
"rewards/margins": 0.9900886416435242,
"rewards/rejected": -4.034698963165283,
"step": 51
},
{
"epoch": 0.22228159230563718,
"grad_norm": 5.565039920114929,
"learning_rate": 9.56364219725138e-07,
"logits/chosen": -0.8463042974472046,
"logits/rejected": -0.8962733745574951,
"logps/chosen": -499.99041748046875,
"logps/rejected": -634.49072265625,
"loss": 0.4915,
"rewards/accuracies": 0.78125,
"rewards/chosen": -3.071751832962036,
"rewards/margins": 1.300065279006958,
"rewards/rejected": -4.371817588806152,
"step": 52
},
{
"epoch": 0.22655623831151483,
"grad_norm": 9.993449327747468,
"learning_rate": 9.532420846376315e-07,
"logits/chosen": -0.7763329744338989,
"logits/rejected": -0.8177902698516846,
"logps/chosen": -433.4925842285156,
"logps/rejected": -530.1204223632812,
"loss": 0.6104,
"rewards/accuracies": 0.71875,
"rewards/chosen": -2.5958311557769775,
"rewards/margins": 0.9427847862243652,
"rewards/rejected": -3.5386157035827637,
"step": 53
},
{
"epoch": 0.23083088431739246,
"grad_norm": 6.914149355278302,
"learning_rate": 9.500175426490454e-07,
"logits/chosen": -0.7263307571411133,
"logits/rejected": -0.7950284481048584,
"logps/chosen": -590.033447265625,
"logps/rejected": -696.9810791015625,
"loss": 0.5291,
"rewards/accuracies": 0.75,
"rewards/chosen": -3.79225492477417,
"rewards/margins": 0.9605345726013184,
"rewards/rejected": -4.75278902053833,
"step": 54
},
{
"epoch": 0.2351055303232701,
"grad_norm": 6.4385225722348425,
"learning_rate": 9.466913223222465e-07,
"logits/chosen": -0.73805832862854,
"logits/rejected": -0.8121139407157898,
"logps/chosen": -527.0991821289062,
"logps/rejected": -672.7268676757812,
"loss": 0.536,
"rewards/accuracies": 0.84375,
"rewards/chosen": -3.397312641143799,
"rewards/margins": 1.3639640808105469,
"rewards/rejected": -4.761276721954346,
"step": 55
},
{
"epoch": 0.23938017632914774,
"grad_norm": 7.766777578194787,
"learning_rate": 9.432641751936162e-07,
"logits/chosen": -0.8009728193283081,
"logits/rejected": -0.8259899020195007,
"logps/chosen": -421.18414306640625,
"logps/rejected": -515.0050659179688,
"loss": 0.5853,
"rewards/accuracies": 0.65625,
"rewards/chosen": -2.522728204727173,
"rewards/margins": 0.8710657954216003,
"rewards/rejected": -3.393793821334839,
"step": 56
},
{
"epoch": 0.2436548223350254,
"grad_norm": 6.5333907258413655,
"learning_rate": 9.397368756032444e-07,
"logits/chosen": -0.7609117031097412,
"logits/rejected": -0.7754147052764893,
"logps/chosen": -436.06427001953125,
"logps/rejected": -512.485107421875,
"loss": 0.5019,
"rewards/accuracies": 0.65625,
"rewards/chosen": -2.599217414855957,
"rewards/margins": 0.6933461427688599,
"rewards/rejected": -3.2925636768341064,
"step": 57
},
{
"epoch": 0.24792946834090301,
"grad_norm": 7.290942942753059,
"learning_rate": 9.36110220519976e-07,
"logits/chosen": -0.7123927474021912,
"logits/rejected": -0.7812705039978027,
"logps/chosen": -428.41351318359375,
"logps/rejected": -493.216552734375,
"loss": 0.5486,
"rewards/accuracies": 0.78125,
"rewards/chosen": -2.501229763031006,
"rewards/margins": 0.6336008906364441,
"rewards/rejected": -3.1348307132720947,
"step": 58
},
{
"epoch": 0.25220411434678064,
"grad_norm": 7.291539302989226,
"learning_rate": 9.323850293613379e-07,
"logits/chosen": -0.8743740916252136,
"logits/rejected": -0.8304850459098816,
"logps/chosen": -416.0941467285156,
"logps/rejected": -461.4928283691406,
"loss": 0.5248,
"rewards/accuracies": 0.6875,
"rewards/chosen": -2.469886064529419,
"rewards/margins": 0.6079959869384766,
"rewards/rejected": -3.0778818130493164,
"step": 59
},
{
"epoch": 0.2564787603526583,
"grad_norm": 6.444086326444115,
"learning_rate": 9.285621438083997e-07,
"logits/chosen": -0.7638828754425049,
"logits/rejected": -0.830043375492096,
"logps/chosen": -462.22723388671875,
"logps/rejected": -567.3082275390625,
"loss": 0.496,
"rewards/accuracies": 0.875,
"rewards/chosen": -2.678903818130493,
"rewards/margins": 0.9149044752120972,
"rewards/rejected": -3.59380841255188,
"step": 60
},
{
"epoch": 0.26075340635853594,
"grad_norm": 6.239441558672264,
"learning_rate": 9.246424276156006e-07,
"logits/chosen": -0.7686220407485962,
"logits/rejected": -0.786496102809906,
"logps/chosen": -426.57977294921875,
"logps/rejected": -539.332763671875,
"loss": 0.4872,
"rewards/accuracies": 0.71875,
"rewards/chosen": -2.421875,
"rewards/margins": 1.0811634063720703,
"rewards/rejected": -3.5030384063720703,
"step": 61
},
{
"epoch": 0.26502805236441357,
"grad_norm": 6.955390195667037,
"learning_rate": 9.206267664155906e-07,
"logits/chosen": -0.8518524765968323,
"logits/rejected": -0.8896721005439758,
"logps/chosen": -490.898681640625,
"logps/rejected": -571.5889282226562,
"loss": 0.5328,
"rewards/accuracies": 0.75,
"rewards/chosen": -2.7304701805114746,
"rewards/margins": 0.8642421364784241,
"rewards/rejected": -3.594712495803833,
"step": 62
},
{
"epoch": 0.2693026983702912,
"grad_norm": 6.71732914608112,
"learning_rate": 9.165160675191271e-07,
"logits/chosen": -0.7856395244598389,
"logits/rejected": -0.8273566961288452,
"logps/chosen": -406.04241943359375,
"logps/rejected": -512.580810546875,
"loss": 0.54,
"rewards/accuracies": 0.84375,
"rewards/chosen": -2.3751254081726074,
"rewards/margins": 1.008442759513855,
"rewards/rejected": -3.3835678100585938,
"step": 63
},
{
"epoch": 0.2735773443761689,
"grad_norm": 8.23163566342515,
"learning_rate": 9.123112597100757e-07,
"logits/chosen": -0.7550954818725586,
"logits/rejected": -0.7312250733375549,
"logps/chosen": -428.09210205078125,
"logps/rejected": -472.0053405761719,
"loss": 0.5628,
"rewards/accuracies": 0.71875,
"rewards/chosen": -2.460500717163086,
"rewards/margins": 0.6525804996490479,
"rewards/rejected": -3.1130809783935547,
"step": 64
},
{
"epoch": 0.2778519903820465,
"grad_norm": 6.484017150981526,
"learning_rate": 9.080132930355566e-07,
"logits/chosen": -0.7198902368545532,
"logits/rejected": -0.7333334684371948,
"logps/chosen": -447.80694580078125,
"logps/rejected": -543.547119140625,
"loss": 0.4951,
"rewards/accuracies": 0.78125,
"rewards/chosen": -2.3654160499572754,
"rewards/margins": 1.1327059268951416,
"rewards/rejected": -3.498121738433838,
"step": 65
},
{
"epoch": 0.2821266363879241,
"grad_norm": 7.88288469592664,
"learning_rate": 9.036231385912889e-07,
"logits/chosen": -0.787277102470398,
"logits/rejected": -0.8082758188247681,
"logps/chosen": -542.9320678710938,
"logps/rejected": -589.2691650390625,
"loss": 0.5554,
"rewards/accuracies": 0.6875,
"rewards/chosen": -3.2829174995422363,
"rewards/margins": 0.4686228036880493,
"rewards/rejected": -3.751540184020996,
"step": 66
},
{
"epoch": 0.28640128239380175,
"grad_norm": 6.87598900963406,
"learning_rate": 8.991417883021779e-07,
"logits/chosen": -0.7320197820663452,
"logits/rejected": -0.7914742231369019,
"logps/chosen": -322.4740295410156,
"logps/rejected": -403.00982666015625,
"loss": 0.489,
"rewards/accuracies": 0.75,
"rewards/chosen": -1.867960810661316,
"rewards/margins": 0.7985786199569702,
"rewards/rejected": -2.666539430618286,
"step": 67
},
{
"epoch": 0.29067592839967943,
"grad_norm": 6.988332805689512,
"learning_rate": 8.945702546981968e-07,
"logits/chosen": -0.7299609780311584,
"logits/rejected": -0.7391811013221741,
"logps/chosen": -424.68255615234375,
"logps/rejected": -520.7315673828125,
"loss": 0.485,
"rewards/accuracies": 0.84375,
"rewards/chosen": -2.4903666973114014,
"rewards/margins": 0.9038018584251404,
"rewards/rejected": -3.3941686153411865,
"step": 68
},
{
"epoch": 0.29495057440555705,
"grad_norm": 7.25643319614823,
"learning_rate": 8.899095706856121e-07,
"logits/chosen": -0.8242793679237366,
"logits/rejected": -0.8567203879356384,
"logps/chosen": -416.467041015625,
"logps/rejected": -556.6646118164062,
"loss": 0.501,
"rewards/accuracies": 0.8125,
"rewards/chosen": -2.350522994995117,
"rewards/margins": 1.3620076179504395,
"rewards/rejected": -3.7125303745269775,
"step": 69
},
{
"epoch": 0.2992252204114347,
"grad_norm": 7.602549324125367,
"learning_rate": 8.851607893136064e-07,
"logits/chosen": -0.7457299828529358,
"logits/rejected": -0.7355296611785889,
"logps/chosen": -458.4794006347656,
"logps/rejected": -523.21484375,
"loss": 0.4974,
"rewards/accuracies": 0.6875,
"rewards/chosen": -2.8070144653320312,
"rewards/margins": 0.6739380955696106,
"rewards/rejected": -3.480952739715576,
"step": 70
},
{
"epoch": 0.3034998664173123,
"grad_norm": 7.842587956186825,
"learning_rate": 8.803249835363484e-07,
"logits/chosen": -0.7719243168830872,
"logits/rejected": -0.8175538778305054,
"logps/chosen": -391.3406982421875,
"logps/rejected": -472.3711242675781,
"loss": 0.5232,
"rewards/accuracies": 0.75,
"rewards/chosen": -2.26741099357605,
"rewards/margins": 0.7575722336769104,
"rewards/rejected": -3.0249834060668945,
"step": 71
},
{
"epoch": 0.3077745124231899,
"grad_norm": 8.33328622593718,
"learning_rate": 8.754032459705671e-07,
"logits/chosen": -0.7375326752662659,
"logits/rejected": -0.7411423921585083,
"logps/chosen": -552.6005249023438,
"logps/rejected": -658.5523071289062,
"loss": 0.4689,
"rewards/accuracies": 0.8125,
"rewards/chosen": -3.2394614219665527,
"rewards/margins": 1.1419782638549805,
"rewards/rejected": -4.381440162658691,
"step": 72
},
{
"epoch": 0.3120491584290676,
"grad_norm": 8.00590533781815,
"learning_rate": 8.703966886486818e-07,
"logits/chosen": -0.7447977066040039,
"logits/rejected": -0.8021827340126038,
"logps/chosen": -528.2827758789062,
"logps/rejected": -663.02099609375,
"loss": 0.4719,
"rewards/accuracies": 0.78125,
"rewards/chosen": -3.100710868835449,
"rewards/margins": 1.4192044734954834,
"rewards/rejected": -4.519914627075195,
"step": 73
},
{
"epoch": 0.31632380443494523,
"grad_norm": 9.911560282455111,
"learning_rate": 8.653064427675469e-07,
"logits/chosen": -0.7718651294708252,
"logits/rejected": -0.7922145128250122,
"logps/chosen": -473.9974365234375,
"logps/rejected": -587.6644897460938,
"loss": 0.559,
"rewards/accuracies": 0.75,
"rewards/chosen": -2.9299654960632324,
"rewards/margins": 1.1681456565856934,
"rewards/rejected": -4.098111152648926,
"step": 74
},
{
"epoch": 0.32059845044082286,
"grad_norm": 7.86727024374843,
"learning_rate": 8.601336584328658e-07,
"logits/chosen": -0.6917619705200195,
"logits/rejected": -0.6980517506599426,
"logps/chosen": -500.0274963378906,
"logps/rejected": -590.8597412109375,
"loss": 0.4719,
"rewards/accuracies": 0.78125,
"rewards/chosen": -3.1366689205169678,
"rewards/margins": 0.9159411191940308,
"rewards/rejected": -4.052610397338867,
"step": 75
},
{
"epoch": 0.3248730964467005,
"grad_norm": 9.289091160075673,
"learning_rate": 8.548795043993315e-07,
"logits/chosen": -0.7438817620277405,
"logits/rejected": -0.7294880747795105,
"logps/chosen": -521.81005859375,
"logps/rejected": -567.1892700195312,
"loss": 0.5389,
"rewards/accuracies": 0.6875,
"rewards/chosen": -3.433103322982788,
"rewards/margins": 0.4686957001686096,
"rewards/rejected": -3.901798725128174,
"step": 76
},
{
"epoch": 0.32914774245257816,
"grad_norm": 8.347011918030578,
"learning_rate": 8.495451678065561e-07,
"logits/chosen": -0.7081446647644043,
"logits/rejected": -0.7084572315216064,
"logps/chosen": -471.94879150390625,
"logps/rejected": -576.2655639648438,
"loss": 0.4923,
"rewards/accuracies": 0.84375,
"rewards/chosen": -2.9118268489837646,
"rewards/margins": 1.087062954902649,
"rewards/rejected": -3.9988901615142822,
"step": 77
},
{
"epoch": 0.3334223884584558,
"grad_norm": 9.178664809436675,
"learning_rate": 8.441318539108432e-07,
"logits/chosen": -0.672901451587677,
"logits/rejected": -0.6473367214202881,
"logps/chosen": -446.5679931640625,
"logps/rejected": -525.839599609375,
"loss": 0.4831,
"rewards/accuracies": 0.71875,
"rewards/chosen": -2.884944200515747,
"rewards/margins": 0.8302309513092041,
"rewards/rejected": -3.7151753902435303,
"step": 78
},
{
"epoch": 0.3376970344643334,
"grad_norm": 9.222651758068919,
"learning_rate": 8.386407858128706e-07,
"logits/chosen": -0.7438699007034302,
"logits/rejected": -0.7355214357376099,
"logps/chosen": -530.9581909179688,
"logps/rejected": -658.5861206054688,
"loss": 0.4838,
"rewards/accuracies": 0.84375,
"rewards/chosen": -3.4786720275878906,
"rewards/margins": 1.2334851026535034,
"rewards/rejected": -4.712156772613525,
"step": 79
},
{
"epoch": 0.34197168047021104,
"grad_norm": 8.799544386144294,
"learning_rate": 8.330732041813366e-07,
"logits/chosen": -0.5365869402885437,
"logits/rejected": -0.5661185383796692,
"logps/chosen": -488.5903015136719,
"logps/rejected": -571.9346923828125,
"loss": 0.4732,
"rewards/accuracies": 0.78125,
"rewards/chosen": -3.123514175415039,
"rewards/margins": 0.8028107285499573,
"rewards/rejected": -3.9263250827789307,
"step": 80
},
{
"epoch": 0.3462463264760887,
"grad_norm": 9.030244312954085,
"learning_rate": 8.274303669726426e-07,
"logits/chosen": -0.6187846660614014,
"logits/rejected": -0.6990691423416138,
"logps/chosen": -469.09161376953125,
"logps/rejected": -603.3698120117188,
"loss": 0.4698,
"rewards/accuracies": 0.90625,
"rewards/chosen": -3.157665729522705,
"rewards/margins": 1.1742490530014038,
"rewards/rejected": -4.331915378570557,
"step": 81
},
{
"epoch": 0.35052097248196634,
"grad_norm": 8.841699455559503,
"learning_rate": 8.217135491466636e-07,
"logits/chosen": -0.473153293132782,
"logits/rejected": -0.5449516177177429,
"logps/chosen": -491.49249267578125,
"logps/rejected": -654.7282104492188,
"loss": 0.4764,
"rewards/accuracies": 0.90625,
"rewards/chosen": -3.222175121307373,
"rewards/margins": 1.6088354587554932,
"rewards/rejected": -4.831010341644287,
"step": 82
},
{
"epoch": 0.35479561848784397,
"grad_norm": 11.098615894997318,
"learning_rate": 8.159240423786819e-07,
"logits/chosen": -0.6635532379150391,
"logits/rejected": -0.6708536148071289,
"logps/chosen": -529.8382568359375,
"logps/rejected": -615.276123046875,
"loss": 0.5068,
"rewards/accuracies": 0.71875,
"rewards/chosen": -3.458679437637329,
"rewards/margins": 0.8304582238197327,
"rewards/rejected": -4.289137840270996,
"step": 83
},
{
"epoch": 0.3590702644937216,
"grad_norm": 10.69217432485512,
"learning_rate": 8.100631547675416e-07,
"logits/chosen": -0.5764239430427551,
"logits/rejected": -0.6042333245277405,
"logps/chosen": -538.3191528320312,
"logps/rejected": -671.1778564453125,
"loss": 0.463,
"rewards/accuracies": 0.78125,
"rewards/chosen": -3.707031726837158,
"rewards/margins": 1.2638828754425049,
"rewards/rejected": -4.970914363861084,
"step": 84
},
{
"epoch": 0.36334491049959927,
"grad_norm": 10.389532135595351,
"learning_rate": 8.041322105400921e-07,
"logits/chosen": -0.5952804088592529,
"logits/rejected": -0.5918059349060059,
"logps/chosen": -468.830322265625,
"logps/rejected": -555.058349609375,
"loss": 0.4507,
"rewards/accuracies": 0.75,
"rewards/chosen": -3.2899224758148193,
"rewards/margins": 0.8072735071182251,
"rewards/rejected": -4.097196102142334,
"step": 85
},
{
"epoch": 0.3676195565054769,
"grad_norm": 9.76247745990493,
"learning_rate": 7.981325497519891e-07,
"logits/chosen": -0.5011740922927856,
"logits/rejected": -0.5748673677444458,
"logps/chosen": -568.4143676757812,
"logps/rejected": -665.891357421875,
"loss": 0.473,
"rewards/accuracies": 0.8125,
"rewards/chosen": -3.9403202533721924,
"rewards/margins": 0.8801581859588623,
"rewards/rejected": -4.820478439331055,
"step": 86
},
{
"epoch": 0.3718942025113545,
"grad_norm": 9.276763138531336,
"learning_rate": 7.920655279849171e-07,
"logits/chosen": -0.6208050847053528,
"logits/rejected": -0.6661792993545532,
"logps/chosen": -454.78558349609375,
"logps/rejected": -583.1072387695312,
"loss": 0.439,
"rewards/accuracies": 0.8125,
"rewards/chosen": -3.0670785903930664,
"rewards/margins": 1.2415242195129395,
"rewards/rejected": -4.308602333068848,
"step": 87
},
{
"epoch": 0.37616884851723215,
"grad_norm": 8.47887809958896,
"learning_rate": 7.859325160403071e-07,
"logits/chosen": -0.5842097401618958,
"logits/rejected": -0.6111244559288025,
"logps/chosen": -513.4686279296875,
"logps/rejected": -631.78515625,
"loss": 0.4224,
"rewards/accuracies": 0.6875,
"rewards/chosen": -3.507704973220825,
"rewards/margins": 1.1260159015655518,
"rewards/rejected": -4.633721351623535,
"step": 88
},
{
"epoch": 0.3804434945231098,
"grad_norm": 9.042970665585285,
"learning_rate": 7.797348996296114e-07,
"logits/chosen": -0.594511091709137,
"logits/rejected": -0.5762075185775757,
"logps/chosen": -528.5706787109375,
"logps/rejected": -640.2254638671875,
"loss": 0.4195,
"rewards/accuracies": 0.78125,
"rewards/chosen": -3.5566911697387695,
"rewards/margins": 1.1777138710021973,
"rewards/rejected": -4.734404563903809,
"step": 89
},
{
"epoch": 0.38471814052898745,
"grad_norm": 9.591838712191619,
"learning_rate": 7.734740790612136e-07,
"logits/chosen": -0.5243846774101257,
"logits/rejected": -0.5419484376907349,
"logps/chosen": -597.5997924804688,
"logps/rejected": -721.8450927734375,
"loss": 0.4525,
"rewards/accuracies": 0.8125,
"rewards/chosen": -4.259735107421875,
"rewards/margins": 1.2303788661956787,
"rewards/rejected": -5.490115165710449,
"step": 90
},
{
"epoch": 0.3889927865348651,
"grad_norm": 9.197817870604572,
"learning_rate": 7.671514689240365e-07,
"logits/chosen": -0.5726766586303711,
"logits/rejected": -0.6172913312911987,
"logps/chosen": -557.262939453125,
"logps/rejected": -697.6727294921875,
"loss": 0.4701,
"rewards/accuracies": 0.78125,
"rewards/chosen": -3.7903552055358887,
"rewards/margins": 1.3101625442504883,
"rewards/rejected": -5.100517749786377,
"step": 91
},
{
"epoch": 0.3932674325407427,
"grad_norm": 11.581331107950847,
"learning_rate": 7.607684977679283e-07,
"logits/chosen": -0.6335964202880859,
"logits/rejected": -0.6610329747200012,
"logps/chosen": -519.31103515625,
"logps/rejected": -657.5548706054688,
"loss": 0.4294,
"rewards/accuracies": 0.8125,
"rewards/chosen": -3.4710004329681396,
"rewards/margins": 1.4494860172271729,
"rewards/rejected": -4.920486927032471,
"step": 92
},
{
"epoch": 0.3975420785466204,
"grad_norm": 10.631351767452296,
"learning_rate": 7.543266077808892e-07,
"logits/chosen": -0.427675724029541,
"logits/rejected": -0.45514771342277527,
"logps/chosen": -571.4942626953125,
"logps/rejected": -717.7467041015625,
"loss": 0.4636,
"rewards/accuracies": 0.71875,
"rewards/chosen": -3.948040723800659,
"rewards/margins": 1.4244039058685303,
"rewards/rejected": -5.372445106506348,
"step": 93
},
{
"epoch": 0.401816724552498,
"grad_norm": 12.057725080831089,
"learning_rate": 7.478272544632202e-07,
"logits/chosen": -0.5969647765159607,
"logits/rejected": -0.6751678586006165,
"logps/chosen": -643.1666870117188,
"logps/rejected": -773.894287109375,
"loss": 0.4507,
"rewards/accuracies": 0.75,
"rewards/chosen": -4.552432537078857,
"rewards/margins": 1.3130940198898315,
"rewards/rejected": -5.8655266761779785,
"step": 94
},
{
"epoch": 0.40609137055837563,
"grad_norm": 10.936442473029212,
"learning_rate": 7.412719062986631e-07,
"logits/chosen": -0.4887186288833618,
"logits/rejected": -0.4894056022167206,
"logps/chosen": -555.25341796875,
"logps/rejected": -653.7760009765625,
"loss": 0.4518,
"rewards/accuracies": 0.71875,
"rewards/chosen": -3.9043784141540527,
"rewards/margins": 1.035258412361145,
"rewards/rejected": -4.939637184143066,
"step": 95
},
{
"epoch": 0.41036601656425326,
"grad_norm": 18.608338586356762,
"learning_rate": 7.346620444226059e-07,
"logits/chosen": -0.5932431221008301,
"logits/rejected": -0.6164640784263611,
"logps/chosen": -586.3929443359375,
"logps/rejected": -703.4130249023438,
"loss": 0.4449,
"rewards/accuracies": 0.75,
"rewards/chosen": -4.206478118896484,
"rewards/margins": 1.083022117614746,
"rewards/rejected": -5.2895002365112305,
"step": 96
},
{
"epoch": 0.41464066257013094,
"grad_norm": 12.657787002066415,
"learning_rate": 7.279991622874318e-07,
"logits/chosen": -0.5697692632675171,
"logits/rejected": -0.6259853839874268,
"logps/chosen": -585.4468994140625,
"logps/rejected": -732.0520629882812,
"loss": 0.483,
"rewards/accuracies": 0.78125,
"rewards/chosen": -3.9570021629333496,
"rewards/margins": 1.447950839996338,
"rewards/rejected": -5.404953479766846,
"step": 97
},
{
"epoch": 0.41891530857600856,
"grad_norm": 13.367477990049776,
"learning_rate": 7.212847653250828e-07,
"logits/chosen": -0.6540141105651855,
"logits/rejected": -0.6565195322036743,
"logps/chosen": -731.0140380859375,
"logps/rejected": -848.3659057617188,
"loss": 0.4767,
"rewards/accuracies": 0.84375,
"rewards/chosen": -5.200318336486816,
"rewards/margins": 1.2101118564605713,
"rewards/rejected": -6.410430908203125,
"step": 98
},
{
"epoch": 0.4231899545818862,
"grad_norm": 10.139524171463488,
"learning_rate": 7.145203706069182e-07,
"logits/chosen": -0.7252554893493652,
"logits/rejected": -0.7746644616127014,
"logps/chosen": -669.2903442382812,
"logps/rejected": -828.5930786132812,
"loss": 0.3875,
"rewards/accuracies": 0.84375,
"rewards/chosen": -4.771048545837402,
"rewards/margins": 1.5108307600021362,
"rewards/rejected": -6.281879425048828,
"step": 99
},
{
"epoch": 0.4274646005877638,
"grad_norm": 12.356398801857143,
"learning_rate": 7.077075065009433e-07,
"logits/chosen": -0.557784914970398,
"logits/rejected": -0.5647093057632446,
"logps/chosen": -607.4837646484375,
"logps/rejected": -718.2990112304688,
"loss": 0.4439,
"rewards/accuracies": 0.71875,
"rewards/chosen": -4.555881500244141,
"rewards/margins": 1.080979585647583,
"rewards/rejected": -5.636861801147461,
"step": 100
},
{
"epoch": 0.4274646005877638,
"eval_logits/chosen": -0.5001155734062195,
"eval_logits/rejected": -0.5150425434112549,
"eval_logps/chosen": -704.6570434570312,
"eval_logps/rejected": -829.715087890625,
"eval_loss": 0.4167614281177521,
"eval_rewards/accuracies": 0.8145161271095276,
"eval_rewards/chosen": -4.996352672576904,
"eval_rewards/margins": 1.3122578859329224,
"eval_rewards/rejected": -6.308610916137695,
"eval_runtime": 165.659,
"eval_samples_per_second": 11.838,
"eval_steps_per_second": 0.374,
"step": 100
},
{
"epoch": 0.4317392465936415,
"grad_norm": 13.314006294995002,
"learning_rate": 7.008477123264847e-07,
"logits/chosen": -0.6598826050758362,
"logits/rejected": -0.6927035450935364,
"logps/chosen": -734.70556640625,
"logps/rejected": -900.852294921875,
"loss": 0.4118,
"rewards/accuracies": 0.875,
"rewards/chosen": -5.161342620849609,
"rewards/margins": 1.7760246992111206,
"rewards/rejected": -6.9373674392700195,
"step": 101
},
{
"epoch": 0.4360138925995191,
"grad_norm": 13.821741873689282,
"learning_rate": 6.939425380063923e-07,
"logits/chosen": -0.6629341244697571,
"logits/rejected": -0.7558687925338745,
"logps/chosen": -699.57177734375,
"logps/rejected": -886.5426025390625,
"loss": 0.3874,
"rewards/accuracies": 0.875,
"rewards/chosen": -5.201181888580322,
"rewards/margins": 1.6117980480194092,
"rewards/rejected": -6.8129801750183105,
"step": 102
},
{
"epoch": 0.44028853860539674,
"grad_norm": 14.52198562884218,
"learning_rate": 6.869935437168449e-07,
"logits/chosen": -0.4441612958908081,
"logits/rejected": -0.4517134428024292,
"logps/chosen": -648.8721313476562,
"logps/rejected": -743.1588745117188,
"loss": 0.4932,
"rewards/accuracies": 0.625,
"rewards/chosen": -4.671912670135498,
"rewards/margins": 1.0545084476470947,
"rewards/rejected": -5.726420879364014,
"step": 103
},
{
"epoch": 0.44456318461127436,
"grad_norm": 15.339894722169653,
"learning_rate": 6.80002299534838e-07,
"logits/chosen": -0.719368577003479,
"logits/rejected": -0.7461254596710205,
"logps/chosen": -573.4705810546875,
"logps/rejected": -651.0980224609375,
"loss": 0.4402,
"rewards/accuracies": 0.6875,
"rewards/chosen": -3.8566808700561523,
"rewards/margins": 0.8586393594741821,
"rewards/rejected": -4.715320587158203,
"step": 104
},
{
"epoch": 0.448837830617152,
"grad_norm": 10.956142784913482,
"learning_rate": 6.72970385083438e-07,
"logits/chosen": -0.641654372215271,
"logits/rejected": -0.6621043682098389,
"logps/chosen": -592.4070434570312,
"logps/rejected": -721.1480102539062,
"loss": 0.4013,
"rewards/accuracies": 0.90625,
"rewards/chosen": -4.002495288848877,
"rewards/margins": 1.1794517040252686,
"rewards/rejected": -5.181946754455566,
"step": 105
},
{
"epoch": 0.45311247662302967,
"grad_norm": 14.08687818754259,
"learning_rate": 6.658993891748759e-07,
"logits/chosen": -0.6141338348388672,
"logits/rejected": -0.5712395310401917,
"logps/chosen": -525.6826171875,
"logps/rejected": -657.1926879882812,
"loss": 0.3788,
"rewards/accuracies": 0.8125,
"rewards/chosen": -3.3286538124084473,
"rewards/margins": 1.5682119131088257,
"rewards/rejected": -4.896864891052246,
"step": 106
},
{
"epoch": 0.4573871226289073,
"grad_norm": 12.007137757034995,
"learning_rate": 6.587909094515663e-07,
"logits/chosen": -0.6399226188659668,
"logits/rejected": -0.6818464994430542,
"logps/chosen": -515.7030639648438,
"logps/rejected": -624.790283203125,
"loss": 0.4432,
"rewards/accuracies": 0.71875,
"rewards/chosen": -3.5857155323028564,
"rewards/margins": 0.9131308794021606,
"rewards/rejected": -4.498846530914307,
"step": 107
},
{
"epoch": 0.4616617686347849,
"grad_norm": 11.626806758384587,
"learning_rate": 6.516465520251313e-07,
"logits/chosen": -0.6572325229644775,
"logits/rejected": -0.7261943221092224,
"logps/chosen": -557.6213989257812,
"logps/rejected": -685.3796997070312,
"loss": 0.4302,
"rewards/accuracies": 0.75,
"rewards/chosen": -3.802943468093872,
"rewards/margins": 1.3063392639160156,
"rewards/rejected": -5.109282970428467,
"step": 108
},
{
"epoch": 0.46593641464066254,
"grad_norm": 11.769626267692969,
"learning_rate": 6.444679311135112e-07,
"logits/chosen": -0.6812455058097839,
"logits/rejected": -0.6769453287124634,
"logps/chosen": -545.5555419921875,
"logps/rejected": -670.9700317382812,
"loss": 0.4633,
"rewards/accuracies": 0.75,
"rewards/chosen": -3.524083137512207,
"rewards/margins": 1.1972600221633911,
"rewards/rejected": -4.721343040466309,
"step": 109
},
{
"epoch": 0.4702110606465402,
"grad_norm": 11.834467781345984,
"learning_rate": 6.372566686762426e-07,
"logits/chosen": -0.6734607219696045,
"logits/rejected": -0.6938244104385376,
"logps/chosen": -631.7657470703125,
"logps/rejected": -778.4968872070312,
"loss": 0.3988,
"rewards/accuracies": 0.84375,
"rewards/chosen": -4.416792869567871,
"rewards/margins": 1.5345261096954346,
"rewards/rejected": -5.951319217681885,
"step": 110
},
{
"epoch": 0.47448570665241785,
"grad_norm": 10.600986700850507,
"learning_rate": 6.30014394047988e-07,
"logits/chosen": -0.7839672565460205,
"logits/rejected": -0.7656916379928589,
"logps/chosen": -520.810791015625,
"logps/rejected": -590.8253173828125,
"loss": 0.4064,
"rewards/accuracies": 0.71875,
"rewards/chosen": -3.642113447189331,
"rewards/margins": 0.7910320162773132,
"rewards/rejected": -4.433145999908447,
"step": 111
},
{
"epoch": 0.4787603526582955,
"grad_norm": 13.974810391572062,
"learning_rate": 6.227427435703995e-07,
"logits/chosen": -0.6362528204917908,
"logits/rejected": -0.7391636371612549,
"logps/chosen": -589.657470703125,
"logps/rejected": -778.1963500976562,
"loss": 0.397,
"rewards/accuracies": 0.90625,
"rewards/chosen": -4.09708309173584,
"rewards/margins": 1.6432350873947144,
"rewards/rejected": -5.7403178215026855,
"step": 112
},
{
"epoch": 0.4830349986641731,
"grad_norm": 12.06902931160219,
"learning_rate": 6.154433602223978e-07,
"logits/chosen": -0.7784813046455383,
"logits/rejected": -0.8440088033676147,
"logps/chosen": -634.3173828125,
"logps/rejected": -829.8695068359375,
"loss": 0.4383,
"rewards/accuracies": 0.90625,
"rewards/chosen": -4.257462024688721,
"rewards/margins": 1.7875339984893799,
"rewards/rejected": -6.0449957847595215,
"step": 113
},
{
"epoch": 0.4873096446700508,
"grad_norm": 13.358425807533337,
"learning_rate": 6.081178932489535e-07,
"logits/chosen": -0.7081687450408936,
"logits/rejected": -0.7073873281478882,
"logps/chosen": -569.8103637695312,
"logps/rejected": -694.3984375,
"loss": 0.4252,
"rewards/accuracies": 0.78125,
"rewards/chosen": -3.89652943611145,
"rewards/margins": 1.3301247358322144,
"rewards/rejected": -5.226654529571533,
"step": 114
},
{
"epoch": 0.4915842906759284,
"grad_norm": 11.586882789419233,
"learning_rate": 6.00767997788451e-07,
"logits/chosen": -0.5270929336547852,
"logits/rejected": -0.5626642107963562,
"logps/chosen": -693.6819458007812,
"logps/rejected": -889.004150390625,
"loss": 0.3575,
"rewards/accuracies": 0.90625,
"rewards/chosen": -4.817679405212402,
"rewards/margins": 1.964691162109375,
"rewards/rejected": -6.782370090484619,
"step": 115
},
{
"epoch": 0.49585893668180603,
"grad_norm": 12.667062697493666,
"learning_rate": 5.933953344987214e-07,
"logits/chosen": -0.6200395226478577,
"logits/rejected": -0.6530672311782837,
"logps/chosen": -617.082763671875,
"logps/rejected": -743.7801513671875,
"loss": 0.394,
"rewards/accuracies": 0.84375,
"rewards/chosen": -4.380313873291016,
"rewards/margins": 1.2766342163085938,
"rewards/rejected": -5.656947612762451,
"step": 116
},
{
"epoch": 0.5001335826876837,
"grad_norm": 13.06084918611884,
"learning_rate": 5.860015691818292e-07,
"logits/chosen": -0.5794460773468018,
"logits/rejected": -0.6392884850502014,
"logps/chosen": -523.0586547851562,
"logps/rejected": -706.2977294921875,
"loss": 0.3972,
"rewards/accuracies": 0.8125,
"rewards/chosen": -3.7082810401916504,
"rewards/margins": 1.7129367589950562,
"rewards/rejected": -5.421217918395996,
"step": 117
},
{
"epoch": 0.5044082286935613,
"grad_norm": 14.158925761660381,
"learning_rate": 5.78588372407695e-07,
"logits/chosen": -0.591346025466919,
"logits/rejected": -0.5808792114257812,
"logps/chosen": -661.6780395507812,
"logps/rejected": -753.6257934570312,
"loss": 0.3814,
"rewards/accuracies": 0.78125,
"rewards/chosen": -4.592419147491455,
"rewards/margins": 1.0543147325515747,
"rewards/rejected": -5.646734237670898,
"step": 118
},
{
"epoch": 0.508682874699439,
"grad_norm": 13.571207607427793,
"learning_rate": 5.711574191366427e-07,
"logits/chosen": -0.4889651834964752,
"logits/rejected": -0.44250980019569397,
"logps/chosen": -608.6267700195312,
"logps/rejected": -910.5017700195312,
"loss": 0.4381,
"rewards/accuracies": 0.6875,
"rewards/chosen": -4.152202606201172,
"rewards/margins": 0.7526392936706543,
"rewards/rejected": -4.904841423034668,
"step": 119
},
{
"epoch": 0.5129575207053166,
"grad_norm": 10.741928061872432,
"learning_rate": 5.637103883409525e-07,
"logits/chosen": -0.5629594922065735,
"logits/rejected": -0.6181632876396179,
"logps/chosen": -604.459228515625,
"logps/rejected": -852.540283203125,
"loss": 0.3589,
"rewards/accuracies": 0.90625,
"rewards/chosen": -4.461546421051025,
"rewards/margins": 2.3289871215820312,
"rewards/rejected": -6.790533542633057,
"step": 120
},
{
"epoch": 0.5172321667111942,
"grad_norm": 13.589230759795878,
"learning_rate": 5.562489626255103e-07,
"logits/chosen": -0.6361875534057617,
"logits/rejected": -0.6799750924110413,
"logps/chosen": -612.7998657226562,
"logps/rejected": -803.4393920898438,
"loss": 0.3612,
"rewards/accuracies": 0.8125,
"rewards/chosen": -4.2646894454956055,
"rewards/margins": 1.7221603393554688,
"rewards/rejected": -5.986849784851074,
"step": 121
},
{
"epoch": 0.5215068127170719,
"grad_norm": 13.130136917819023,
"learning_rate": 5.48774827847634e-07,
"logits/chosen": -0.6019195914268494,
"logits/rejected": -0.6733092665672302,
"logps/chosen": -578.5673828125,
"logps/rejected": -739.2437133789062,
"loss": 0.3972,
"rewards/accuracies": 0.84375,
"rewards/chosen": -4.263480186462402,
"rewards/margins": 1.4445068836212158,
"rewards/rejected": -5.707987308502197,
"step": 122
},
{
"epoch": 0.5257814587229495,
"grad_norm": 12.568726768016017,
"learning_rate": 5.412896727361662e-07,
"logits/chosen": -0.5387797951698303,
"logits/rejected": -0.6281207799911499,
"logps/chosen": -604.70703125,
"logps/rejected": -767.2006225585938,
"loss": 0.3866,
"rewards/accuracies": 0.875,
"rewards/chosen": -4.197920322418213,
"rewards/margins": 1.5229953527450562,
"rewards/rejected": -5.720915794372559,
"step": 123
},
{
"epoch": 0.5300561047288271,
"grad_norm": 12.66743855092958,
"learning_rate": 5.337951885099166e-07,
"logits/chosen": -0.7120057940483093,
"logits/rejected": -0.6868148446083069,
"logps/chosen": -564.8189086914062,
"logps/rejected": -678.998779296875,
"loss": 0.4235,
"rewards/accuracies": 0.75,
"rewards/chosen": -3.940401077270508,
"rewards/margins": 1.179326057434082,
"rewards/rejected": -5.11972713470459,
"step": 124
},
{
"epoch": 0.5343307507347048,
"grad_norm": 14.46489476063489,
"learning_rate": 5.262930684955438e-07,
"logits/chosen": -0.7230139970779419,
"logits/rejected": -0.7383438348770142,
"logps/chosen": -680.5816040039062,
"logps/rejected": -828.6448974609375,
"loss": 0.4321,
"rewards/accuracies": 0.875,
"rewards/chosen": -4.831777572631836,
"rewards/margins": 1.4383575916290283,
"rewards/rejected": -6.270134925842285,
"step": 125
},
{
"epoch": 0.5386053967405824,
"grad_norm": 13.241409444585061,
"learning_rate": 5.187850077449603e-07,
"logits/chosen": -0.49940329790115356,
"logits/rejected": -0.5305464267730713,
"logps/chosen": -678.576416015625,
"logps/rejected": -828.8834838867188,
"loss": 0.3599,
"rewards/accuracies": 0.8125,
"rewards/chosen": -4.9777750968933105,
"rewards/margins": 1.5080969333648682,
"rewards/rejected": -6.485872268676758,
"step": 126
},
{
"epoch": 0.5428800427464601,
"grad_norm": 15.600684361137235,
"learning_rate": 5.11272702652346e-07,
"logits/chosen": -0.766007125377655,
"logits/rejected": -0.8170765042304993,
"logps/chosen": -783.025146484375,
"logps/rejected": -941.555908203125,
"loss": 0.3807,
"rewards/accuracies": 0.8125,
"rewards/chosen": -5.462682247161865,
"rewards/margins": 1.6811782121658325,
"rewards/rejected": -7.143860816955566,
"step": 127
},
{
"epoch": 0.5471546887523377,
"grad_norm": 12.46259382895567,
"learning_rate": 5.03757850570861e-07,
"logits/chosen": -0.6791242361068726,
"logits/rejected": -0.6803139448165894,
"logps/chosen": -693.43115234375,
"logps/rejected": -792.7020263671875,
"loss": 0.3952,
"rewards/accuracies": 0.71875,
"rewards/chosen": -4.814350128173828,
"rewards/margins": 1.0507954359054565,
"rewards/rejected": -5.865145206451416,
"step": 128
},
{
"epoch": 0.5514293347582153,
"grad_norm": 14.68281824566638,
"learning_rate": 4.962421494291391e-07,
"logits/chosen": -0.6624226570129395,
"logits/rejected": -0.8003214597702026,
"logps/chosen": -641.5405883789062,
"logps/rejected": -848.9283447265625,
"loss": 0.3979,
"rewards/accuracies": 0.90625,
"rewards/chosen": -4.510129928588867,
"rewards/margins": 1.8167277574539185,
"rewards/rejected": -6.326857566833496,
"step": 129
},
{
"epoch": 0.555703980764093,
"grad_norm": 14.156910665906969,
"learning_rate": 4.88727297347654e-07,
"logits/chosen": -0.6747015714645386,
"logits/rejected": -0.6304070353507996,
"logps/chosen": -673.6571655273438,
"logps/rejected": -818.0390625,
"loss": 0.3615,
"rewards/accuracies": 0.8125,
"rewards/chosen": -4.6810078620910645,
"rewards/margins": 1.7004587650299072,
"rewards/rejected": -6.381466388702393,
"step": 130
},
{
"epoch": 0.5599786267699706,
"grad_norm": 13.513462308218608,
"learning_rate": 4.812149922550397e-07,
"logits/chosen": -0.5138005614280701,
"logits/rejected": -0.5008392333984375,
"logps/chosen": -603.177490234375,
"logps/rejected": -718.6824951171875,
"loss": 0.4195,
"rewards/accuracies": 0.71875,
"rewards/chosen": -4.324260234832764,
"rewards/margins": 1.1721910238265991,
"rewards/rejected": -5.496450901031494,
"step": 131
},
{
"epoch": 0.5642532727758482,
"grad_norm": 13.859777475577925,
"learning_rate": 4.7370693150445615e-07,
"logits/chosen": -0.7230309247970581,
"logits/rejected": -0.7601820826530457,
"logps/chosen": -678.0418090820312,
"logps/rejected": -836.7296142578125,
"loss": 0.4123,
"rewards/accuracies": 0.78125,
"rewards/chosen": -4.642270565032959,
"rewards/margins": 1.615240216255188,
"rewards/rejected": -6.257511138916016,
"step": 132
},
{
"epoch": 0.5685279187817259,
"grad_norm": 12.304755311149645,
"learning_rate": 4.6620481149008364e-07,
"logits/chosen": -0.5858466029167175,
"logits/rejected": -0.5665376782417297,
"logps/chosen": -551.0853271484375,
"logps/rejected": -661.145263671875,
"loss": 0.3952,
"rewards/accuracies": 0.8125,
"rewards/chosen": -4.0769124031066895,
"rewards/margins": 1.150291919708252,
"rewards/rejected": -5.227204322814941,
"step": 133
},
{
"epoch": 0.5728025647876035,
"grad_norm": 13.318626812706627,
"learning_rate": 4.5871032726383385e-07,
"logits/chosen": -0.6011719703674316,
"logits/rejected": -0.6539227962493896,
"logps/chosen": -613.9602661132812,
"logps/rejected": -801.2767333984375,
"loss": 0.3028,
"rewards/accuracies": 0.9375,
"rewards/chosen": -4.4908127784729,
"rewards/margins": 1.866058588027954,
"rewards/rejected": -6.356871604919434,
"step": 134
},
{
"epoch": 0.5770772107934812,
"grad_norm": 15.02191791227977,
"learning_rate": 4.512251721523659e-07,
"logits/chosen": -0.5807833671569824,
"logits/rejected": -0.5801360607147217,
"logps/chosen": -585.0089111328125,
"logps/rejected": -690.991455078125,
"loss": 0.4568,
"rewards/accuracies": 0.65625,
"rewards/chosen": -4.436345100402832,
"rewards/margins": 0.9718096852302551,
"rewards/rejected": -5.40815544128418,
"step": 135
},
{
"epoch": 0.5813518567993589,
"grad_norm": 13.719743061532137,
"learning_rate": 4.4375103737448967e-07,
"logits/chosen": -0.6176421642303467,
"logits/rejected": -0.5977914333343506,
"logps/chosen": -647.6025390625,
"logps/rejected": -780.4196166992188,
"loss": 0.3489,
"rewards/accuracies": 0.75,
"rewards/chosen": -4.731601715087891,
"rewards/margins": 1.3735716342926025,
"rewards/rejected": -6.105173110961914,
"step": 136
},
{
"epoch": 0.5856265028052364,
"grad_norm": 13.899011782478087,
"learning_rate": 4.362896116590475e-07,
"logits/chosen": -0.6031906604766846,
"logits/rejected": -0.6842055320739746,
"logps/chosen": -619.6199340820312,
"logps/rejected": -823.1647338867188,
"loss": 0.3829,
"rewards/accuracies": 0.875,
"rewards/chosen": -4.415548324584961,
"rewards/margins": 1.8599358797073364,
"rewards/rejected": -6.275484085083008,
"step": 137
},
{
"epoch": 0.5899011488111141,
"grad_norm": 14.094976818965387,
"learning_rate": 4.2884258086335745e-07,
"logits/chosen": -0.5712490081787109,
"logits/rejected": -0.6076186299324036,
"logps/chosen": -635.4631958007812,
"logps/rejected": -767.619873046875,
"loss": 0.3952,
"rewards/accuracies": 0.84375,
"rewards/chosen": -4.872575759887695,
"rewards/margins": 1.2816861867904663,
"rewards/rejected": -6.154261589050293,
"step": 138
},
{
"epoch": 0.5941757948169917,
"grad_norm": 15.923719497368527,
"learning_rate": 4.2141162759230503e-07,
"logits/chosen": -0.4579673409461975,
"logits/rejected": -0.5088114738464355,
"logps/chosen": -541.15185546875,
"logps/rejected": -640.8451538085938,
"loss": 0.3694,
"rewards/accuracies": 0.6875,
"rewards/chosen": -4.0685858726501465,
"rewards/margins": 0.9016439914703369,
"rewards/rejected": -4.970229625701904,
"step": 139
},
{
"epoch": 0.5984504408228694,
"grad_norm": 15.626619862394913,
"learning_rate": 4.139984308181708e-07,
"logits/chosen": -0.6617997884750366,
"logits/rejected": -0.6638819575309753,
"logps/chosen": -747.4265747070312,
"logps/rejected": -863.458251953125,
"loss": 0.3971,
"rewards/accuracies": 0.78125,
"rewards/chosen": -5.498664379119873,
"rewards/margins": 1.1953853368759155,
"rewards/rejected": -6.694049835205078,
"step": 140
},
{
"epoch": 0.602725086828747,
"grad_norm": 16.11770967148462,
"learning_rate": 4.0660466550127853e-07,
"logits/chosen": -0.7728097438812256,
"logits/rejected": -0.8388174772262573,
"logps/chosen": -708.0802001953125,
"logps/rejected": -857.48486328125,
"loss": 0.4029,
"rewards/accuracies": 0.8125,
"rewards/chosen": -5.071959972381592,
"rewards/margins": 1.4134087562561035,
"rewards/rejected": -6.485368251800537,
"step": 141
},
{
"epoch": 0.6069997328346246,
"grad_norm": 13.358737574089647,
"learning_rate": 3.9923200221154914e-07,
"logits/chosen": -0.5902035236358643,
"logits/rejected": -0.600926399230957,
"logps/chosen": -655.1759033203125,
"logps/rejected": -776.4323120117188,
"loss": 0.4008,
"rewards/accuracies": 0.75,
"rewards/chosen": -4.78019905090332,
"rewards/margins": 1.229543685913086,
"rewards/rejected": -6.009742736816406,
"step": 142
},
{
"epoch": 0.6112743788405023,
"grad_norm": 20.882100986314004,
"learning_rate": 3.918821067510464e-07,
"logits/chosen": -0.5608689188957214,
"logits/rejected": -0.5520298480987549,
"logps/chosen": -606.8939208984375,
"logps/rejected": -732.2677001953125,
"loss": 0.481,
"rewards/accuracies": 0.8125,
"rewards/chosen": -4.506969928741455,
"rewards/margins": 1.263184666633606,
"rewards/rejected": -5.77015495300293,
"step": 143
},
{
"epoch": 0.6155490248463799,
"grad_norm": 15.142641307206523,
"learning_rate": 3.845566397776021e-07,
"logits/chosen": -0.5451078414916992,
"logits/rejected": -0.5328483581542969,
"logps/chosen": -578.9205932617188,
"logps/rejected": -720.2644653320312,
"loss": 0.3981,
"rewards/accuracies": 0.78125,
"rewards/chosen": -4.13190221786499,
"rewards/margins": 1.3544728755950928,
"rewards/rejected": -5.486375331878662,
"step": 144
},
{
"epoch": 0.6198236708522575,
"grad_norm": 14.102739323418549,
"learning_rate": 3.772572564296004e-07,
"logits/chosen": -0.5815902948379517,
"logits/rejected": -0.6612125635147095,
"logps/chosen": -647.4329833984375,
"logps/rejected": -800.5177001953125,
"loss": 0.3771,
"rewards/accuracies": 0.8125,
"rewards/chosen": -4.570836067199707,
"rewards/margins": 1.5209659337997437,
"rewards/rejected": -6.091801643371582,
"step": 145
},
{
"epoch": 0.6240983168581352,
"grad_norm": 16.783278936540377,
"learning_rate": 3.699856059520118e-07,
"logits/chosen": -0.5741180777549744,
"logits/rejected": -0.6261047720909119,
"logps/chosen": -518.5027465820312,
"logps/rejected": -752.7945556640625,
"loss": 0.339,
"rewards/accuracies": 0.90625,
"rewards/chosen": -3.5159010887145996,
"rewards/margins": 2.189849615097046,
"rewards/rejected": -5.705749988555908,
"step": 146
},
{
"epoch": 0.6283729628640128,
"grad_norm": 16.241290219398206,
"learning_rate": 3.627433313237576e-07,
"logits/chosen": -0.6445101499557495,
"logits/rejected": -0.6295093297958374,
"logps/chosen": -611.40576171875,
"logps/rejected": -746.1281127929688,
"loss": 0.4584,
"rewards/accuracies": 0.71875,
"rewards/chosen": -4.344273090362549,
"rewards/margins": 1.294838309288025,
"rewards/rejected": -5.639111518859863,
"step": 147
},
{
"epoch": 0.6326476088698905,
"grad_norm": 16.58217058287404,
"learning_rate": 3.5553206888648885e-07,
"logits/chosen": -0.5924898386001587,
"logits/rejected": -0.6705700755119324,
"logps/chosen": -561.9385986328125,
"logps/rejected": -790.1613159179688,
"loss": 0.3589,
"rewards/accuracies": 0.875,
"rewards/chosen": -3.8234567642211914,
"rewards/margins": 2.038635015487671,
"rewards/rejected": -5.862092018127441,
"step": 148
},
{
"epoch": 0.6369222548757681,
"grad_norm": 13.631809000580427,
"learning_rate": 3.483534479748688e-07,
"logits/chosen": -0.6043068170547485,
"logits/rejected": -0.6237097978591919,
"logps/chosen": -599.3536376953125,
"logps/rejected": -737.3873291015625,
"loss": 0.3333,
"rewards/accuracies": 0.78125,
"rewards/chosen": -4.288971424102783,
"rewards/margins": 1.361509919166565,
"rewards/rejected": -5.650481224060059,
"step": 149
},
{
"epoch": 0.6411969008816457,
"grad_norm": 13.48441747872976,
"learning_rate": 3.412090905484337e-07,
"logits/chosen": -0.5789849758148193,
"logits/rejected": -0.5934211015701294,
"logps/chosen": -637.4574584960938,
"logps/rejected": -799.4398193359375,
"loss": 0.3633,
"rewards/accuracies": 0.875,
"rewards/chosen": -4.6277079582214355,
"rewards/margins": 1.6034901142120361,
"rewards/rejected": -6.231198310852051,
"step": 150
},
{
"epoch": 0.6454715468875234,
"grad_norm": 13.54705279065067,
"learning_rate": 3.3410061082512417e-07,
"logits/chosen": -0.7143419981002808,
"logits/rejected": -0.7396361231803894,
"logps/chosen": -644.5235595703125,
"logps/rejected": -819.0350341796875,
"loss": 0.3787,
"rewards/accuracies": 0.84375,
"rewards/chosen": -4.598511695861816,
"rewards/margins": 1.7460615634918213,
"rewards/rejected": -6.344573020935059,
"step": 151
},
{
"epoch": 0.649746192893401,
"grad_norm": 13.97282414379501,
"learning_rate": 3.270296149165619e-07,
"logits/chosen": -0.7898523807525635,
"logits/rejected": -0.78404700756073,
"logps/chosen": -746.5989990234375,
"logps/rejected": -925.236328125,
"loss": 0.3681,
"rewards/accuracies": 0.6875,
"rewards/chosen": -5.577144145965576,
"rewards/margins": 1.7709450721740723,
"rewards/rejected": -7.348089218139648,
"step": 152
},
{
"epoch": 0.6540208388992786,
"grad_norm": 15.30647255299837,
"learning_rate": 3.1999770046516194e-07,
"logits/chosen": -0.6549022197723389,
"logits/rejected": -0.6652963161468506,
"logps/chosen": -734.189453125,
"logps/rejected": -882.783203125,
"loss": 0.397,
"rewards/accuracies": 0.84375,
"rewards/chosen": -5.586746692657471,
"rewards/margins": 1.5474714040756226,
"rewards/rejected": -7.134217739105225,
"step": 153
},
{
"epoch": 0.6582954849051563,
"grad_norm": 15.480021561153041,
"learning_rate": 3.1300645628315526e-07,
"logits/chosen": -0.6595125794410706,
"logits/rejected": -0.684340238571167,
"logps/chosen": -692.1060180664062,
"logps/rejected": -861.9107055664062,
"loss": 0.3519,
"rewards/accuracies": 0.90625,
"rewards/chosen": -5.179555416107178,
"rewards/margins": 1.7508639097213745,
"rewards/rejected": -6.930419445037842,
"step": 154
},
{
"epoch": 0.6625701309110339,
"grad_norm": 14.370880370113698,
"learning_rate": 3.060574619936075e-07,
"logits/chosen": -0.6609420776367188,
"logits/rejected": -0.6882165670394897,
"logps/chosen": -755.37109375,
"logps/rejected": -923.8385620117188,
"loss": 0.3964,
"rewards/accuracies": 0.84375,
"rewards/chosen": -5.7476301193237305,
"rewards/margins": 1.664482831954956,
"rewards/rejected": -7.412113189697266,
"step": 155
},
{
"epoch": 0.6668447769169116,
"grad_norm": 15.740330800217551,
"learning_rate": 2.9915228767351535e-07,
"logits/chosen": -0.6638086438179016,
"logits/rejected": -0.6543954014778137,
"logps/chosen": -691.0468139648438,
"logps/rejected": -834.0437622070312,
"loss": 0.3507,
"rewards/accuracies": 0.8125,
"rewards/chosen": -4.920919895172119,
"rewards/margins": 1.5358891487121582,
"rewards/rejected": -6.4568095207214355,
"step": 156
},
{
"epoch": 0.6711194229227893,
"grad_norm": 16.901891917408413,
"learning_rate": 2.922924934990568e-07,
"logits/chosen": -0.7027104496955872,
"logits/rejected": -0.7690137624740601,
"logps/chosen": -740.62353515625,
"logps/rejected": -911.9879150390625,
"loss": 0.383,
"rewards/accuracies": 0.84375,
"rewards/chosen": -5.2726335525512695,
"rewards/margins": 1.703195571899414,
"rewards/rejected": -6.975828647613525,
"step": 157
},
{
"epoch": 0.6753940689286668,
"grad_norm": 14.502221464305004,
"learning_rate": 2.8547962939308186e-07,
"logits/chosen": -0.7240225672721863,
"logits/rejected": -0.743172287940979,
"logps/chosen": -638.63720703125,
"logps/rejected": -785.5480346679688,
"loss": 0.3907,
"rewards/accuracies": 0.90625,
"rewards/chosen": -4.341987133026123,
"rewards/margins": 1.3872699737548828,
"rewards/rejected": -5.729257583618164,
"step": 158
},
{
"epoch": 0.6796687149345445,
"grad_norm": 13.076811073919702,
"learning_rate": 2.7871523467491725e-07,
"logits/chosen": -0.5847674608230591,
"logits/rejected": -0.6150667667388916,
"logps/chosen": -558.647216796875,
"logps/rejected": -742.8478393554688,
"loss": 0.3777,
"rewards/accuracies": 0.9375,
"rewards/chosen": -3.8698740005493164,
"rewards/margins": 1.8015196323394775,
"rewards/rejected": -5.671393394470215,
"step": 159
},
{
"epoch": 0.6839433609404221,
"grad_norm": 13.435743880369799,
"learning_rate": 2.720008377125682e-07,
"logits/chosen": -0.7234424352645874,
"logits/rejected": -0.7753596305847168,
"logps/chosen": -599.20947265625,
"logps/rejected": -840.508544921875,
"loss": 0.3559,
"rewards/accuracies": 0.875,
"rewards/chosen": -4.066052436828613,
"rewards/margins": 2.2670648097991943,
"rewards/rejected": -6.3331170082092285,
"step": 160
},
{
"epoch": 0.6882180069462998,
"grad_norm": 14.725846441531868,
"learning_rate": 2.6533795557739405e-07,
"logits/chosen": -0.5986216068267822,
"logits/rejected": -0.6119877099990845,
"logps/chosen": -587.1053466796875,
"logps/rejected": -751.2557983398438,
"loss": 0.3576,
"rewards/accuracies": 0.875,
"rewards/chosen": -3.9571621417999268,
"rewards/margins": 1.737083077430725,
"rewards/rejected": -5.694245338439941,
"step": 161
},
{
"epoch": 0.6924926529521774,
"grad_norm": 18.854853550251143,
"learning_rate": 2.5872809370133704e-07,
"logits/chosen": -0.7047430872917175,
"logits/rejected": -0.7392921447753906,
"logps/chosen": -541.9876098632812,
"logps/rejected": -679.9630737304688,
"loss": 0.3713,
"rewards/accuracies": 0.8125,
"rewards/chosen": -3.7241249084472656,
"rewards/margins": 1.3988580703735352,
"rewards/rejected": -5.122982978820801,
"step": 162
},
{
"epoch": 0.696767298958055,
"grad_norm": 15.327216222305758,
"learning_rate": 2.521727455367797e-07,
"logits/chosen": -0.4683057963848114,
"logits/rejected": -0.4958358705043793,
"logps/chosen": -477.89453125,
"logps/rejected": -642.5132446289062,
"loss": 0.3217,
"rewards/accuracies": 0.78125,
"rewards/chosen": -3.4097371101379395,
"rewards/margins": 1.6171811819076538,
"rewards/rejected": -5.026918411254883,
"step": 163
},
{
"epoch": 0.7010419449639327,
"grad_norm": 15.107047920787036,
"learning_rate": 2.456733922191108e-07,
"logits/chosen": -0.6403992176055908,
"logits/rejected": -0.7081367373466492,
"logps/chosen": -535.940673828125,
"logps/rejected": -721.487060546875,
"loss": 0.3835,
"rewards/accuracies": 0.875,
"rewards/chosen": -3.6437511444091797,
"rewards/margins": 1.7465698719024658,
"rewards/rejected": -5.390320777893066,
"step": 164
},
{
"epoch": 0.7053165909698104,
"grad_norm": 14.789037132111227,
"learning_rate": 2.3923150223207173e-07,
"logits/chosen": -0.6419979333877563,
"logits/rejected": -0.6663538217544556,
"logps/chosen": -611.316162109375,
"logps/rejected": -779.5244140625,
"loss": 0.357,
"rewards/accuracies": 0.875,
"rewards/chosen": -3.9267449378967285,
"rewards/margins": 1.7568156719207764,
"rewards/rejected": -5.683561325073242,
"step": 165
},
{
"epoch": 0.7095912369756879,
"grad_norm": 11.678139693168967,
"learning_rate": 2.3284853107596347e-07,
"logits/chosen": -0.643075704574585,
"logits/rejected": -0.677239179611206,
"logps/chosen": -605.615478515625,
"logps/rejected": -788.3361206054688,
"loss": 0.3161,
"rewards/accuracies": 0.84375,
"rewards/chosen": -4.1840410232543945,
"rewards/margins": 1.828648567199707,
"rewards/rejected": -6.012689590454102,
"step": 166
},
{
"epoch": 0.7138658829815656,
"grad_norm": 14.838448866889486,
"learning_rate": 2.2652592093878665e-07,
"logits/chosen": -0.5908488631248474,
"logits/rejected": -0.6062439680099487,
"logps/chosen": -623.344482421875,
"logps/rejected": -775.658447265625,
"loss": 0.3693,
"rewards/accuracies": 0.8125,
"rewards/chosen": -4.541435241699219,
"rewards/margins": 1.4988759756088257,
"rewards/rejected": -6.040311336517334,
"step": 167
},
{
"epoch": 0.7181405289874432,
"grad_norm": 13.768660505175903,
"learning_rate": 2.202651003703885e-07,
"logits/chosen": -0.5698331594467163,
"logits/rejected": -0.5812557339668274,
"logps/chosen": -609.9556274414062,
"logps/rejected": -810.02587890625,
"loss": 0.3727,
"rewards/accuracies": 0.875,
"rewards/chosen": -4.267837047576904,
"rewards/margins": 1.995699167251587,
"rewards/rejected": -6.2635369300842285,
"step": 168
},
{
"epoch": 0.7224151749933209,
"grad_norm": 12.956408292396839,
"learning_rate": 2.1406748395969305e-07,
"logits/chosen": -0.6224421858787537,
"logits/rejected": -0.6563930511474609,
"logps/chosen": -620.7723388671875,
"logps/rejected": -781.536865234375,
"loss": 0.3362,
"rewards/accuracies": 0.875,
"rewards/chosen": -4.473085403442383,
"rewards/margins": 1.6361382007598877,
"rewards/rejected": -6.109223365783691,
"step": 169
},
{
"epoch": 0.7266898209991985,
"grad_norm": 15.331044331855917,
"learning_rate": 2.0793447201508286e-07,
"logits/chosen": -0.6418094635009766,
"logits/rejected": -0.6333540678024292,
"logps/chosen": -693.9434204101562,
"logps/rejected": -780.4866333007812,
"loss": 0.3273,
"rewards/accuracies": 0.6875,
"rewards/chosen": -5.1741743087768555,
"rewards/margins": 0.9175342321395874,
"rewards/rejected": -6.091708660125732,
"step": 170
},
{
"epoch": 0.7309644670050761,
"grad_norm": 14.142847544336131,
"learning_rate": 2.01867450248011e-07,
"logits/chosen": -0.6202086210250854,
"logits/rejected": -0.6713452935218811,
"logps/chosen": -733.3516235351562,
"logps/rejected": -912.0302734375,
"loss": 0.3551,
"rewards/accuracies": 0.75,
"rewards/chosen": -5.687126636505127,
"rewards/margins": 1.6365104913711548,
"rewards/rejected": -7.32363748550415,
"step": 171
},
{
"epoch": 0.7352391130109538,
"grad_norm": 14.223351065221193,
"learning_rate": 1.9586778945990783e-07,
"logits/chosen": -0.5605691075325012,
"logits/rejected": -0.6431994438171387,
"logps/chosen": -720.2319946289062,
"logps/rejected": -907.8939819335938,
"loss": 0.3518,
"rewards/accuracies": 0.84375,
"rewards/chosen": -5.346857070922852,
"rewards/margins": 1.8520584106445312,
"rewards/rejected": -7.198915481567383,
"step": 172
},
{
"epoch": 0.7395137590168315,
"grad_norm": 14.283973424406769,
"learning_rate": 1.899368452324584e-07,
"logits/chosen": -0.8039106130599976,
"logits/rejected": -0.8154680728912354,
"logps/chosen": -702.9293823242188,
"logps/rejected": -874.7135009765625,
"loss": 0.369,
"rewards/accuracies": 0.71875,
"rewards/chosen": -5.166163444519043,
"rewards/margins": 1.7623482942581177,
"rewards/rejected": -6.928511142730713,
"step": 173
},
{
"epoch": 0.743788405022709,
"grad_norm": 13.648679506818656,
"learning_rate": 1.840759576213181e-07,
"logits/chosen": -0.5466803908348083,
"logits/rejected": -0.6139577031135559,
"logps/chosen": -643.9953002929688,
"logps/rejected": -851.5556640625,
"loss": 0.3272,
"rewards/accuracies": 0.96875,
"rewards/chosen": -4.4559197425842285,
"rewards/margins": 2.084817886352539,
"rewards/rejected": -6.540737152099609,
"step": 174
},
{
"epoch": 0.7480630510285867,
"grad_norm": 16.8663479372205,
"learning_rate": 1.7828645085333644e-07,
"logits/chosen": -0.6725043654441833,
"logits/rejected": -0.7195257544517517,
"logps/chosen": -706.1845703125,
"logps/rejected": -901.8247680664062,
"loss": 0.3902,
"rewards/accuracies": 0.84375,
"rewards/chosen": -5.220101356506348,
"rewards/margins": 1.968306541442871,
"rewards/rejected": -7.188408374786377,
"step": 175
},
{
"epoch": 0.7523376970344643,
"grad_norm": 13.612220492425323,
"learning_rate": 1.725696330273575e-07,
"logits/chosen": -0.7102064490318298,
"logits/rejected": -0.7330564260482788,
"logps/chosen": -644.0183715820312,
"logps/rejected": -833.1116943359375,
"loss": 0.3007,
"rewards/accuracies": 0.9375,
"rewards/chosen": -4.664500713348389,
"rewards/margins": 1.9048974514007568,
"rewards/rejected": -6.569398403167725,
"step": 176
},
{
"epoch": 0.756612343040342,
"grad_norm": 13.592118995239437,
"learning_rate": 1.6692679581866332e-07,
"logits/chosen": -0.5269302725791931,
"logits/rejected": -0.562321662902832,
"logps/chosen": -647.377685546875,
"logps/rejected": -876.7937622070312,
"loss": 0.3336,
"rewards/accuracies": 0.90625,
"rewards/chosen": -4.665207862854004,
"rewards/margins": 2.2923545837402344,
"rewards/rejected": -6.9575629234313965,
"step": 177
},
{
"epoch": 0.7608869890462197,
"grad_norm": 13.100123198154126,
"learning_rate": 1.6135921418712955e-07,
"logits/chosen": -0.6257606744766235,
"logits/rejected": -0.6605125069618225,
"logps/chosen": -589.7310791015625,
"logps/rejected": -762.8815307617188,
"loss": 0.3326,
"rewards/accuracies": 0.90625,
"rewards/chosen": -4.342212200164795,
"rewards/margins": 1.6928372383117676,
"rewards/rejected": -6.035048961639404,
"step": 178
},
{
"epoch": 0.7651616350520972,
"grad_norm": 24.95386459216583,
"learning_rate": 1.558681460891567e-07,
"logits/chosen": -0.6882709860801697,
"logits/rejected": -0.7164211273193359,
"logps/chosen": -729.4912719726562,
"logps/rejected": -960.5554809570312,
"loss": 0.3596,
"rewards/accuracies": 0.90625,
"rewards/chosen": -5.27896785736084,
"rewards/margins": 2.3151795864105225,
"rewards/rejected": -7.594147682189941,
"step": 179
},
{
"epoch": 0.7694362810579749,
"grad_norm": 14.508183670705439,
"learning_rate": 1.5045483219344385e-07,
"logits/chosen": -0.4706317186355591,
"logits/rejected": -0.4956177771091461,
"logps/chosen": -686.981201171875,
"logps/rejected": -892.425537109375,
"loss": 0.3878,
"rewards/accuracies": 0.875,
"rewards/chosen": -5.139954090118408,
"rewards/margins": 1.9214184284210205,
"rewards/rejected": -7.061371803283691,
"step": 180
},
{
"epoch": 0.7737109270638525,
"grad_norm": 13.601053697523751,
"learning_rate": 1.4512049560066835e-07,
"logits/chosen": -0.5556597113609314,
"logits/rejected": -0.6318129301071167,
"logps/chosen": -582.2374877929688,
"logps/rejected": -793.1754760742188,
"loss": 0.3245,
"rewards/accuracies": 0.9375,
"rewards/chosen": -4.254069805145264,
"rewards/margins": 2.0026259422302246,
"rewards/rejected": -6.256695747375488,
"step": 181
},
{
"epoch": 0.7779855730697302,
"grad_norm": 18.93784031657066,
"learning_rate": 1.3986634156713417e-07,
"logits/chosen": -0.5807328224182129,
"logits/rejected": -0.5342915654182434,
"logps/chosen": -602.649658203125,
"logps/rejected": -747.00732421875,
"loss": 0.3884,
"rewards/accuracies": 0.90625,
"rewards/chosen": -4.532341957092285,
"rewards/margins": 1.6315253973007202,
"rewards/rejected": -6.163866996765137,
"step": 182
},
{
"epoch": 0.7822602190756078,
"grad_norm": 16.431500936058505,
"learning_rate": 1.34693557232453e-07,
"logits/chosen": -0.6352940797805786,
"logits/rejected": -0.6816412210464478,
"logps/chosen": -679.6211547851562,
"logps/rejected": -891.3534545898438,
"loss": 0.3745,
"rewards/accuracies": 0.84375,
"rewards/chosen": -4.584761619567871,
"rewards/margins": 2.096449375152588,
"rewards/rejected": -6.681210994720459,
"step": 183
},
{
"epoch": 0.7865348650814854,
"grad_norm": 17.829780427983952,
"learning_rate": 1.2960331135131823e-07,
"logits/chosen": -0.6611433029174805,
"logits/rejected": -0.6609802842140198,
"logps/chosen": -662.570068359375,
"logps/rejected": -845.5758666992188,
"loss": 0.3201,
"rewards/accuracies": 0.78125,
"rewards/chosen": -4.930952072143555,
"rewards/margins": 1.9189307689666748,
"rewards/rejected": -6.849882125854492,
"step": 184
},
{
"epoch": 0.7908095110873631,
"grad_norm": 14.013649032140211,
"learning_rate": 1.2459675402943288e-07,
"logits/chosen": -0.5998414754867554,
"logits/rejected": -0.6097269058227539,
"logps/chosen": -691.530517578125,
"logps/rejected": -900.010986328125,
"loss": 0.2947,
"rewards/accuracies": 0.875,
"rewards/chosen": -5.21755838394165,
"rewards/margins": 2.078969955444336,
"rewards/rejected": -7.296527862548828,
"step": 185
},
{
"epoch": 0.7950841570932408,
"grad_norm": 13.880919593847631,
"learning_rate": 1.1967501646365146e-07,
"logits/chosen": -0.6878648996353149,
"logits/rejected": -0.7795136570930481,
"logps/chosen": -629.9570922851562,
"logps/rejected": -834.251708984375,
"loss": 0.3317,
"rewards/accuracies": 0.78125,
"rewards/chosen": -4.382741451263428,
"rewards/margins": 2.003836154937744,
"rewards/rejected": -6.386577606201172,
"step": 186
},
{
"epoch": 0.7993588030991183,
"grad_norm": 15.353298997567329,
"learning_rate": 1.1483921068639351e-07,
"logits/chosen": -0.6340612173080444,
"logits/rejected": -0.6507092714309692,
"logps/chosen": -723.2861938476562,
"logps/rejected": -939.2916259765625,
"loss": 0.3424,
"rewards/accuracies": 0.8125,
"rewards/chosen": -5.30019474029541,
"rewards/margins": 2.066894769668579,
"rewards/rejected": -7.36708927154541,
"step": 187
},
{
"epoch": 0.803633449104996,
"grad_norm": 14.566838990350824,
"learning_rate": 1.1009042931438783e-07,
"logits/chosen": -0.6575983762741089,
"logits/rejected": -0.699353814125061,
"logps/chosen": -693.1535034179688,
"logps/rejected": -887.3889770507812,
"loss": 0.3477,
"rewards/accuracies": 0.84375,
"rewards/chosen": -5.220640182495117,
"rewards/margins": 1.8395450115203857,
"rewards/rejected": -7.060185432434082,
"step": 188
},
{
"epoch": 0.8079080951108736,
"grad_norm": 17.580820182141434,
"learning_rate": 1.0542974530180327e-07,
"logits/chosen": -0.6650811433792114,
"logits/rejected": -0.7292627692222595,
"logps/chosen": -640.0955200195312,
"logps/rejected": -821.3978881835938,
"loss": 0.3859,
"rewards/accuracies": 0.8125,
"rewards/chosen": -4.4905171394348145,
"rewards/margins": 1.7442741394042969,
"rewards/rejected": -6.234791278839111,
"step": 189
},
{
"epoch": 0.8121827411167513,
"grad_norm": 14.138278292631155,
"learning_rate": 1.0085821169782199e-07,
"logits/chosen": -0.6633419394493103,
"logits/rejected": -0.7350410223007202,
"logps/chosen": -550.8514404296875,
"logps/rejected": -755.83056640625,
"loss": 0.3497,
"rewards/accuracies": 0.96875,
"rewards/chosen": -3.9601831436157227,
"rewards/margins": 1.8831403255462646,
"rewards/rejected": -5.843323707580566,
"step": 190
},
{
"epoch": 0.8164573871226289,
"grad_norm": 15.051759686903543,
"learning_rate": 9.637686140871121e-08,
"logits/chosen": -0.5633993148803711,
"logits/rejected": -0.5642579793930054,
"logps/chosen": -751.5147705078125,
"logps/rejected": -917.2871704101562,
"loss": 0.3641,
"rewards/accuracies": 0.8125,
"rewards/chosen": -5.359073162078857,
"rewards/margins": 1.6620866060256958,
"rewards/rejected": -7.021159648895264,
"step": 191
},
{
"epoch": 0.8207320331285065,
"grad_norm": 16.652659340140023,
"learning_rate": 9.198670696444338e-08,
"logits/chosen": -0.6166589260101318,
"logits/rejected": -0.6604666709899902,
"logps/chosen": -641.2590942382812,
"logps/rejected": -832.0524291992188,
"loss": 0.3808,
"rewards/accuracies": 0.75,
"rewards/chosen": -4.821630954742432,
"rewards/margins": 1.7895194292068481,
"rewards/rejected": -6.611149787902832,
"step": 192
},
{
"epoch": 0.8250066791343842,
"grad_norm": 14.1280180622041,
"learning_rate": 8.768874028992429e-08,
"logits/chosen": -0.6036140322685242,
"logits/rejected": -0.6334025859832764,
"logps/chosen": -613.3704223632812,
"logps/rejected": -792.6769409179688,
"loss": 0.3289,
"rewards/accuracies": 0.90625,
"rewards/chosen": -4.494506359100342,
"rewards/margins": 1.7201225757598877,
"rewards/rejected": -6.21462869644165,
"step": 193
},
{
"epoch": 0.8292813251402619,
"grad_norm": 13.810909832189019,
"learning_rate": 8.348393248087287e-08,
"logits/chosen": -0.5372692346572876,
"logits/rejected": -0.5193148255348206,
"logps/chosen": -559.5550537109375,
"logps/rejected": -737.1176147460938,
"loss": 0.3486,
"rewards/accuracies": 0.78125,
"rewards/chosen": -4.210740089416504,
"rewards/margins": 1.7630614042282104,
"rewards/rejected": -5.973801612854004,
"step": 194
},
{
"epoch": 0.8335559711461394,
"grad_norm": 13.364037119659574,
"learning_rate": 7.937323358440934e-08,
"logits/chosen": -0.7003932595252991,
"logits/rejected": -0.6555891633033752,
"logps/chosen": -687.826171875,
"logps/rejected": -845.4681396484375,
"loss": 0.3359,
"rewards/accuracies": 0.8125,
"rewards/chosen": -4.774288654327393,
"rewards/margins": 1.7430295944213867,
"rewards/rejected": -6.517318248748779,
"step": 195
},
{
"epoch": 0.8378306171520171,
"grad_norm": 14.21133162771402,
"learning_rate": 7.535757238439938e-08,
"logits/chosen": -0.7010968923568726,
"logits/rejected": -0.7519139647483826,
"logps/chosen": -618.968017578125,
"logps/rejected": -890.8580322265625,
"loss": 0.2995,
"rewards/accuracies": 0.8125,
"rewards/chosen": -4.3997673988342285,
"rewards/margins": 2.6948697566986084,
"rewards/rejected": -7.094637393951416,
"step": 196
},
{
"epoch": 0.8421052631578947,
"grad_norm": 14.29704216155569,
"learning_rate": 7.143785619160026e-08,
"logits/chosen": -0.8667165637016296,
"logits/rejected": -0.9397881031036377,
"logps/chosen": -635.5562133789062,
"logps/rejected": -869.5856323242188,
"loss": 0.2775,
"rewards/accuracies": 1.0,
"rewards/chosen": -4.424742221832275,
"rewards/margins": 2.1135940551757812,
"rewards/rejected": -6.538336277008057,
"step": 197
},
{
"epoch": 0.8463799091637724,
"grad_norm": 15.830532425162781,
"learning_rate": 6.761497063866206e-08,
"logits/chosen": -0.715203583240509,
"logits/rejected": -0.7201322317123413,
"logps/chosen": -675.7356567382812,
"logps/rejected": -823.666748046875,
"loss": 0.3961,
"rewards/accuracies": 0.84375,
"rewards/chosen": -4.689385890960693,
"rewards/margins": 1.436130404472351,
"rewards/rejected": -6.125515937805176,
"step": 198
},
{
"epoch": 0.85065455516965,
"grad_norm": 12.71855284005859,
"learning_rate": 6.388977948002406e-08,
"logits/chosen": -0.6863987445831299,
"logits/rejected": -0.7026057839393616,
"logps/chosen": -635.3793334960938,
"logps/rejected": -810.3753051757812,
"loss": 0.3249,
"rewards/accuracies": 0.875,
"rewards/chosen": -4.71897554397583,
"rewards/margins": 1.7087393999099731,
"rewards/rejected": -6.427714824676514,
"step": 199
},
{
"epoch": 0.8549292011755276,
"grad_norm": 14.663842598726813,
"learning_rate": 6.026312439675551e-08,
"logits/chosen": -0.6016858220100403,
"logits/rejected": -0.6423814296722412,
"logps/chosen": -533.045654296875,
"logps/rejected": -688.97705078125,
"loss": 0.343,
"rewards/accuracies": 0.90625,
"rewards/chosen": -3.7861201763153076,
"rewards/margins": 1.5125634670257568,
"rewards/rejected": -5.298683166503906,
"step": 200
},
{
"epoch": 0.8549292011755276,
"eval_logits/chosen": -0.5621978044509888,
"eval_logits/rejected": -0.5776455998420715,
"eval_logps/chosen": -698.124755859375,
"eval_logps/rejected": -878.510498046875,
"eval_loss": 0.3298446834087372,
"eval_rewards/accuracies": 0.8951612710952759,
"eval_rewards/chosen": -4.931028842926025,
"eval_rewards/margins": 1.8655366897583008,
"eval_rewards/rejected": -6.796565055847168,
"eval_runtime": 148.1942,
"eval_samples_per_second": 13.233,
"eval_steps_per_second": 0.418,
"step": 200
},
{
"epoch": 0.8592038471814053,
"grad_norm": 13.178173716498932,
"learning_rate": 5.6735824806383945e-08,
"logits/chosen": -0.8137510418891907,
"logits/rejected": -0.8618326783180237,
"logps/chosen": -762.1471557617188,
"logps/rejected": -987.831787109375,
"loss": 0.3117,
"rewards/accuracies": 0.875,
"rewards/chosen": -5.3300957679748535,
"rewards/margins": 2.290207862854004,
"rewards/rejected": -7.620304107666016,
"step": 201
},
{
"epoch": 0.863478493187283,
"grad_norm": 15.015075721065077,
"learning_rate": 5.3308677677753324e-08,
"logits/chosen": -0.6007865071296692,
"logits/rejected": -0.6010035872459412,
"logps/chosen": -607.5897827148438,
"logps/rejected": -783.7905883789062,
"loss": 0.3751,
"rewards/accuracies": 0.8125,
"rewards/chosen": -4.329671382904053,
"rewards/margins": 1.7175862789154053,
"rewards/rejected": -6.047257423400879,
"step": 202
},
{
"epoch": 0.8677531391931605,
"grad_norm": 17.440815026489904,
"learning_rate": 4.9982457350954576e-08,
"logits/chosen": -0.6124709844589233,
"logits/rejected": -0.5998551249504089,
"logps/chosen": -768.2071533203125,
"logps/rejected": -909.81982421875,
"loss": 0.337,
"rewards/accuracies": 0.75,
"rewards/chosen": -5.643802642822266,
"rewards/margins": 1.5375871658325195,
"rewards/rejected": -7.181390285491943,
"step": 203
},
{
"epoch": 0.8720277851990382,
"grad_norm": 13.117492290397454,
"learning_rate": 4.675791536236856e-08,
"logits/chosen": -0.6630779504776001,
"logits/rejected": -0.7016023397445679,
"logps/chosen": -576.6640014648438,
"logps/rejected": -781.410888671875,
"loss": 0.3565,
"rewards/accuracies": 0.90625,
"rewards/chosen": -4.391974449157715,
"rewards/margins": 2.0928616523742676,
"rewards/rejected": -6.484836101531982,
"step": 204
},
{
"epoch": 0.8763024312049158,
"grad_norm": 16.38485979196371,
"learning_rate": 4.3635780274861864e-08,
"logits/chosen": -0.6497581005096436,
"logits/rejected": -0.7152493596076965,
"logps/chosen": -604.6185302734375,
"logps/rejected": -769.5409545898438,
"loss": 0.3998,
"rewards/accuracies": 0.84375,
"rewards/chosen": -4.433528900146484,
"rewards/margins": 1.5042006969451904,
"rewards/rejected": -5.937729835510254,
"step": 205
},
{
"epoch": 0.8805770772107935,
"grad_norm": 19.27882004478496,
"learning_rate": 4.0616757513173115e-08,
"logits/chosen": -0.5923482775688171,
"logits/rejected": -0.6674529910087585,
"logps/chosen": -730.7793579101562,
"logps/rejected": -982.1051025390625,
"loss": 0.3504,
"rewards/accuracies": 0.84375,
"rewards/chosen": -4.950905799865723,
"rewards/margins": 2.3350894451141357,
"rewards/rejected": -7.2859954833984375,
"step": 206
},
{
"epoch": 0.8848517232166712,
"grad_norm": 12.70289041030948,
"learning_rate": 3.7701529204526846e-08,
"logits/chosen": -0.5751956105232239,
"logits/rejected": -0.6102803349494934,
"logps/chosen": -638.7919311523438,
"logps/rejected": -788.8345336914062,
"loss": 0.3242,
"rewards/accuracies": 0.84375,
"rewards/chosen": -4.849457740783691,
"rewards/margins": 1.458791971206665,
"rewards/rejected": -6.308249473571777,
"step": 207
},
{
"epoch": 0.8891263692225487,
"grad_norm": 16.266039867395605,
"learning_rate": 3.4890754024512246e-08,
"logits/chosen": -0.6625305414199829,
"logits/rejected": -0.6873234510421753,
"logps/chosen": -717.0994873046875,
"logps/rejected": -884.3046875,
"loss": 0.3447,
"rewards/accuracies": 0.75,
"rewards/chosen": -5.226888179779053,
"rewards/margins": 1.590306043624878,
"rewards/rejected": -6.81719446182251,
"step": 208
},
{
"epoch": 0.8934010152284264,
"grad_norm": 14.509559337608318,
"learning_rate": 3.218506704825924e-08,
"logits/chosen": -0.5810579061508179,
"logits/rejected": -0.6108168363571167,
"logps/chosen": -675.04150390625,
"logps/rejected": -836.3502197265625,
"loss": 0.3439,
"rewards/accuracies": 0.78125,
"rewards/chosen": -5.011352062225342,
"rewards/margins": 1.6481198072433472,
"rewards/rejected": -6.6594719886779785,
"step": 209
},
{
"epoch": 0.897675661234304,
"grad_norm": 17.634651177078286,
"learning_rate": 2.958507960694784e-08,
"logits/chosen": -0.5604880452156067,
"logits/rejected": -0.5632505416870117,
"logps/chosen": -718.5660400390625,
"logps/rejected": -901.1178588867188,
"loss": 0.3646,
"rewards/accuracies": 0.875,
"rewards/chosen": -5.241810321807861,
"rewards/margins": 1.8300410509109497,
"rewards/rejected": -7.071850776672363,
"step": 210
},
{
"epoch": 0.9019503072401817,
"grad_norm": 14.379481683528487,
"learning_rate": 2.7091379149682682e-08,
"logits/chosen": -0.7952392101287842,
"logits/rejected": -0.8281516432762146,
"logps/chosen": -715.3723754882812,
"logps/rejected": -877.9434814453125,
"loss": 0.3452,
"rewards/accuracies": 0.78125,
"rewards/chosen": -5.230836868286133,
"rewards/margins": 1.5704870223999023,
"rewards/rejected": -6.801323413848877,
"step": 211
},
{
"epoch": 0.9062249532460593,
"grad_norm": 12.38379892339324,
"learning_rate": 2.470452911076226e-08,
"logits/chosen": -0.6212865114212036,
"logits/rejected": -0.7052218317985535,
"logps/chosen": -568.6085205078125,
"logps/rejected": -806.6718139648438,
"loss": 0.3225,
"rewards/accuracies": 0.96875,
"rewards/chosen": -4.2210798263549805,
"rewards/margins": 2.2609493732452393,
"rewards/rejected": -6.482028484344482,
"step": 212
},
{
"epoch": 0.9104995992519369,
"grad_norm": 13.671352372902698,
"learning_rate": 2.2425068782375378e-08,
"logits/chosen": -0.708694577217102,
"logits/rejected": -0.7406002283096313,
"logps/chosen": -631.50537109375,
"logps/rejected": -812.782958984375,
"loss": 0.307,
"rewards/accuracies": 0.875,
"rewards/chosen": -4.614077091217041,
"rewards/margins": 1.823103427886963,
"rewards/rejected": -6.4371795654296875,
"step": 213
},
{
"epoch": 0.9147742452578146,
"grad_norm": 14.646295067927662,
"learning_rate": 2.025351319275137e-08,
"logits/chosen": -0.6601104736328125,
"logits/rejected": -0.6835007667541504,
"logps/chosen": -590.8656005859375,
"logps/rejected": -773.0665283203125,
"loss": 0.3485,
"rewards/accuracies": 0.84375,
"rewards/chosen": -4.47109317779541,
"rewards/margins": 1.788784384727478,
"rewards/rejected": -6.259877681732178,
"step": 214
},
{
"epoch": 0.9190488912636923,
"grad_norm": 13.735401688183746,
"learning_rate": 1.8190352989793322e-08,
"logits/chosen": -0.6832427978515625,
"logits/rejected": -0.7862576246261597,
"logps/chosen": -725.4074096679688,
"logps/rejected": -984.6351318359375,
"loss": 0.3231,
"rewards/accuracies": 0.9375,
"rewards/chosen": -5.303395748138428,
"rewards/margins": 2.5184988975524902,
"rewards/rejected": -7.821893692016602,
"step": 215
},
{
"epoch": 0.9233235372695698,
"grad_norm": 15.601926902374625,
"learning_rate": 1.623605433021985e-08,
"logits/chosen": -0.7613773345947266,
"logits/rejected": -0.8268823623657227,
"logps/chosen": -661.9906005859375,
"logps/rejected": -919.4451904296875,
"loss": 0.3372,
"rewards/accuracies": 0.84375,
"rewards/chosen": -4.652050018310547,
"rewards/margins": 2.513514995574951,
"rewards/rejected": -7.1655659675598145,
"step": 216
},
{
"epoch": 0.9275981832754475,
"grad_norm": 16.26278938580055,
"learning_rate": 1.4391058774239629e-08,
"logits/chosen": -0.6215861439704895,
"logits/rejected": -0.684840977191925,
"logps/chosen": -776.4395751953125,
"logps/rejected": -996.853271484375,
"loss": 0.326,
"rewards/accuracies": 0.84375,
"rewards/chosen": -5.608438491821289,
"rewards/margins": 1.9786306619644165,
"rewards/rejected": -7.587069511413574,
"step": 217
},
{
"epoch": 0.9318728292813251,
"grad_norm": 14.96580166748689,
"learning_rate": 1.2655783185784252e-08,
"logits/chosen": -0.5001079440116882,
"logits/rejected": -0.5907378792762756,
"logps/chosen": -636.527099609375,
"logps/rejected": -864.0882568359375,
"loss": 0.3191,
"rewards/accuracies": 0.90625,
"rewards/chosen": -4.651758670806885,
"rewards/margins": 2.046351432800293,
"rewards/rejected": -6.698110103607178,
"step": 218
},
{
"epoch": 0.9361474752872028,
"grad_norm": 16.265287798595743,
"learning_rate": 1.1030619638320804e-08,
"logits/chosen": -0.7028571963310242,
"logits/rejected": -0.7375434041023254,
"logps/chosen": -659.0259399414062,
"logps/rejected": -862.6585693359375,
"loss": 0.4146,
"rewards/accuracies": 0.84375,
"rewards/chosen": -4.674835205078125,
"rewards/margins": 1.9501476287841797,
"rewards/rejected": -6.624982833862305,
"step": 219
},
{
"epoch": 0.9404221212930804,
"grad_norm": 18.704093009678243,
"learning_rate": 9.515935326265378e-09,
"logits/chosen": -0.6694950461387634,
"logits/rejected": -0.7062525749206543,
"logps/chosen": -723.8511962890625,
"logps/rejected": -943.264892578125,
"loss": 0.3529,
"rewards/accuracies": 0.78125,
"rewards/chosen": -5.229786396026611,
"rewards/margins": 2.252092123031616,
"rewards/rejected": -7.481878757476807,
"step": 220
},
{
"epoch": 0.944696767298958,
"grad_norm": 14.334384768178085,
"learning_rate": 8.11207248201834e-09,
"logits/chosen": -0.5764753818511963,
"logits/rejected": -0.5958765745162964,
"logps/chosen": -652.0119018554688,
"logps/rejected": -832.4238891601562,
"loss": 0.3477,
"rewards/accuracies": 0.8125,
"rewards/chosen": -4.68829870223999,
"rewards/margins": 1.8251862525939941,
"rewards/rejected": -6.513484001159668,
"step": 221
},
{
"epoch": 0.9489714133048357,
"grad_norm": 13.331601354010285,
"learning_rate": 6.819348298638839e-09,
"logits/chosen": -0.5833301544189453,
"logits/rejected": -0.5820556879043579,
"logps/chosen": -636.1702270507812,
"logps/rejected": -775.4622192382812,
"loss": 0.3313,
"rewards/accuracies": 0.875,
"rewards/chosen": -4.623978137969971,
"rewards/margins": 1.5225858688354492,
"rewards/rejected": -6.14656400680542,
"step": 222
},
{
"epoch": 0.9532460593107134,
"grad_norm": 14.100864285899743,
"learning_rate": 5.638054858177643e-09,
"logits/chosen": -0.6460739374160767,
"logits/rejected": -0.704484760761261,
"logps/chosen": -666.4550170898438,
"logps/rejected": -879.8217163085938,
"loss": 0.3125,
"rewards/accuracies": 0.875,
"rewards/chosen": -4.722194194793701,
"rewards/margins": 2.0787832736968994,
"rewards/rejected": -6.8009772300720215,
"step": 223
},
{
"epoch": 0.957520705316591,
"grad_norm": 14.340696291340606,
"learning_rate": 4.568459065683205e-09,
"logits/chosen": -0.6707419753074646,
"logits/rejected": -0.6990107893943787,
"logps/chosen": -594.0106201171875,
"logps/rejected": -786.5427856445312,
"loss": 0.2979,
"rewards/accuracies": 0.84375,
"rewards/chosen": -4.234139442443848,
"rewards/margins": 1.8917232751846313,
"rewards/rejected": -6.1258625984191895,
"step": 224
},
{
"epoch": 0.9617953513224686,
"grad_norm": 16.87207011175805,
"learning_rate": 3.6108025888958447e-09,
"logits/chosen": -0.6689302921295166,
"logits/rejected": -0.7221664786338806,
"logps/chosen": -647.2908935546875,
"logps/rejected": -820.8074340820312,
"loss": 0.3808,
"rewards/accuracies": 0.875,
"rewards/chosen": -4.4107489585876465,
"rewards/margins": 1.711435317993164,
"rewards/rejected": -6.1221842765808105,
"step": 225
},
{
"epoch": 0.9660699973283462,
"grad_norm": 15.439317380628985,
"learning_rate": 2.7653018036454256e-09,
"logits/chosen": -0.7938471436500549,
"logits/rejected": -0.8257592916488647,
"logps/chosen": -685.37255859375,
"logps/rejected": -856.272705078125,
"loss": 0.376,
"rewards/accuracies": 0.875,
"rewards/chosen": -4.837329864501953,
"rewards/margins": 1.7641983032226562,
"rewards/rejected": -6.601528167724609,
"step": 226
},
{
"epoch": 0.9703446433342239,
"grad_norm": 15.341084234652751,
"learning_rate": 2.0321477449619096e-09,
"logits/chosen": -0.6373786330223083,
"logits/rejected": -0.6358063220977783,
"logps/chosen": -671.7559814453125,
"logps/rejected": -808.6800537109375,
"loss": 0.3511,
"rewards/accuracies": 0.75,
"rewards/chosen": -4.954955101013184,
"rewards/margins": 1.4025709629058838,
"rewards/rejected": -6.357525825500488,
"step": 227
},
{
"epoch": 0.9746192893401016,
"grad_norm": 16.08851675396365,
"learning_rate": 1.4115060639128818e-09,
"logits/chosen": -0.7105420827865601,
"logits/rejected": -0.7698283195495605,
"logps/chosen": -771.7847900390625,
"logps/rejected": -1010.12255859375,
"loss": 0.3947,
"rewards/accuracies": 0.8125,
"rewards/chosen": -5.364959716796875,
"rewards/margins": 2.1707944869995117,
"rewards/rejected": -7.535754203796387,
"step": 228
},
{
"epoch": 0.9788939353459791,
"grad_norm": 16.317073121348123,
"learning_rate": 9.035169901754902e-10,
"logits/chosen": -0.6789236664772034,
"logits/rejected": -0.718908429145813,
"logps/chosen": -675.6837158203125,
"logps/rejected": -947.82958984375,
"loss": 0.3487,
"rewards/accuracies": 0.90625,
"rewards/chosen": -4.9360833168029785,
"rewards/margins": 2.77742600440979,
"rewards/rejected": -7.713509559631348,
"step": 229
},
{
"epoch": 0.9831685813518568,
"grad_norm": 14.913455884524495,
"learning_rate": 5.082953003528456e-10,
"logits/chosen": -0.7133156061172485,
"logits/rejected": -0.7598533630371094,
"logps/chosen": -662.4967651367188,
"logps/rejected": -898.2874145507812,
"loss": 0.3301,
"rewards/accuracies": 0.875,
"rewards/chosen": -4.673389911651611,
"rewards/margins": 2.1788995265960693,
"rewards/rejected": -6.852289199829102,
"step": 230
},
{
"epoch": 0.9874432273577345,
"grad_norm": 13.34444116932449,
"learning_rate": 2.2593029204076574e-10,
"logits/chosen": -0.7370655536651611,
"logits/rejected": -0.7865728139877319,
"logps/chosen": -603.37109375,
"logps/rejected": -828.8916015625,
"loss": 0.3555,
"rewards/accuracies": 0.90625,
"rewards/chosen": -4.3032660484313965,
"rewards/margins": 2.1432766914367676,
"rewards/rejected": -6.446542739868164,
"step": 231
},
{
"epoch": 0.9917178733636121,
"grad_norm": 14.986063948498115,
"learning_rate": 5.648576365169244e-11,
"logits/chosen": -0.7017238140106201,
"logits/rejected": -0.7418109178543091,
"logps/chosen": -690.6285400390625,
"logps/rejected": -863.64208984375,
"loss": 0.352,
"rewards/accuracies": 0.90625,
"rewards/chosen": -5.160223960876465,
"rewards/margins": 1.6970188617706299,
"rewards/rejected": -6.857243061065674,
"step": 232
},
{
"epoch": 0.9959925193694897,
"grad_norm": 14.725841841963547,
"learning_rate": 0.0,
"logits/chosen": -0.540886402130127,
"logits/rejected": -0.5910319089889526,
"logps/chosen": -539.7564697265625,
"logps/rejected": -689.2047119140625,
"loss": 0.3329,
"rewards/accuracies": 0.8125,
"rewards/chosen": -4.104672431945801,
"rewards/margins": 1.427870750427246,
"rewards/rejected": -5.532542705535889,
"step": 233
},
{
"epoch": 0.9959925193694897,
"step": 233,
"total_flos": 0.0,
"train_loss": 0.45488236134655996,
"train_runtime": 10577.9564,
"train_samples_per_second": 5.66,
"train_steps_per_second": 0.022
}
],
"logging_steps": 1,
"max_steps": 233,
"num_input_tokens_seen": 0,
"num_train_epochs": 1,
"save_steps": 100,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 0.0,
"train_batch_size": 2,
"trial_name": null,
"trial_params": null
}