|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 3.098106712564544, |
|
"eval_steps": 500, |
|
"global_step": 1800, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.01721170395869191, |
|
"grad_norm": 0.29955029487609863, |
|
"learning_rate": 2.0293089116901574e-06, |
|
"loss": 0.6322, |
|
"step": 10 |
|
}, |
|
{ |
|
"epoch": 0.03442340791738382, |
|
"grad_norm": 0.06169761344790459, |
|
"learning_rate": 2.6401917645771237e-06, |
|
"loss": 0.4697, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.05163511187607573, |
|
"grad_norm": 0.051926977932453156, |
|
"learning_rate": 2.9975353258495578e-06, |
|
"loss": 0.5617, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 0.06884681583476764, |
|
"grad_norm": 0.07096195966005325, |
|
"learning_rate": 3.25107461746409e-06, |
|
"loss": 0.4301, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 0.08605851979345955, |
|
"grad_norm": 0.06899057328701019, |
|
"learning_rate": 3.4477349704933476e-06, |
|
"loss": 0.4905, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.10327022375215146, |
|
"grad_norm": 0.08537387102842331, |
|
"learning_rate": 3.6084181787365237e-06, |
|
"loss": 0.4551, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 0.12048192771084337, |
|
"grad_norm": 0.049780745059251785, |
|
"learning_rate": 3.7442738955429737e-06, |
|
"loss": 0.4058, |
|
"step": 70 |
|
}, |
|
{ |
|
"epoch": 0.13769363166953527, |
|
"grad_norm": 0.04421038553118706, |
|
"learning_rate": 3.861957470351056e-06, |
|
"loss": 0.6748, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 0.1549053356282272, |
|
"grad_norm": 1.9084473848342896, |
|
"learning_rate": 3.965761740008958e-06, |
|
"loss": 0.8719, |
|
"step": 90 |
|
}, |
|
{ |
|
"epoch": 0.1721170395869191, |
|
"grad_norm": 0.08046019077301025, |
|
"learning_rate": 4.058617823380315e-06, |
|
"loss": 0.4635, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.18932874354561102, |
|
"grad_norm": 0.21439455449581146, |
|
"learning_rate": 4.142616368250685e-06, |
|
"loss": 0.928, |
|
"step": 110 |
|
}, |
|
{ |
|
"epoch": 0.20654044750430292, |
|
"grad_norm": 0.06055545434355736, |
|
"learning_rate": 4.21930103162349e-06, |
|
"loss": 0.3721, |
|
"step": 120 |
|
}, |
|
{ |
|
"epoch": 0.22375215146299485, |
|
"grad_norm": 0.08670035004615784, |
|
"learning_rate": 4.289844083644429e-06, |
|
"loss": 0.7536, |
|
"step": 130 |
|
}, |
|
{ |
|
"epoch": 0.24096385542168675, |
|
"grad_norm": 0.06118405610322952, |
|
"learning_rate": 4.355156748429939e-06, |
|
"loss": 0.9829, |
|
"step": 140 |
|
}, |
|
{ |
|
"epoch": 0.25817555938037867, |
|
"grad_norm": 0.04853704199194908, |
|
"learning_rate": 4.415961384652748e-06, |
|
"loss": 0.4444, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 0.27538726333907054, |
|
"grad_norm": 0.03537767753005028, |
|
"learning_rate": 4.472840323238023e-06, |
|
"loss": 0.5064, |
|
"step": 160 |
|
}, |
|
{ |
|
"epoch": 0.29259896729776247, |
|
"grad_norm": 0.06154410541057587, |
|
"learning_rate": 4.52626987322263e-06, |
|
"loss": 0.5456, |
|
"step": 170 |
|
}, |
|
{ |
|
"epoch": 0.3098106712564544, |
|
"grad_norm": 0.052560485899448395, |
|
"learning_rate": 4.576644592895925e-06, |
|
"loss": 0.5106, |
|
"step": 180 |
|
}, |
|
{ |
|
"epoch": 0.3270223752151463, |
|
"grad_norm": 0.04913010448217392, |
|
"learning_rate": 4.6242949899596115e-06, |
|
"loss": 0.4026, |
|
"step": 190 |
|
}, |
|
{ |
|
"epoch": 0.3442340791738382, |
|
"grad_norm": 0.07974158972501755, |
|
"learning_rate": 4.66950067626728e-06, |
|
"loss": 0.4828, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 0.3614457831325301, |
|
"grad_norm": 0.03538183122873306, |
|
"learning_rate": 4.712500309702374e-06, |
|
"loss": 0.3549, |
|
"step": 210 |
|
}, |
|
{ |
|
"epoch": 0.37865748709122204, |
|
"grad_norm": 0.21638496220111847, |
|
"learning_rate": 4.753499221137652e-06, |
|
"loss": 0.4912, |
|
"step": 220 |
|
}, |
|
{ |
|
"epoch": 0.3958691910499139, |
|
"grad_norm": 0.03895362466573715, |
|
"learning_rate": 4.792675344617211e-06, |
|
"loss": 0.3846, |
|
"step": 230 |
|
}, |
|
{ |
|
"epoch": 0.41308089500860584, |
|
"grad_norm": 0.03565879911184311, |
|
"learning_rate": 4.830183884510456e-06, |
|
"loss": 0.8434, |
|
"step": 240 |
|
}, |
|
{ |
|
"epoch": 0.43029259896729777, |
|
"grad_norm": 0.03526683151721954, |
|
"learning_rate": 4.866161029296539e-06, |
|
"loss": 0.3603, |
|
"step": 250 |
|
}, |
|
{ |
|
"epoch": 0.4475043029259897, |
|
"grad_norm": 0.064102903008461, |
|
"learning_rate": 4.900726936531396e-06, |
|
"loss": 0.5178, |
|
"step": 260 |
|
}, |
|
{ |
|
"epoch": 0.46471600688468157, |
|
"grad_norm": 0.06982860714197159, |
|
"learning_rate": 4.9339881541683585e-06, |
|
"loss": 0.3712, |
|
"step": 270 |
|
}, |
|
{ |
|
"epoch": 0.4819277108433735, |
|
"grad_norm": 0.0654272809624672, |
|
"learning_rate": 4.966039601316906e-06, |
|
"loss": 0.9119, |
|
"step": 280 |
|
}, |
|
{ |
|
"epoch": 0.4991394148020654, |
|
"grad_norm": 0.04955059662461281, |
|
"learning_rate": 4.9969662012643525e-06, |
|
"loss": 0.3874, |
|
"step": 290 |
|
}, |
|
{ |
|
"epoch": 0.5163511187607573, |
|
"grad_norm": 1.0234352350234985, |
|
"learning_rate": 4.984697781178272e-06, |
|
"loss": 0.8952, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 0.5335628227194492, |
|
"grad_norm": 0.03769606724381447, |
|
"learning_rate": 4.96557000765111e-06, |
|
"loss": 0.3347, |
|
"step": 310 |
|
}, |
|
{ |
|
"epoch": 0.5507745266781411, |
|
"grad_norm": 0.11739111691713333, |
|
"learning_rate": 4.946442234123948e-06, |
|
"loss": 0.3677, |
|
"step": 320 |
|
}, |
|
{ |
|
"epoch": 0.5679862306368331, |
|
"grad_norm": 0.04959660395979881, |
|
"learning_rate": 4.927314460596787e-06, |
|
"loss": 1.1762, |
|
"step": 330 |
|
}, |
|
{ |
|
"epoch": 0.5851979345955249, |
|
"grad_norm": 0.1042531356215477, |
|
"learning_rate": 4.908186687069626e-06, |
|
"loss": 0.4252, |
|
"step": 340 |
|
}, |
|
{ |
|
"epoch": 0.6024096385542169, |
|
"grad_norm": 0.05064910277724266, |
|
"learning_rate": 4.889058913542464e-06, |
|
"loss": 0.3836, |
|
"step": 350 |
|
}, |
|
{ |
|
"epoch": 0.6196213425129088, |
|
"grad_norm": 0.0689607635140419, |
|
"learning_rate": 4.869931140015303e-06, |
|
"loss": 0.7539, |
|
"step": 360 |
|
}, |
|
{ |
|
"epoch": 0.6368330464716007, |
|
"grad_norm": 0.23462702333927155, |
|
"learning_rate": 4.850803366488141e-06, |
|
"loss": 0.8236, |
|
"step": 370 |
|
}, |
|
{ |
|
"epoch": 0.6540447504302926, |
|
"grad_norm": 0.11018137633800507, |
|
"learning_rate": 4.83167559296098e-06, |
|
"loss": 0.4839, |
|
"step": 380 |
|
}, |
|
{ |
|
"epoch": 0.6712564543889845, |
|
"grad_norm": 0.0751522108912468, |
|
"learning_rate": 4.812547819433818e-06, |
|
"loss": 0.5791, |
|
"step": 390 |
|
}, |
|
{ |
|
"epoch": 0.6884681583476764, |
|
"grad_norm": 0.17227555811405182, |
|
"learning_rate": 4.793420045906657e-06, |
|
"loss": 0.7993, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 0.7056798623063684, |
|
"grad_norm": 0.0664035975933075, |
|
"learning_rate": 4.7742922723794954e-06, |
|
"loss": 0.387, |
|
"step": 410 |
|
}, |
|
{ |
|
"epoch": 0.7228915662650602, |
|
"grad_norm": 0.04762504622340202, |
|
"learning_rate": 4.755164498852334e-06, |
|
"loss": 0.5436, |
|
"step": 420 |
|
}, |
|
{ |
|
"epoch": 0.7401032702237521, |
|
"grad_norm": 0.03658389300107956, |
|
"learning_rate": 4.736036725325173e-06, |
|
"loss": 0.6715, |
|
"step": 430 |
|
}, |
|
{ |
|
"epoch": 0.7573149741824441, |
|
"grad_norm": 0.03955502808094025, |
|
"learning_rate": 4.716908951798011e-06, |
|
"loss": 0.4902, |
|
"step": 440 |
|
}, |
|
{ |
|
"epoch": 0.774526678141136, |
|
"grad_norm": 0.05926811322569847, |
|
"learning_rate": 4.69778117827085e-06, |
|
"loss": 0.7329, |
|
"step": 450 |
|
}, |
|
{ |
|
"epoch": 0.7917383820998278, |
|
"grad_norm": 0.26404136419296265, |
|
"learning_rate": 4.678653404743688e-06, |
|
"loss": 0.5748, |
|
"step": 460 |
|
}, |
|
{ |
|
"epoch": 0.8089500860585198, |
|
"grad_norm": 0.07195431739091873, |
|
"learning_rate": 4.6595256312165265e-06, |
|
"loss": 0.5501, |
|
"step": 470 |
|
}, |
|
{ |
|
"epoch": 0.8261617900172117, |
|
"grad_norm": 0.0486939400434494, |
|
"learning_rate": 4.640397857689365e-06, |
|
"loss": 0.4527, |
|
"step": 480 |
|
}, |
|
{ |
|
"epoch": 0.8433734939759037, |
|
"grad_norm": 0.05488497018814087, |
|
"learning_rate": 4.621270084162204e-06, |
|
"loss": 0.8637, |
|
"step": 490 |
|
}, |
|
{ |
|
"epoch": 0.8605851979345955, |
|
"grad_norm": 0.045418575406074524, |
|
"learning_rate": 4.6021423106350425e-06, |
|
"loss": 0.437, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 0.8777969018932874, |
|
"grad_norm": 0.04055708646774292, |
|
"learning_rate": 4.583014537107881e-06, |
|
"loss": 0.6466, |
|
"step": 510 |
|
}, |
|
{ |
|
"epoch": 0.8950086058519794, |
|
"grad_norm": 0.03856475651264191, |
|
"learning_rate": 4.563886763580719e-06, |
|
"loss": 0.669, |
|
"step": 520 |
|
}, |
|
{ |
|
"epoch": 0.9122203098106713, |
|
"grad_norm": 0.035741958767175674, |
|
"learning_rate": 4.5447589900535585e-06, |
|
"loss": 0.3615, |
|
"step": 530 |
|
}, |
|
{ |
|
"epoch": 0.9294320137693631, |
|
"grad_norm": 0.04278489947319031, |
|
"learning_rate": 4.525631216526396e-06, |
|
"loss": 0.3849, |
|
"step": 540 |
|
}, |
|
{ |
|
"epoch": 0.9466437177280551, |
|
"grad_norm": 0.031775712966918945, |
|
"learning_rate": 4.506503442999236e-06, |
|
"loss": 0.6446, |
|
"step": 550 |
|
}, |
|
{ |
|
"epoch": 0.963855421686747, |
|
"grad_norm": 0.19989252090454102, |
|
"learning_rate": 4.487375669472074e-06, |
|
"loss": 0.6668, |
|
"step": 560 |
|
}, |
|
{ |
|
"epoch": 0.9810671256454389, |
|
"grad_norm": 0.04056662693619728, |
|
"learning_rate": 4.468247895944912e-06, |
|
"loss": 0.4243, |
|
"step": 570 |
|
}, |
|
{ |
|
"epoch": 0.9982788296041308, |
|
"grad_norm": 0.06392610818147659, |
|
"learning_rate": 4.449120122417751e-06, |
|
"loss": 0.3431, |
|
"step": 580 |
|
}, |
|
{ |
|
"epoch": 1.0154905335628228, |
|
"grad_norm": 0.03935154527425766, |
|
"learning_rate": 4.42999234889059e-06, |
|
"loss": 0.5167, |
|
"step": 590 |
|
}, |
|
{ |
|
"epoch": 1.0327022375215147, |
|
"grad_norm": 0.05566889047622681, |
|
"learning_rate": 4.410864575363428e-06, |
|
"loss": 0.4372, |
|
"step": 600 |
|
}, |
|
{ |
|
"epoch": 1.0499139414802066, |
|
"grad_norm": 0.07127536088228226, |
|
"learning_rate": 4.391736801836267e-06, |
|
"loss": 1.4152, |
|
"step": 610 |
|
}, |
|
{ |
|
"epoch": 1.0671256454388984, |
|
"grad_norm": 0.04618392139673233, |
|
"learning_rate": 4.372609028309105e-06, |
|
"loss": 0.601, |
|
"step": 620 |
|
}, |
|
{ |
|
"epoch": 1.0843373493975903, |
|
"grad_norm": 0.04588570445775986, |
|
"learning_rate": 4.3534812547819434e-06, |
|
"loss": 0.4723, |
|
"step": 630 |
|
}, |
|
{ |
|
"epoch": 1.1015490533562822, |
|
"grad_norm": 0.03991321101784706, |
|
"learning_rate": 4.334353481254782e-06, |
|
"loss": 0.4807, |
|
"step": 640 |
|
}, |
|
{ |
|
"epoch": 1.1187607573149743, |
|
"grad_norm": 0.2501582205295563, |
|
"learning_rate": 4.315225707727621e-06, |
|
"loss": 0.8098, |
|
"step": 650 |
|
}, |
|
{ |
|
"epoch": 1.1359724612736661, |
|
"grad_norm": 0.042163778096437454, |
|
"learning_rate": 4.296097934200459e-06, |
|
"loss": 0.4158, |
|
"step": 660 |
|
}, |
|
{ |
|
"epoch": 1.153184165232358, |
|
"grad_norm": 0.04054609313607216, |
|
"learning_rate": 4.276970160673298e-06, |
|
"loss": 0.3728, |
|
"step": 670 |
|
}, |
|
{ |
|
"epoch": 1.1703958691910499, |
|
"grad_norm": 0.0925000011920929, |
|
"learning_rate": 4.257842387146137e-06, |
|
"loss": 0.4251, |
|
"step": 680 |
|
}, |
|
{ |
|
"epoch": 1.1876075731497417, |
|
"grad_norm": 0.06017041206359863, |
|
"learning_rate": 4.2387146136189745e-06, |
|
"loss": 0.4782, |
|
"step": 690 |
|
}, |
|
{ |
|
"epoch": 1.2048192771084336, |
|
"grad_norm": 0.040517594665288925, |
|
"learning_rate": 4.219586840091814e-06, |
|
"loss": 0.4354, |
|
"step": 700 |
|
}, |
|
{ |
|
"epoch": 1.2220309810671257, |
|
"grad_norm": 0.04731125384569168, |
|
"learning_rate": 4.200459066564652e-06, |
|
"loss": 0.4969, |
|
"step": 710 |
|
}, |
|
{ |
|
"epoch": 1.2392426850258176, |
|
"grad_norm": 0.050880610942840576, |
|
"learning_rate": 4.1813312930374905e-06, |
|
"loss": 0.492, |
|
"step": 720 |
|
}, |
|
{ |
|
"epoch": 1.2564543889845095, |
|
"grad_norm": 0.04548948258161545, |
|
"learning_rate": 4.162203519510329e-06, |
|
"loss": 0.3914, |
|
"step": 730 |
|
}, |
|
{ |
|
"epoch": 1.2736660929432013, |
|
"grad_norm": 0.03825736418366432, |
|
"learning_rate": 4.143075745983168e-06, |
|
"loss": 0.3921, |
|
"step": 740 |
|
}, |
|
{ |
|
"epoch": 1.2908777969018934, |
|
"grad_norm": 0.046227287501096725, |
|
"learning_rate": 4.1239479724560065e-06, |
|
"loss": 0.4632, |
|
"step": 750 |
|
}, |
|
{ |
|
"epoch": 1.3080895008605853, |
|
"grad_norm": 0.04002716392278671, |
|
"learning_rate": 4.104820198928845e-06, |
|
"loss": 0.7436, |
|
"step": 760 |
|
}, |
|
{ |
|
"epoch": 1.3253012048192772, |
|
"grad_norm": 0.04381329566240311, |
|
"learning_rate": 4.085692425401683e-06, |
|
"loss": 0.5388, |
|
"step": 770 |
|
}, |
|
{ |
|
"epoch": 1.342512908777969, |
|
"grad_norm": 0.09227538853883743, |
|
"learning_rate": 4.0665646518745225e-06, |
|
"loss": 0.7008, |
|
"step": 780 |
|
}, |
|
{ |
|
"epoch": 1.359724612736661, |
|
"grad_norm": 0.0453125424683094, |
|
"learning_rate": 4.04743687834736e-06, |
|
"loss": 0.4813, |
|
"step": 790 |
|
}, |
|
{ |
|
"epoch": 1.3769363166953528, |
|
"grad_norm": 0.20484060049057007, |
|
"learning_rate": 4.0283091048202e-06, |
|
"loss": 0.6594, |
|
"step": 800 |
|
}, |
|
{ |
|
"epoch": 1.3941480206540446, |
|
"grad_norm": 0.05485668033361435, |
|
"learning_rate": 4.009181331293038e-06, |
|
"loss": 0.6538, |
|
"step": 810 |
|
}, |
|
{ |
|
"epoch": 1.4113597246127367, |
|
"grad_norm": 0.04452645406126976, |
|
"learning_rate": 3.990053557765876e-06, |
|
"loss": 0.3713, |
|
"step": 820 |
|
}, |
|
{ |
|
"epoch": 1.4285714285714286, |
|
"grad_norm": 0.03632510080933571, |
|
"learning_rate": 3.970925784238715e-06, |
|
"loss": 0.3395, |
|
"step": 830 |
|
}, |
|
{ |
|
"epoch": 1.4457831325301205, |
|
"grad_norm": 0.0884113535284996, |
|
"learning_rate": 3.951798010711554e-06, |
|
"loss": 0.3602, |
|
"step": 840 |
|
}, |
|
{ |
|
"epoch": 1.4629948364888123, |
|
"grad_norm": 0.1275469958782196, |
|
"learning_rate": 3.932670237184392e-06, |
|
"loss": 0.4533, |
|
"step": 850 |
|
}, |
|
{ |
|
"epoch": 1.4802065404475044, |
|
"grad_norm": 0.03843805938959122, |
|
"learning_rate": 3.913542463657231e-06, |
|
"loss": 0.7519, |
|
"step": 860 |
|
}, |
|
{ |
|
"epoch": 1.4974182444061963, |
|
"grad_norm": 0.03635178506374359, |
|
"learning_rate": 3.89441469013007e-06, |
|
"loss": 0.388, |
|
"step": 870 |
|
}, |
|
{ |
|
"epoch": 1.5146299483648882, |
|
"grad_norm": 0.039031002670526505, |
|
"learning_rate": 3.875286916602907e-06, |
|
"loss": 0.4425, |
|
"step": 880 |
|
}, |
|
{ |
|
"epoch": 1.53184165232358, |
|
"grad_norm": 0.04110798239707947, |
|
"learning_rate": 3.856159143075746e-06, |
|
"loss": 0.4095, |
|
"step": 890 |
|
}, |
|
{ |
|
"epoch": 1.549053356282272, |
|
"grad_norm": 0.04002736508846283, |
|
"learning_rate": 3.837031369548585e-06, |
|
"loss": 0.6104, |
|
"step": 900 |
|
}, |
|
{ |
|
"epoch": 1.5662650602409638, |
|
"grad_norm": 0.03314425051212311, |
|
"learning_rate": 3.817903596021423e-06, |
|
"loss": 0.5594, |
|
"step": 910 |
|
}, |
|
{ |
|
"epoch": 1.5834767641996557, |
|
"grad_norm": 0.03947990760207176, |
|
"learning_rate": 3.798775822494262e-06, |
|
"loss": 0.4931, |
|
"step": 920 |
|
}, |
|
{ |
|
"epoch": 1.6006884681583475, |
|
"grad_norm": 0.05939627066254616, |
|
"learning_rate": 3.7796480489671007e-06, |
|
"loss": 0.5127, |
|
"step": 930 |
|
}, |
|
{ |
|
"epoch": 1.6179001721170396, |
|
"grad_norm": 0.03439631685614586, |
|
"learning_rate": 3.760520275439939e-06, |
|
"loss": 0.4139, |
|
"step": 940 |
|
}, |
|
{ |
|
"epoch": 1.6351118760757315, |
|
"grad_norm": 0.06566853076219559, |
|
"learning_rate": 3.7413925019127776e-06, |
|
"loss": 0.6641, |
|
"step": 950 |
|
}, |
|
{ |
|
"epoch": 1.6523235800344234, |
|
"grad_norm": 0.06731946766376495, |
|
"learning_rate": 3.7222647283856163e-06, |
|
"loss": 0.6865, |
|
"step": 960 |
|
}, |
|
{ |
|
"epoch": 1.6695352839931155, |
|
"grad_norm": 0.03529343381524086, |
|
"learning_rate": 3.703136954858455e-06, |
|
"loss": 0.6395, |
|
"step": 970 |
|
}, |
|
{ |
|
"epoch": 1.6867469879518073, |
|
"grad_norm": 0.09028229117393494, |
|
"learning_rate": 3.684009181331293e-06, |
|
"loss": 0.774, |
|
"step": 980 |
|
}, |
|
{ |
|
"epoch": 1.7039586919104992, |
|
"grad_norm": 0.04828124865889549, |
|
"learning_rate": 3.664881407804132e-06, |
|
"loss": 0.4953, |
|
"step": 990 |
|
}, |
|
{ |
|
"epoch": 1.721170395869191, |
|
"grad_norm": 0.050330750644207, |
|
"learning_rate": 3.6457536342769705e-06, |
|
"loss": 0.6435, |
|
"step": 1000 |
|
}, |
|
{ |
|
"epoch": 1.738382099827883, |
|
"grad_norm": 0.03781217709183693, |
|
"learning_rate": 3.6266258607498087e-06, |
|
"loss": 0.4538, |
|
"step": 1010 |
|
}, |
|
{ |
|
"epoch": 1.7555938037865748, |
|
"grad_norm": 0.053586967289447784, |
|
"learning_rate": 3.607498087222648e-06, |
|
"loss": 0.384, |
|
"step": 1020 |
|
}, |
|
{ |
|
"epoch": 1.7728055077452667, |
|
"grad_norm": 0.04280597344040871, |
|
"learning_rate": 3.588370313695486e-06, |
|
"loss": 0.385, |
|
"step": 1030 |
|
}, |
|
{ |
|
"epoch": 1.7900172117039586, |
|
"grad_norm": 0.05530484393239021, |
|
"learning_rate": 3.5692425401683243e-06, |
|
"loss": 0.732, |
|
"step": 1040 |
|
}, |
|
{ |
|
"epoch": 1.8072289156626506, |
|
"grad_norm": 0.05707624554634094, |
|
"learning_rate": 3.5501147666411634e-06, |
|
"loss": 0.4075, |
|
"step": 1050 |
|
}, |
|
{ |
|
"epoch": 1.8244406196213425, |
|
"grad_norm": 0.07795403897762299, |
|
"learning_rate": 3.5309869931140016e-06, |
|
"loss": 1.0486, |
|
"step": 1060 |
|
}, |
|
{ |
|
"epoch": 1.8416523235800344, |
|
"grad_norm": 0.08253274112939835, |
|
"learning_rate": 3.5118592195868407e-06, |
|
"loss": 0.7014, |
|
"step": 1070 |
|
}, |
|
{ |
|
"epoch": 1.8588640275387265, |
|
"grad_norm": 0.037665221840143204, |
|
"learning_rate": 3.492731446059679e-06, |
|
"loss": 0.5129, |
|
"step": 1080 |
|
}, |
|
{ |
|
"epoch": 1.8760757314974184, |
|
"grad_norm": 0.08074070513248444, |
|
"learning_rate": 3.473603672532517e-06, |
|
"loss": 0.6965, |
|
"step": 1090 |
|
}, |
|
{ |
|
"epoch": 1.8932874354561102, |
|
"grad_norm": 0.053863946348428726, |
|
"learning_rate": 3.4544758990053563e-06, |
|
"loss": 0.3608, |
|
"step": 1100 |
|
}, |
|
{ |
|
"epoch": 1.910499139414802, |
|
"grad_norm": 0.03980562463402748, |
|
"learning_rate": 3.4353481254781945e-06, |
|
"loss": 0.3408, |
|
"step": 1110 |
|
}, |
|
{ |
|
"epoch": 1.927710843373494, |
|
"grad_norm": 0.03091476857662201, |
|
"learning_rate": 3.4162203519510336e-06, |
|
"loss": 0.4147, |
|
"step": 1120 |
|
}, |
|
{ |
|
"epoch": 1.9449225473321858, |
|
"grad_norm": 0.05423520505428314, |
|
"learning_rate": 3.399005355776588e-06, |
|
"loss": 0.501, |
|
"step": 1130 |
|
}, |
|
{ |
|
"epoch": 1.9621342512908777, |
|
"grad_norm": 0.056222882121801376, |
|
"learning_rate": 3.379877582249426e-06, |
|
"loss": 0.6646, |
|
"step": 1140 |
|
}, |
|
{ |
|
"epoch": 1.9793459552495696, |
|
"grad_norm": 0.04780727997422218, |
|
"learning_rate": 3.360749808722265e-06, |
|
"loss": 0.4433, |
|
"step": 1150 |
|
}, |
|
{ |
|
"epoch": 1.9965576592082617, |
|
"grad_norm": 0.0465485118329525, |
|
"learning_rate": 3.3416220351951034e-06, |
|
"loss": 0.4117, |
|
"step": 1160 |
|
}, |
|
{ |
|
"epoch": 2.0137693631669533, |
|
"grad_norm": 0.038410015404224396, |
|
"learning_rate": 3.3224942616679424e-06, |
|
"loss": 0.9719, |
|
"step": 1170 |
|
}, |
|
{ |
|
"epoch": 2.0309810671256456, |
|
"grad_norm": 0.03839205205440521, |
|
"learning_rate": 3.3033664881407807e-06, |
|
"loss": 0.5383, |
|
"step": 1180 |
|
}, |
|
{ |
|
"epoch": 2.0481927710843375, |
|
"grad_norm": 0.05250284820795059, |
|
"learning_rate": 3.284238714613619e-06, |
|
"loss": 0.5573, |
|
"step": 1190 |
|
}, |
|
{ |
|
"epoch": 2.0654044750430294, |
|
"grad_norm": 0.05850391089916229, |
|
"learning_rate": 3.265110941086458e-06, |
|
"loss": 0.3652, |
|
"step": 1200 |
|
}, |
|
{ |
|
"epoch": 2.0826161790017212, |
|
"grad_norm": 0.03551226481795311, |
|
"learning_rate": 3.2459831675592962e-06, |
|
"loss": 1.1687, |
|
"step": 1210 |
|
}, |
|
{ |
|
"epoch": 2.099827882960413, |
|
"grad_norm": 0.035683631896972656, |
|
"learning_rate": 3.226855394032135e-06, |
|
"loss": 0.3377, |
|
"step": 1220 |
|
}, |
|
{ |
|
"epoch": 2.117039586919105, |
|
"grad_norm": 0.05406322330236435, |
|
"learning_rate": 3.2077276205049736e-06, |
|
"loss": 0.4614, |
|
"step": 1230 |
|
}, |
|
{ |
|
"epoch": 2.134251290877797, |
|
"grad_norm": 0.030787965282797813, |
|
"learning_rate": 3.188599846977812e-06, |
|
"loss": 0.3771, |
|
"step": 1240 |
|
}, |
|
{ |
|
"epoch": 2.1514629948364887, |
|
"grad_norm": 0.04496818408370018, |
|
"learning_rate": 3.169472073450651e-06, |
|
"loss": 0.4846, |
|
"step": 1250 |
|
}, |
|
{ |
|
"epoch": 2.1686746987951806, |
|
"grad_norm": 0.03633632883429527, |
|
"learning_rate": 3.150344299923489e-06, |
|
"loss": 0.3549, |
|
"step": 1260 |
|
}, |
|
{ |
|
"epoch": 2.1858864027538725, |
|
"grad_norm": 0.033117033541202545, |
|
"learning_rate": 3.1312165263963278e-06, |
|
"loss": 0.4224, |
|
"step": 1270 |
|
}, |
|
{ |
|
"epoch": 2.2030981067125643, |
|
"grad_norm": 0.04940853640437126, |
|
"learning_rate": 3.1120887528691664e-06, |
|
"loss": 0.6976, |
|
"step": 1280 |
|
}, |
|
{ |
|
"epoch": 2.2203098106712567, |
|
"grad_norm": 0.03474991396069527, |
|
"learning_rate": 3.092960979342005e-06, |
|
"loss": 0.5837, |
|
"step": 1290 |
|
}, |
|
{ |
|
"epoch": 2.2375215146299485, |
|
"grad_norm": 0.08616980165243149, |
|
"learning_rate": 3.0738332058148433e-06, |
|
"loss": 0.5885, |
|
"step": 1300 |
|
}, |
|
{ |
|
"epoch": 2.2547332185886404, |
|
"grad_norm": 0.04921899363398552, |
|
"learning_rate": 3.054705432287682e-06, |
|
"loss": 0.4007, |
|
"step": 1310 |
|
}, |
|
{ |
|
"epoch": 2.2719449225473323, |
|
"grad_norm": 0.033128101378679276, |
|
"learning_rate": 3.0355776587605207e-06, |
|
"loss": 0.3948, |
|
"step": 1320 |
|
}, |
|
{ |
|
"epoch": 2.289156626506024, |
|
"grad_norm": 0.0420563630759716, |
|
"learning_rate": 3.016449885233359e-06, |
|
"loss": 0.6675, |
|
"step": 1330 |
|
}, |
|
{ |
|
"epoch": 2.306368330464716, |
|
"grad_norm": 0.04620426893234253, |
|
"learning_rate": 2.997322111706198e-06, |
|
"loss": 0.3454, |
|
"step": 1340 |
|
}, |
|
{ |
|
"epoch": 2.323580034423408, |
|
"grad_norm": 0.031115278601646423, |
|
"learning_rate": 2.9781943381790362e-06, |
|
"loss": 0.4697, |
|
"step": 1350 |
|
}, |
|
{ |
|
"epoch": 2.3407917383820998, |
|
"grad_norm": 0.03716883435845375, |
|
"learning_rate": 2.9590665646518745e-06, |
|
"loss": 0.7016, |
|
"step": 1360 |
|
}, |
|
{ |
|
"epoch": 2.3580034423407916, |
|
"grad_norm": 0.2217116802930832, |
|
"learning_rate": 2.9399387911247135e-06, |
|
"loss": 0.6504, |
|
"step": 1370 |
|
}, |
|
{ |
|
"epoch": 2.3752151462994835, |
|
"grad_norm": 0.08799983561038971, |
|
"learning_rate": 2.9208110175975518e-06, |
|
"loss": 0.3518, |
|
"step": 1380 |
|
}, |
|
{ |
|
"epoch": 2.3924268502581754, |
|
"grad_norm": 0.03414052352309227, |
|
"learning_rate": 2.901683244070391e-06, |
|
"loss": 0.5522, |
|
"step": 1390 |
|
}, |
|
{ |
|
"epoch": 2.4096385542168672, |
|
"grad_norm": 0.14305748045444489, |
|
"learning_rate": 2.882555470543229e-06, |
|
"loss": 0.7692, |
|
"step": 1400 |
|
}, |
|
{ |
|
"epoch": 2.4268502581755595, |
|
"grad_norm": 0.04776856303215027, |
|
"learning_rate": 2.8634276970160673e-06, |
|
"loss": 0.4163, |
|
"step": 1410 |
|
}, |
|
{ |
|
"epoch": 2.4440619621342514, |
|
"grad_norm": 0.06117096543312073, |
|
"learning_rate": 2.8442999234889064e-06, |
|
"loss": 0.3797, |
|
"step": 1420 |
|
}, |
|
{ |
|
"epoch": 2.4612736660929433, |
|
"grad_norm": 0.1437849998474121, |
|
"learning_rate": 2.8251721499617447e-06, |
|
"loss": 0.3978, |
|
"step": 1430 |
|
}, |
|
{ |
|
"epoch": 2.478485370051635, |
|
"grad_norm": 0.03535407409071922, |
|
"learning_rate": 2.8060443764345833e-06, |
|
"loss": 0.7543, |
|
"step": 1440 |
|
}, |
|
{ |
|
"epoch": 2.495697074010327, |
|
"grad_norm": 0.034573543816804886, |
|
"learning_rate": 2.786916602907422e-06, |
|
"loss": 0.4385, |
|
"step": 1450 |
|
}, |
|
{ |
|
"epoch": 2.512908777969019, |
|
"grad_norm": 0.05264075845479965, |
|
"learning_rate": 2.7677888293802602e-06, |
|
"loss": 0.5788, |
|
"step": 1460 |
|
}, |
|
{ |
|
"epoch": 2.5301204819277108, |
|
"grad_norm": 0.047263339161872864, |
|
"learning_rate": 2.748661055853099e-06, |
|
"loss": 0.5397, |
|
"step": 1470 |
|
}, |
|
{ |
|
"epoch": 2.5473321858864026, |
|
"grad_norm": 0.03852943331003189, |
|
"learning_rate": 2.7295332823259375e-06, |
|
"loss": 0.3995, |
|
"step": 1480 |
|
}, |
|
{ |
|
"epoch": 2.5645438898450945, |
|
"grad_norm": 0.04756772890686989, |
|
"learning_rate": 2.710405508798776e-06, |
|
"loss": 0.5136, |
|
"step": 1490 |
|
}, |
|
{ |
|
"epoch": 2.581755593803787, |
|
"grad_norm": 0.07750029861927032, |
|
"learning_rate": 2.6912777352716144e-06, |
|
"loss": 0.8293, |
|
"step": 1500 |
|
}, |
|
{ |
|
"epoch": 2.5989672977624787, |
|
"grad_norm": 0.047012392431497574, |
|
"learning_rate": 2.672149961744453e-06, |
|
"loss": 0.5485, |
|
"step": 1510 |
|
}, |
|
{ |
|
"epoch": 2.6161790017211706, |
|
"grad_norm": 0.04318179562687874, |
|
"learning_rate": 2.6530221882172918e-06, |
|
"loss": 0.4112, |
|
"step": 1520 |
|
}, |
|
{ |
|
"epoch": 2.6333907056798624, |
|
"grad_norm": 0.06012555584311485, |
|
"learning_rate": 2.63389441469013e-06, |
|
"loss": 0.7031, |
|
"step": 1530 |
|
}, |
|
{ |
|
"epoch": 2.6506024096385543, |
|
"grad_norm": 0.03384987264871597, |
|
"learning_rate": 2.614766641162969e-06, |
|
"loss": 0.439, |
|
"step": 1540 |
|
}, |
|
{ |
|
"epoch": 2.667814113597246, |
|
"grad_norm": 0.05770883336663246, |
|
"learning_rate": 2.5956388676358073e-06, |
|
"loss": 0.3991, |
|
"step": 1550 |
|
}, |
|
{ |
|
"epoch": 2.685025817555938, |
|
"grad_norm": 0.05510050430893898, |
|
"learning_rate": 2.5765110941086456e-06, |
|
"loss": 0.9784, |
|
"step": 1560 |
|
}, |
|
{ |
|
"epoch": 2.70223752151463, |
|
"grad_norm": 0.055017050355672836, |
|
"learning_rate": 2.5573833205814846e-06, |
|
"loss": 0.3796, |
|
"step": 1570 |
|
}, |
|
{ |
|
"epoch": 2.719449225473322, |
|
"grad_norm": 0.04332127049565315, |
|
"learning_rate": 2.538255547054323e-06, |
|
"loss": 0.433, |
|
"step": 1580 |
|
}, |
|
{ |
|
"epoch": 2.7366609294320137, |
|
"grad_norm": 0.060054711997509, |
|
"learning_rate": 2.519127773527162e-06, |
|
"loss": 0.2799, |
|
"step": 1590 |
|
}, |
|
{ |
|
"epoch": 2.7538726333907055, |
|
"grad_norm": 0.0340825691819191, |
|
"learning_rate": 2.5e-06, |
|
"loss": 0.6797, |
|
"step": 1600 |
|
}, |
|
{ |
|
"epoch": 2.7710843373493974, |
|
"grad_norm": 0.22405555844306946, |
|
"learning_rate": 2.480872226472839e-06, |
|
"loss": 0.6071, |
|
"step": 1610 |
|
}, |
|
{ |
|
"epoch": 2.7882960413080893, |
|
"grad_norm": 0.04493927210569382, |
|
"learning_rate": 2.4617444529456775e-06, |
|
"loss": 0.4004, |
|
"step": 1620 |
|
}, |
|
{ |
|
"epoch": 2.805507745266781, |
|
"grad_norm": 0.06454917788505554, |
|
"learning_rate": 2.4426166794185158e-06, |
|
"loss": 0.3903, |
|
"step": 1630 |
|
}, |
|
{ |
|
"epoch": 2.8227194492254735, |
|
"grad_norm": 0.07336492091417313, |
|
"learning_rate": 2.4234889058913544e-06, |
|
"loss": 0.9157, |
|
"step": 1640 |
|
}, |
|
{ |
|
"epoch": 2.8399311531841653, |
|
"grad_norm": 0.08775831758975983, |
|
"learning_rate": 2.404361132364193e-06, |
|
"loss": 0.4865, |
|
"step": 1650 |
|
}, |
|
{ |
|
"epoch": 2.857142857142857, |
|
"grad_norm": 0.03372660651803017, |
|
"learning_rate": 2.3852333588370317e-06, |
|
"loss": 0.3975, |
|
"step": 1660 |
|
}, |
|
{ |
|
"epoch": 2.874354561101549, |
|
"grad_norm": 0.034449730068445206, |
|
"learning_rate": 2.3661055853098704e-06, |
|
"loss": 0.3927, |
|
"step": 1670 |
|
}, |
|
{ |
|
"epoch": 2.891566265060241, |
|
"grad_norm": 0.02975647896528244, |
|
"learning_rate": 2.3469778117827086e-06, |
|
"loss": 0.3664, |
|
"step": 1680 |
|
}, |
|
{ |
|
"epoch": 2.908777969018933, |
|
"grad_norm": 0.037901297211647034, |
|
"learning_rate": 2.3278500382555473e-06, |
|
"loss": 0.3973, |
|
"step": 1690 |
|
}, |
|
{ |
|
"epoch": 2.9259896729776247, |
|
"grad_norm": 0.05662724748253822, |
|
"learning_rate": 2.308722264728386e-06, |
|
"loss": 0.4422, |
|
"step": 1700 |
|
}, |
|
{ |
|
"epoch": 2.9432013769363166, |
|
"grad_norm": 0.044157788157463074, |
|
"learning_rate": 2.289594491201224e-06, |
|
"loss": 0.4324, |
|
"step": 1710 |
|
}, |
|
{ |
|
"epoch": 2.960413080895009, |
|
"grad_norm": 0.04280713573098183, |
|
"learning_rate": 2.270466717674063e-06, |
|
"loss": 0.5674, |
|
"step": 1720 |
|
}, |
|
{ |
|
"epoch": 2.9776247848537007, |
|
"grad_norm": 0.04871043935418129, |
|
"learning_rate": 2.2513389441469015e-06, |
|
"loss": 0.3223, |
|
"step": 1730 |
|
}, |
|
{ |
|
"epoch": 2.9948364888123926, |
|
"grad_norm": 0.036149609833955765, |
|
"learning_rate": 2.2322111706197398e-06, |
|
"loss": 0.6471, |
|
"step": 1740 |
|
}, |
|
{ |
|
"epoch": 3.0120481927710845, |
|
"grad_norm": 0.02951321005821228, |
|
"learning_rate": 2.2130833970925784e-06, |
|
"loss": 0.3926, |
|
"step": 1750 |
|
}, |
|
{ |
|
"epoch": 3.0292598967297764, |
|
"grad_norm": 0.04006199911236763, |
|
"learning_rate": 2.193955623565417e-06, |
|
"loss": 0.6222, |
|
"step": 1760 |
|
}, |
|
{ |
|
"epoch": 3.0464716006884682, |
|
"grad_norm": 0.03238508850336075, |
|
"learning_rate": 2.1748278500382557e-06, |
|
"loss": 0.4144, |
|
"step": 1770 |
|
}, |
|
{ |
|
"epoch": 3.06368330464716, |
|
"grad_norm": 0.035425204783678055, |
|
"learning_rate": 2.1557000765110944e-06, |
|
"loss": 0.3745, |
|
"step": 1780 |
|
}, |
|
{ |
|
"epoch": 3.080895008605852, |
|
"grad_norm": 0.08181657642126083, |
|
"learning_rate": 2.1365723029839326e-06, |
|
"loss": 0.4049, |
|
"step": 1790 |
|
}, |
|
{ |
|
"epoch": 3.098106712564544, |
|
"grad_norm": 0.03448079526424408, |
|
"learning_rate": 2.1174445294567713e-06, |
|
"loss": 0.5435, |
|
"step": 1800 |
|
} |
|
], |
|
"logging_steps": 10, |
|
"max_steps": 2905, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 5, |
|
"save_steps": 300, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": false |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 0.0, |
|
"train_batch_size": 1, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|