|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 2.9964796996010326, |
|
"eval_steps": 100, |
|
"global_step": 1596, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.018774935461159353, |
|
"grad_norm": 0.3424220085144043, |
|
"learning_rate": 2.0000000000000003e-06, |
|
"loss": 3.1063, |
|
"step": 10 |
|
}, |
|
{ |
|
"epoch": 0.03754987092231871, |
|
"grad_norm": 0.3918570578098297, |
|
"learning_rate": 4.000000000000001e-06, |
|
"loss": 3.0935, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.05632480638347806, |
|
"grad_norm": 0.42540064454078674, |
|
"learning_rate": 6e-06, |
|
"loss": 3.0475, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 0.07509974184463741, |
|
"grad_norm": 0.47563326358795166, |
|
"learning_rate": 8.000000000000001e-06, |
|
"loss": 3.0609, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 0.09387467730579677, |
|
"grad_norm": 0.5615227818489075, |
|
"learning_rate": 1e-05, |
|
"loss": 3.026, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.11264961276695612, |
|
"grad_norm": 0.5656551122665405, |
|
"learning_rate": 1.2e-05, |
|
"loss": 3.0134, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 0.13142454822811547, |
|
"grad_norm": 0.5487608313560486, |
|
"learning_rate": 1.4000000000000001e-05, |
|
"loss": 2.9549, |
|
"step": 70 |
|
}, |
|
{ |
|
"epoch": 0.15019948368927483, |
|
"grad_norm": 0.48524507880210876, |
|
"learning_rate": 1.6000000000000003e-05, |
|
"loss": 2.9301, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 0.16897441915043418, |
|
"grad_norm": 0.5230005383491516, |
|
"learning_rate": 1.8e-05, |
|
"loss": 2.9109, |
|
"step": 90 |
|
}, |
|
{ |
|
"epoch": 0.18774935461159353, |
|
"grad_norm": 0.38439810276031494, |
|
"learning_rate": 2e-05, |
|
"loss": 2.8142, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.18774935461159353, |
|
"eval_loss": 2.856565237045288, |
|
"eval_runtime": 115.5533, |
|
"eval_samples_per_second": 8.195, |
|
"eval_steps_per_second": 4.102, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.2065242900727529, |
|
"grad_norm": 0.4284297227859497, |
|
"learning_rate": 2.2000000000000003e-05, |
|
"loss": 2.8967, |
|
"step": 110 |
|
}, |
|
{ |
|
"epoch": 0.22529922553391224, |
|
"grad_norm": 0.41896769404411316, |
|
"learning_rate": 2.4e-05, |
|
"loss": 2.8629, |
|
"step": 120 |
|
}, |
|
{ |
|
"epoch": 0.2440741609950716, |
|
"grad_norm": 0.41066381335258484, |
|
"learning_rate": 2.6000000000000002e-05, |
|
"loss": 2.8142, |
|
"step": 130 |
|
}, |
|
{ |
|
"epoch": 0.26284909645623095, |
|
"grad_norm": 0.44229111075401306, |
|
"learning_rate": 2.8000000000000003e-05, |
|
"loss": 2.815, |
|
"step": 140 |
|
}, |
|
{ |
|
"epoch": 0.28162403191739027, |
|
"grad_norm": 0.43809670209884644, |
|
"learning_rate": 3e-05, |
|
"loss": 2.7881, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 0.30039896737854965, |
|
"grad_norm": 0.48944246768951416, |
|
"learning_rate": 3.2000000000000005e-05, |
|
"loss": 2.7937, |
|
"step": 160 |
|
}, |
|
{ |
|
"epoch": 0.319173902839709, |
|
"grad_norm": 0.5136508941650391, |
|
"learning_rate": 3.4000000000000007e-05, |
|
"loss": 2.7524, |
|
"step": 170 |
|
}, |
|
{ |
|
"epoch": 0.33794883830086836, |
|
"grad_norm": 0.48745179176330566, |
|
"learning_rate": 3.6e-05, |
|
"loss": 2.7675, |
|
"step": 180 |
|
}, |
|
{ |
|
"epoch": 0.3567237737620277, |
|
"grad_norm": 0.4948646128177643, |
|
"learning_rate": 3.8e-05, |
|
"loss": 2.731, |
|
"step": 190 |
|
}, |
|
{ |
|
"epoch": 0.37549870922318707, |
|
"grad_norm": 0.5109882354736328, |
|
"learning_rate": 4e-05, |
|
"loss": 2.7235, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 0.37549870922318707, |
|
"eval_loss": 2.6776111125946045, |
|
"eval_runtime": 115.5701, |
|
"eval_samples_per_second": 8.194, |
|
"eval_steps_per_second": 4.101, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 0.3942736446843464, |
|
"grad_norm": 0.5552621483802795, |
|
"learning_rate": 4.2e-05, |
|
"loss": 2.7244, |
|
"step": 210 |
|
}, |
|
{ |
|
"epoch": 0.4130485801455058, |
|
"grad_norm": 0.5443555116653442, |
|
"learning_rate": 4.4000000000000006e-05, |
|
"loss": 2.6612, |
|
"step": 220 |
|
}, |
|
{ |
|
"epoch": 0.4318235156066651, |
|
"grad_norm": 0.6089721918106079, |
|
"learning_rate": 4.600000000000001e-05, |
|
"loss": 2.6635, |
|
"step": 230 |
|
}, |
|
{ |
|
"epoch": 0.4505984510678245, |
|
"grad_norm": 0.6026090383529663, |
|
"learning_rate": 4.8e-05, |
|
"loss": 2.6335, |
|
"step": 240 |
|
}, |
|
{ |
|
"epoch": 0.4693733865289838, |
|
"grad_norm": 0.6256532073020935, |
|
"learning_rate": 5e-05, |
|
"loss": 2.6262, |
|
"step": 250 |
|
}, |
|
{ |
|
"epoch": 0.4881483219901432, |
|
"grad_norm": 0.6782171726226807, |
|
"learning_rate": 5.2000000000000004e-05, |
|
"loss": 2.665, |
|
"step": 260 |
|
}, |
|
{ |
|
"epoch": 0.5069232574513025, |
|
"grad_norm": 0.7474572658538818, |
|
"learning_rate": 5.4000000000000005e-05, |
|
"loss": 2.7222, |
|
"step": 270 |
|
}, |
|
{ |
|
"epoch": 0.5256981929124619, |
|
"grad_norm": 0.6566171646118164, |
|
"learning_rate": 5.6000000000000006e-05, |
|
"loss": 2.6626, |
|
"step": 280 |
|
}, |
|
{ |
|
"epoch": 0.5444731283736212, |
|
"grad_norm": 0.7244133949279785, |
|
"learning_rate": 5.8e-05, |
|
"loss": 2.5429, |
|
"step": 290 |
|
}, |
|
{ |
|
"epoch": 0.5632480638347805, |
|
"grad_norm": 0.7084689140319824, |
|
"learning_rate": 6e-05, |
|
"loss": 2.5809, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 0.5632480638347805, |
|
"eval_loss": 2.5663962364196777, |
|
"eval_runtime": 115.4961, |
|
"eval_samples_per_second": 8.199, |
|
"eval_steps_per_second": 4.104, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 0.5820229992959399, |
|
"grad_norm": 0.7440884709358215, |
|
"learning_rate": 6.2e-05, |
|
"loss": 2.6225, |
|
"step": 310 |
|
}, |
|
{ |
|
"epoch": 0.6007979347570993, |
|
"grad_norm": 1.0468900203704834, |
|
"learning_rate": 6.400000000000001e-05, |
|
"loss": 2.579, |
|
"step": 320 |
|
}, |
|
{ |
|
"epoch": 0.6195728702182586, |
|
"grad_norm": 1.0507537126541138, |
|
"learning_rate": 6.6e-05, |
|
"loss": 2.6367, |
|
"step": 330 |
|
}, |
|
{ |
|
"epoch": 0.638347805679418, |
|
"grad_norm": 1.0535833835601807, |
|
"learning_rate": 6.800000000000001e-05, |
|
"loss": 2.556, |
|
"step": 340 |
|
}, |
|
{ |
|
"epoch": 0.6571227411405773, |
|
"grad_norm": 1.5066871643066406, |
|
"learning_rate": 7e-05, |
|
"loss": 2.5231, |
|
"step": 350 |
|
}, |
|
{ |
|
"epoch": 0.6758976766017367, |
|
"grad_norm": 1.1159995794296265, |
|
"learning_rate": 7.2e-05, |
|
"loss": 2.4532, |
|
"step": 360 |
|
}, |
|
{ |
|
"epoch": 0.694672612062896, |
|
"grad_norm": 1.4439431428909302, |
|
"learning_rate": 7.4e-05, |
|
"loss": 2.4436, |
|
"step": 370 |
|
}, |
|
{ |
|
"epoch": 0.7134475475240554, |
|
"grad_norm": 1.1311273574829102, |
|
"learning_rate": 7.58e-05, |
|
"loss": 2.4088, |
|
"step": 380 |
|
}, |
|
{ |
|
"epoch": 0.7322224829852148, |
|
"grad_norm": 0.9161147475242615, |
|
"learning_rate": 7.780000000000001e-05, |
|
"loss": 2.5574, |
|
"step": 390 |
|
}, |
|
{ |
|
"epoch": 0.7509974184463741, |
|
"grad_norm": 1.2713381052017212, |
|
"learning_rate": 7.98e-05, |
|
"loss": 2.3971, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 0.7509974184463741, |
|
"eval_loss": 2.4457809925079346, |
|
"eval_runtime": 115.5729, |
|
"eval_samples_per_second": 8.194, |
|
"eval_steps_per_second": 4.101, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 0.7697723539075334, |
|
"grad_norm": 1.4076571464538574, |
|
"learning_rate": 8.18e-05, |
|
"loss": 2.5646, |
|
"step": 410 |
|
}, |
|
{ |
|
"epoch": 0.7885472893686928, |
|
"grad_norm": 1.0278912782669067, |
|
"learning_rate": 8.38e-05, |
|
"loss": 2.5097, |
|
"step": 420 |
|
}, |
|
{ |
|
"epoch": 0.8073222248298522, |
|
"grad_norm": 0.9688978791236877, |
|
"learning_rate": 8.58e-05, |
|
"loss": 2.4623, |
|
"step": 430 |
|
}, |
|
{ |
|
"epoch": 0.8260971602910115, |
|
"grad_norm": 1.3491131067276, |
|
"learning_rate": 8.78e-05, |
|
"loss": 2.508, |
|
"step": 440 |
|
}, |
|
{ |
|
"epoch": 0.8448720957521708, |
|
"grad_norm": 0.8669747710227966, |
|
"learning_rate": 8.98e-05, |
|
"loss": 2.518, |
|
"step": 450 |
|
}, |
|
{ |
|
"epoch": 0.8636470312133302, |
|
"grad_norm": 3.474540948867798, |
|
"learning_rate": 9.180000000000001e-05, |
|
"loss": 2.4558, |
|
"step": 460 |
|
}, |
|
{ |
|
"epoch": 0.8824219666744896, |
|
"grad_norm": 0.9799988865852356, |
|
"learning_rate": 9.38e-05, |
|
"loss": 2.3909, |
|
"step": 470 |
|
}, |
|
{ |
|
"epoch": 0.901196902135649, |
|
"grad_norm": 1.605360507965088, |
|
"learning_rate": 9.58e-05, |
|
"loss": 2.3931, |
|
"step": 480 |
|
}, |
|
{ |
|
"epoch": 0.9199718375968082, |
|
"grad_norm": 1.0437480211257935, |
|
"learning_rate": 9.78e-05, |
|
"loss": 2.3893, |
|
"step": 490 |
|
}, |
|
{ |
|
"epoch": 0.9387467730579676, |
|
"grad_norm": 0.7387896180152893, |
|
"learning_rate": 9.98e-05, |
|
"loss": 2.4147, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 0.9387467730579676, |
|
"eval_loss": 2.381179094314575, |
|
"eval_runtime": 115.6216, |
|
"eval_samples_per_second": 8.191, |
|
"eval_steps_per_second": 4.1, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 0.957521708519127, |
|
"grad_norm": 1.0030367374420166, |
|
"learning_rate": 9.998336282524578e-05, |
|
"loss": 2.4192, |
|
"step": 510 |
|
}, |
|
{ |
|
"epoch": 0.9762966439802864, |
|
"grad_norm": 2.564517021179199, |
|
"learning_rate": 9.992586581827853e-05, |
|
"loss": 2.4279, |
|
"step": 520 |
|
}, |
|
{ |
|
"epoch": 0.9950715794414456, |
|
"grad_norm": 0.8353458642959595, |
|
"learning_rate": 9.983904607709364e-05, |
|
"loss": 2.4838, |
|
"step": 530 |
|
}, |
|
{ |
|
"epoch": 1.013846514902605, |
|
"grad_norm": 0.7976542115211487, |
|
"learning_rate": 9.970368310819e-05, |
|
"loss": 2.4366, |
|
"step": 540 |
|
}, |
|
{ |
|
"epoch": 1.0326214503637643, |
|
"grad_norm": 1.2872307300567627, |
|
"learning_rate": 9.952748467735829e-05, |
|
"loss": 2.3284, |
|
"step": 550 |
|
}, |
|
{ |
|
"epoch": 1.0513963858249238, |
|
"grad_norm": 1.1133558750152588, |
|
"learning_rate": 9.931059554538611e-05, |
|
"loss": 2.3192, |
|
"step": 560 |
|
}, |
|
{ |
|
"epoch": 1.070171321286083, |
|
"grad_norm": 1.0255626440048218, |
|
"learning_rate": 9.905319390365365e-05, |
|
"loss": 2.3409, |
|
"step": 570 |
|
}, |
|
{ |
|
"epoch": 1.0889462567472425, |
|
"grad_norm": 1.0734648704528809, |
|
"learning_rate": 9.875549122773535e-05, |
|
"loss": 2.4633, |
|
"step": 580 |
|
}, |
|
{ |
|
"epoch": 1.1077211922084018, |
|
"grad_norm": 0.9561296105384827, |
|
"learning_rate": 9.841773210365646e-05, |
|
"loss": 2.3144, |
|
"step": 590 |
|
}, |
|
{ |
|
"epoch": 1.126496127669561, |
|
"grad_norm": 0.8402953743934631, |
|
"learning_rate": 9.804019402694626e-05, |
|
"loss": 2.3987, |
|
"step": 600 |
|
}, |
|
{ |
|
"epoch": 1.126496127669561, |
|
"eval_loss": 2.343639850616455, |
|
"eval_runtime": 115.6739, |
|
"eval_samples_per_second": 8.187, |
|
"eval_steps_per_second": 4.098, |
|
"step": 600 |
|
}, |
|
{ |
|
"epoch": 1.1452710631307206, |
|
"grad_norm": 0.9632971882820129, |
|
"learning_rate": 9.762318717465388e-05, |
|
"loss": 2.4487, |
|
"step": 610 |
|
}, |
|
{ |
|
"epoch": 1.1640459985918798, |
|
"grad_norm": 0.996286153793335, |
|
"learning_rate": 9.716705415051361e-05, |
|
"loss": 2.5009, |
|
"step": 620 |
|
}, |
|
{ |
|
"epoch": 1.1828209340530391, |
|
"grad_norm": 0.9334232211112976, |
|
"learning_rate": 9.667216970346914e-05, |
|
"loss": 2.2991, |
|
"step": 630 |
|
}, |
|
{ |
|
"epoch": 1.2015958695141986, |
|
"grad_norm": 1.014267086982727, |
|
"learning_rate": 9.613894041978795e-05, |
|
"loss": 2.4963, |
|
"step": 640 |
|
}, |
|
{ |
|
"epoch": 1.2203708049753579, |
|
"grad_norm": 1.3219252824783325, |
|
"learning_rate": 9.556780438901898e-05, |
|
"loss": 2.4339, |
|
"step": 650 |
|
}, |
|
{ |
|
"epoch": 1.2391457404365172, |
|
"grad_norm": 0.7801995873451233, |
|
"learning_rate": 9.495923084406772e-05, |
|
"loss": 2.3548, |
|
"step": 660 |
|
}, |
|
{ |
|
"epoch": 1.2579206758976766, |
|
"grad_norm": 1.7439610958099365, |
|
"learning_rate": 9.431371977568483e-05, |
|
"loss": 2.4373, |
|
"step": 670 |
|
}, |
|
{ |
|
"epoch": 1.276695611358836, |
|
"grad_norm": 1.080696702003479, |
|
"learning_rate": 9.363180152168447e-05, |
|
"loss": 2.1908, |
|
"step": 680 |
|
}, |
|
{ |
|
"epoch": 1.2954705468199954, |
|
"grad_norm": 1.151071548461914, |
|
"learning_rate": 9.291403633123046e-05, |
|
"loss": 2.5268, |
|
"step": 690 |
|
}, |
|
{ |
|
"epoch": 1.3142454822811547, |
|
"grad_norm": 0.8707749843597412, |
|
"learning_rate": 9.21610139045477e-05, |
|
"loss": 2.3, |
|
"step": 700 |
|
}, |
|
{ |
|
"epoch": 1.3142454822811547, |
|
"eval_loss": 2.3193163871765137, |
|
"eval_runtime": 115.797, |
|
"eval_samples_per_second": 8.178, |
|
"eval_steps_per_second": 4.093, |
|
"step": 700 |
|
}, |
|
{ |
|
"epoch": 1.333020417742314, |
|
"grad_norm": 1.0154194831848145, |
|
"learning_rate": 9.13733529084374e-05, |
|
"loss": 2.3169, |
|
"step": 710 |
|
}, |
|
{ |
|
"epoch": 1.3517953532034734, |
|
"grad_norm": 1.2447744607925415, |
|
"learning_rate": 9.055170046799385e-05, |
|
"loss": 2.2876, |
|
"step": 720 |
|
}, |
|
{ |
|
"epoch": 1.3705702886646327, |
|
"grad_norm": 1.545040249824524, |
|
"learning_rate": 8.969673163494064e-05, |
|
"loss": 2.3832, |
|
"step": 730 |
|
}, |
|
{ |
|
"epoch": 1.3893452241257922, |
|
"grad_norm": 0.9809775352478027, |
|
"learning_rate": 8.880914883302278e-05, |
|
"loss": 2.4271, |
|
"step": 740 |
|
}, |
|
{ |
|
"epoch": 1.4081201595869515, |
|
"grad_norm": 0.8140939474105835, |
|
"learning_rate": 8.788968128091083e-05, |
|
"loss": 2.426, |
|
"step": 750 |
|
}, |
|
{ |
|
"epoch": 1.4268950950481107, |
|
"grad_norm": 0.9135342240333557, |
|
"learning_rate": 8.69390843930906e-05, |
|
"loss": 2.3396, |
|
"step": 760 |
|
}, |
|
{ |
|
"epoch": 1.44567003050927, |
|
"grad_norm": 0.7756408452987671, |
|
"learning_rate": 8.595813915923113e-05, |
|
"loss": 2.3238, |
|
"step": 770 |
|
}, |
|
{ |
|
"epoch": 1.4644449659704295, |
|
"grad_norm": 0.8543408513069153, |
|
"learning_rate": 8.494765150254062e-05, |
|
"loss": 2.509, |
|
"step": 780 |
|
}, |
|
{ |
|
"epoch": 1.4832199014315888, |
|
"grad_norm": 0.8993504643440247, |
|
"learning_rate": 8.390845161763756e-05, |
|
"loss": 2.1994, |
|
"step": 790 |
|
}, |
|
{ |
|
"epoch": 1.5019948368927483, |
|
"grad_norm": 1.2262345552444458, |
|
"learning_rate": 8.284139328848082e-05, |
|
"loss": 2.3219, |
|
"step": 800 |
|
}, |
|
{ |
|
"epoch": 1.5019948368927483, |
|
"eval_loss": 2.295060873031616, |
|
"eval_runtime": 115.6726, |
|
"eval_samples_per_second": 8.187, |
|
"eval_steps_per_second": 4.098, |
|
"step": 800 |
|
}, |
|
{ |
|
"epoch": 1.5207697723539075, |
|
"grad_norm": 0.8056873679161072, |
|
"learning_rate": 8.174735318691945e-05, |
|
"loss": 2.4357, |
|
"step": 810 |
|
}, |
|
{ |
|
"epoch": 1.5395447078150668, |
|
"grad_norm": 0.9234392642974854, |
|
"learning_rate": 8.062723015243822e-05, |
|
"loss": 2.172, |
|
"step": 820 |
|
}, |
|
{ |
|
"epoch": 1.558319643276226, |
|
"grad_norm": 1.4100919961929321, |
|
"learning_rate": 7.948194445369064e-05, |
|
"loss": 2.4166, |
|
"step": 830 |
|
}, |
|
{ |
|
"epoch": 1.5770945787373856, |
|
"grad_norm": 0.9078715443611145, |
|
"learning_rate": 7.831243703242637e-05, |
|
"loss": 2.3765, |
|
"step": 840 |
|
}, |
|
{ |
|
"epoch": 1.595869514198545, |
|
"grad_norm": 1.0631110668182373, |
|
"learning_rate": 7.711966873043398e-05, |
|
"loss": 2.3603, |
|
"step": 850 |
|
}, |
|
{ |
|
"epoch": 1.6146444496597043, |
|
"grad_norm": 0.8904982805252075, |
|
"learning_rate": 7.590461950013424e-05, |
|
"loss": 2.2791, |
|
"step": 860 |
|
}, |
|
{ |
|
"epoch": 1.6334193851208636, |
|
"grad_norm": 1.1240772008895874, |
|
"learning_rate": 7.46682875994727e-05, |
|
"loss": 2.4014, |
|
"step": 870 |
|
}, |
|
{ |
|
"epoch": 1.6521943205820229, |
|
"grad_norm": 0.9138346314430237, |
|
"learning_rate": 7.341168877177267e-05, |
|
"loss": 2.3897, |
|
"step": 880 |
|
}, |
|
{ |
|
"epoch": 1.6709692560431824, |
|
"grad_norm": 0.8400964140892029, |
|
"learning_rate": 7.213585541122261e-05, |
|
"loss": 2.3004, |
|
"step": 890 |
|
}, |
|
{ |
|
"epoch": 1.6897441915043419, |
|
"grad_norm": 1.1160262823104858, |
|
"learning_rate": 7.084183571468368e-05, |
|
"loss": 2.377, |
|
"step": 900 |
|
}, |
|
{ |
|
"epoch": 1.6897441915043419, |
|
"eval_loss": 2.2762601375579834, |
|
"eval_runtime": 115.7128, |
|
"eval_samples_per_second": 8.184, |
|
"eval_steps_per_second": 4.096, |
|
"step": 900 |
|
}, |
|
{ |
|
"epoch": 1.7085191269655011, |
|
"grad_norm": 0.8896321654319763, |
|
"learning_rate": 6.953069282051396e-05, |
|
"loss": 2.1619, |
|
"step": 910 |
|
}, |
|
{ |
|
"epoch": 1.7272940624266604, |
|
"grad_norm": 0.848534107208252, |
|
"learning_rate": 6.820350393511732e-05, |
|
"loss": 2.447, |
|
"step": 920 |
|
}, |
|
{ |
|
"epoch": 1.7460689978878197, |
|
"grad_norm": 0.9254674911499023, |
|
"learning_rate": 6.686135944793395e-05, |
|
"loss": 2.2199, |
|
"step": 930 |
|
}, |
|
{ |
|
"epoch": 1.7648439333489792, |
|
"grad_norm": 0.8377211093902588, |
|
"learning_rate": 6.550536203560029e-05, |
|
"loss": 2.3059, |
|
"step": 940 |
|
}, |
|
{ |
|
"epoch": 1.7836188688101384, |
|
"grad_norm": 1.01736319065094, |
|
"learning_rate": 6.413662575601391e-05, |
|
"loss": 2.1205, |
|
"step": 950 |
|
}, |
|
{ |
|
"epoch": 1.802393804271298, |
|
"grad_norm": 0.7986969947814941, |
|
"learning_rate": 6.27562751330479e-05, |
|
"loss": 2.2619, |
|
"step": 960 |
|
}, |
|
{ |
|
"epoch": 1.8211687397324572, |
|
"grad_norm": 1.054882287979126, |
|
"learning_rate": 6.136544423266651e-05, |
|
"loss": 2.1804, |
|
"step": 970 |
|
}, |
|
{ |
|
"epoch": 1.8399436751936165, |
|
"grad_norm": 0.8551494479179382, |
|
"learning_rate": 5.9965275731201366e-05, |
|
"loss": 2.3119, |
|
"step": 980 |
|
}, |
|
{ |
|
"epoch": 1.8587186106547757, |
|
"grad_norm": 1.810271143913269, |
|
"learning_rate": 5.8556919976553406e-05, |
|
"loss": 2.4647, |
|
"step": 990 |
|
}, |
|
{ |
|
"epoch": 1.8774935461159352, |
|
"grad_norm": 0.7847923636436462, |
|
"learning_rate": 5.714153404309228e-05, |
|
"loss": 2.2977, |
|
"step": 1000 |
|
}, |
|
{ |
|
"epoch": 1.8774935461159352, |
|
"eval_loss": 2.262303590774536, |
|
"eval_runtime": 115.6317, |
|
"eval_samples_per_second": 8.19, |
|
"eval_steps_per_second": 4.099, |
|
"step": 1000 |
|
}, |
|
{ |
|
"epoch": 1.8962684815770947, |
|
"grad_norm": 0.8150405883789062, |
|
"learning_rate": 5.5720280781029166e-05, |
|
"loss": 2.2873, |
|
"step": 1010 |
|
}, |
|
{ |
|
"epoch": 1.915043417038254, |
|
"grad_norm": 0.7990459203720093, |
|
"learning_rate": 5.429432786104446e-05, |
|
"loss": 2.3096, |
|
"step": 1020 |
|
}, |
|
{ |
|
"epoch": 1.9338183524994133, |
|
"grad_norm": 0.9583683013916016, |
|
"learning_rate": 5.286484681495499e-05, |
|
"loss": 2.3133, |
|
"step": 1030 |
|
}, |
|
{ |
|
"epoch": 1.9525932879605725, |
|
"grad_norm": 0.8717370629310608, |
|
"learning_rate": 5.1433012073209085e-05, |
|
"loss": 2.3946, |
|
"step": 1040 |
|
}, |
|
{ |
|
"epoch": 1.971368223421732, |
|
"grad_norm": 0.8073247671127319, |
|
"learning_rate": 5e-05, |
|
"loss": 2.1941, |
|
"step": 1050 |
|
}, |
|
{ |
|
"epoch": 1.9901431588828915, |
|
"grad_norm": 1.0495169162750244, |
|
"learning_rate": 4.856698792679094e-05, |
|
"loss": 2.2465, |
|
"step": 1060 |
|
}, |
|
{ |
|
"epoch": 2.0089180943440508, |
|
"grad_norm": 0.8290483355522156, |
|
"learning_rate": 4.7135153185045014e-05, |
|
"loss": 2.2935, |
|
"step": 1070 |
|
}, |
|
{ |
|
"epoch": 2.02769302980521, |
|
"grad_norm": 0.8408583998680115, |
|
"learning_rate": 4.570567213895555e-05, |
|
"loss": 2.2642, |
|
"step": 1080 |
|
}, |
|
{ |
|
"epoch": 2.0464679652663693, |
|
"grad_norm": 1.1089783906936646, |
|
"learning_rate": 4.427971921897085e-05, |
|
"loss": 2.3274, |
|
"step": 1090 |
|
}, |
|
{ |
|
"epoch": 2.0652429007275286, |
|
"grad_norm": 1.12042236328125, |
|
"learning_rate": 4.2858465956907724e-05, |
|
"loss": 2.269, |
|
"step": 1100 |
|
}, |
|
{ |
|
"epoch": 2.0652429007275286, |
|
"eval_loss": 2.252450704574585, |
|
"eval_runtime": 115.7991, |
|
"eval_samples_per_second": 8.178, |
|
"eval_steps_per_second": 4.093, |
|
"step": 1100 |
|
}, |
|
{ |
|
"epoch": 2.0840178361886883, |
|
"grad_norm": 0.9814222455024719, |
|
"learning_rate": 4.14430800234466e-05, |
|
"loss": 2.1596, |
|
"step": 1110 |
|
}, |
|
{ |
|
"epoch": 2.1027927716498476, |
|
"grad_norm": 0.8501948118209839, |
|
"learning_rate": 4.003472426879866e-05, |
|
"loss": 2.3033, |
|
"step": 1120 |
|
}, |
|
{ |
|
"epoch": 2.121567707111007, |
|
"grad_norm": 0.8352840542793274, |
|
"learning_rate": 3.863455576733349e-05, |
|
"loss": 2.2893, |
|
"step": 1130 |
|
}, |
|
{ |
|
"epoch": 2.140342642572166, |
|
"grad_norm": 0.8014605045318604, |
|
"learning_rate": 3.724372486695211e-05, |
|
"loss": 2.2274, |
|
"step": 1140 |
|
}, |
|
{ |
|
"epoch": 2.1591175780333254, |
|
"grad_norm": 0.9311611652374268, |
|
"learning_rate": 3.5863374243986094e-05, |
|
"loss": 2.0715, |
|
"step": 1150 |
|
}, |
|
{ |
|
"epoch": 2.177892513494485, |
|
"grad_norm": 0.8880860209465027, |
|
"learning_rate": 3.4494637964399724e-05, |
|
"loss": 2.2517, |
|
"step": 1160 |
|
}, |
|
{ |
|
"epoch": 2.1966674489556444, |
|
"grad_norm": 0.9553248882293701, |
|
"learning_rate": 3.313864055206607e-05, |
|
"loss": 2.3358, |
|
"step": 1170 |
|
}, |
|
{ |
|
"epoch": 2.2154423844168036, |
|
"grad_norm": 1.015997052192688, |
|
"learning_rate": 3.179649606488267e-05, |
|
"loss": 2.2711, |
|
"step": 1180 |
|
}, |
|
{ |
|
"epoch": 2.234217319877963, |
|
"grad_norm": 0.8522908091545105, |
|
"learning_rate": 3.046930717948604e-05, |
|
"loss": 2.2483, |
|
"step": 1190 |
|
}, |
|
{ |
|
"epoch": 2.252992255339122, |
|
"grad_norm": 0.844487190246582, |
|
"learning_rate": 2.9158164285316353e-05, |
|
"loss": 2.2305, |
|
"step": 1200 |
|
}, |
|
{ |
|
"epoch": 2.252992255339122, |
|
"eval_loss": 2.2441632747650146, |
|
"eval_runtime": 115.7498, |
|
"eval_samples_per_second": 8.181, |
|
"eval_steps_per_second": 4.095, |
|
"step": 1200 |
|
}, |
|
{ |
|
"epoch": 2.2717671908002814, |
|
"grad_norm": 0.87571120262146, |
|
"learning_rate": 2.7864144588777403e-05, |
|
"loss": 2.2648, |
|
"step": 1210 |
|
}, |
|
{ |
|
"epoch": 2.290542126261441, |
|
"grad_norm": 0.8913039565086365, |
|
"learning_rate": 2.6588311228227347e-05, |
|
"loss": 2.1812, |
|
"step": 1220 |
|
}, |
|
{ |
|
"epoch": 2.3093170617226004, |
|
"grad_norm": 1.0479601621627808, |
|
"learning_rate": 2.5331712400527298e-05, |
|
"loss": 2.2898, |
|
"step": 1230 |
|
}, |
|
{ |
|
"epoch": 2.3280919971837597, |
|
"grad_norm": 0.9554978013038635, |
|
"learning_rate": 2.4095380499865762e-05, |
|
"loss": 2.3384, |
|
"step": 1240 |
|
}, |
|
{ |
|
"epoch": 2.346866932644919, |
|
"grad_norm": 0.8619790077209473, |
|
"learning_rate": 2.288033126956604e-05, |
|
"loss": 2.2454, |
|
"step": 1250 |
|
}, |
|
{ |
|
"epoch": 2.3656418681060782, |
|
"grad_norm": 0.980107843875885, |
|
"learning_rate": 2.1687562967573645e-05, |
|
"loss": 2.193, |
|
"step": 1260 |
|
}, |
|
{ |
|
"epoch": 2.3844168035672375, |
|
"grad_norm": 0.935689389705658, |
|
"learning_rate": 2.0518055546309362e-05, |
|
"loss": 2.3337, |
|
"step": 1270 |
|
}, |
|
{ |
|
"epoch": 2.4031917390283972, |
|
"grad_norm": 0.8614078164100647, |
|
"learning_rate": 1.9372769847561788e-05, |
|
"loss": 2.1658, |
|
"step": 1280 |
|
}, |
|
{ |
|
"epoch": 2.4219666744895565, |
|
"grad_norm": 0.9031429886817932, |
|
"learning_rate": 1.8252646813080565e-05, |
|
"loss": 2.1985, |
|
"step": 1290 |
|
}, |
|
{ |
|
"epoch": 2.4407416099507158, |
|
"grad_norm": 0.9556667804718018, |
|
"learning_rate": 1.7158606711519194e-05, |
|
"loss": 2.3866, |
|
"step": 1300 |
|
}, |
|
{ |
|
"epoch": 2.4407416099507158, |
|
"eval_loss": 2.2396435737609863, |
|
"eval_runtime": 115.863, |
|
"eval_samples_per_second": 8.173, |
|
"eval_steps_per_second": 4.091, |
|
"step": 1300 |
|
}, |
|
{ |
|
"epoch": 2.459516545411875, |
|
"grad_norm": 0.9352513551712036, |
|
"learning_rate": 1.6091548382362458e-05, |
|
"loss": 2.2399, |
|
"step": 1310 |
|
}, |
|
{ |
|
"epoch": 2.4782914808730343, |
|
"grad_norm": 1.494532585144043, |
|
"learning_rate": 1.50523484974594e-05, |
|
"loss": 2.3273, |
|
"step": 1320 |
|
}, |
|
{ |
|
"epoch": 2.497066416334194, |
|
"grad_norm": 0.9380200505256653, |
|
"learning_rate": 1.40418608407689e-05, |
|
"loss": 2.2071, |
|
"step": 1330 |
|
}, |
|
{ |
|
"epoch": 2.5158413517953533, |
|
"grad_norm": 0.8295271396636963, |
|
"learning_rate": 1.3060915606909413e-05, |
|
"loss": 2.274, |
|
"step": 1340 |
|
}, |
|
{ |
|
"epoch": 2.5346162872565126, |
|
"grad_norm": 0.8793519735336304, |
|
"learning_rate": 1.2110318719089158e-05, |
|
"loss": 2.3791, |
|
"step": 1350 |
|
}, |
|
{ |
|
"epoch": 2.553391222717672, |
|
"grad_norm": 0.9659551382064819, |
|
"learning_rate": 1.1190851166977217e-05, |
|
"loss": 2.2612, |
|
"step": 1360 |
|
}, |
|
{ |
|
"epoch": 2.572166158178831, |
|
"grad_norm": 0.9426782727241516, |
|
"learning_rate": 1.0303268365059382e-05, |
|
"loss": 2.2907, |
|
"step": 1370 |
|
}, |
|
{ |
|
"epoch": 2.590941093639991, |
|
"grad_norm": 1.0491048097610474, |
|
"learning_rate": 9.448299532006149e-06, |
|
"loss": 2.3124, |
|
"step": 1380 |
|
}, |
|
{ |
|
"epoch": 2.60971602910115, |
|
"grad_norm": 0.9064353704452515, |
|
"learning_rate": 8.626647091562612e-06, |
|
"loss": 2.2214, |
|
"step": 1390 |
|
}, |
|
{ |
|
"epoch": 2.6284909645623094, |
|
"grad_norm": 0.7819197773933411, |
|
"learning_rate": 7.838986095452311e-06, |
|
"loss": 2.3217, |
|
"step": 1400 |
|
}, |
|
{ |
|
"epoch": 2.6284909645623094, |
|
"eval_loss": 2.2368621826171875, |
|
"eval_runtime": 115.8127, |
|
"eval_samples_per_second": 8.177, |
|
"eval_steps_per_second": 4.093, |
|
"step": 1400 |
|
}, |
|
{ |
|
"epoch": 2.6472659000234686, |
|
"grad_norm": 0.8173615336418152, |
|
"learning_rate": 7.085963668769552e-06, |
|
"loss": 2.2193, |
|
"step": 1410 |
|
}, |
|
{ |
|
"epoch": 2.666040835484628, |
|
"grad_norm": 0.7969537377357483, |
|
"learning_rate": 6.36819847831554e-06, |
|
"loss": 2.3388, |
|
"step": 1420 |
|
}, |
|
{ |
|
"epoch": 2.6848157709457876, |
|
"grad_norm": 0.8578535318374634, |
|
"learning_rate": 5.686280224315188e-06, |
|
"loss": 2.3577, |
|
"step": 1430 |
|
}, |
|
{ |
|
"epoch": 2.703590706406947, |
|
"grad_norm": 0.8416107296943665, |
|
"learning_rate": 5.040769155932284e-06, |
|
"loss": 2.2829, |
|
"step": 1440 |
|
}, |
|
{ |
|
"epoch": 2.722365641868106, |
|
"grad_norm": 0.8079660534858704, |
|
"learning_rate": 4.432195610981032e-06, |
|
"loss": 2.2675, |
|
"step": 1450 |
|
}, |
|
{ |
|
"epoch": 2.7411405773292654, |
|
"grad_norm": 0.8975071907043457, |
|
"learning_rate": 3.861059580212056e-06, |
|
"loss": 2.3367, |
|
"step": 1460 |
|
}, |
|
{ |
|
"epoch": 2.7599155127904247, |
|
"grad_norm": 1.0292245149612427, |
|
"learning_rate": 3.3278302965308596e-06, |
|
"loss": 2.3209, |
|
"step": 1470 |
|
}, |
|
{ |
|
"epoch": 2.7786904482515844, |
|
"grad_norm": 0.7877647280693054, |
|
"learning_rate": 2.8329458494863847e-06, |
|
"loss": 2.2889, |
|
"step": 1480 |
|
}, |
|
{ |
|
"epoch": 2.7974653837127432, |
|
"grad_norm": 0.8482506275177002, |
|
"learning_rate": 2.3768128253461253e-06, |
|
"loss": 2.3301, |
|
"step": 1490 |
|
}, |
|
{ |
|
"epoch": 2.816240319173903, |
|
"grad_norm": 0.7603471279144287, |
|
"learning_rate": 1.9598059730537466e-06, |
|
"loss": 2.2007, |
|
"step": 1500 |
|
}, |
|
{ |
|
"epoch": 2.816240319173903, |
|
"eval_loss": 2.2355196475982666, |
|
"eval_runtime": 115.7546, |
|
"eval_samples_per_second": 8.181, |
|
"eval_steps_per_second": 4.095, |
|
"step": 1500 |
|
}, |
|
{ |
|
"epoch": 2.835015254635062, |
|
"grad_norm": 0.8267006874084473, |
|
"learning_rate": 1.5822678963435478e-06, |
|
"loss": 2.392, |
|
"step": 1510 |
|
}, |
|
{ |
|
"epoch": 2.8537901900962215, |
|
"grad_norm": 0.8282234072685242, |
|
"learning_rate": 1.2445087722646575e-06, |
|
"loss": 2.2968, |
|
"step": 1520 |
|
}, |
|
{ |
|
"epoch": 2.8725651255573807, |
|
"grad_norm": 0.8622906804084778, |
|
"learning_rate": 9.468060963463755e-07, |
|
"loss": 2.2902, |
|
"step": 1530 |
|
}, |
|
{ |
|
"epoch": 2.89134006101854, |
|
"grad_norm": 0.8282898664474487, |
|
"learning_rate": 6.894044546138845e-07, |
|
"loss": 2.2151, |
|
"step": 1540 |
|
}, |
|
{ |
|
"epoch": 2.9101149964796997, |
|
"grad_norm": 0.8747497797012329, |
|
"learning_rate": 4.7251532264170895e-07, |
|
"loss": 2.188, |
|
"step": 1550 |
|
}, |
|
{ |
|
"epoch": 2.928889931940859, |
|
"grad_norm": 0.96438068151474, |
|
"learning_rate": 2.9631689180999457e-07, |
|
"loss": 2.4261, |
|
"step": 1560 |
|
}, |
|
{ |
|
"epoch": 2.9476648674020183, |
|
"grad_norm": 0.8510390520095825, |
|
"learning_rate": 1.6095392290635393e-07, |
|
"loss": 2.272, |
|
"step": 1570 |
|
}, |
|
{ |
|
"epoch": 2.9664398028631775, |
|
"grad_norm": 0.7817071080207825, |
|
"learning_rate": 6.653762719355805e-08, |
|
"loss": 2.3397, |
|
"step": 1580 |
|
}, |
|
{ |
|
"epoch": 2.985214738324337, |
|
"grad_norm": 0.8862946033477783, |
|
"learning_rate": 1.3145575040801606e-08, |
|
"loss": 2.269, |
|
"step": 1590 |
|
}, |
|
{ |
|
"epoch": 2.9964796996010326, |
|
"step": 1596, |
|
"total_flos": 1.1780062420402176e+18, |
|
"train_loss": 2.428264606566656, |
|
"train_runtime": 11538.2228, |
|
"train_samples_per_second": 2.216, |
|
"train_steps_per_second": 0.138 |
|
} |
|
], |
|
"logging_steps": 10, |
|
"max_steps": 1596, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 3, |
|
"save_steps": 500, |
|
"total_flos": 1.1780062420402176e+18, |
|
"train_batch_size": 1, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|