|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 5.0, |
|
"eval_steps": 500, |
|
"global_step": 7815, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.06397952655150352, |
|
"grad_norm": 1.344713568687439, |
|
"learning_rate": 0.00029616122840690973, |
|
"loss": 2.6677, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.12795905310300704, |
|
"grad_norm": 1.3358043432235718, |
|
"learning_rate": 0.00029232245681381954, |
|
"loss": 2.2932, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 0.19193857965451055, |
|
"grad_norm": 1.3450862169265747, |
|
"learning_rate": 0.0002885220729366602, |
|
"loss": 2.2158, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 0.2559181062060141, |
|
"grad_norm": 1.1904783248901367, |
|
"learning_rate": 0.00028468330134357004, |
|
"loss": 2.1907, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 0.3198976327575176, |
|
"grad_norm": 1.3194555044174194, |
|
"learning_rate": 0.00028084452975047985, |
|
"loss": 2.1725, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 0.3838771593090211, |
|
"grad_norm": 1.3454678058624268, |
|
"learning_rate": 0.0002770057581573896, |
|
"loss": 2.1586, |
|
"step": 600 |
|
}, |
|
{ |
|
"epoch": 0.44785668586052463, |
|
"grad_norm": 1.318642497062683, |
|
"learning_rate": 0.0002731669865642994, |
|
"loss": 2.1429, |
|
"step": 700 |
|
}, |
|
{ |
|
"epoch": 0.5118362124120281, |
|
"grad_norm": 1.1936390399932861, |
|
"learning_rate": 0.0002693282149712092, |
|
"loss": 2.0988, |
|
"step": 800 |
|
}, |
|
{ |
|
"epoch": 0.5758157389635317, |
|
"grad_norm": 1.1406426429748535, |
|
"learning_rate": 0.000265489443378119, |
|
"loss": 2.0834, |
|
"step": 900 |
|
}, |
|
{ |
|
"epoch": 0.6397952655150352, |
|
"grad_norm": 1.2195415496826172, |
|
"learning_rate": 0.0002616506717850288, |
|
"loss": 2.1004, |
|
"step": 1000 |
|
}, |
|
{ |
|
"epoch": 0.7037747920665387, |
|
"grad_norm": 1.2042839527130127, |
|
"learning_rate": 0.00025781190019193856, |
|
"loss": 2.0883, |
|
"step": 1100 |
|
}, |
|
{ |
|
"epoch": 0.7677543186180422, |
|
"grad_norm": 1.159279465675354, |
|
"learning_rate": 0.0002539731285988483, |
|
"loss": 2.0852, |
|
"step": 1200 |
|
}, |
|
{ |
|
"epoch": 0.8317338451695457, |
|
"grad_norm": 1.2210711240768433, |
|
"learning_rate": 0.00025013435700575813, |
|
"loss": 2.0449, |
|
"step": 1300 |
|
}, |
|
{ |
|
"epoch": 0.8957133717210493, |
|
"grad_norm": 1.2941797971725464, |
|
"learning_rate": 0.00024629558541266794, |
|
"loss": 2.0455, |
|
"step": 1400 |
|
}, |
|
{ |
|
"epoch": 0.9596928982725528, |
|
"grad_norm": 1.1550540924072266, |
|
"learning_rate": 0.00024245681381957772, |
|
"loss": 2.0293, |
|
"step": 1500 |
|
}, |
|
{ |
|
"epoch": 1.0236724248240563, |
|
"grad_norm": 1.3520652055740356, |
|
"learning_rate": 0.0002386564299424184, |
|
"loss": 1.9799, |
|
"step": 1600 |
|
}, |
|
{ |
|
"epoch": 1.0876519513755598, |
|
"grad_norm": 1.375148892402649, |
|
"learning_rate": 0.0002348176583493282, |
|
"loss": 1.9032, |
|
"step": 1700 |
|
}, |
|
{ |
|
"epoch": 1.1516314779270633, |
|
"grad_norm": 1.4116652011871338, |
|
"learning_rate": 0.00023097888675623797, |
|
"loss": 1.9398, |
|
"step": 1800 |
|
}, |
|
{ |
|
"epoch": 1.2156110044785668, |
|
"grad_norm": 1.2254273891448975, |
|
"learning_rate": 0.00022714011516314776, |
|
"loss": 1.9097, |
|
"step": 1900 |
|
}, |
|
{ |
|
"epoch": 1.2795905310300704, |
|
"grad_norm": 1.3888587951660156, |
|
"learning_rate": 0.00022330134357005757, |
|
"loss": 1.9039, |
|
"step": 2000 |
|
}, |
|
{ |
|
"epoch": 1.3435700575815739, |
|
"grad_norm": 1.4431171417236328, |
|
"learning_rate": 0.00021946257197696736, |
|
"loss": 1.9095, |
|
"step": 2100 |
|
}, |
|
{ |
|
"epoch": 1.4075495841330774, |
|
"grad_norm": 1.2467221021652222, |
|
"learning_rate": 0.00021562380038387714, |
|
"loss": 1.9103, |
|
"step": 2200 |
|
}, |
|
{ |
|
"epoch": 1.471529110684581, |
|
"grad_norm": 1.41363525390625, |
|
"learning_rate": 0.00021178502879078693, |
|
"loss": 1.9132, |
|
"step": 2300 |
|
}, |
|
{ |
|
"epoch": 1.5355086372360844, |
|
"grad_norm": 1.4501458406448364, |
|
"learning_rate": 0.0002079462571976967, |
|
"loss": 1.9095, |
|
"step": 2400 |
|
}, |
|
{ |
|
"epoch": 1.599488163787588, |
|
"grad_norm": 1.2889657020568848, |
|
"learning_rate": 0.00020410748560460652, |
|
"loss": 1.9308, |
|
"step": 2500 |
|
}, |
|
{ |
|
"epoch": 1.6634676903390915, |
|
"grad_norm": 1.4488581418991089, |
|
"learning_rate": 0.0002002687140115163, |
|
"loss": 1.9161, |
|
"step": 2600 |
|
}, |
|
{ |
|
"epoch": 1.727447216890595, |
|
"grad_norm": 1.3905428647994995, |
|
"learning_rate": 0.0001964299424184261, |
|
"loss": 1.8958, |
|
"step": 2700 |
|
}, |
|
{ |
|
"epoch": 1.7914267434420985, |
|
"grad_norm": 1.3509632349014282, |
|
"learning_rate": 0.00019259117082533588, |
|
"loss": 1.8933, |
|
"step": 2800 |
|
}, |
|
{ |
|
"epoch": 1.855406269993602, |
|
"grad_norm": 1.3293097019195557, |
|
"learning_rate": 0.00018875239923224566, |
|
"loss": 1.8902, |
|
"step": 2900 |
|
}, |
|
{ |
|
"epoch": 1.9193857965451055, |
|
"grad_norm": 1.3575371503829956, |
|
"learning_rate": 0.00018491362763915547, |
|
"loss": 1.9107, |
|
"step": 3000 |
|
}, |
|
{ |
|
"epoch": 1.983365323096609, |
|
"grad_norm": 1.4029614925384521, |
|
"learning_rate": 0.00018107485604606526, |
|
"loss": 1.9122, |
|
"step": 3100 |
|
}, |
|
{ |
|
"epoch": 2.0473448496481126, |
|
"grad_norm": 1.6483345031738281, |
|
"learning_rate": 0.00017723608445297504, |
|
"loss": 1.7797, |
|
"step": 3200 |
|
}, |
|
{ |
|
"epoch": 2.111324376199616, |
|
"grad_norm": 1.7276026010513306, |
|
"learning_rate": 0.00017339731285988483, |
|
"loss": 1.7411, |
|
"step": 3300 |
|
}, |
|
{ |
|
"epoch": 2.1753039027511196, |
|
"grad_norm": 1.6449826955795288, |
|
"learning_rate": 0.00016955854126679461, |
|
"loss": 1.7355, |
|
"step": 3400 |
|
}, |
|
{ |
|
"epoch": 2.239283429302623, |
|
"grad_norm": 1.606766939163208, |
|
"learning_rate": 0.00016571976967370443, |
|
"loss": 1.7479, |
|
"step": 3500 |
|
}, |
|
{ |
|
"epoch": 2.3032629558541267, |
|
"grad_norm": 1.6743805408477783, |
|
"learning_rate": 0.0001618809980806142, |
|
"loss": 1.7376, |
|
"step": 3600 |
|
}, |
|
{ |
|
"epoch": 2.36724248240563, |
|
"grad_norm": 1.58048415184021, |
|
"learning_rate": 0.00015804222648752397, |
|
"loss": 1.7499, |
|
"step": 3700 |
|
}, |
|
{ |
|
"epoch": 2.4312220089571337, |
|
"grad_norm": 1.7509996891021729, |
|
"learning_rate": 0.00015420345489443375, |
|
"loss": 1.741, |
|
"step": 3800 |
|
}, |
|
{ |
|
"epoch": 2.495201535508637, |
|
"grad_norm": 1.6279881000518799, |
|
"learning_rate": 0.00015036468330134354, |
|
"loss": 1.7584, |
|
"step": 3900 |
|
}, |
|
{ |
|
"epoch": 2.5591810620601407, |
|
"grad_norm": 1.6708228588104248, |
|
"learning_rate": 0.00014652591170825335, |
|
"loss": 1.7505, |
|
"step": 4000 |
|
}, |
|
{ |
|
"epoch": 2.6231605886116443, |
|
"grad_norm": 1.628318428993225, |
|
"learning_rate": 0.00014268714011516314, |
|
"loss": 1.7535, |
|
"step": 4100 |
|
}, |
|
{ |
|
"epoch": 2.6871401151631478, |
|
"grad_norm": 1.66116464138031, |
|
"learning_rate": 0.00013884836852207292, |
|
"loss": 1.7534, |
|
"step": 4200 |
|
}, |
|
{ |
|
"epoch": 2.7511196417146513, |
|
"grad_norm": 1.7303767204284668, |
|
"learning_rate": 0.0001350095969289827, |
|
"loss": 1.7605, |
|
"step": 4300 |
|
}, |
|
{ |
|
"epoch": 2.815099168266155, |
|
"grad_norm": 1.6892797946929932, |
|
"learning_rate": 0.00013117082533589252, |
|
"loss": 1.7343, |
|
"step": 4400 |
|
}, |
|
{ |
|
"epoch": 2.8790786948176583, |
|
"grad_norm": 1.700649380683899, |
|
"learning_rate": 0.0001273320537428023, |
|
"loss": 1.7545, |
|
"step": 4500 |
|
}, |
|
{ |
|
"epoch": 2.943058221369162, |
|
"grad_norm": 1.7158896923065186, |
|
"learning_rate": 0.0001234932821497121, |
|
"loss": 1.7472, |
|
"step": 4600 |
|
}, |
|
{ |
|
"epoch": 3.0070377479206654, |
|
"grad_norm": 1.5952404737472534, |
|
"learning_rate": 0.00011965451055662187, |
|
"loss": 1.709, |
|
"step": 4700 |
|
}, |
|
{ |
|
"epoch": 3.071017274472169, |
|
"grad_norm": 1.8965271711349487, |
|
"learning_rate": 0.00011581573896353166, |
|
"loss": 1.5308, |
|
"step": 4800 |
|
}, |
|
{ |
|
"epoch": 3.1349968010236724, |
|
"grad_norm": 1.9957573413848877, |
|
"learning_rate": 0.00011197696737044146, |
|
"loss": 1.564, |
|
"step": 4900 |
|
}, |
|
{ |
|
"epoch": 3.198976327575176, |
|
"grad_norm": 2.0544333457946777, |
|
"learning_rate": 0.00010813819577735124, |
|
"loss": 1.5668, |
|
"step": 5000 |
|
}, |
|
{ |
|
"epoch": 3.2629558541266794, |
|
"grad_norm": 2.041703462600708, |
|
"learning_rate": 0.00010429942418426103, |
|
"loss": 1.5766, |
|
"step": 5100 |
|
}, |
|
{ |
|
"epoch": 3.326935380678183, |
|
"grad_norm": 2.300631284713745, |
|
"learning_rate": 0.00010046065259117082, |
|
"loss": 1.5701, |
|
"step": 5200 |
|
}, |
|
{ |
|
"epoch": 3.3909149072296865, |
|
"grad_norm": 1.9454134702682495, |
|
"learning_rate": 9.662188099808061e-05, |
|
"loss": 1.5701, |
|
"step": 5300 |
|
}, |
|
{ |
|
"epoch": 3.45489443378119, |
|
"grad_norm": 2.113377571105957, |
|
"learning_rate": 9.278310940499041e-05, |
|
"loss": 1.5882, |
|
"step": 5400 |
|
}, |
|
{ |
|
"epoch": 3.5188739603326935, |
|
"grad_norm": 2.2492353916168213, |
|
"learning_rate": 8.894433781190018e-05, |
|
"loss": 1.5778, |
|
"step": 5500 |
|
}, |
|
{ |
|
"epoch": 3.582853486884197, |
|
"grad_norm": 2.1024489402770996, |
|
"learning_rate": 8.510556621880996e-05, |
|
"loss": 1.5926, |
|
"step": 5600 |
|
}, |
|
{ |
|
"epoch": 3.6468330134357005, |
|
"grad_norm": 2.1116743087768555, |
|
"learning_rate": 8.126679462571976e-05, |
|
"loss": 1.5937, |
|
"step": 5700 |
|
}, |
|
{ |
|
"epoch": 3.710812539987204, |
|
"grad_norm": 2.013080596923828, |
|
"learning_rate": 7.742802303262955e-05, |
|
"loss": 1.5913, |
|
"step": 5800 |
|
}, |
|
{ |
|
"epoch": 3.7747920665387076, |
|
"grad_norm": 2.1557400226593018, |
|
"learning_rate": 7.358925143953934e-05, |
|
"loss": 1.6041, |
|
"step": 5900 |
|
}, |
|
{ |
|
"epoch": 3.838771593090211, |
|
"grad_norm": 2.10186767578125, |
|
"learning_rate": 6.975047984644913e-05, |
|
"loss": 1.5799, |
|
"step": 6000 |
|
}, |
|
{ |
|
"epoch": 3.9027511196417146, |
|
"grad_norm": 2.129519462585449, |
|
"learning_rate": 6.591170825335893e-05, |
|
"loss": 1.5946, |
|
"step": 6100 |
|
}, |
|
{ |
|
"epoch": 3.966730646193218, |
|
"grad_norm": 2.045646905899048, |
|
"learning_rate": 6.20729366602687e-05, |
|
"loss": 1.5882, |
|
"step": 6200 |
|
}, |
|
{ |
|
"epoch": 4.030710172744722, |
|
"grad_norm": 2.2427146434783936, |
|
"learning_rate": 5.82341650671785e-05, |
|
"loss": 1.5005, |
|
"step": 6300 |
|
}, |
|
{ |
|
"epoch": 4.094689699296225, |
|
"grad_norm": 2.2632296085357666, |
|
"learning_rate": 5.439539347408829e-05, |
|
"loss": 1.3888, |
|
"step": 6400 |
|
}, |
|
{ |
|
"epoch": 4.158669225847729, |
|
"grad_norm": 2.541220188140869, |
|
"learning_rate": 5.0556621880998075e-05, |
|
"loss": 1.4124, |
|
"step": 6500 |
|
}, |
|
{ |
|
"epoch": 4.222648752399232, |
|
"grad_norm": 2.566311836242676, |
|
"learning_rate": 4.6717850287907866e-05, |
|
"loss": 1.409, |
|
"step": 6600 |
|
}, |
|
{ |
|
"epoch": 4.286628278950736, |
|
"grad_norm": 2.425945281982422, |
|
"learning_rate": 4.287907869481765e-05, |
|
"loss": 1.4134, |
|
"step": 6700 |
|
}, |
|
{ |
|
"epoch": 4.350607805502239, |
|
"grad_norm": 2.4377615451812744, |
|
"learning_rate": 3.904030710172744e-05, |
|
"loss": 1.4257, |
|
"step": 6800 |
|
}, |
|
{ |
|
"epoch": 4.414587332053743, |
|
"grad_norm": 2.6660194396972656, |
|
"learning_rate": 3.5201535508637234e-05, |
|
"loss": 1.4288, |
|
"step": 6900 |
|
}, |
|
{ |
|
"epoch": 4.478566858605246, |
|
"grad_norm": 2.393036365509033, |
|
"learning_rate": 3.1362763915547026e-05, |
|
"loss": 1.4182, |
|
"step": 7000 |
|
}, |
|
{ |
|
"epoch": 4.54254638515675, |
|
"grad_norm": 2.6361422538757324, |
|
"learning_rate": 2.752399232245681e-05, |
|
"loss": 1.4149, |
|
"step": 7100 |
|
}, |
|
{ |
|
"epoch": 4.606525911708253, |
|
"grad_norm": 2.6104772090911865, |
|
"learning_rate": 2.3685220729366603e-05, |
|
"loss": 1.407, |
|
"step": 7200 |
|
}, |
|
{ |
|
"epoch": 4.670505438259757, |
|
"grad_norm": 2.4266579151153564, |
|
"learning_rate": 1.9846449136276387e-05, |
|
"loss": 1.4113, |
|
"step": 7300 |
|
}, |
|
{ |
|
"epoch": 4.73448496481126, |
|
"grad_norm": 2.5348973274230957, |
|
"learning_rate": 1.600767754318618e-05, |
|
"loss": 1.3936, |
|
"step": 7400 |
|
}, |
|
{ |
|
"epoch": 4.798464491362764, |
|
"grad_norm": 2.3764045238494873, |
|
"learning_rate": 1.2168905950095967e-05, |
|
"loss": 1.4097, |
|
"step": 7500 |
|
}, |
|
{ |
|
"epoch": 4.862444017914267, |
|
"grad_norm": 2.4267590045928955, |
|
"learning_rate": 8.330134357005757e-06, |
|
"loss": 1.4209, |
|
"step": 7600 |
|
}, |
|
{ |
|
"epoch": 4.926423544465771, |
|
"grad_norm": 2.7150962352752686, |
|
"learning_rate": 4.4913627639155465e-06, |
|
"loss": 1.4148, |
|
"step": 7700 |
|
}, |
|
{ |
|
"epoch": 4.990403071017274, |
|
"grad_norm": 2.550471067428589, |
|
"learning_rate": 6.525911708253358e-07, |
|
"loss": 1.4254, |
|
"step": 7800 |
|
} |
|
], |
|
"logging_steps": 100, |
|
"max_steps": 7815, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 5, |
|
"save_steps": 2500, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": true |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 3.2180160877992346e+17, |
|
"train_batch_size": 16, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|