|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 2.984771573604061, |
|
"eval_steps": 500, |
|
"global_step": 294, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.10152284263959391, |
|
"grad_norm": 0.2823072373867035, |
|
"learning_rate": 3.9948625730435075e-05, |
|
"loss": 2.8273, |
|
"step": 10 |
|
}, |
|
{ |
|
"epoch": 0.20304568527918782, |
|
"grad_norm": 0.2618003189563751, |
|
"learning_rate": 3.969770236712935e-05, |
|
"loss": 2.6708, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.30456852791878175, |
|
"grad_norm": 0.24902400374412537, |
|
"learning_rate": 3.9240711835383766e-05, |
|
"loss": 2.495, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 0.40609137055837563, |
|
"grad_norm": 0.21406474709510803, |
|
"learning_rate": 3.858297520699212e-05, |
|
"loss": 2.3206, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 0.5076142131979695, |
|
"grad_norm": 0.24804013967514038, |
|
"learning_rate": 3.7732150985966965e-05, |
|
"loss": 2.1775, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.6091370558375635, |
|
"grad_norm": 0.28641432523727417, |
|
"learning_rate": 3.6698145935029794e-05, |
|
"loss": 1.9521, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 0.7106598984771574, |
|
"grad_norm": 0.3936513066291809, |
|
"learning_rate": 3.5492999723996456e-05, |
|
"loss": 1.7139, |
|
"step": 70 |
|
}, |
|
{ |
|
"epoch": 0.8121827411167513, |
|
"grad_norm": 0.2143162190914154, |
|
"learning_rate": 3.41307447431802e-05, |
|
"loss": 1.4854, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 0.9137055837563451, |
|
"grad_norm": 0.08941923081874847, |
|
"learning_rate": 3.262724271410661e-05, |
|
"loss": 1.3727, |
|
"step": 90 |
|
}, |
|
{ |
|
"epoch": 1.015228426395939, |
|
"grad_norm": 0.08629459887742996, |
|
"learning_rate": 3.1e-05, |
|
"loss": 1.3856, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 1.116751269035533, |
|
"grad_norm": 0.07598450779914856, |
|
"learning_rate": 2.9267963766515236e-05, |
|
"loss": 1.3676, |
|
"step": 110 |
|
}, |
|
{ |
|
"epoch": 1.218274111675127, |
|
"grad_norm": 0.08433938026428223, |
|
"learning_rate": 2.7451301366163116e-05, |
|
"loss": 1.3437, |
|
"step": 120 |
|
}, |
|
{ |
|
"epoch": 1.3197969543147208, |
|
"grad_norm": 0.06350907683372498, |
|
"learning_rate": 2.557116551521642e-05, |
|
"loss": 1.3879, |
|
"step": 130 |
|
}, |
|
{ |
|
"epoch": 1.4213197969543148, |
|
"grad_norm": 0.07121391594409943, |
|
"learning_rate": 2.364944799731179e-05, |
|
"loss": 1.3591, |
|
"step": 140 |
|
}, |
|
{ |
|
"epoch": 1.5228426395939088, |
|
"grad_norm": 0.0733482763171196, |
|
"learning_rate": 2.1708524761554973e-05, |
|
"loss": 1.3315, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 1.6243654822335025, |
|
"grad_norm": 0.08874043822288513, |
|
"learning_rate": 1.9770995383136862e-05, |
|
"loss": 1.3264, |
|
"step": 160 |
|
}, |
|
{ |
|
"epoch": 1.7258883248730963, |
|
"grad_norm": 0.07142503559589386, |
|
"learning_rate": 1.7859419920108888e-05, |
|
"loss": 1.3342, |
|
"step": 170 |
|
}, |
|
{ |
|
"epoch": 1.8274111675126905, |
|
"grad_norm": 0.07405081391334534, |
|
"learning_rate": 1.5996056230285237e-05, |
|
"loss": 1.325, |
|
"step": 180 |
|
}, |
|
{ |
|
"epoch": 1.9289340101522843, |
|
"grad_norm": 0.07454005628824234, |
|
"learning_rate": 1.420260080688125e-05, |
|
"loss": 1.3448, |
|
"step": 190 |
|
}, |
|
{ |
|
"epoch": 2.030456852791878, |
|
"grad_norm": 0.07932838797569275, |
|
"learning_rate": 1.2499936150526519e-05, |
|
"loss": 1.3212, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 2.1319796954314723, |
|
"grad_norm": 0.07376272976398468, |
|
"learning_rate": 1.0907887619183308e-05, |
|
"loss": 1.3232, |
|
"step": 210 |
|
}, |
|
{ |
|
"epoch": 2.233502538071066, |
|
"grad_norm": 0.06934653967618942, |
|
"learning_rate": 9.444992587142751e-06, |
|
"loss": 1.3523, |
|
"step": 220 |
|
}, |
|
{ |
|
"epoch": 2.33502538071066, |
|
"grad_norm": 0.09688505530357361, |
|
"learning_rate": 8.128284600947997e-06, |
|
"loss": 1.3456, |
|
"step": 230 |
|
}, |
|
{ |
|
"epoch": 2.436548223350254, |
|
"grad_norm": 0.06206211820244789, |
|
"learning_rate": 6.973095045473124e-06, |
|
"loss": 1.2987, |
|
"step": 240 |
|
}, |
|
{ |
|
"epoch": 2.5380710659898478, |
|
"grad_norm": 0.08122861385345459, |
|
"learning_rate": 5.992874629503358e-06, |
|
"loss": 1.3153, |
|
"step": 250 |
|
}, |
|
{ |
|
"epoch": 2.6395939086294415, |
|
"grad_norm": 0.07479345053434372, |
|
"learning_rate": 5.199036769389357e-06, |
|
"loss": 1.2851, |
|
"step": 260 |
|
}, |
|
{ |
|
"epoch": 2.7411167512690353, |
|
"grad_norm": 0.07279635220766068, |
|
"learning_rate": 4.600824694373e-06, |
|
"loss": 1.322, |
|
"step": 270 |
|
}, |
|
{ |
|
"epoch": 2.8426395939086295, |
|
"grad_norm": 0.08008701354265213, |
|
"learning_rate": 4.205203820973785e-06, |
|
"loss": 1.2833, |
|
"step": 280 |
|
}, |
|
{ |
|
"epoch": 2.9441624365482233, |
|
"grad_norm": 0.08546874672174454, |
|
"learning_rate": 4.016780649598707e-06, |
|
"loss": 1.294, |
|
"step": 290 |
|
} |
|
], |
|
"logging_steps": 10, |
|
"max_steps": 294, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 3, |
|
"save_steps": 500, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": true |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 3.4706848609050624e+16, |
|
"train_batch_size": 2, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|