MohamedAhmedAE commited on
Commit
9896419
1 Parent(s): d1f9513

Training in progress, step 100900, checkpoint

Browse files
last-checkpoint/adapter_config.json CHANGED
@@ -20,13 +20,13 @@
20
  "rank_pattern": {},
21
  "revision": null,
22
  "target_modules": [
23
- "up_proj",
24
  "k_proj",
25
  "gate_proj",
26
  "down_proj",
27
- "o_proj",
28
- "v_proj",
29
- "q_proj"
30
  ],
31
  "task_type": "CAUSAL_LM",
32
  "use_dora": false,
 
20
  "rank_pattern": {},
21
  "revision": null,
22
  "target_modules": [
23
+ "v_proj",
24
  "k_proj",
25
  "gate_proj",
26
  "down_proj",
27
+ "q_proj",
28
+ "up_proj",
29
+ "o_proj"
30
  ],
31
  "task_type": "CAUSAL_LM",
32
  "use_dora": false,
last-checkpoint/adapter_model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:1c8c2bc861c786b2bf9d8ddb8858babedad8fc42c6e26fb00fe13b35096c6de7
3
  size 5544997664
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:8ff50e9a0eef14c00f32c5e550257295427f2d666e009aac32472aef43b0c78f
3
  size 5544997664
last-checkpoint/optimizer.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:36ebe5440e2bcb412e4131df2efca8e8fc88b5200168c85a419cb901604336b6
3
  size 674093138
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:4b1f3c8b9d1c8514057a760685418307853b2172387d395e371d81516debcc99
3
  size 674093138
last-checkpoint/rng_state.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:0610f4aa7ed2f34398fce8dc77c3d7b14d52dfb0bc17dc7f64e8f6c2438e189b
3
  size 14244
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:3d52e9778ae961a843d4efe5adba669832146332ec663eac9df46d71427724e3
3
  size 14244
last-checkpoint/scheduler.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:17feec1222485652df46ab05d04d0cb1b6896f1f053ea3ae8ca19c7cd689e6b7
3
  size 1064
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:b1dd725a3e5295711459643d6e1204a1d04a7f905cc6416544fa87ecdfb18228
3
  size 1064
last-checkpoint/trainer_state.json CHANGED
@@ -1,9 +1,9 @@
1
  {
2
  "best_metric": null,
3
  "best_model_checkpoint": null,
4
- "epoch": 0.07064982969672631,
5
  "eval_steps": 200,
6
- "global_step": 95000,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
@@ -6657,6 +6657,419 @@
6657
  "learning_rate": 1.9990150014305462e-05,
6658
  "loss": 1.5194,
6659
  "step": 95000
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
6660
  }
6661
  ],
6662
  "logging_steps": 100,
@@ -6676,7 +7089,7 @@
6676
  "attributes": {}
6677
  }
6678
  },
6679
- "total_flos": 1.2945898144897352e+18,
6680
  "train_batch_size": 1,
6681
  "trial_name": null,
6682
  "trial_params": null
 
1
  {
2
  "best_metric": null,
3
  "best_model_checkpoint": null,
4
+ "epoch": 0.07503755596210195,
5
  "eval_steps": 200,
6
+ "global_step": 100900,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
 
6657
  "learning_rate": 1.9990150014305462e-05,
6658
  "loss": 1.5194,
6659
  "step": 95000
6660
+ },
6661
+ {
6662
+ "epoch": 0.07072419793851234,
6663
+ "grad_norm": 0.7444689273834229,
6664
+ "learning_rate": 1.9990129268900848e-05,
6665
+ "loss": 1.5198,
6666
+ "step": 95100
6667
+ },
6668
+ {
6669
+ "epoch": 0.07079856618029837,
6670
+ "grad_norm": 0.9299377202987671,
6671
+ "learning_rate": 1.9990108501683685e-05,
6672
+ "loss": 1.5393,
6673
+ "step": 95200
6674
+ },
6675
+ {
6676
+ "epoch": 0.0708729344220844,
6677
+ "grad_norm": 0.6611402630805969,
6678
+ "learning_rate": 1.999008771265401e-05,
6679
+ "loss": 1.5351,
6680
+ "step": 95300
6681
+ },
6682
+ {
6683
+ "epoch": 0.07094730266387042,
6684
+ "grad_norm": 0.4772530496120453,
6685
+ "learning_rate": 1.9990066901811876e-05,
6686
+ "loss": 1.5243,
6687
+ "step": 95400
6688
+ },
6689
+ {
6690
+ "epoch": 0.07102167090565645,
6691
+ "grad_norm": 0.42998188734054565,
6692
+ "learning_rate": 1.9990046069157322e-05,
6693
+ "loss": 1.5877,
6694
+ "step": 95500
6695
+ },
6696
+ {
6697
+ "epoch": 0.07109603914744247,
6698
+ "grad_norm": 0.7415347099304199,
6699
+ "learning_rate": 1.9990025214690396e-05,
6700
+ "loss": 1.5633,
6701
+ "step": 95600
6702
+ },
6703
+ {
6704
+ "epoch": 0.0711704073892285,
6705
+ "grad_norm": 0.657112717628479,
6706
+ "learning_rate": 1.999000433841114e-05,
6707
+ "loss": 1.4555,
6708
+ "step": 95700
6709
+ },
6710
+ {
6711
+ "epoch": 0.07124477563101453,
6712
+ "grad_norm": 0.9188429713249207,
6713
+ "learning_rate": 1.998998344031961e-05,
6714
+ "loss": 1.4329,
6715
+ "step": 95800
6716
+ },
6717
+ {
6718
+ "epoch": 0.07131914387280056,
6719
+ "grad_norm": 0.8823667168617249,
6720
+ "learning_rate": 1.9989962520415836e-05,
6721
+ "loss": 1.4754,
6722
+ "step": 95900
6723
+ },
6724
+ {
6725
+ "epoch": 0.07139351211458658,
6726
+ "grad_norm": 0.7276200652122498,
6727
+ "learning_rate": 1.9989941578699878e-05,
6728
+ "loss": 1.5286,
6729
+ "step": 96000
6730
+ },
6731
+ {
6732
+ "epoch": 0.07146788035637261,
6733
+ "grad_norm": 0.941512405872345,
6734
+ "learning_rate": 1.998992061517177e-05,
6735
+ "loss": 1.5087,
6736
+ "step": 96100
6737
+ },
6738
+ {
6739
+ "epoch": 0.07154224859815864,
6740
+ "grad_norm": 1.0310442447662354,
6741
+ "learning_rate": 1.998989962983157e-05,
6742
+ "loss": 1.5895,
6743
+ "step": 96200
6744
+ },
6745
+ {
6746
+ "epoch": 0.07161661683994466,
6747
+ "grad_norm": 1.3620883226394653,
6748
+ "learning_rate": 1.9989878622679317e-05,
6749
+ "loss": 1.474,
6750
+ "step": 96300
6751
+ },
6752
+ {
6753
+ "epoch": 0.0716909850817307,
6754
+ "grad_norm": 0.5119801163673401,
6755
+ "learning_rate": 1.998985759371505e-05,
6756
+ "loss": 1.5112,
6757
+ "step": 96400
6758
+ },
6759
+ {
6760
+ "epoch": 0.07176535332351673,
6761
+ "grad_norm": 0.8966123461723328,
6762
+ "learning_rate": 1.998983654293883e-05,
6763
+ "loss": 1.4903,
6764
+ "step": 96500
6765
+ },
6766
+ {
6767
+ "epoch": 0.07183972156530276,
6768
+ "grad_norm": 0.5336944460868835,
6769
+ "learning_rate": 1.998981547035069e-05,
6770
+ "loss": 1.5673,
6771
+ "step": 96600
6772
+ },
6773
+ {
6774
+ "epoch": 0.07191408980708879,
6775
+ "grad_norm": 1.2533961534500122,
6776
+ "learning_rate": 1.9989794375950688e-05,
6777
+ "loss": 1.5039,
6778
+ "step": 96700
6779
+ },
6780
+ {
6781
+ "epoch": 0.07198845804887481,
6782
+ "grad_norm": 1.3317081928253174,
6783
+ "learning_rate": 1.9989773259738858e-05,
6784
+ "loss": 1.567,
6785
+ "step": 96800
6786
+ },
6787
+ {
6788
+ "epoch": 0.07206282629066084,
6789
+ "grad_norm": 0.49700722098350525,
6790
+ "learning_rate": 1.998975212171525e-05,
6791
+ "loss": 1.542,
6792
+ "step": 96900
6793
+ },
6794
+ {
6795
+ "epoch": 0.07213719453244687,
6796
+ "grad_norm": 0.5809246301651001,
6797
+ "learning_rate": 1.9989730961879913e-05,
6798
+ "loss": 1.5097,
6799
+ "step": 97000
6800
+ },
6801
+ {
6802
+ "epoch": 0.07221156277423289,
6803
+ "grad_norm": 0.6107625365257263,
6804
+ "learning_rate": 1.9989709780232894e-05,
6805
+ "loss": 1.536,
6806
+ "step": 97100
6807
+ },
6808
+ {
6809
+ "epoch": 0.07228593101601892,
6810
+ "grad_norm": 0.5271338820457458,
6811
+ "learning_rate": 1.9989688576774234e-05,
6812
+ "loss": 1.5819,
6813
+ "step": 97200
6814
+ },
6815
+ {
6816
+ "epoch": 0.07236029925780495,
6817
+ "grad_norm": 0.6692411303520203,
6818
+ "learning_rate": 1.9989667351503988e-05,
6819
+ "loss": 1.4833,
6820
+ "step": 97300
6821
+ },
6822
+ {
6823
+ "epoch": 0.07243466749959097,
6824
+ "grad_norm": 1.0627728700637817,
6825
+ "learning_rate": 1.998964610442219e-05,
6826
+ "loss": 1.5404,
6827
+ "step": 97400
6828
+ },
6829
+ {
6830
+ "epoch": 0.072509035741377,
6831
+ "grad_norm": 0.5696298480033875,
6832
+ "learning_rate": 1.9989624835528896e-05,
6833
+ "loss": 1.4491,
6834
+ "step": 97500
6835
+ },
6836
+ {
6837
+ "epoch": 0.07258340398316303,
6838
+ "grad_norm": 0.5105301141738892,
6839
+ "learning_rate": 1.998960354482415e-05,
6840
+ "loss": 1.5188,
6841
+ "step": 97600
6842
+ },
6843
+ {
6844
+ "epoch": 0.07265777222494905,
6845
+ "grad_norm": 0.53251713514328,
6846
+ "learning_rate": 1.9989582232307998e-05,
6847
+ "loss": 1.5367,
6848
+ "step": 97700
6849
+ },
6850
+ {
6851
+ "epoch": 0.07273214046673508,
6852
+ "grad_norm": 0.6559078693389893,
6853
+ "learning_rate": 1.9989560897980485e-05,
6854
+ "loss": 1.4773,
6855
+ "step": 97800
6856
+ },
6857
+ {
6858
+ "epoch": 0.07280650870852111,
6859
+ "grad_norm": 0.39833974838256836,
6860
+ "learning_rate": 1.998953954184166e-05,
6861
+ "loss": 1.6063,
6862
+ "step": 97900
6863
+ },
6864
+ {
6865
+ "epoch": 0.07288087695030714,
6866
+ "grad_norm": 1.0479645729064941,
6867
+ "learning_rate": 1.9989518163891566e-05,
6868
+ "loss": 1.565,
6869
+ "step": 98000
6870
+ },
6871
+ {
6872
+ "epoch": 0.07295524519209316,
6873
+ "grad_norm": 0.7905478477478027,
6874
+ "learning_rate": 1.9989496764130253e-05,
6875
+ "loss": 1.5266,
6876
+ "step": 98100
6877
+ },
6878
+ {
6879
+ "epoch": 0.07302961343387919,
6880
+ "grad_norm": 0.4569951295852661,
6881
+ "learning_rate": 1.998947534255777e-05,
6882
+ "loss": 1.5295,
6883
+ "step": 98200
6884
+ },
6885
+ {
6886
+ "epoch": 0.07310398167566523,
6887
+ "grad_norm": 0.5308849215507507,
6888
+ "learning_rate": 1.9989453899174158e-05,
6889
+ "loss": 1.5203,
6890
+ "step": 98300
6891
+ },
6892
+ {
6893
+ "epoch": 0.07317834991745126,
6894
+ "grad_norm": 0.906802773475647,
6895
+ "learning_rate": 1.998943243397947e-05,
6896
+ "loss": 1.556,
6897
+ "step": 98400
6898
+ },
6899
+ {
6900
+ "epoch": 0.07325271815923728,
6901
+ "grad_norm": 0.5071494579315186,
6902
+ "learning_rate": 1.9989410946973747e-05,
6903
+ "loss": 1.5627,
6904
+ "step": 98500
6905
+ },
6906
+ {
6907
+ "epoch": 0.07332708640102331,
6908
+ "grad_norm": 0.5252199172973633,
6909
+ "learning_rate": 1.9989389438157037e-05,
6910
+ "loss": 1.5181,
6911
+ "step": 98600
6912
+ },
6913
+ {
6914
+ "epoch": 0.07340145464280934,
6915
+ "grad_norm": 0.5738980174064636,
6916
+ "learning_rate": 1.9989367907529394e-05,
6917
+ "loss": 1.6101,
6918
+ "step": 98700
6919
+ },
6920
+ {
6921
+ "epoch": 0.07347582288459537,
6922
+ "grad_norm": 0.6898683309555054,
6923
+ "learning_rate": 1.9989346355090853e-05,
6924
+ "loss": 1.579,
6925
+ "step": 98800
6926
+ },
6927
+ {
6928
+ "epoch": 0.07355019112638139,
6929
+ "grad_norm": 0.5396860241889954,
6930
+ "learning_rate": 1.998932478084147e-05,
6931
+ "loss": 1.5645,
6932
+ "step": 98900
6933
+ },
6934
+ {
6935
+ "epoch": 0.07362455936816742,
6936
+ "grad_norm": 0.5482293367385864,
6937
+ "learning_rate": 1.998930318478129e-05,
6938
+ "loss": 1.5453,
6939
+ "step": 99000
6940
+ },
6941
+ {
6942
+ "epoch": 0.07369892760995345,
6943
+ "grad_norm": 0.8394240736961365,
6944
+ "learning_rate": 1.9989281566910363e-05,
6945
+ "loss": 1.5025,
6946
+ "step": 99100
6947
+ },
6948
+ {
6949
+ "epoch": 0.07377329585173947,
6950
+ "grad_norm": 0.9409950971603394,
6951
+ "learning_rate": 1.9989259927228725e-05,
6952
+ "loss": 1.5489,
6953
+ "step": 99200
6954
+ },
6955
+ {
6956
+ "epoch": 0.0738476640935255,
6957
+ "grad_norm": 0.5597321391105652,
6958
+ "learning_rate": 1.9989238265736437e-05,
6959
+ "loss": 1.5994,
6960
+ "step": 99300
6961
+ },
6962
+ {
6963
+ "epoch": 0.07392203233531153,
6964
+ "grad_norm": 0.5139235258102417,
6965
+ "learning_rate": 1.9989216582433538e-05,
6966
+ "loss": 1.5478,
6967
+ "step": 99400
6968
+ },
6969
+ {
6970
+ "epoch": 0.07399640057709755,
6971
+ "grad_norm": 0.6312362551689148,
6972
+ "learning_rate": 1.998919487732008e-05,
6973
+ "loss": 1.4989,
6974
+ "step": 99500
6975
+ },
6976
+ {
6977
+ "epoch": 0.07407076881888358,
6978
+ "grad_norm": 0.6924223303794861,
6979
+ "learning_rate": 1.9989173150396105e-05,
6980
+ "loss": 1.4491,
6981
+ "step": 99600
6982
+ },
6983
+ {
6984
+ "epoch": 0.07414513706066961,
6985
+ "grad_norm": 0.5490585565567017,
6986
+ "learning_rate": 1.9989151401661666e-05,
6987
+ "loss": 1.538,
6988
+ "step": 99700
6989
+ },
6990
+ {
6991
+ "epoch": 0.07421950530245564,
6992
+ "grad_norm": 0.630455732345581,
6993
+ "learning_rate": 1.998912963111681e-05,
6994
+ "loss": 1.5286,
6995
+ "step": 99800
6996
+ },
6997
+ {
6998
+ "epoch": 0.07429387354424166,
6999
+ "grad_norm": 0.8591504693031311,
7000
+ "learning_rate": 1.998910783876158e-05,
7001
+ "loss": 1.5612,
7002
+ "step": 99900
7003
+ },
7004
+ {
7005
+ "epoch": 0.07436824178602769,
7006
+ "grad_norm": 1.0016669034957886,
7007
+ "learning_rate": 1.9989086024596027e-05,
7008
+ "loss": 1.5154,
7009
+ "step": 100000
7010
+ },
7011
+ {
7012
+ "epoch": 0.07444261002781372,
7013
+ "grad_norm": 0.6513885259628296,
7014
+ "learning_rate": 1.9989064188620197e-05,
7015
+ "loss": 1.5446,
7016
+ "step": 100100
7017
+ },
7018
+ {
7019
+ "epoch": 0.07451697826959976,
7020
+ "grad_norm": 0.6838514804840088,
7021
+ "learning_rate": 1.998904233083414e-05,
7022
+ "loss": 1.5336,
7023
+ "step": 100200
7024
+ },
7025
+ {
7026
+ "epoch": 0.07459134651138578,
7027
+ "grad_norm": 0.46571242809295654,
7028
+ "learning_rate": 1.9989020451237903e-05,
7029
+ "loss": 1.4838,
7030
+ "step": 100300
7031
+ },
7032
+ {
7033
+ "epoch": 0.07466571475317181,
7034
+ "grad_norm": 0.9936356544494629,
7035
+ "learning_rate": 1.998899854983153e-05,
7036
+ "loss": 1.5929,
7037
+ "step": 100400
7038
+ },
7039
+ {
7040
+ "epoch": 0.07474008299495784,
7041
+ "grad_norm": 0.6591018438339233,
7042
+ "learning_rate": 1.9988976626615075e-05,
7043
+ "loss": 1.54,
7044
+ "step": 100500
7045
+ },
7046
+ {
7047
+ "epoch": 0.07481445123674386,
7048
+ "grad_norm": 0.8453909754753113,
7049
+ "learning_rate": 1.998895468158858e-05,
7050
+ "loss": 1.5191,
7051
+ "step": 100600
7052
+ },
7053
+ {
7054
+ "epoch": 0.07488881947852989,
7055
+ "grad_norm": 0.6555935144424438,
7056
+ "learning_rate": 1.9988932714752095e-05,
7057
+ "loss": 1.5734,
7058
+ "step": 100700
7059
+ },
7060
+ {
7061
+ "epoch": 0.07496318772031592,
7062
+ "grad_norm": 0.6445733308792114,
7063
+ "learning_rate": 1.998891072610567e-05,
7064
+ "loss": 1.5516,
7065
+ "step": 100800
7066
+ },
7067
+ {
7068
+ "epoch": 0.07503755596210195,
7069
+ "grad_norm": 0.534389078617096,
7070
+ "learning_rate": 1.9988888715649357e-05,
7071
+ "loss": 1.5441,
7072
+ "step": 100900
7073
  }
7074
  ],
7075
  "logging_steps": 100,
 
7089
  "attributes": {}
7090
  }
7091
  },
7092
+ "total_flos": 1.3747108667853128e+18,
7093
  "train_batch_size": 1,
7094
  "trial_name": null,
7095
  "trial_params": null