MohamedAhmedAE commited on
Commit
fd59336
1 Parent(s): 3bb4cdc

Training in progress, step 95000

Browse files
adapter_model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:16a93671d5b090c8f379d3ef80dfb48390da64b0609eafb13b96590cd1e684ac
3
  size 5544997664
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:1c8c2bc861c786b2bf9d8ddb8858babedad8fc42c6e26fb00fe13b35096c6de7
3
  size 5544997664
last-checkpoint/adapter_config.json CHANGED
@@ -20,13 +20,13 @@
20
  "rank_pattern": {},
21
  "revision": null,
22
  "target_modules": [
 
23
  "up_proj",
 
24
  "k_proj",
25
  "gate_proj",
26
- "down_proj",
27
- "o_proj",
28
  "v_proj",
29
- "q_proj"
30
  ],
31
  "task_type": "CAUSAL_LM",
32
  "use_dora": false,
 
20
  "rank_pattern": {},
21
  "revision": null,
22
  "target_modules": [
23
+ "down_proj",
24
  "up_proj",
25
+ "q_proj",
26
  "k_proj",
27
  "gate_proj",
 
 
28
  "v_proj",
29
+ "o_proj"
30
  ],
31
  "task_type": "CAUSAL_LM",
32
  "use_dora": false,
last-checkpoint/adapter_model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:16a93671d5b090c8f379d3ef80dfb48390da64b0609eafb13b96590cd1e684ac
3
  size 5544997664
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:588a15f7cea8d0a814b43f40bdcbefaf553e3f1ea3fe8f93ab138197ad9ca78c
3
  size 5544997664
last-checkpoint/optimizer.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:5464c27c303415381f5471ad99dd5ce53a77219410f9c1d16f371030c261e24d
3
  size 674093138
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:3ebee6fad5e5c1bb9d7e488846379921805f8c0c20003d8a91f2e25aed77a83a
3
  size 674093138
last-checkpoint/rng_state.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:45c375600effc3777511397dfebe48d9d053b6e888254cb1574ec49486dfc9f9
3
  size 14244
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:6d8f8f9743ec8d3f95f6d09874e8c8e1665b1753c549b2fad6b80c9e2a59f8a6
3
  size 14244
last-checkpoint/scheduler.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:626d4207e6537a18f1e89e162d32f5105d10060445c6866d763e711bad257f85
3
  size 1064
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:def38347a8476c414f9a59b485e01231d01480373c9bf5d7882acb65a1218490
3
  size 1064
last-checkpoint/tokenizer_config.json CHANGED
@@ -2072,6 +2072,7 @@
2072
  "bos_token": "<|im_start|>",
2073
  "chat_template": "{% for message in messages %}{{'<|im_start|>' + message['role'] + '\n' + message['content'] + '<|im_end|>' + '\n'}}{% endfor %}{% if add_generation_prompt %}{{ '<|im_start|>assistant\n' }}{% endif %}",
2074
  "clean_up_tokenization_spaces": true,
 
2075
  "eos_token": "<|im_end|>",
2076
  "max_length": 4096,
2077
  "model_input_names": [
 
2072
  "bos_token": "<|im_start|>",
2073
  "chat_template": "{% for message in messages %}{{'<|im_start|>' + message['role'] + '\n' + message['content'] + '<|im_end|>' + '\n'}}{% endfor %}{% if add_generation_prompt %}{{ '<|im_start|>assistant\n' }}{% endif %}",
2074
  "clean_up_tokenization_spaces": true,
2075
+ "device_map": "auto",
2076
  "eos_token": "<|im_end|>",
2077
  "max_length": 4096,
2078
  "model_input_names": [
last-checkpoint/trainer_state.json CHANGED
@@ -1,9 +1,9 @@
1
  {
2
  "best_metric": null,
3
  "best_model_checkpoint": null,
4
- "epoch": 0.07057546145494029,
5
  "eval_steps": 200,
6
- "global_step": 94900,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
@@ -6230,426 +6230,6 @@
6230
  "learning_rate": 1.9991374234676826e-05,
6231
  "loss": 1.5551,
6232
  "step": 88900
6233
- },
6234
- {
6235
- "epoch": 0.06618773518956465,
6236
- "grad_norm": 0.6733551621437073,
6237
- "learning_rate": 1.999135481992186e-05,
6238
- "loss": 1.4334,
6239
- "step": 89000
6240
- },
6241
- {
6242
- "epoch": 0.06626210343135068,
6243
- "grad_norm": 0.8035016059875488,
6244
- "learning_rate": 1.999133538335166e-05,
6245
- "loss": 1.4872,
6246
- "step": 89100
6247
- },
6248
- {
6249
- "epoch": 0.06633647167313671,
6250
- "grad_norm": 0.4339046776294708,
6251
- "learning_rate": 1.9991315924966277e-05,
6252
- "loss": 1.4869,
6253
- "step": 89200
6254
- },
6255
- {
6256
- "epoch": 0.06641083991492273,
6257
- "grad_norm": 0.6680594086647034,
6258
- "learning_rate": 1.9991296444765747e-05,
6259
- "loss": 1.5103,
6260
- "step": 89300
6261
- },
6262
- {
6263
- "epoch": 0.06648520815670876,
6264
- "grad_norm": 0.697487473487854,
6265
- "learning_rate": 1.9991276942750117e-05,
6266
- "loss": 1.4239,
6267
- "step": 89400
6268
- },
6269
- {
6270
- "epoch": 0.06655957639849479,
6271
- "grad_norm": 0.587734043598175,
6272
- "learning_rate": 1.9991257418919424e-05,
6273
- "loss": 1.5856,
6274
- "step": 89500
6275
- },
6276
- {
6277
- "epoch": 0.06663394464028081,
6278
- "grad_norm": 0.8574571013450623,
6279
- "learning_rate": 1.999123787327372e-05,
6280
- "loss": 1.4818,
6281
- "step": 89600
6282
- },
6283
- {
6284
- "epoch": 0.06670831288206684,
6285
- "grad_norm": 1.0861676931381226,
6286
- "learning_rate": 1.9991218305813035e-05,
6287
- "loss": 1.4883,
6288
- "step": 89700
6289
- },
6290
- {
6291
- "epoch": 0.06678268112385287,
6292
- "grad_norm": 1.0139306783676147,
6293
- "learning_rate": 1.9991198716537422e-05,
6294
- "loss": 1.5099,
6295
- "step": 89800
6296
- },
6297
- {
6298
- "epoch": 0.0668570493656389,
6299
- "grad_norm": 0.6741511225700378,
6300
- "learning_rate": 1.999117910544692e-05,
6301
- "loss": 1.4746,
6302
- "step": 89900
6303
- },
6304
- {
6305
- "epoch": 0.06693141760742492,
6306
- "grad_norm": 0.9702801704406738,
6307
- "learning_rate": 1.999115947254157e-05,
6308
- "loss": 1.5166,
6309
- "step": 90000
6310
- },
6311
- {
6312
- "epoch": 0.06700578584921095,
6313
- "grad_norm": 0.7757803797721863,
6314
- "learning_rate": 1.9991139817821416e-05,
6315
- "loss": 1.5031,
6316
- "step": 90100
6317
- },
6318
- {
6319
- "epoch": 0.06708015409099698,
6320
- "grad_norm": 0.7200698256492615,
6321
- "learning_rate": 1.9991120141286502e-05,
6322
- "loss": 1.5834,
6323
- "step": 90200
6324
- },
6325
- {
6326
- "epoch": 0.067154522332783,
6327
- "grad_norm": 0.7415780425071716,
6328
- "learning_rate": 1.999110044293687e-05,
6329
- "loss": 1.5689,
6330
- "step": 90300
6331
- },
6332
- {
6333
- "epoch": 0.06722889057456903,
6334
- "grad_norm": 0.5777677297592163,
6335
- "learning_rate": 1.9991080722772564e-05,
6336
- "loss": 1.5139,
6337
- "step": 90400
6338
- },
6339
- {
6340
- "epoch": 0.06730325881635506,
6341
- "grad_norm": 0.6991866827011108,
6342
- "learning_rate": 1.999106098079363e-05,
6343
- "loss": 1.5073,
6344
- "step": 90500
6345
- },
6346
- {
6347
- "epoch": 0.06737762705814108,
6348
- "grad_norm": 0.6112390160560608,
6349
- "learning_rate": 1.9991041217000105e-05,
6350
- "loss": 1.4773,
6351
- "step": 90600
6352
- },
6353
- {
6354
- "epoch": 0.06745199529992713,
6355
- "grad_norm": 0.8287676572799683,
6356
- "learning_rate": 1.9991021431392033e-05,
6357
- "loss": 1.5425,
6358
- "step": 90700
6359
- },
6360
- {
6361
- "epoch": 0.06752636354171315,
6362
- "grad_norm": 0.8582881689071655,
6363
- "learning_rate": 1.999100162396946e-05,
6364
- "loss": 1.5581,
6365
- "step": 90800
6366
- },
6367
- {
6368
- "epoch": 0.06760073178349918,
6369
- "grad_norm": 0.5585276484489441,
6370
- "learning_rate": 1.999098179473243e-05,
6371
- "loss": 1.5015,
6372
- "step": 90900
6373
- },
6374
- {
6375
- "epoch": 0.0676751000252852,
6376
- "grad_norm": 0.4237435460090637,
6377
- "learning_rate": 1.9990961943680984e-05,
6378
- "loss": 1.523,
6379
- "step": 91000
6380
- },
6381
- {
6382
- "epoch": 0.06774946826707123,
6383
- "grad_norm": 0.5455594658851624,
6384
- "learning_rate": 1.999094207081517e-05,
6385
- "loss": 1.5448,
6386
- "step": 91100
6387
- },
6388
- {
6389
- "epoch": 0.06782383650885726,
6390
- "grad_norm": 0.48855817317962646,
6391
- "learning_rate": 1.999092217613502e-05,
6392
- "loss": 1.4535,
6393
- "step": 91200
6394
- },
6395
- {
6396
- "epoch": 0.06789820475064329,
6397
- "grad_norm": 0.5199916958808899,
6398
- "learning_rate": 1.999090225964059e-05,
6399
- "loss": 1.4921,
6400
- "step": 91300
6401
- },
6402
- {
6403
- "epoch": 0.06797257299242931,
6404
- "grad_norm": 0.5790271162986755,
6405
- "learning_rate": 1.9990882321331916e-05,
6406
- "loss": 1.5773,
6407
- "step": 91400
6408
- },
6409
- {
6410
- "epoch": 0.06804694123421534,
6411
- "grad_norm": 0.5524342656135559,
6412
- "learning_rate": 1.9990862361209043e-05,
6413
- "loss": 1.4619,
6414
- "step": 91500
6415
- },
6416
- {
6417
- "epoch": 0.06812130947600137,
6418
- "grad_norm": 0.7153291702270508,
6419
- "learning_rate": 1.999084237927202e-05,
6420
- "loss": 1.6042,
6421
- "step": 91600
6422
- },
6423
- {
6424
- "epoch": 0.0681956777177874,
6425
- "grad_norm": 0.957635223865509,
6426
- "learning_rate": 1.9990822375520882e-05,
6427
- "loss": 1.538,
6428
- "step": 91700
6429
- },
6430
- {
6431
- "epoch": 0.06827004595957342,
6432
- "grad_norm": 0.38240477442741394,
6433
- "learning_rate": 1.9990802349955678e-05,
6434
- "loss": 1.5937,
6435
- "step": 91800
6436
- },
6437
- {
6438
- "epoch": 0.06834441420135945,
6439
- "grad_norm": 0.8961233496665955,
6440
- "learning_rate": 1.999078230257645e-05,
6441
- "loss": 1.5119,
6442
- "step": 91900
6443
- },
6444
- {
6445
- "epoch": 0.06841878244314548,
6446
- "grad_norm": 0.47433900833129883,
6447
- "learning_rate": 1.999076223338324e-05,
6448
- "loss": 1.5449,
6449
- "step": 92000
6450
- },
6451
- {
6452
- "epoch": 0.0684931506849315,
6453
- "grad_norm": 0.8222399353981018,
6454
- "learning_rate": 1.9990742142376098e-05,
6455
- "loss": 1.5334,
6456
- "step": 92100
6457
- },
6458
- {
6459
- "epoch": 0.06856751892671753,
6460
- "grad_norm": 0.464373916387558,
6461
- "learning_rate": 1.999072202955506e-05,
6462
- "loss": 1.5003,
6463
- "step": 92200
6464
- },
6465
- {
6466
- "epoch": 0.06864188716850356,
6467
- "grad_norm": 0.8799763321876526,
6468
- "learning_rate": 1.9990701894920176e-05,
6469
- "loss": 1.581,
6470
- "step": 92300
6471
- },
6472
- {
6473
- "epoch": 0.06871625541028958,
6474
- "grad_norm": 0.9567086100578308,
6475
- "learning_rate": 1.999068173847149e-05,
6476
- "loss": 1.4373,
6477
- "step": 92400
6478
- },
6479
- {
6480
- "epoch": 0.06879062365207561,
6481
- "grad_norm": 0.440479576587677,
6482
- "learning_rate": 1.999066156020904e-05,
6483
- "loss": 1.5571,
6484
- "step": 92500
6485
- },
6486
- {
6487
- "epoch": 0.06886499189386165,
6488
- "grad_norm": 0.7486180663108826,
6489
- "learning_rate": 1.9990641360132876e-05,
6490
- "loss": 1.4437,
6491
- "step": 92600
6492
- },
6493
- {
6494
- "epoch": 0.06893936013564768,
6495
- "grad_norm": 0.7576742172241211,
6496
- "learning_rate": 1.9990621138243037e-05,
6497
- "loss": 1.5306,
6498
- "step": 92700
6499
- },
6500
- {
6501
- "epoch": 0.0690137283774337,
6502
- "grad_norm": 0.6755186915397644,
6503
- "learning_rate": 1.9990600894539574e-05,
6504
- "loss": 1.5769,
6505
- "step": 92800
6506
- },
6507
- {
6508
- "epoch": 0.06908809661921973,
6509
- "grad_norm": 0.6093853712081909,
6510
- "learning_rate": 1.9990580629022526e-05,
6511
- "loss": 1.5777,
6512
- "step": 92900
6513
- },
6514
- {
6515
- "epoch": 0.06916246486100576,
6516
- "grad_norm": 0.5788242220878601,
6517
- "learning_rate": 1.9990560341691938e-05,
6518
- "loss": 1.494,
6519
- "step": 93000
6520
- },
6521
- {
6522
- "epoch": 0.06923683310279179,
6523
- "grad_norm": 0.828676700592041,
6524
- "learning_rate": 1.9990540032547855e-05,
6525
- "loss": 1.5651,
6526
- "step": 93100
6527
- },
6528
- {
6529
- "epoch": 0.06931120134457781,
6530
- "grad_norm": 0.5612863302230835,
6531
- "learning_rate": 1.9990519701590322e-05,
6532
- "loss": 1.5584,
6533
- "step": 93200
6534
- },
6535
- {
6536
- "epoch": 0.06938556958636384,
6537
- "grad_norm": 0.965107262134552,
6538
- "learning_rate": 1.999049934881938e-05,
6539
- "loss": 1.497,
6540
- "step": 93300
6541
- },
6542
- {
6543
- "epoch": 0.06945993782814987,
6544
- "grad_norm": 0.46939852833747864,
6545
- "learning_rate": 1.9990478974235078e-05,
6546
- "loss": 1.5716,
6547
- "step": 93400
6548
- },
6549
- {
6550
- "epoch": 0.0695343060699359,
6551
- "grad_norm": 0.4986964464187622,
6552
- "learning_rate": 1.999045857783746e-05,
6553
- "loss": 1.5762,
6554
- "step": 93500
6555
- },
6556
- {
6557
- "epoch": 0.06960867431172192,
6558
- "grad_norm": 0.4267128109931946,
6559
- "learning_rate": 1.9990438159626566e-05,
6560
- "loss": 1.5101,
6561
- "step": 93600
6562
- },
6563
- {
6564
- "epoch": 0.06968304255350795,
6565
- "grad_norm": 0.411811888217926,
6566
- "learning_rate": 1.9990417719602445e-05,
6567
- "loss": 1.5623,
6568
- "step": 93700
6569
- },
6570
- {
6571
- "epoch": 0.06975741079529398,
6572
- "grad_norm": 0.8761053681373596,
6573
- "learning_rate": 1.999039725776514e-05,
6574
- "loss": 1.4294,
6575
- "step": 93800
6576
- },
6577
- {
6578
- "epoch": 0.06983177903708,
6579
- "grad_norm": 0.9531000852584839,
6580
- "learning_rate": 1.99903767741147e-05,
6581
- "loss": 1.4925,
6582
- "step": 93900
6583
- },
6584
- {
6585
- "epoch": 0.06990614727886603,
6586
- "grad_norm": 0.516830325126648,
6587
- "learning_rate": 1.999035626865116e-05,
6588
- "loss": 1.5802,
6589
- "step": 94000
6590
- },
6591
- {
6592
- "epoch": 0.06998051552065206,
6593
- "grad_norm": 0.47061294317245483,
6594
- "learning_rate": 1.9990335741374572e-05,
6595
- "loss": 1.5668,
6596
- "step": 94100
6597
- },
6598
- {
6599
- "epoch": 0.07005488376243808,
6600
- "grad_norm": 0.7790777683258057,
6601
- "learning_rate": 1.9990315192284978e-05,
6602
- "loss": 1.5568,
6603
- "step": 94200
6604
- },
6605
- {
6606
- "epoch": 0.07012925200422411,
6607
- "grad_norm": 0.75156170129776,
6608
- "learning_rate": 1.9990294621382426e-05,
6609
- "loss": 1.5217,
6610
- "step": 94300
6611
- },
6612
- {
6613
- "epoch": 0.07020362024601014,
6614
- "grad_norm": 1.195028305053711,
6615
- "learning_rate": 1.999027402866696e-05,
6616
- "loss": 1.5662,
6617
- "step": 94400
6618
- },
6619
- {
6620
- "epoch": 0.07027798848779618,
6621
- "grad_norm": 0.6215851306915283,
6622
- "learning_rate": 1.999025341413862e-05,
6623
- "loss": 1.5208,
6624
- "step": 94500
6625
- },
6626
- {
6627
- "epoch": 0.0703523567295822,
6628
- "grad_norm": 0.509843647480011,
6629
- "learning_rate": 1.9990232777797458e-05,
6630
- "loss": 1.489,
6631
- "step": 94600
6632
- },
6633
- {
6634
- "epoch": 0.07042672497136823,
6635
- "grad_norm": 1.2951029539108276,
6636
- "learning_rate": 1.9990212119643516e-05,
6637
- "loss": 1.4729,
6638
- "step": 94700
6639
- },
6640
- {
6641
- "epoch": 0.07050109321315426,
6642
- "grad_norm": 0.5028135776519775,
6643
- "learning_rate": 1.9990191439676838e-05,
6644
- "loss": 1.5579,
6645
- "step": 94800
6646
- },
6647
- {
6648
- "epoch": 0.07057546145494029,
6649
- "grad_norm": 0.7202877998352051,
6650
- "learning_rate": 1.9990170737897473e-05,
6651
- "loss": 1.5282,
6652
- "step": 94900
6653
  }
6654
  ],
6655
  "logging_steps": 100,
@@ -6669,7 +6249,7 @@
6669
  "attributes": {}
6670
  }
6671
  },
6672
- "total_flos": 1.2933639715629711e+18,
6673
  "train_batch_size": 1,
6674
  "trial_name": null,
6675
  "trial_params": null
 
1
  {
2
  "best_metric": null,
3
  "best_model_checkpoint": null,
4
+ "epoch": 0.06611336694777863,
5
  "eval_steps": 200,
6
+ "global_step": 88900,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
 
6230
  "learning_rate": 1.9991374234676826e-05,
6231
  "loss": 1.5551,
6232
  "step": 88900
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
6233
  }
6234
  ],
6235
  "logging_steps": 100,
 
6249
  "attributes": {}
6250
  }
6251
  },
6252
+ "total_flos": 1.211678461812007e+18,
6253
  "train_batch_size": 1,
6254
  "trial_name": null,
6255
  "trial_params": null
last-checkpoint/training_args.bin CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:ccc6594b62fe53f0b1bfeab5cb36a3d9d52c3d027d521d24a54039f0b55f3bd6
3
- size 5560
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:8bb89e5e4b20648cd50836d8df065bde229d29cb5c6085310a18725c84aab824
3
+ size 5496