Alex-xu commited on
Commit
4d84a6e
1 Parent(s): c7f6144

Training in progress, step 6500, checkpoint

Browse files
checkpoint-6500/config.json CHANGED
@@ -16,7 +16,7 @@
16
  "intermediate_size": 3072,
17
  "layer_norm_eps": 1e-12,
18
  "max_blocks": 200,
19
- "max_position_embeddings": 1801,
20
  "max_relative_position_embeddings": 8,
21
  "model_type": "longelm",
22
  "node_size": 1,
 
16
  "intermediate_size": 3072,
17
  "layer_norm_eps": 1e-12,
18
  "max_blocks": 200,
19
+ "max_position_embeddings": 1805,
20
  "max_relative_position_embeddings": 8,
21
  "model_type": "longelm",
22
  "node_size": 1,
checkpoint-6500/optimizer.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:cafd9385532b5445162239e58b8a5e0127213f1e220c822154583bba81c1587c
3
- size 1010407418
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:78a1bcf829ca59fd5516a454b6a5da8e3254f3c1499d6c3cd3f5471c7183307c
3
+ size 1010431994
checkpoint-6500/pytorch_model.bin CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:bc3c57fdc86b39e9ee3e40c071c9e9d037ef2d31d38dd2883d61f28f6428a8f2
3
- size 505188394
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:b890105b5f6ecc2adfd62a87ff6f06cb65a5ed44a32eb0a059a76a94a9c8f1f7
3
+ size 505200682
checkpoint-6500/trainer_state.json CHANGED
@@ -9,107 +9,107 @@
9
  "is_world_process_zero": true,
10
  "log_history": [
11
  {
12
- "ep_loss": 4.7343,
13
  "epoch": 0.04,
14
  "learning_rate": 2.5e-05,
15
- "loss": 11.6218,
16
- "mlm_loss": 6.8875,
17
  "step": 500
18
  },
19
  {
20
- "ep_loss": 0.4315,
21
  "epoch": 0.08,
22
  "learning_rate": 5e-05,
23
- "loss": 3.0285,
24
- "mlm_loss": 2.5969,
25
  "step": 1000
26
  },
27
  {
28
- "ep_loss": 0.3811,
29
  "epoch": 0.12,
30
  "learning_rate": 7.5e-05,
31
- "loss": 2.2571,
32
- "mlm_loss": 1.8761,
33
  "step": 1500
34
  },
35
  {
36
- "ep_loss": 0.37,
37
  "epoch": 0.15,
38
  "learning_rate": 0.0001,
39
- "loss": 1.9695,
40
- "mlm_loss": 1.5994,
41
  "step": 2000
42
  },
43
  {
44
- "ep_loss": 0.3813,
45
  "epoch": 0.19,
46
  "learning_rate": 0.00012495,
47
- "loss": 1.5597,
48
- "mlm_loss": 1.1785,
49
  "step": 2500
50
  },
51
  {
52
- "ep_loss": 0.3624,
53
  "epoch": 0.23,
54
  "learning_rate": 0.00014995,
55
- "loss": 1.4228,
56
- "mlm_loss": 1.0604,
57
  "step": 3000
58
  },
59
  {
60
- "ep_loss": 0.338,
61
  "epoch": 0.27,
62
- "learning_rate": 0.00017495,
63
- "loss": 1.3108,
64
- "mlm_loss": 0.9729,
65
  "step": 3500
66
  },
67
  {
68
- "ep_loss": 0.3524,
69
  "epoch": 0.31,
70
- "learning_rate": 0.00019994999999999998,
71
- "loss": 1.2734,
72
- "mlm_loss": 0.921,
73
  "step": 4000
74
  },
75
  {
76
- "ep_loss": 0.3423,
77
  "epoch": 0.35,
78
- "learning_rate": 0.00022495000000000002,
79
- "loss": 1.217,
80
- "mlm_loss": 0.8747,
81
  "step": 4500
82
  },
83
  {
84
- "ep_loss": 0.3229,
85
  "epoch": 0.38,
86
  "learning_rate": 0.0002499,
87
- "loss": 1.1373,
88
- "mlm_loss": 0.8144,
89
  "step": 5000
90
  },
91
  {
92
- "ep_loss": 0.3326,
93
  "epoch": 0.42,
94
  "learning_rate": 0.00027489999999999996,
95
- "loss": 1.117,
96
- "mlm_loss": 0.7844,
97
  "step": 5500
98
  },
99
  {
100
- "ep_loss": 0.332,
101
  "epoch": 0.46,
102
  "learning_rate": 0.00029985,
103
- "loss": 1.0958,
104
- "mlm_loss": 0.7638,
105
  "step": 6000
106
  },
107
  {
108
- "ep_loss": 0.3236,
109
  "epoch": 0.5,
110
  "learning_rate": 0.00032485,
111
- "loss": 1.0729,
112
- "mlm_loss": 0.7493,
113
  "step": 6500
114
  }
115
  ],
 
9
  "is_world_process_zero": true,
10
  "log_history": [
11
  {
12
+ "ep_loss": 4.4238,
13
  "epoch": 0.04,
14
  "learning_rate": 2.5e-05,
15
+ "loss": 11.1467,
16
+ "mlm_loss": 6.7229,
17
  "step": 500
18
  },
19
  {
20
+ "ep_loss": 0.4305,
21
  "epoch": 0.08,
22
  "learning_rate": 5e-05,
23
+ "loss": 2.9841,
24
+ "mlm_loss": 2.5537,
25
  "step": 1000
26
  },
27
  {
28
+ "ep_loss": 0.3724,
29
  "epoch": 0.12,
30
  "learning_rate": 7.5e-05,
31
+ "loss": 2.2371,
32
+ "mlm_loss": 1.8647,
33
  "step": 1500
34
  },
35
  {
36
+ "ep_loss": 0.364,
37
  "epoch": 0.15,
38
  "learning_rate": 0.0001,
39
+ "loss": 1.8477,
40
+ "mlm_loss": 1.4837,
41
  "step": 2000
42
  },
43
  {
44
+ "ep_loss": 0.3678,
45
  "epoch": 0.19,
46
  "learning_rate": 0.00012495,
47
+ "loss": 1.5215,
48
+ "mlm_loss": 1.1538,
49
  "step": 2500
50
  },
51
  {
52
+ "ep_loss": 0.3617,
53
  "epoch": 0.23,
54
  "learning_rate": 0.00014995,
55
+ "loss": 1.4119,
56
+ "mlm_loss": 1.0501,
57
  "step": 3000
58
  },
59
  {
60
+ "ep_loss": 0.3336,
61
  "epoch": 0.27,
62
+ "learning_rate": 0.0001749,
63
+ "loss": 1.3027,
64
+ "mlm_loss": 0.9691,
65
  "step": 3500
66
  },
67
  {
68
+ "ep_loss": 0.3348,
69
  "epoch": 0.31,
70
+ "learning_rate": 0.0001999,
71
+ "loss": 1.2441,
72
+ "mlm_loss": 0.9093,
73
  "step": 4000
74
  },
75
  {
76
+ "ep_loss": 0.3348,
77
  "epoch": 0.35,
78
+ "learning_rate": 0.0002249,
79
+ "loss": 1.1942,
80
+ "mlm_loss": 0.8594,
81
  "step": 4500
82
  },
83
  {
84
+ "ep_loss": 0.3331,
85
  "epoch": 0.38,
86
  "learning_rate": 0.0002499,
87
+ "loss": 1.1466,
88
+ "mlm_loss": 0.8135,
89
  "step": 5000
90
  },
91
  {
92
+ "ep_loss": 0.3268,
93
  "epoch": 0.42,
94
  "learning_rate": 0.00027489999999999996,
95
+ "loss": 1.1067,
96
+ "mlm_loss": 0.7799,
97
  "step": 5500
98
  },
99
  {
100
+ "ep_loss": 0.3378,
101
  "epoch": 0.46,
102
  "learning_rate": 0.00029985,
103
+ "loss": 1.1007,
104
+ "mlm_loss": 0.7629,
105
  "step": 6000
106
  },
107
  {
108
+ "ep_loss": 0.3033,
109
  "epoch": 0.5,
110
  "learning_rate": 0.00032485,
111
+ "loss": 1.028,
112
+ "mlm_loss": 0.7246,
113
  "step": 6500
114
  }
115
  ],
checkpoint-6500/training_args.bin CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:cee56d4f38339eac5ecf098381d5222b3783455863547a17ac0df67c8caa7d15
3
  size 4664
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:b26e03be70ebe5d40a81a81be94421c0578a9d8742b065bc029501df48632957
3
  size 4664