Jorkieboe commited on
Commit
ab8adb8
1 Parent(s): add959d

Upload 11 files

Browse files
adapter_config.json CHANGED
@@ -9,8 +9,8 @@
9
  "layers_pattern": null,
10
  "layers_to_transform": null,
11
  "loftq_config": {},
12
- "lora_alpha": 24,
13
- "lora_dropout": 0.01,
14
  "megatron_config": null,
15
  "megatron_core": "megatron.core",
16
  "modules_to_save": null,
 
9
  "layers_pattern": null,
10
  "layers_to_transform": null,
11
  "loftq_config": {},
12
+ "lora_alpha": 16,
13
+ "lora_dropout": 0.05,
14
  "megatron_config": null,
15
  "megatron_core": "megatron.core",
16
  "modules_to_save": null,
adapter_model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:7d9b9026f1fc8d65193982e5cb561167c3246717663d8f0465fa4bb4d1521817
3
  size 1789320
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:108e1bf4fbf2d7f1adecf290bb24085b9c64793f96e5ca60392409de1a6c1b4d
3
  size 1789320
optimizer.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:9f05520ccce78a7ddaba8a53c25dac98a9f0d6ec7f39a70f95046f1f67108f32
3
+ size 3624250
rng_state.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:8f4ed6b80aa5ab63802a3c246d8fc2e4caaa87e994e156271c635c512697c7a1
3
+ size 14244
scheduler.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:19781da243137bd65d859c8bf7788ab89687fb0c90fb6adca6a1e0ca4b00f4b3
3
+ size 1064
trainer_state.json ADDED
@@ -0,0 +1,2939 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "best_metric": null,
3
+ "best_model_checkpoint": null,
4
+ "epoch": 218.1818181818182,
5
+ "eval_steps": 500,
6
+ "global_step": 6000,
7
+ "is_hyper_param_search": false,
8
+ "is_local_process_zero": true,
9
+ "is_world_process_zero": true,
10
+ "log_history": [
11
+ {
12
+ "epoch": 0.98,
13
+ "eval_gen_len": 14.9909,
14
+ "eval_loss": 21.427785873413086,
15
+ "eval_rouge1": 0.0931,
16
+ "eval_rouge2": 0.021,
17
+ "eval_rougeL": 0.0719,
18
+ "eval_rougeLsum": 0.0715,
19
+ "eval_runtime": 11.4039,
20
+ "eval_samples_per_second": 9.646,
21
+ "eval_steps_per_second": 1.228,
22
+ "step": 27
23
+ },
24
+ {
25
+ "epoch": 2.0,
26
+ "eval_gen_len": 14.9909,
27
+ "eval_loss": 21.134571075439453,
28
+ "eval_rouge1": 0.0948,
29
+ "eval_rouge2": 0.0238,
30
+ "eval_rougeL": 0.073,
31
+ "eval_rougeLsum": 0.0724,
32
+ "eval_runtime": 9.5984,
33
+ "eval_samples_per_second": 11.46,
34
+ "eval_steps_per_second": 1.459,
35
+ "step": 55
36
+ },
37
+ {
38
+ "epoch": 2.98,
39
+ "eval_gen_len": 14.9909,
40
+ "eval_loss": 20.702985763549805,
41
+ "eval_rouge1": 0.0917,
42
+ "eval_rouge2": 0.0203,
43
+ "eval_rougeL": 0.0707,
44
+ "eval_rougeLsum": 0.0703,
45
+ "eval_runtime": 9.5997,
46
+ "eval_samples_per_second": 11.459,
47
+ "eval_steps_per_second": 1.458,
48
+ "step": 82
49
+ },
50
+ {
51
+ "epoch": 4.0,
52
+ "eval_gen_len": 15.3455,
53
+ "eval_loss": 20.22304344177246,
54
+ "eval_rouge1": 0.0991,
55
+ "eval_rouge2": 0.0246,
56
+ "eval_rougeL": 0.0781,
57
+ "eval_rougeLsum": 0.0778,
58
+ "eval_runtime": 9.5876,
59
+ "eval_samples_per_second": 11.473,
60
+ "eval_steps_per_second": 1.46,
61
+ "step": 110
62
+ },
63
+ {
64
+ "epoch": 4.98,
65
+ "eval_gen_len": 15.4818,
66
+ "eval_loss": 19.888574600219727,
67
+ "eval_rouge1": 0.1057,
68
+ "eval_rouge2": 0.0297,
69
+ "eval_rougeL": 0.0845,
70
+ "eval_rougeLsum": 0.0845,
71
+ "eval_runtime": 9.549,
72
+ "eval_samples_per_second": 11.519,
73
+ "eval_steps_per_second": 1.466,
74
+ "step": 137
75
+ },
76
+ {
77
+ "epoch": 6.0,
78
+ "eval_gen_len": 15.7182,
79
+ "eval_loss": 19.539011001586914,
80
+ "eval_rouge1": 0.1034,
81
+ "eval_rouge2": 0.0276,
82
+ "eval_rougeL": 0.084,
83
+ "eval_rougeLsum": 0.0839,
84
+ "eval_runtime": 9.551,
85
+ "eval_samples_per_second": 11.517,
86
+ "eval_steps_per_second": 1.466,
87
+ "step": 165
88
+ },
89
+ {
90
+ "epoch": 6.98,
91
+ "eval_gen_len": 15.8727,
92
+ "eval_loss": 19.162174224853516,
93
+ "eval_rouge1": 0.1023,
94
+ "eval_rouge2": 0.0303,
95
+ "eval_rougeL": 0.0837,
96
+ "eval_rougeLsum": 0.0835,
97
+ "eval_runtime": 9.5511,
98
+ "eval_samples_per_second": 11.517,
99
+ "eval_steps_per_second": 1.466,
100
+ "step": 192
101
+ },
102
+ {
103
+ "epoch": 8.0,
104
+ "eval_gen_len": 15.5091,
105
+ "eval_loss": 18.730688095092773,
106
+ "eval_rouge1": 0.1034,
107
+ "eval_rouge2": 0.0342,
108
+ "eval_rougeL": 0.0832,
109
+ "eval_rougeLsum": 0.083,
110
+ "eval_runtime": 9.5901,
111
+ "eval_samples_per_second": 11.47,
112
+ "eval_steps_per_second": 1.46,
113
+ "step": 220
114
+ },
115
+ {
116
+ "epoch": 8.98,
117
+ "eval_gen_len": 15.8,
118
+ "eval_loss": 18.190088272094727,
119
+ "eval_rouge1": 0.0969,
120
+ "eval_rouge2": 0.0344,
121
+ "eval_rougeL": 0.0818,
122
+ "eval_rougeLsum": 0.0815,
123
+ "eval_runtime": 9.5799,
124
+ "eval_samples_per_second": 11.482,
125
+ "eval_steps_per_second": 1.461,
126
+ "step": 247
127
+ },
128
+ {
129
+ "epoch": 10.0,
130
+ "eval_gen_len": 15.9455,
131
+ "eval_loss": 17.473072052001953,
132
+ "eval_rouge1": 0.1041,
133
+ "eval_rouge2": 0.0337,
134
+ "eval_rougeL": 0.0857,
135
+ "eval_rougeLsum": 0.0853,
136
+ "eval_runtime": 9.5524,
137
+ "eval_samples_per_second": 11.515,
138
+ "eval_steps_per_second": 1.466,
139
+ "step": 275
140
+ },
141
+ {
142
+ "epoch": 10.98,
143
+ "eval_gen_len": 18.0182,
144
+ "eval_loss": 16.60153579711914,
145
+ "eval_rouge1": 0.1001,
146
+ "eval_rouge2": 0.029,
147
+ "eval_rougeL": 0.0828,
148
+ "eval_rougeLsum": 0.0828,
149
+ "eval_runtime": 9.5623,
150
+ "eval_samples_per_second": 11.503,
151
+ "eval_steps_per_second": 1.464,
152
+ "step": 302
153
+ },
154
+ {
155
+ "epoch": 12.0,
156
+ "eval_gen_len": 16.9636,
157
+ "eval_loss": 15.59052848815918,
158
+ "eval_rouge1": 0.0875,
159
+ "eval_rouge2": 0.0222,
160
+ "eval_rougeL": 0.0724,
161
+ "eval_rougeLsum": 0.0724,
162
+ "eval_runtime": 9.5436,
163
+ "eval_samples_per_second": 11.526,
164
+ "eval_steps_per_second": 1.467,
165
+ "step": 330
166
+ },
167
+ {
168
+ "epoch": 12.98,
169
+ "eval_gen_len": 15.1455,
170
+ "eval_loss": 14.362702369689941,
171
+ "eval_rouge1": 0.0657,
172
+ "eval_rouge2": 0.0153,
173
+ "eval_rougeL": 0.0545,
174
+ "eval_rougeLsum": 0.0543,
175
+ "eval_runtime": 9.5681,
176
+ "eval_samples_per_second": 11.497,
177
+ "eval_steps_per_second": 1.463,
178
+ "step": 357
179
+ },
180
+ {
181
+ "epoch": 14.0,
182
+ "eval_gen_len": 13.2273,
183
+ "eval_loss": 12.995957374572754,
184
+ "eval_rouge1": 0.0449,
185
+ "eval_rouge2": 0.0083,
186
+ "eval_rougeL": 0.0363,
187
+ "eval_rougeLsum": 0.0361,
188
+ "eval_runtime": 9.506,
189
+ "eval_samples_per_second": 11.572,
190
+ "eval_steps_per_second": 1.473,
191
+ "step": 385
192
+ },
193
+ {
194
+ "epoch": 14.98,
195
+ "eval_gen_len": 13.1818,
196
+ "eval_loss": 11.857050895690918,
197
+ "eval_rouge1": 0.0337,
198
+ "eval_rouge2": 0.0071,
199
+ "eval_rougeL": 0.0303,
200
+ "eval_rougeLsum": 0.0304,
201
+ "eval_runtime": 9.5174,
202
+ "eval_samples_per_second": 11.558,
203
+ "eval_steps_per_second": 1.471,
204
+ "step": 412
205
+ },
206
+ {
207
+ "epoch": 16.0,
208
+ "eval_gen_len": 12.0545,
209
+ "eval_loss": 10.728998184204102,
210
+ "eval_rouge1": 0.0137,
211
+ "eval_rouge2": 0.002,
212
+ "eval_rougeL": 0.0114,
213
+ "eval_rougeLsum": 0.0112,
214
+ "eval_runtime": 9.5263,
215
+ "eval_samples_per_second": 11.547,
216
+ "eval_steps_per_second": 1.47,
217
+ "step": 440
218
+ },
219
+ {
220
+ "epoch": 16.98,
221
+ "eval_gen_len": 14.0818,
222
+ "eval_loss": 9.675826072692871,
223
+ "eval_rouge1": 0.0051,
224
+ "eval_rouge2": 0.0007,
225
+ "eval_rougeL": 0.005,
226
+ "eval_rougeLsum": 0.0049,
227
+ "eval_runtime": 9.4916,
228
+ "eval_samples_per_second": 11.589,
229
+ "eval_steps_per_second": 1.475,
230
+ "step": 467
231
+ },
232
+ {
233
+ "epoch": 18.0,
234
+ "eval_gen_len": 17.2545,
235
+ "eval_loss": 8.581353187561035,
236
+ "eval_rouge1": 0.0013,
237
+ "eval_rouge2": 0.0,
238
+ "eval_rougeL": 0.0013,
239
+ "eval_rougeLsum": 0.0013,
240
+ "eval_runtime": 9.4979,
241
+ "eval_samples_per_second": 11.581,
242
+ "eval_steps_per_second": 1.474,
243
+ "step": 495
244
+ },
245
+ {
246
+ "epoch": 18.18,
247
+ "grad_norm": 5.896080017089844,
248
+ "learning_rate": 1.8360493827160496e-05,
249
+ "loss": 16.7039,
250
+ "step": 500
251
+ },
252
+ {
253
+ "epoch": 18.98,
254
+ "eval_gen_len": 17.5455,
255
+ "eval_loss": 7.5404815673828125,
256
+ "eval_rouge1": 0.0015,
257
+ "eval_rouge2": 0.0,
258
+ "eval_rougeL": 0.0014,
259
+ "eval_rougeLsum": 0.0014,
260
+ "eval_runtime": 9.5287,
261
+ "eval_samples_per_second": 11.544,
262
+ "eval_steps_per_second": 1.469,
263
+ "step": 522
264
+ },
265
+ {
266
+ "epoch": 20.0,
267
+ "eval_gen_len": 18.5636,
268
+ "eval_loss": 6.503509044647217,
269
+ "eval_rouge1": 0.0006,
270
+ "eval_rouge2": 0.0,
271
+ "eval_rougeL": 0.0006,
272
+ "eval_rougeLsum": 0.0006,
273
+ "eval_runtime": 9.5264,
274
+ "eval_samples_per_second": 11.547,
275
+ "eval_steps_per_second": 1.47,
276
+ "step": 550
277
+ },
278
+ {
279
+ "epoch": 20.98,
280
+ "eval_gen_len": 19.0,
281
+ "eval_loss": 5.559322834014893,
282
+ "eval_rouge1": 0.0008,
283
+ "eval_rouge2": 0.0,
284
+ "eval_rougeL": 0.0007,
285
+ "eval_rougeLsum": 0.0007,
286
+ "eval_runtime": 9.5196,
287
+ "eval_samples_per_second": 11.555,
288
+ "eval_steps_per_second": 1.471,
289
+ "step": 577
290
+ },
291
+ {
292
+ "epoch": 22.0,
293
+ "eval_gen_len": 19.0,
294
+ "eval_loss": 4.715544700622559,
295
+ "eval_rouge1": 0.0002,
296
+ "eval_rouge2": 0.0,
297
+ "eval_rougeL": 0.0002,
298
+ "eval_rougeLsum": 0.0002,
299
+ "eval_runtime": 9.5225,
300
+ "eval_samples_per_second": 11.552,
301
+ "eval_steps_per_second": 1.47,
302
+ "step": 605
303
+ },
304
+ {
305
+ "epoch": 22.98,
306
+ "eval_gen_len": 19.0,
307
+ "eval_loss": 4.022518634796143,
308
+ "eval_rouge1": 0.0007,
309
+ "eval_rouge2": 0.0,
310
+ "eval_rougeL": 0.0007,
311
+ "eval_rougeLsum": 0.0007,
312
+ "eval_runtime": 9.5406,
313
+ "eval_samples_per_second": 11.53,
314
+ "eval_steps_per_second": 1.467,
315
+ "step": 632
316
+ },
317
+ {
318
+ "epoch": 24.0,
319
+ "eval_gen_len": 19.0,
320
+ "eval_loss": 3.401372194290161,
321
+ "eval_rouge1": 0.0006,
322
+ "eval_rouge2": 0.0,
323
+ "eval_rougeL": 0.0006,
324
+ "eval_rougeLsum": 0.0006,
325
+ "eval_runtime": 9.5371,
326
+ "eval_samples_per_second": 11.534,
327
+ "eval_steps_per_second": 1.468,
328
+ "step": 660
329
+ },
330
+ {
331
+ "epoch": 24.98,
332
+ "eval_gen_len": 19.0,
333
+ "eval_loss": 2.971496105194092,
334
+ "eval_rouge1": 0.0005,
335
+ "eval_rouge2": 0.0,
336
+ "eval_rougeL": 0.0005,
337
+ "eval_rougeLsum": 0.0005,
338
+ "eval_runtime": 9.5401,
339
+ "eval_samples_per_second": 11.53,
340
+ "eval_steps_per_second": 1.467,
341
+ "step": 687
342
+ },
343
+ {
344
+ "epoch": 26.0,
345
+ "eval_gen_len": 18.7273,
346
+ "eval_loss": 2.6538097858428955,
347
+ "eval_rouge1": 0.0008,
348
+ "eval_rouge2": 0.0,
349
+ "eval_rougeL": 0.0008,
350
+ "eval_rougeLsum": 0.0008,
351
+ "eval_runtime": 9.5288,
352
+ "eval_samples_per_second": 11.544,
353
+ "eval_steps_per_second": 1.469,
354
+ "step": 715
355
+ },
356
+ {
357
+ "epoch": 26.98,
358
+ "eval_gen_len": 14.5,
359
+ "eval_loss": 2.4105889797210693,
360
+ "eval_rouge1": 0.0003,
361
+ "eval_rouge2": 0.0,
362
+ "eval_rougeL": 0.0003,
363
+ "eval_rougeLsum": 0.0003,
364
+ "eval_runtime": 9.478,
365
+ "eval_samples_per_second": 11.606,
366
+ "eval_steps_per_second": 1.477,
367
+ "step": 742
368
+ },
369
+ {
370
+ "epoch": 28.0,
371
+ "eval_gen_len": 8.2636,
372
+ "eval_loss": 2.2297565937042236,
373
+ "eval_rouge1": 0.0,
374
+ "eval_rouge2": 0.0,
375
+ "eval_rougeL": 0.0,
376
+ "eval_rougeLsum": 0.0,
377
+ "eval_runtime": 9.4449,
378
+ "eval_samples_per_second": 11.646,
379
+ "eval_steps_per_second": 1.482,
380
+ "step": 770
381
+ },
382
+ {
383
+ "epoch": 28.98,
384
+ "eval_gen_len": 7.1455,
385
+ "eval_loss": 2.1042511463165283,
386
+ "eval_rouge1": 0.0,
387
+ "eval_rouge2": 0.0,
388
+ "eval_rougeL": 0.0,
389
+ "eval_rougeLsum": 0.0,
390
+ "eval_runtime": 9.4539,
391
+ "eval_samples_per_second": 11.635,
392
+ "eval_steps_per_second": 1.481,
393
+ "step": 797
394
+ },
395
+ {
396
+ "epoch": 30.0,
397
+ "eval_gen_len": 6.5545,
398
+ "eval_loss": 2.011505126953125,
399
+ "eval_rouge1": 0.0,
400
+ "eval_rouge2": 0.0,
401
+ "eval_rougeL": 0.0,
402
+ "eval_rougeLsum": 0.0,
403
+ "eval_runtime": 9.4645,
404
+ "eval_samples_per_second": 11.622,
405
+ "eval_steps_per_second": 1.479,
406
+ "step": 825
407
+ },
408
+ {
409
+ "epoch": 30.98,
410
+ "eval_gen_len": 6.0273,
411
+ "eval_loss": 1.927019715309143,
412
+ "eval_rouge1": 0.0,
413
+ "eval_rouge2": 0.0,
414
+ "eval_rougeL": 0.0,
415
+ "eval_rougeLsum": 0.0,
416
+ "eval_runtime": 9.5048,
417
+ "eval_samples_per_second": 11.573,
418
+ "eval_steps_per_second": 1.473,
419
+ "step": 852
420
+ },
421
+ {
422
+ "epoch": 32.0,
423
+ "eval_gen_len": 5.4091,
424
+ "eval_loss": 1.8653310537338257,
425
+ "eval_rouge1": 0.0,
426
+ "eval_rouge2": 0.0,
427
+ "eval_rougeL": 0.0,
428
+ "eval_rougeLsum": 0.0,
429
+ "eval_runtime": 9.49,
430
+ "eval_samples_per_second": 11.591,
431
+ "eval_steps_per_second": 1.475,
432
+ "step": 880
433
+ },
434
+ {
435
+ "epoch": 32.98,
436
+ "eval_gen_len": 5.1727,
437
+ "eval_loss": 1.8196451663970947,
438
+ "eval_rouge1": 0.0,
439
+ "eval_rouge2": 0.0,
440
+ "eval_rougeL": 0.0,
441
+ "eval_rougeLsum": 0.0,
442
+ "eval_runtime": 9.4647,
443
+ "eval_samples_per_second": 11.622,
444
+ "eval_steps_per_second": 1.479,
445
+ "step": 907
446
+ },
447
+ {
448
+ "epoch": 34.0,
449
+ "eval_gen_len": 5.1909,
450
+ "eval_loss": 1.7811836004257202,
451
+ "eval_rouge1": 0.0,
452
+ "eval_rouge2": 0.0,
453
+ "eval_rougeL": 0.0,
454
+ "eval_rougeLsum": 0.0,
455
+ "eval_runtime": 9.4798,
456
+ "eval_samples_per_second": 11.604,
457
+ "eval_steps_per_second": 1.477,
458
+ "step": 935
459
+ },
460
+ {
461
+ "epoch": 34.98,
462
+ "eval_gen_len": 5.3182,
463
+ "eval_loss": 1.7490955591201782,
464
+ "eval_rouge1": 0.0,
465
+ "eval_rouge2": 0.0,
466
+ "eval_rougeL": 0.0,
467
+ "eval_rougeLsum": 0.0,
468
+ "eval_runtime": 9.4662,
469
+ "eval_samples_per_second": 11.62,
470
+ "eval_steps_per_second": 1.479,
471
+ "step": 962
472
+ },
473
+ {
474
+ "epoch": 36.0,
475
+ "eval_gen_len": 5.3091,
476
+ "eval_loss": 1.721943974494934,
477
+ "eval_rouge1": 0.0003,
478
+ "eval_rouge2": 0.0,
479
+ "eval_rougeL": 0.0003,
480
+ "eval_rougeLsum": 0.0003,
481
+ "eval_runtime": 9.4582,
482
+ "eval_samples_per_second": 11.63,
483
+ "eval_steps_per_second": 1.48,
484
+ "step": 990
485
+ },
486
+ {
487
+ "epoch": 36.36,
488
+ "grad_norm": 3.1814663410186768,
489
+ "learning_rate": 1.6720987654320987e-05,
490
+ "loss": 3.9957,
491
+ "step": 1000
492
+ },
493
+ {
494
+ "epoch": 36.98,
495
+ "eval_gen_len": 5.3273,
496
+ "eval_loss": 1.69921875,
497
+ "eval_rouge1": 0.0,
498
+ "eval_rouge2": 0.0,
499
+ "eval_rougeL": 0.0,
500
+ "eval_rougeLsum": 0.0,
501
+ "eval_runtime": 9.5006,
502
+ "eval_samples_per_second": 11.578,
503
+ "eval_steps_per_second": 1.474,
504
+ "step": 1017
505
+ },
506
+ {
507
+ "epoch": 38.0,
508
+ "eval_gen_len": 5.2091,
509
+ "eval_loss": 1.678276777267456,
510
+ "eval_rouge1": 0.0003,
511
+ "eval_rouge2": 0.0,
512
+ "eval_rougeL": 0.0003,
513
+ "eval_rougeLsum": 0.0003,
514
+ "eval_runtime": 9.4859,
515
+ "eval_samples_per_second": 11.596,
516
+ "eval_steps_per_second": 1.476,
517
+ "step": 1045
518
+ },
519
+ {
520
+ "epoch": 38.98,
521
+ "eval_gen_len": 5.9273,
522
+ "eval_loss": 1.657922625541687,
523
+ "eval_rouge1": 0.0006,
524
+ "eval_rouge2": 0.0,
525
+ "eval_rougeL": 0.0006,
526
+ "eval_rougeLsum": 0.0006,
527
+ "eval_runtime": 9.4757,
528
+ "eval_samples_per_second": 11.609,
529
+ "eval_steps_per_second": 1.477,
530
+ "step": 1072
531
+ },
532
+ {
533
+ "epoch": 40.0,
534
+ "eval_gen_len": 6.1909,
535
+ "eval_loss": 1.6388959884643555,
536
+ "eval_rouge1": 0.0007,
537
+ "eval_rouge2": 0.0,
538
+ "eval_rougeL": 0.0007,
539
+ "eval_rougeLsum": 0.0007,
540
+ "eval_runtime": 9.4468,
541
+ "eval_samples_per_second": 11.644,
542
+ "eval_steps_per_second": 1.482,
543
+ "step": 1100
544
+ },
545
+ {
546
+ "epoch": 40.98,
547
+ "eval_gen_len": 5.6818,
548
+ "eval_loss": 1.622721552848816,
549
+ "eval_rouge1": 0.0,
550
+ "eval_rouge2": 0.0,
551
+ "eval_rougeL": 0.0,
552
+ "eval_rougeLsum": 0.0,
553
+ "eval_runtime": 9.4505,
554
+ "eval_samples_per_second": 11.64,
555
+ "eval_steps_per_second": 1.481,
556
+ "step": 1127
557
+ },
558
+ {
559
+ "epoch": 42.0,
560
+ "eval_gen_len": 5.0818,
561
+ "eval_loss": 1.6065285205841064,
562
+ "eval_rouge1": 0.0,
563
+ "eval_rouge2": 0.0,
564
+ "eval_rougeL": 0.0,
565
+ "eval_rougeLsum": 0.0,
566
+ "eval_runtime": 9.4897,
567
+ "eval_samples_per_second": 11.592,
568
+ "eval_steps_per_second": 1.475,
569
+ "step": 1155
570
+ },
571
+ {
572
+ "epoch": 42.98,
573
+ "eval_gen_len": 5.5091,
574
+ "eval_loss": 1.5892502069473267,
575
+ "eval_rouge1": 0.0002,
576
+ "eval_rouge2": 0.0,
577
+ "eval_rougeL": 0.0002,
578
+ "eval_rougeLsum": 0.0002,
579
+ "eval_runtime": 9.4662,
580
+ "eval_samples_per_second": 11.62,
581
+ "eval_steps_per_second": 1.479,
582
+ "step": 1182
583
+ },
584
+ {
585
+ "epoch": 44.0,
586
+ "eval_gen_len": 6.0636,
587
+ "eval_loss": 1.568816900253296,
588
+ "eval_rouge1": 0.0,
589
+ "eval_rouge2": 0.0,
590
+ "eval_rougeL": 0.0,
591
+ "eval_rougeLsum": 0.0,
592
+ "eval_runtime": 9.4716,
593
+ "eval_samples_per_second": 11.614,
594
+ "eval_steps_per_second": 1.478,
595
+ "step": 1210
596
+ },
597
+ {
598
+ "epoch": 44.98,
599
+ "eval_gen_len": 6.6273,
600
+ "eval_loss": 1.5522329807281494,
601
+ "eval_rouge1": 0.0,
602
+ "eval_rouge2": 0.0,
603
+ "eval_rougeL": 0.0,
604
+ "eval_rougeLsum": 0.0,
605
+ "eval_runtime": 9.4809,
606
+ "eval_samples_per_second": 11.602,
607
+ "eval_steps_per_second": 1.477,
608
+ "step": 1237
609
+ },
610
+ {
611
+ "epoch": 46.0,
612
+ "eval_gen_len": 6.5364,
613
+ "eval_loss": 1.539686918258667,
614
+ "eval_rouge1": 0.0,
615
+ "eval_rouge2": 0.0,
616
+ "eval_rougeL": 0.0,
617
+ "eval_rougeLsum": 0.0,
618
+ "eval_runtime": 9.4493,
619
+ "eval_samples_per_second": 11.641,
620
+ "eval_steps_per_second": 1.482,
621
+ "step": 1265
622
+ },
623
+ {
624
+ "epoch": 46.98,
625
+ "eval_gen_len": 6.3909,
626
+ "eval_loss": 1.527640700340271,
627
+ "eval_rouge1": 0.0,
628
+ "eval_rouge2": 0.0,
629
+ "eval_rougeL": 0.0,
630
+ "eval_rougeLsum": 0.0,
631
+ "eval_runtime": 9.441,
632
+ "eval_samples_per_second": 11.651,
633
+ "eval_steps_per_second": 1.483,
634
+ "step": 1292
635
+ },
636
+ {
637
+ "epoch": 48.0,
638
+ "eval_gen_len": 7.2818,
639
+ "eval_loss": 1.5062768459320068,
640
+ "eval_rouge1": 0.0,
641
+ "eval_rouge2": 0.0,
642
+ "eval_rougeL": 0.0,
643
+ "eval_rougeLsum": 0.0,
644
+ "eval_runtime": 9.4248,
645
+ "eval_samples_per_second": 11.671,
646
+ "eval_steps_per_second": 1.485,
647
+ "step": 1320
648
+ },
649
+ {
650
+ "epoch": 48.98,
651
+ "eval_gen_len": 7.9273,
652
+ "eval_loss": 1.4878034591674805,
653
+ "eval_rouge1": 0.0002,
654
+ "eval_rouge2": 0.0,
655
+ "eval_rougeL": 0.0002,
656
+ "eval_rougeLsum": 0.0002,
657
+ "eval_runtime": 9.5189,
658
+ "eval_samples_per_second": 11.556,
659
+ "eval_steps_per_second": 1.471,
660
+ "step": 1347
661
+ },
662
+ {
663
+ "epoch": 50.0,
664
+ "eval_gen_len": 7.5636,
665
+ "eval_loss": 1.4774630069732666,
666
+ "eval_rouge1": 0.0006,
667
+ "eval_rouge2": 0.0,
668
+ "eval_rougeL": 0.0006,
669
+ "eval_rougeLsum": 0.0006,
670
+ "eval_runtime": 9.4679,
671
+ "eval_samples_per_second": 11.618,
672
+ "eval_steps_per_second": 1.479,
673
+ "step": 1375
674
+ },
675
+ {
676
+ "epoch": 50.98,
677
+ "eval_gen_len": 8.2273,
678
+ "eval_loss": 1.4622873067855835,
679
+ "eval_rouge1": 0.0007,
680
+ "eval_rouge2": 0.0,
681
+ "eval_rougeL": 0.0007,
682
+ "eval_rougeLsum": 0.0007,
683
+ "eval_runtime": 9.4582,
684
+ "eval_samples_per_second": 11.63,
685
+ "eval_steps_per_second": 1.48,
686
+ "step": 1402
687
+ },
688
+ {
689
+ "epoch": 52.0,
690
+ "eval_gen_len": 8.5636,
691
+ "eval_loss": 1.4518091678619385,
692
+ "eval_rouge1": 0.0008,
693
+ "eval_rouge2": 0.0,
694
+ "eval_rougeL": 0.0008,
695
+ "eval_rougeLsum": 0.0008,
696
+ "eval_runtime": 9.4604,
697
+ "eval_samples_per_second": 11.627,
698
+ "eval_steps_per_second": 1.48,
699
+ "step": 1430
700
+ },
701
+ {
702
+ "epoch": 52.98,
703
+ "eval_gen_len": 8.2545,
704
+ "eval_loss": 1.4443649053573608,
705
+ "eval_rouge1": 0.0005,
706
+ "eval_rouge2": 0.0,
707
+ "eval_rougeL": 0.0005,
708
+ "eval_rougeLsum": 0.0005,
709
+ "eval_runtime": 9.4485,
710
+ "eval_samples_per_second": 11.642,
711
+ "eval_steps_per_second": 1.482,
712
+ "step": 1457
713
+ },
714
+ {
715
+ "epoch": 54.0,
716
+ "eval_gen_len": 7.9545,
717
+ "eval_loss": 1.4318201541900635,
718
+ "eval_rouge1": 0.0005,
719
+ "eval_rouge2": 0.0,
720
+ "eval_rougeL": 0.0003,
721
+ "eval_rougeLsum": 0.0003,
722
+ "eval_runtime": 9.5193,
723
+ "eval_samples_per_second": 11.555,
724
+ "eval_steps_per_second": 1.471,
725
+ "step": 1485
726
+ },
727
+ {
728
+ "epoch": 54.55,
729
+ "grad_norm": 4.463714599609375,
730
+ "learning_rate": 1.5078189300411523e-05,
731
+ "loss": 1.9182,
732
+ "step": 1500
733
+ },
734
+ {
735
+ "epoch": 54.98,
736
+ "eval_gen_len": 8.8273,
737
+ "eval_loss": 1.4121521711349487,
738
+ "eval_rouge1": 0.0009,
739
+ "eval_rouge2": 0.0,
740
+ "eval_rougeL": 0.0009,
741
+ "eval_rougeLsum": 0.0009,
742
+ "eval_runtime": 9.4998,
743
+ "eval_samples_per_second": 11.579,
744
+ "eval_steps_per_second": 1.474,
745
+ "step": 1512
746
+ },
747
+ {
748
+ "epoch": 56.0,
749
+ "eval_gen_len": 7.8455,
750
+ "eval_loss": 1.400160789489746,
751
+ "eval_rouge1": 0.0012,
752
+ "eval_rouge2": 0.0,
753
+ "eval_rougeL": 0.0013,
754
+ "eval_rougeLsum": 0.0012,
755
+ "eval_runtime": 9.4886,
756
+ "eval_samples_per_second": 11.593,
757
+ "eval_steps_per_second": 1.475,
758
+ "step": 1540
759
+ },
760
+ {
761
+ "epoch": 56.98,
762
+ "eval_gen_len": 7.7909,
763
+ "eval_loss": 1.3821996450424194,
764
+ "eval_rouge1": 0.0003,
765
+ "eval_rouge2": 0.0,
766
+ "eval_rougeL": 0.0003,
767
+ "eval_rougeLsum": 0.0003,
768
+ "eval_runtime": 9.4712,
769
+ "eval_samples_per_second": 11.614,
770
+ "eval_steps_per_second": 1.478,
771
+ "step": 1567
772
+ },
773
+ {
774
+ "epoch": 58.0,
775
+ "eval_gen_len": 8.5545,
776
+ "eval_loss": 1.370047688484192,
777
+ "eval_rouge1": 0.001,
778
+ "eval_rouge2": 0.0,
779
+ "eval_rougeL": 0.001,
780
+ "eval_rougeLsum": 0.001,
781
+ "eval_runtime": 9.4659,
782
+ "eval_samples_per_second": 11.621,
783
+ "eval_steps_per_second": 1.479,
784
+ "step": 1595
785
+ },
786
+ {
787
+ "epoch": 58.98,
788
+ "eval_gen_len": 8.4636,
789
+ "eval_loss": 1.3583662509918213,
790
+ "eval_rouge1": 0.0006,
791
+ "eval_rouge2": 0.0,
792
+ "eval_rougeL": 0.0006,
793
+ "eval_rougeLsum": 0.0006,
794
+ "eval_runtime": 9.472,
795
+ "eval_samples_per_second": 11.613,
796
+ "eval_steps_per_second": 1.478,
797
+ "step": 1622
798
+ },
799
+ {
800
+ "epoch": 60.0,
801
+ "eval_gen_len": 8.0909,
802
+ "eval_loss": 1.3374855518341064,
803
+ "eval_rouge1": 0.0002,
804
+ "eval_rouge2": 0.0,
805
+ "eval_rougeL": 0.0002,
806
+ "eval_rougeLsum": 0.0002,
807
+ "eval_runtime": 9.4676,
808
+ "eval_samples_per_second": 11.619,
809
+ "eval_steps_per_second": 1.479,
810
+ "step": 1650
811
+ },
812
+ {
813
+ "epoch": 60.98,
814
+ "eval_gen_len": 8.7,
815
+ "eval_loss": 1.3220137357711792,
816
+ "eval_rouge1": 0.0015,
817
+ "eval_rouge2": 0.0,
818
+ "eval_rougeL": 0.0016,
819
+ "eval_rougeLsum": 0.0016,
820
+ "eval_runtime": 9.504,
821
+ "eval_samples_per_second": 11.574,
822
+ "eval_steps_per_second": 1.473,
823
+ "step": 1677
824
+ },
825
+ {
826
+ "epoch": 62.0,
827
+ "eval_gen_len": 8.5818,
828
+ "eval_loss": 1.3142321109771729,
829
+ "eval_rouge1": 0.0002,
830
+ "eval_rouge2": 0.0,
831
+ "eval_rougeL": 0.0002,
832
+ "eval_rougeLsum": 0.0002,
833
+ "eval_runtime": 9.4996,
834
+ "eval_samples_per_second": 11.579,
835
+ "eval_steps_per_second": 1.474,
836
+ "step": 1705
837
+ },
838
+ {
839
+ "epoch": 62.98,
840
+ "eval_gen_len": 8.7727,
841
+ "eval_loss": 1.3096009492874146,
842
+ "eval_rouge1": 0.0016,
843
+ "eval_rouge2": 0.0,
844
+ "eval_rougeL": 0.0017,
845
+ "eval_rougeLsum": 0.0017,
846
+ "eval_runtime": 9.463,
847
+ "eval_samples_per_second": 11.624,
848
+ "eval_steps_per_second": 1.479,
849
+ "step": 1732
850
+ },
851
+ {
852
+ "epoch": 64.0,
853
+ "eval_gen_len": 9.0455,
854
+ "eval_loss": 1.2836058139801025,
855
+ "eval_rouge1": 0.0013,
856
+ "eval_rouge2": 0.0003,
857
+ "eval_rougeL": 0.0012,
858
+ "eval_rougeLsum": 0.0013,
859
+ "eval_runtime": 9.4725,
860
+ "eval_samples_per_second": 11.613,
861
+ "eval_steps_per_second": 1.478,
862
+ "step": 1760
863
+ },
864
+ {
865
+ "epoch": 64.98,
866
+ "eval_gen_len": 9.0,
867
+ "eval_loss": 1.2642889022827148,
868
+ "eval_rouge1": 0.002,
869
+ "eval_rouge2": 0.0003,
870
+ "eval_rougeL": 0.002,
871
+ "eval_rougeLsum": 0.002,
872
+ "eval_runtime": 9.6032,
873
+ "eval_samples_per_second": 11.454,
874
+ "eval_steps_per_second": 1.458,
875
+ "step": 1787
876
+ },
877
+ {
878
+ "epoch": 66.0,
879
+ "eval_gen_len": 8.8182,
880
+ "eval_loss": 1.2471901178359985,
881
+ "eval_rouge1": 0.0052,
882
+ "eval_rouge2": 0.001,
883
+ "eval_rougeL": 0.0052,
884
+ "eval_rougeLsum": 0.0052,
885
+ "eval_runtime": 9.4575,
886
+ "eval_samples_per_second": 11.631,
887
+ "eval_steps_per_second": 1.48,
888
+ "step": 1815
889
+ },
890
+ {
891
+ "epoch": 66.98,
892
+ "eval_gen_len": 9.6636,
893
+ "eval_loss": 1.226246953010559,
894
+ "eval_rouge1": 0.0086,
895
+ "eval_rouge2": 0.0013,
896
+ "eval_rougeL": 0.0088,
897
+ "eval_rougeLsum": 0.0088,
898
+ "eval_runtime": 9.4953,
899
+ "eval_samples_per_second": 11.585,
900
+ "eval_steps_per_second": 1.474,
901
+ "step": 1842
902
+ },
903
+ {
904
+ "epoch": 68.0,
905
+ "eval_gen_len": 9.5273,
906
+ "eval_loss": 1.2092421054840088,
907
+ "eval_rouge1": 0.0107,
908
+ "eval_rouge2": 0.0019,
909
+ "eval_rougeL": 0.009,
910
+ "eval_rougeLsum": 0.0088,
911
+ "eval_runtime": 9.5273,
912
+ "eval_samples_per_second": 11.546,
913
+ "eval_steps_per_second": 1.469,
914
+ "step": 1870
915
+ },
916
+ {
917
+ "epoch": 68.98,
918
+ "eval_gen_len": 9.8091,
919
+ "eval_loss": 1.1953155994415283,
920
+ "eval_rouge1": 0.0123,
921
+ "eval_rouge2": 0.0032,
922
+ "eval_rougeL": 0.012,
923
+ "eval_rougeLsum": 0.0118,
924
+ "eval_runtime": 9.4864,
925
+ "eval_samples_per_second": 11.596,
926
+ "eval_steps_per_second": 1.476,
927
+ "step": 1897
928
+ },
929
+ {
930
+ "epoch": 70.0,
931
+ "eval_gen_len": 9.3636,
932
+ "eval_loss": 1.1815446615219116,
933
+ "eval_rouge1": 0.0104,
934
+ "eval_rouge2": 0.0033,
935
+ "eval_rougeL": 0.0101,
936
+ "eval_rougeLsum": 0.0098,
937
+ "eval_runtime": 9.4807,
938
+ "eval_samples_per_second": 11.603,
939
+ "eval_steps_per_second": 1.477,
940
+ "step": 1925
941
+ },
942
+ {
943
+ "epoch": 70.98,
944
+ "eval_gen_len": 9.1091,
945
+ "eval_loss": 1.161879301071167,
946
+ "eval_rouge1": 0.0064,
947
+ "eval_rouge2": 0.0008,
948
+ "eval_rougeL": 0.0056,
949
+ "eval_rougeLsum": 0.0055,
950
+ "eval_runtime": 9.4717,
951
+ "eval_samples_per_second": 11.613,
952
+ "eval_steps_per_second": 1.478,
953
+ "step": 1952
954
+ },
955
+ {
956
+ "epoch": 72.0,
957
+ "eval_gen_len": 9.3273,
958
+ "eval_loss": 1.139123797416687,
959
+ "eval_rouge1": 0.0105,
960
+ "eval_rouge2": 0.002,
961
+ "eval_rougeL": 0.0099,
962
+ "eval_rougeLsum": 0.0098,
963
+ "eval_runtime": 9.4501,
964
+ "eval_samples_per_second": 11.64,
965
+ "eval_steps_per_second": 1.481,
966
+ "step": 1980
967
+ },
968
+ {
969
+ "epoch": 72.73,
970
+ "grad_norm": 3.556974172592163,
971
+ "learning_rate": 1.34320987654321e-05,
972
+ "loss": 1.6026,
973
+ "step": 2000
974
+ },
975
+ {
976
+ "epoch": 72.98,
977
+ "eval_gen_len": 9.7364,
978
+ "eval_loss": 1.1243102550506592,
979
+ "eval_rouge1": 0.0108,
980
+ "eval_rouge2": 0.0019,
981
+ "eval_rougeL": 0.0104,
982
+ "eval_rougeLsum": 0.0101,
983
+ "eval_runtime": 9.4726,
984
+ "eval_samples_per_second": 11.612,
985
+ "eval_steps_per_second": 1.478,
986
+ "step": 2007
987
+ },
988
+ {
989
+ "epoch": 74.0,
990
+ "eval_gen_len": 10.6182,
991
+ "eval_loss": 1.1061749458312988,
992
+ "eval_rouge1": 0.0204,
993
+ "eval_rouge2": 0.0031,
994
+ "eval_rougeL": 0.0185,
995
+ "eval_rougeLsum": 0.0183,
996
+ "eval_runtime": 9.4719,
997
+ "eval_samples_per_second": 11.613,
998
+ "eval_steps_per_second": 1.478,
999
+ "step": 2035
1000
+ },
1001
+ {
1002
+ "epoch": 74.98,
1003
+ "eval_gen_len": 10.1545,
1004
+ "eval_loss": 1.0930193662643433,
1005
+ "eval_rouge1": 0.0159,
1006
+ "eval_rouge2": 0.0017,
1007
+ "eval_rougeL": 0.0145,
1008
+ "eval_rougeLsum": 0.0143,
1009
+ "eval_runtime": 9.4961,
1010
+ "eval_samples_per_second": 11.584,
1011
+ "eval_steps_per_second": 1.474,
1012
+ "step": 2062
1013
+ },
1014
+ {
1015
+ "epoch": 76.0,
1016
+ "eval_gen_len": 10.7364,
1017
+ "eval_loss": 1.077184796333313,
1018
+ "eval_rouge1": 0.0151,
1019
+ "eval_rouge2": 0.0023,
1020
+ "eval_rougeL": 0.0133,
1021
+ "eval_rougeLsum": 0.0133,
1022
+ "eval_runtime": 9.5109,
1023
+ "eval_samples_per_second": 11.566,
1024
+ "eval_steps_per_second": 1.472,
1025
+ "step": 2090
1026
+ },
1027
+ {
1028
+ "epoch": 76.98,
1029
+ "eval_gen_len": 10.7,
1030
+ "eval_loss": 1.0578068494796753,
1031
+ "eval_rouge1": 0.0195,
1032
+ "eval_rouge2": 0.0044,
1033
+ "eval_rougeL": 0.0178,
1034
+ "eval_rougeLsum": 0.0178,
1035
+ "eval_runtime": 9.4786,
1036
+ "eval_samples_per_second": 11.605,
1037
+ "eval_steps_per_second": 1.477,
1038
+ "step": 2117
1039
+ },
1040
+ {
1041
+ "epoch": 78.0,
1042
+ "eval_gen_len": 11.0636,
1043
+ "eval_loss": 1.0393445491790771,
1044
+ "eval_rouge1": 0.0237,
1045
+ "eval_rouge2": 0.0048,
1046
+ "eval_rougeL": 0.0214,
1047
+ "eval_rougeLsum": 0.0211,
1048
+ "eval_runtime": 9.4415,
1049
+ "eval_samples_per_second": 11.651,
1050
+ "eval_steps_per_second": 1.483,
1051
+ "step": 2145
1052
+ },
1053
+ {
1054
+ "epoch": 78.98,
1055
+ "eval_gen_len": 10.1455,
1056
+ "eval_loss": 1.0262919664382935,
1057
+ "eval_rouge1": 0.0121,
1058
+ "eval_rouge2": 0.0014,
1059
+ "eval_rougeL": 0.0113,
1060
+ "eval_rougeLsum": 0.0112,
1061
+ "eval_runtime": 9.4285,
1062
+ "eval_samples_per_second": 11.667,
1063
+ "eval_steps_per_second": 1.485,
1064
+ "step": 2172
1065
+ },
1066
+ {
1067
+ "epoch": 80.0,
1068
+ "eval_gen_len": 11.3818,
1069
+ "eval_loss": 1.0064616203308105,
1070
+ "eval_rouge1": 0.0273,
1071
+ "eval_rouge2": 0.0048,
1072
+ "eval_rougeL": 0.0238,
1073
+ "eval_rougeLsum": 0.0235,
1074
+ "eval_runtime": 9.448,
1075
+ "eval_samples_per_second": 11.643,
1076
+ "eval_steps_per_second": 1.482,
1077
+ "step": 2200
1078
+ },
1079
+ {
1080
+ "epoch": 80.98,
1081
+ "eval_gen_len": 10.9545,
1082
+ "eval_loss": 0.98997563123703,
1083
+ "eval_rouge1": 0.0228,
1084
+ "eval_rouge2": 0.0042,
1085
+ "eval_rougeL": 0.0197,
1086
+ "eval_rougeLsum": 0.0196,
1087
+ "eval_runtime": 9.4798,
1088
+ "eval_samples_per_second": 11.604,
1089
+ "eval_steps_per_second": 1.477,
1090
+ "step": 2227
1091
+ },
1092
+ {
1093
+ "epoch": 82.0,
1094
+ "eval_gen_len": 10.9909,
1095
+ "eval_loss": 0.9675103425979614,
1096
+ "eval_rouge1": 0.024,
1097
+ "eval_rouge2": 0.0046,
1098
+ "eval_rougeL": 0.0204,
1099
+ "eval_rougeLsum": 0.0202,
1100
+ "eval_runtime": 9.4448,
1101
+ "eval_samples_per_second": 11.647,
1102
+ "eval_steps_per_second": 1.482,
1103
+ "step": 2255
1104
+ },
1105
+ {
1106
+ "epoch": 82.98,
1107
+ "eval_gen_len": 10.7182,
1108
+ "eval_loss": 0.9506540298461914,
1109
+ "eval_rouge1": 0.0244,
1110
+ "eval_rouge2": 0.0051,
1111
+ "eval_rougeL": 0.0211,
1112
+ "eval_rougeLsum": 0.0211,
1113
+ "eval_runtime": 9.5012,
1114
+ "eval_samples_per_second": 11.578,
1115
+ "eval_steps_per_second": 1.474,
1116
+ "step": 2282
1117
+ },
1118
+ {
1119
+ "epoch": 84.0,
1120
+ "eval_gen_len": 10.8636,
1121
+ "eval_loss": 0.9340613484382629,
1122
+ "eval_rouge1": 0.0249,
1123
+ "eval_rouge2": 0.0058,
1124
+ "eval_rougeL": 0.022,
1125
+ "eval_rougeLsum": 0.0223,
1126
+ "eval_runtime": 9.447,
1127
+ "eval_samples_per_second": 11.644,
1128
+ "eval_steps_per_second": 1.482,
1129
+ "step": 2310
1130
+ },
1131
+ {
1132
+ "epoch": 84.98,
1133
+ "eval_gen_len": 10.0909,
1134
+ "eval_loss": 0.9161014556884766,
1135
+ "eval_rouge1": 0.0243,
1136
+ "eval_rouge2": 0.0077,
1137
+ "eval_rougeL": 0.0224,
1138
+ "eval_rougeLsum": 0.0226,
1139
+ "eval_runtime": 9.4492,
1140
+ "eval_samples_per_second": 11.641,
1141
+ "eval_steps_per_second": 1.482,
1142
+ "step": 2337
1143
+ },
1144
+ {
1145
+ "epoch": 86.0,
1146
+ "eval_gen_len": 9.7182,
1147
+ "eval_loss": 0.8942736983299255,
1148
+ "eval_rouge1": 0.0176,
1149
+ "eval_rouge2": 0.0035,
1150
+ "eval_rougeL": 0.0152,
1151
+ "eval_rougeLsum": 0.0153,
1152
+ "eval_runtime": 9.4727,
1153
+ "eval_samples_per_second": 11.612,
1154
+ "eval_steps_per_second": 1.478,
1155
+ "step": 2365
1156
+ },
1157
+ {
1158
+ "epoch": 86.98,
1159
+ "eval_gen_len": 10.0,
1160
+ "eval_loss": 0.8758471608161926,
1161
+ "eval_rouge1": 0.0239,
1162
+ "eval_rouge2": 0.0093,
1163
+ "eval_rougeL": 0.0214,
1164
+ "eval_rougeLsum": 0.0215,
1165
+ "eval_runtime": 9.4864,
1166
+ "eval_samples_per_second": 11.596,
1167
+ "eval_steps_per_second": 1.476,
1168
+ "step": 2392
1169
+ },
1170
+ {
1171
+ "epoch": 88.0,
1172
+ "eval_gen_len": 10.2273,
1173
+ "eval_loss": 0.854742169380188,
1174
+ "eval_rouge1": 0.0254,
1175
+ "eval_rouge2": 0.0116,
1176
+ "eval_rougeL": 0.0237,
1177
+ "eval_rougeLsum": 0.0238,
1178
+ "eval_runtime": 9.5043,
1179
+ "eval_samples_per_second": 11.574,
1180
+ "eval_steps_per_second": 1.473,
1181
+ "step": 2420
1182
+ },
1183
+ {
1184
+ "epoch": 88.98,
1185
+ "eval_gen_len": 10.2545,
1186
+ "eval_loss": 0.8352662324905396,
1187
+ "eval_rouge1": 0.0196,
1188
+ "eval_rouge2": 0.007,
1189
+ "eval_rougeL": 0.0183,
1190
+ "eval_rougeLsum": 0.0182,
1191
+ "eval_runtime": 9.455,
1192
+ "eval_samples_per_second": 11.634,
1193
+ "eval_steps_per_second": 1.481,
1194
+ "step": 2447
1195
+ },
1196
+ {
1197
+ "epoch": 90.0,
1198
+ "eval_gen_len": 9.2364,
1199
+ "eval_loss": 0.8150736093521118,
1200
+ "eval_rouge1": 0.0104,
1201
+ "eval_rouge2": 0.0032,
1202
+ "eval_rougeL": 0.0095,
1203
+ "eval_rougeLsum": 0.0098,
1204
+ "eval_runtime": 9.4416,
1205
+ "eval_samples_per_second": 11.651,
1206
+ "eval_steps_per_second": 1.483,
1207
+ "step": 2475
1208
+ },
1209
+ {
1210
+ "epoch": 90.91,
1211
+ "grad_norm": 1.338108777999878,
1212
+ "learning_rate": 1.1786008230452676e-05,
1213
+ "loss": 1.2934,
1214
+ "step": 2500
1215
+ },
1216
+ {
1217
+ "epoch": 90.98,
1218
+ "eval_gen_len": 9.3455,
1219
+ "eval_loss": 0.7920636534690857,
1220
+ "eval_rouge1": 0.01,
1221
+ "eval_rouge2": 0.0036,
1222
+ "eval_rougeL": 0.0095,
1223
+ "eval_rougeLsum": 0.0096,
1224
+ "eval_runtime": 9.4783,
1225
+ "eval_samples_per_second": 11.605,
1226
+ "eval_steps_per_second": 1.477,
1227
+ "step": 2502
1228
+ },
1229
+ {
1230
+ "epoch": 92.0,
1231
+ "eval_gen_len": 9.2545,
1232
+ "eval_loss": 0.7697137594223022,
1233
+ "eval_rouge1": 0.012,
1234
+ "eval_rouge2": 0.0051,
1235
+ "eval_rougeL": 0.0111,
1236
+ "eval_rougeLsum": 0.011,
1237
+ "eval_runtime": 9.5826,
1238
+ "eval_samples_per_second": 11.479,
1239
+ "eval_steps_per_second": 1.461,
1240
+ "step": 2530
1241
+ },
1242
+ {
1243
+ "epoch": 92.98,
1244
+ "eval_gen_len": 8.8455,
1245
+ "eval_loss": 0.7492441534996033,
1246
+ "eval_rouge1": 0.0106,
1247
+ "eval_rouge2": 0.0048,
1248
+ "eval_rougeL": 0.0098,
1249
+ "eval_rougeLsum": 0.0098,
1250
+ "eval_runtime": 9.4826,
1251
+ "eval_samples_per_second": 11.6,
1252
+ "eval_steps_per_second": 1.476,
1253
+ "step": 2557
1254
+ },
1255
+ {
1256
+ "epoch": 94.0,
1257
+ "eval_gen_len": 9.4,
1258
+ "eval_loss": 0.7301111221313477,
1259
+ "eval_rouge1": 0.0112,
1260
+ "eval_rouge2": 0.0053,
1261
+ "eval_rougeL": 0.0095,
1262
+ "eval_rougeLsum": 0.0095,
1263
+ "eval_runtime": 9.4682,
1264
+ "eval_samples_per_second": 11.618,
1265
+ "eval_steps_per_second": 1.479,
1266
+ "step": 2585
1267
+ },
1268
+ {
1269
+ "epoch": 94.98,
1270
+ "eval_gen_len": 8.9,
1271
+ "eval_loss": 0.7125746607780457,
1272
+ "eval_rouge1": 0.0069,
1273
+ "eval_rouge2": 0.0026,
1274
+ "eval_rougeL": 0.0063,
1275
+ "eval_rougeLsum": 0.0064,
1276
+ "eval_runtime": 9.4699,
1277
+ "eval_samples_per_second": 11.616,
1278
+ "eval_steps_per_second": 1.478,
1279
+ "step": 2612
1280
+ },
1281
+ {
1282
+ "epoch": 96.0,
1283
+ "eval_gen_len": 9.2,
1284
+ "eval_loss": 0.6931119561195374,
1285
+ "eval_rouge1": 0.0107,
1286
+ "eval_rouge2": 0.0053,
1287
+ "eval_rougeL": 0.0088,
1288
+ "eval_rougeLsum": 0.0089,
1289
+ "eval_runtime": 9.4599,
1290
+ "eval_samples_per_second": 11.628,
1291
+ "eval_steps_per_second": 1.48,
1292
+ "step": 2640
1293
+ },
1294
+ {
1295
+ "epoch": 96.98,
1296
+ "eval_gen_len": 8.8909,
1297
+ "eval_loss": 0.6750566363334656,
1298
+ "eval_rouge1": 0.0063,
1299
+ "eval_rouge2": 0.0029,
1300
+ "eval_rougeL": 0.0049,
1301
+ "eval_rougeLsum": 0.0048,
1302
+ "eval_runtime": 9.4656,
1303
+ "eval_samples_per_second": 11.621,
1304
+ "eval_steps_per_second": 1.479,
1305
+ "step": 2667
1306
+ },
1307
+ {
1308
+ "epoch": 98.0,
1309
+ "eval_gen_len": 9.3182,
1310
+ "eval_loss": 0.6581041216850281,
1311
+ "eval_rouge1": 0.0091,
1312
+ "eval_rouge2": 0.005,
1313
+ "eval_rougeL": 0.0069,
1314
+ "eval_rougeLsum": 0.0071,
1315
+ "eval_runtime": 9.4802,
1316
+ "eval_samples_per_second": 11.603,
1317
+ "eval_steps_per_second": 1.477,
1318
+ "step": 2695
1319
+ },
1320
+ {
1321
+ "epoch": 98.98,
1322
+ "eval_gen_len": 9.1091,
1323
+ "eval_loss": 0.6413628458976746,
1324
+ "eval_rouge1": 0.0038,
1325
+ "eval_rouge2": 0.0027,
1326
+ "eval_rougeL": 0.0026,
1327
+ "eval_rougeLsum": 0.0034,
1328
+ "eval_runtime": 9.4864,
1329
+ "eval_samples_per_second": 11.595,
1330
+ "eval_steps_per_second": 1.476,
1331
+ "step": 2722
1332
+ },
1333
+ {
1334
+ "epoch": 100.0,
1335
+ "eval_gen_len": 9.5273,
1336
+ "eval_loss": 0.6237995028495789,
1337
+ "eval_rouge1": 0.0125,
1338
+ "eval_rouge2": 0.0088,
1339
+ "eval_rougeL": 0.0109,
1340
+ "eval_rougeLsum": 0.0109,
1341
+ "eval_runtime": 9.479,
1342
+ "eval_samples_per_second": 11.605,
1343
+ "eval_steps_per_second": 1.477,
1344
+ "step": 2750
1345
+ },
1346
+ {
1347
+ "epoch": 100.98,
1348
+ "eval_gen_len": 9.3727,
1349
+ "eval_loss": 0.6053850054740906,
1350
+ "eval_rouge1": 0.0053,
1351
+ "eval_rouge2": 0.0027,
1352
+ "eval_rougeL": 0.0042,
1353
+ "eval_rougeLsum": 0.0045,
1354
+ "eval_runtime": 9.4604,
1355
+ "eval_samples_per_second": 11.627,
1356
+ "eval_steps_per_second": 1.48,
1357
+ "step": 2777
1358
+ },
1359
+ {
1360
+ "epoch": 102.0,
1361
+ "eval_gen_len": 9.7091,
1362
+ "eval_loss": 0.5907317996025085,
1363
+ "eval_rouge1": 0.0115,
1364
+ "eval_rouge2": 0.0096,
1365
+ "eval_rougeL": 0.0108,
1366
+ "eval_rougeLsum": 0.0109,
1367
+ "eval_runtime": 9.4533,
1368
+ "eval_samples_per_second": 11.636,
1369
+ "eval_steps_per_second": 1.481,
1370
+ "step": 2805
1371
+ },
1372
+ {
1373
+ "epoch": 102.98,
1374
+ "eval_gen_len": 9.5364,
1375
+ "eval_loss": 0.5752558708190918,
1376
+ "eval_rouge1": 0.0055,
1377
+ "eval_rouge2": 0.0045,
1378
+ "eval_rougeL": 0.005,
1379
+ "eval_rougeLsum": 0.0053,
1380
+ "eval_runtime": 9.4454,
1381
+ "eval_samples_per_second": 11.646,
1382
+ "eval_steps_per_second": 1.482,
1383
+ "step": 2832
1384
+ },
1385
+ {
1386
+ "epoch": 104.0,
1387
+ "eval_gen_len": 9.7273,
1388
+ "eval_loss": 0.562364399433136,
1389
+ "eval_rouge1": 0.01,
1390
+ "eval_rouge2": 0.0057,
1391
+ "eval_rougeL": 0.0091,
1392
+ "eval_rougeLsum": 0.0094,
1393
+ "eval_runtime": 9.4478,
1394
+ "eval_samples_per_second": 11.643,
1395
+ "eval_steps_per_second": 1.482,
1396
+ "step": 2860
1397
+ },
1398
+ {
1399
+ "epoch": 104.98,
1400
+ "eval_gen_len": 9.6909,
1401
+ "eval_loss": 0.5496523380279541,
1402
+ "eval_rouge1": 0.0078,
1403
+ "eval_rouge2": 0.0038,
1404
+ "eval_rougeL": 0.0066,
1405
+ "eval_rougeLsum": 0.0069,
1406
+ "eval_runtime": 9.4963,
1407
+ "eval_samples_per_second": 11.583,
1408
+ "eval_steps_per_second": 1.474,
1409
+ "step": 2887
1410
+ },
1411
+ {
1412
+ "epoch": 106.0,
1413
+ "eval_gen_len": 9.6,
1414
+ "eval_loss": 0.5380507111549377,
1415
+ "eval_rouge1": 0.0077,
1416
+ "eval_rouge2": 0.0041,
1417
+ "eval_rougeL": 0.0068,
1418
+ "eval_rougeLsum": 0.0071,
1419
+ "eval_runtime": 9.4581,
1420
+ "eval_samples_per_second": 11.63,
1421
+ "eval_steps_per_second": 1.48,
1422
+ "step": 2915
1423
+ },
1424
+ {
1425
+ "epoch": 106.98,
1426
+ "eval_gen_len": 9.2909,
1427
+ "eval_loss": 0.5269507765769958,
1428
+ "eval_rouge1": 0.0109,
1429
+ "eval_rouge2": 0.0068,
1430
+ "eval_rougeL": 0.0101,
1431
+ "eval_rougeLsum": 0.0103,
1432
+ "eval_runtime": 9.4431,
1433
+ "eval_samples_per_second": 11.649,
1434
+ "eval_steps_per_second": 1.483,
1435
+ "step": 2942
1436
+ },
1437
+ {
1438
+ "epoch": 108.0,
1439
+ "eval_gen_len": 8.9636,
1440
+ "eval_loss": 0.5166643857955933,
1441
+ "eval_rouge1": 0.0095,
1442
+ "eval_rouge2": 0.004,
1443
+ "eval_rougeL": 0.008,
1444
+ "eval_rougeLsum": 0.0079,
1445
+ "eval_runtime": 9.4573,
1446
+ "eval_samples_per_second": 11.631,
1447
+ "eval_steps_per_second": 1.48,
1448
+ "step": 2970
1449
+ },
1450
+ {
1451
+ "epoch": 108.98,
1452
+ "eval_gen_len": 9.3818,
1453
+ "eval_loss": 0.5079358816146851,
1454
+ "eval_rouge1": 0.0078,
1455
+ "eval_rouge2": 0.0035,
1456
+ "eval_rougeL": 0.0055,
1457
+ "eval_rougeLsum": 0.0059,
1458
+ "eval_runtime": 9.4478,
1459
+ "eval_samples_per_second": 11.643,
1460
+ "eval_steps_per_second": 1.482,
1461
+ "step": 2997
1462
+ },
1463
+ {
1464
+ "epoch": 109.09,
1465
+ "grad_norm": 2.720813512802124,
1466
+ "learning_rate": 1.0139917695473251e-05,
1467
+ "loss": 0.9194,
1468
+ "step": 3000
1469
+ },
1470
+ {
1471
+ "epoch": 110.0,
1472
+ "eval_gen_len": 9.2909,
1473
+ "eval_loss": 0.5007773041725159,
1474
+ "eval_rouge1": 0.0044,
1475
+ "eval_rouge2": 0.0019,
1476
+ "eval_rougeL": 0.0037,
1477
+ "eval_rougeLsum": 0.0033,
1478
+ "eval_runtime": 9.4749,
1479
+ "eval_samples_per_second": 11.61,
1480
+ "eval_steps_per_second": 1.478,
1481
+ "step": 3025
1482
+ },
1483
+ {
1484
+ "epoch": 110.98,
1485
+ "eval_gen_len": 9.2909,
1486
+ "eval_loss": 0.49183493852615356,
1487
+ "eval_rouge1": 0.0108,
1488
+ "eval_rouge2": 0.0069,
1489
+ "eval_rougeL": 0.0087,
1490
+ "eval_rougeLsum": 0.0088,
1491
+ "eval_runtime": 9.5004,
1492
+ "eval_samples_per_second": 11.578,
1493
+ "eval_steps_per_second": 1.474,
1494
+ "step": 3052
1495
+ },
1496
+ {
1497
+ "epoch": 112.0,
1498
+ "eval_gen_len": 9.2455,
1499
+ "eval_loss": 0.48342087864875793,
1500
+ "eval_rouge1": 0.0112,
1501
+ "eval_rouge2": 0.008,
1502
+ "eval_rougeL": 0.0092,
1503
+ "eval_rougeLsum": 0.009,
1504
+ "eval_runtime": 9.5074,
1505
+ "eval_samples_per_second": 11.57,
1506
+ "eval_steps_per_second": 1.473,
1507
+ "step": 3080
1508
+ },
1509
+ {
1510
+ "epoch": 112.98,
1511
+ "eval_gen_len": 8.9364,
1512
+ "eval_loss": 0.47708794474601746,
1513
+ "eval_rouge1": 0.0057,
1514
+ "eval_rouge2": 0.0031,
1515
+ "eval_rougeL": 0.0042,
1516
+ "eval_rougeLsum": 0.004,
1517
+ "eval_runtime": 9.5044,
1518
+ "eval_samples_per_second": 11.574,
1519
+ "eval_steps_per_second": 1.473,
1520
+ "step": 3107
1521
+ },
1522
+ {
1523
+ "epoch": 114.0,
1524
+ "eval_gen_len": 9.1364,
1525
+ "eval_loss": 0.47102922201156616,
1526
+ "eval_rouge1": 0.0122,
1527
+ "eval_rouge2": 0.0065,
1528
+ "eval_rougeL": 0.0089,
1529
+ "eval_rougeLsum": 0.0087,
1530
+ "eval_runtime": 9.4603,
1531
+ "eval_samples_per_second": 11.628,
1532
+ "eval_steps_per_second": 1.48,
1533
+ "step": 3135
1534
+ },
1535
+ {
1536
+ "epoch": 114.98,
1537
+ "eval_gen_len": 9.2182,
1538
+ "eval_loss": 0.4654105007648468,
1539
+ "eval_rouge1": 0.0102,
1540
+ "eval_rouge2": 0.0063,
1541
+ "eval_rougeL": 0.0085,
1542
+ "eval_rougeLsum": 0.0083,
1543
+ "eval_runtime": 9.4614,
1544
+ "eval_samples_per_second": 11.626,
1545
+ "eval_steps_per_second": 1.48,
1546
+ "step": 3162
1547
+ },
1548
+ {
1549
+ "epoch": 116.0,
1550
+ "eval_gen_len": 8.9455,
1551
+ "eval_loss": 0.4600731432437897,
1552
+ "eval_rouge1": 0.0119,
1553
+ "eval_rouge2": 0.0092,
1554
+ "eval_rougeL": 0.0104,
1555
+ "eval_rougeLsum": 0.0102,
1556
+ "eval_runtime": 9.461,
1557
+ "eval_samples_per_second": 11.627,
1558
+ "eval_steps_per_second": 1.48,
1559
+ "step": 3190
1560
+ },
1561
+ {
1562
+ "epoch": 116.98,
1563
+ "eval_gen_len": 8.4182,
1564
+ "eval_loss": 0.4526377320289612,
1565
+ "eval_rouge1": 0.0107,
1566
+ "eval_rouge2": 0.0082,
1567
+ "eval_rougeL": 0.0099,
1568
+ "eval_rougeLsum": 0.0097,
1569
+ "eval_runtime": 9.4731,
1570
+ "eval_samples_per_second": 11.612,
1571
+ "eval_steps_per_second": 1.478,
1572
+ "step": 3217
1573
+ },
1574
+ {
1575
+ "epoch": 118.0,
1576
+ "eval_gen_len": 8.4273,
1577
+ "eval_loss": 0.4467811584472656,
1578
+ "eval_rouge1": 0.0052,
1579
+ "eval_rouge2": 0.0044,
1580
+ "eval_rougeL": 0.0045,
1581
+ "eval_rougeLsum": 0.0045,
1582
+ "eval_runtime": 9.4604,
1583
+ "eval_samples_per_second": 11.627,
1584
+ "eval_steps_per_second": 1.48,
1585
+ "step": 3245
1586
+ },
1587
+ {
1588
+ "epoch": 118.98,
1589
+ "eval_gen_len": 8.3,
1590
+ "eval_loss": 0.4426099956035614,
1591
+ "eval_rouge1": 0.0054,
1592
+ "eval_rouge2": 0.0041,
1593
+ "eval_rougeL": 0.005,
1594
+ "eval_rougeLsum": 0.0052,
1595
+ "eval_runtime": 9.4891,
1596
+ "eval_samples_per_second": 11.592,
1597
+ "eval_steps_per_second": 1.475,
1598
+ "step": 3272
1599
+ },
1600
+ {
1601
+ "epoch": 120.0,
1602
+ "eval_gen_len": 8.2727,
1603
+ "eval_loss": 0.43666210770606995,
1604
+ "eval_rouge1": 0.0107,
1605
+ "eval_rouge2": 0.0079,
1606
+ "eval_rougeL": 0.0102,
1607
+ "eval_rougeLsum": 0.0101,
1608
+ "eval_runtime": 9.5096,
1609
+ "eval_samples_per_second": 11.567,
1610
+ "eval_steps_per_second": 1.472,
1611
+ "step": 3300
1612
+ },
1613
+ {
1614
+ "epoch": 120.98,
1615
+ "eval_gen_len": 8.7182,
1616
+ "eval_loss": 0.4338292181491852,
1617
+ "eval_rouge1": 0.0142,
1618
+ "eval_rouge2": 0.0102,
1619
+ "eval_rougeL": 0.0134,
1620
+ "eval_rougeLsum": 0.0131,
1621
+ "eval_runtime": 9.4732,
1622
+ "eval_samples_per_second": 11.612,
1623
+ "eval_steps_per_second": 1.478,
1624
+ "step": 3327
1625
+ },
1626
+ {
1627
+ "epoch": 122.0,
1628
+ "eval_gen_len": 7.5727,
1629
+ "eval_loss": 0.4293038547039032,
1630
+ "eval_rouge1": 0.0045,
1631
+ "eval_rouge2": 0.0035,
1632
+ "eval_rougeL": 0.0039,
1633
+ "eval_rougeLsum": 0.0039,
1634
+ "eval_runtime": 9.458,
1635
+ "eval_samples_per_second": 11.63,
1636
+ "eval_steps_per_second": 1.48,
1637
+ "step": 3355
1638
+ },
1639
+ {
1640
+ "epoch": 122.98,
1641
+ "eval_gen_len": 7.8818,
1642
+ "eval_loss": 0.4247772991657257,
1643
+ "eval_rouge1": 0.0082,
1644
+ "eval_rouge2": 0.0056,
1645
+ "eval_rougeL": 0.0078,
1646
+ "eval_rougeLsum": 0.0076,
1647
+ "eval_runtime": 9.4976,
1648
+ "eval_samples_per_second": 11.582,
1649
+ "eval_steps_per_second": 1.474,
1650
+ "step": 3382
1651
+ },
1652
+ {
1653
+ "epoch": 124.0,
1654
+ "eval_gen_len": 7.4273,
1655
+ "eval_loss": 0.4226304888725281,
1656
+ "eval_rouge1": 0.0047,
1657
+ "eval_rouge2": 0.0039,
1658
+ "eval_rougeL": 0.0047,
1659
+ "eval_rougeLsum": 0.0047,
1660
+ "eval_runtime": 9.4775,
1661
+ "eval_samples_per_second": 11.606,
1662
+ "eval_steps_per_second": 1.477,
1663
+ "step": 3410
1664
+ },
1665
+ {
1666
+ "epoch": 124.98,
1667
+ "eval_gen_len": 7.7091,
1668
+ "eval_loss": 0.4187226891517639,
1669
+ "eval_rouge1": 0.0096,
1670
+ "eval_rouge2": 0.0065,
1671
+ "eval_rougeL": 0.0097,
1672
+ "eval_rougeLsum": 0.0095,
1673
+ "eval_runtime": 9.4596,
1674
+ "eval_samples_per_second": 11.628,
1675
+ "eval_steps_per_second": 1.48,
1676
+ "step": 3437
1677
+ },
1678
+ {
1679
+ "epoch": 126.0,
1680
+ "eval_gen_len": 7.1364,
1681
+ "eval_loss": 0.4152156412601471,
1682
+ "eval_rouge1": 0.0026,
1683
+ "eval_rouge2": 0.0024,
1684
+ "eval_rougeL": 0.0026,
1685
+ "eval_rougeLsum": 0.0026,
1686
+ "eval_runtime": 9.4787,
1687
+ "eval_samples_per_second": 11.605,
1688
+ "eval_steps_per_second": 1.477,
1689
+ "step": 3465
1690
+ },
1691
+ {
1692
+ "epoch": 126.98,
1693
+ "eval_gen_len": 6.8909,
1694
+ "eval_loss": 0.4114760458469391,
1695
+ "eval_rouge1": 0.0026,
1696
+ "eval_rouge2": 0.0024,
1697
+ "eval_rougeL": 0.0026,
1698
+ "eval_rougeLsum": 0.0026,
1699
+ "eval_runtime": 9.4927,
1700
+ "eval_samples_per_second": 11.588,
1701
+ "eval_steps_per_second": 1.475,
1702
+ "step": 3492
1703
+ },
1704
+ {
1705
+ "epoch": 127.27,
1706
+ "grad_norm": 5.880601406097412,
1707
+ "learning_rate": 8.493827160493828e-06,
1708
+ "loss": 0.6369,
1709
+ "step": 3500
1710
+ },
1711
+ {
1712
+ "epoch": 128.0,
1713
+ "eval_gen_len": 6.7182,
1714
+ "eval_loss": 0.4087616503238678,
1715
+ "eval_rouge1": 0.0051,
1716
+ "eval_rouge2": 0.0048,
1717
+ "eval_rougeL": 0.0051,
1718
+ "eval_rougeLsum": 0.0051,
1719
+ "eval_runtime": 9.5116,
1720
+ "eval_samples_per_second": 11.565,
1721
+ "eval_steps_per_second": 1.472,
1722
+ "step": 3520
1723
+ },
1724
+ {
1725
+ "epoch": 128.98,
1726
+ "eval_gen_len": 7.3091,
1727
+ "eval_loss": 0.4050390422344208,
1728
+ "eval_rouge1": 0.0113,
1729
+ "eval_rouge2": 0.0097,
1730
+ "eval_rougeL": 0.0115,
1731
+ "eval_rougeLsum": 0.0115,
1732
+ "eval_runtime": 9.5251,
1733
+ "eval_samples_per_second": 11.548,
1734
+ "eval_steps_per_second": 1.47,
1735
+ "step": 3547
1736
+ },
1737
+ {
1738
+ "epoch": 130.0,
1739
+ "eval_gen_len": 7.2727,
1740
+ "eval_loss": 0.40342459082603455,
1741
+ "eval_rouge1": 0.0097,
1742
+ "eval_rouge2": 0.0086,
1743
+ "eval_rougeL": 0.0098,
1744
+ "eval_rougeLsum": 0.0099,
1745
+ "eval_runtime": 9.5061,
1746
+ "eval_samples_per_second": 11.572,
1747
+ "eval_steps_per_second": 1.473,
1748
+ "step": 3575
1749
+ },
1750
+ {
1751
+ "epoch": 130.98,
1752
+ "eval_gen_len": 6.9455,
1753
+ "eval_loss": 0.39917439222335815,
1754
+ "eval_rouge1": 0.0096,
1755
+ "eval_rouge2": 0.0081,
1756
+ "eval_rougeL": 0.0097,
1757
+ "eval_rougeLsum": 0.0097,
1758
+ "eval_runtime": 9.5045,
1759
+ "eval_samples_per_second": 11.573,
1760
+ "eval_steps_per_second": 1.473,
1761
+ "step": 3602
1762
+ },
1763
+ {
1764
+ "epoch": 132.0,
1765
+ "eval_gen_len": 6.6,
1766
+ "eval_loss": 0.3954027593135834,
1767
+ "eval_rouge1": 0.0053,
1768
+ "eval_rouge2": 0.0056,
1769
+ "eval_rougeL": 0.0053,
1770
+ "eval_rougeLsum": 0.0061,
1771
+ "eval_runtime": 9.4882,
1772
+ "eval_samples_per_second": 11.593,
1773
+ "eval_steps_per_second": 1.476,
1774
+ "step": 3630
1775
+ },
1776
+ {
1777
+ "epoch": 132.98,
1778
+ "eval_gen_len": 6.6727,
1779
+ "eval_loss": 0.3916667103767395,
1780
+ "eval_rouge1": 0.0061,
1781
+ "eval_rouge2": 0.0056,
1782
+ "eval_rougeL": 0.006,
1783
+ "eval_rougeLsum": 0.0061,
1784
+ "eval_runtime": 9.4921,
1785
+ "eval_samples_per_second": 11.589,
1786
+ "eval_steps_per_second": 1.475,
1787
+ "step": 3657
1788
+ },
1789
+ {
1790
+ "epoch": 134.0,
1791
+ "eval_gen_len": 6.9455,
1792
+ "eval_loss": 0.3922131359577179,
1793
+ "eval_rouge1": 0.0084,
1794
+ "eval_rouge2": 0.0072,
1795
+ "eval_rougeL": 0.0082,
1796
+ "eval_rougeLsum": 0.0086,
1797
+ "eval_runtime": 9.4995,
1798
+ "eval_samples_per_second": 11.58,
1799
+ "eval_steps_per_second": 1.474,
1800
+ "step": 3685
1801
+ },
1802
+ {
1803
+ "epoch": 134.98,
1804
+ "eval_gen_len": 6.3727,
1805
+ "eval_loss": 0.38674217462539673,
1806
+ "eval_rouge1": 0.0052,
1807
+ "eval_rouge2": 0.003,
1808
+ "eval_rougeL": 0.0046,
1809
+ "eval_rougeLsum": 0.0046,
1810
+ "eval_runtime": 9.4561,
1811
+ "eval_samples_per_second": 11.633,
1812
+ "eval_steps_per_second": 1.481,
1813
+ "step": 3712
1814
+ },
1815
+ {
1816
+ "epoch": 136.0,
1817
+ "eval_gen_len": 6.6273,
1818
+ "eval_loss": 0.38475027680397034,
1819
+ "eval_rouge1": 0.009,
1820
+ "eval_rouge2": 0.0061,
1821
+ "eval_rougeL": 0.0083,
1822
+ "eval_rougeLsum": 0.0086,
1823
+ "eval_runtime": 9.4487,
1824
+ "eval_samples_per_second": 11.642,
1825
+ "eval_steps_per_second": 1.482,
1826
+ "step": 3740
1827
+ },
1828
+ {
1829
+ "epoch": 136.98,
1830
+ "eval_gen_len": 6.4455,
1831
+ "eval_loss": 0.37981557846069336,
1832
+ "eval_rouge1": 0.0123,
1833
+ "eval_rouge2": 0.0095,
1834
+ "eval_rougeL": 0.0119,
1835
+ "eval_rougeLsum": 0.0121,
1836
+ "eval_runtime": 9.4838,
1837
+ "eval_samples_per_second": 11.599,
1838
+ "eval_steps_per_second": 1.476,
1839
+ "step": 3767
1840
+ },
1841
+ {
1842
+ "epoch": 138.0,
1843
+ "eval_gen_len": 6.6909,
1844
+ "eval_loss": 0.3788022994995117,
1845
+ "eval_rouge1": 0.0138,
1846
+ "eval_rouge2": 0.01,
1847
+ "eval_rougeL": 0.0132,
1848
+ "eval_rougeLsum": 0.0133,
1849
+ "eval_runtime": 9.4736,
1850
+ "eval_samples_per_second": 11.611,
1851
+ "eval_steps_per_second": 1.478,
1852
+ "step": 3795
1853
+ },
1854
+ {
1855
+ "epoch": 138.98,
1856
+ "eval_gen_len": 6.3909,
1857
+ "eval_loss": 0.3755718171596527,
1858
+ "eval_rouge1": 0.0119,
1859
+ "eval_rouge2": 0.0085,
1860
+ "eval_rougeL": 0.0116,
1861
+ "eval_rougeLsum": 0.0116,
1862
+ "eval_runtime": 9.4922,
1863
+ "eval_samples_per_second": 11.588,
1864
+ "eval_steps_per_second": 1.475,
1865
+ "step": 3822
1866
+ },
1867
+ {
1868
+ "epoch": 140.0,
1869
+ "eval_gen_len": 6.6545,
1870
+ "eval_loss": 0.3745150864124298,
1871
+ "eval_rouge1": 0.0135,
1872
+ "eval_rouge2": 0.0095,
1873
+ "eval_rougeL": 0.013,
1874
+ "eval_rougeLsum": 0.013,
1875
+ "eval_runtime": 9.5082,
1876
+ "eval_samples_per_second": 11.569,
1877
+ "eval_steps_per_second": 1.472,
1878
+ "step": 3850
1879
+ },
1880
+ {
1881
+ "epoch": 140.98,
1882
+ "eval_gen_len": 6.9636,
1883
+ "eval_loss": 0.37223342061042786,
1884
+ "eval_rouge1": 0.0175,
1885
+ "eval_rouge2": 0.0123,
1886
+ "eval_rougeL": 0.0171,
1887
+ "eval_rougeLsum": 0.0168,
1888
+ "eval_runtime": 9.506,
1889
+ "eval_samples_per_second": 11.572,
1890
+ "eval_steps_per_second": 1.473,
1891
+ "step": 3877
1892
+ },
1893
+ {
1894
+ "epoch": 142.0,
1895
+ "eval_gen_len": 7.0727,
1896
+ "eval_loss": 0.36922305822372437,
1897
+ "eval_rouge1": 0.0188,
1898
+ "eval_rouge2": 0.0127,
1899
+ "eval_rougeL": 0.0183,
1900
+ "eval_rougeLsum": 0.018,
1901
+ "eval_runtime": 9.4985,
1902
+ "eval_samples_per_second": 11.581,
1903
+ "eval_steps_per_second": 1.474,
1904
+ "step": 3905
1905
+ },
1906
+ {
1907
+ "epoch": 142.98,
1908
+ "eval_gen_len": 7.2727,
1909
+ "eval_loss": 0.3674834668636322,
1910
+ "eval_rouge1": 0.0201,
1911
+ "eval_rouge2": 0.0136,
1912
+ "eval_rougeL": 0.0197,
1913
+ "eval_rougeLsum": 0.0194,
1914
+ "eval_runtime": 9.4928,
1915
+ "eval_samples_per_second": 11.588,
1916
+ "eval_steps_per_second": 1.475,
1917
+ "step": 3932
1918
+ },
1919
+ {
1920
+ "epoch": 144.0,
1921
+ "eval_gen_len": 7.1818,
1922
+ "eval_loss": 0.36526089906692505,
1923
+ "eval_rouge1": 0.0215,
1924
+ "eval_rouge2": 0.0139,
1925
+ "eval_rougeL": 0.0211,
1926
+ "eval_rougeLsum": 0.0208,
1927
+ "eval_runtime": 9.4987,
1928
+ "eval_samples_per_second": 11.581,
1929
+ "eval_steps_per_second": 1.474,
1930
+ "step": 3960
1931
+ },
1932
+ {
1933
+ "epoch": 144.98,
1934
+ "eval_gen_len": 7.2,
1935
+ "eval_loss": 0.36316850781440735,
1936
+ "eval_rouge1": 0.0209,
1937
+ "eval_rouge2": 0.0128,
1938
+ "eval_rougeL": 0.0201,
1939
+ "eval_rougeLsum": 0.0202,
1940
+ "eval_runtime": 9.4792,
1941
+ "eval_samples_per_second": 11.604,
1942
+ "eval_steps_per_second": 1.477,
1943
+ "step": 3987
1944
+ },
1945
+ {
1946
+ "epoch": 145.45,
1947
+ "grad_norm": 0.6945245265960693,
1948
+ "learning_rate": 6.847736625514404e-06,
1949
+ "loss": 0.5099,
1950
+ "step": 4000
1951
+ },
1952
+ {
1953
+ "epoch": 146.0,
1954
+ "eval_gen_len": 7.1364,
1955
+ "eval_loss": 0.3603822886943817,
1956
+ "eval_rouge1": 0.022,
1957
+ "eval_rouge2": 0.0145,
1958
+ "eval_rougeL": 0.0213,
1959
+ "eval_rougeLsum": 0.0212,
1960
+ "eval_runtime": 9.5107,
1961
+ "eval_samples_per_second": 11.566,
1962
+ "eval_steps_per_second": 1.472,
1963
+ "step": 4015
1964
+ },
1965
+ {
1966
+ "epoch": 146.98,
1967
+ "eval_gen_len": 7.1182,
1968
+ "eval_loss": 0.35853010416030884,
1969
+ "eval_rouge1": 0.022,
1970
+ "eval_rouge2": 0.0145,
1971
+ "eval_rougeL": 0.0213,
1972
+ "eval_rougeLsum": 0.0212,
1973
+ "eval_runtime": 9.5074,
1974
+ "eval_samples_per_second": 11.57,
1975
+ "eval_steps_per_second": 1.473,
1976
+ "step": 4042
1977
+ },
1978
+ {
1979
+ "epoch": 148.0,
1980
+ "eval_gen_len": 7.7364,
1981
+ "eval_loss": 0.35745835304260254,
1982
+ "eval_rouge1": 0.0283,
1983
+ "eval_rouge2": 0.018,
1984
+ "eval_rougeL": 0.0269,
1985
+ "eval_rougeLsum": 0.0269,
1986
+ "eval_runtime": 9.482,
1987
+ "eval_samples_per_second": 11.601,
1988
+ "eval_steps_per_second": 1.476,
1989
+ "step": 4070
1990
+ },
1991
+ {
1992
+ "epoch": 148.98,
1993
+ "eval_gen_len": 8.3364,
1994
+ "eval_loss": 0.3559305965900421,
1995
+ "eval_rouge1": 0.0374,
1996
+ "eval_rouge2": 0.0236,
1997
+ "eval_rougeL": 0.0357,
1998
+ "eval_rougeLsum": 0.0359,
1999
+ "eval_runtime": 9.4877,
2000
+ "eval_samples_per_second": 11.594,
2001
+ "eval_steps_per_second": 1.476,
2002
+ "step": 4097
2003
+ },
2004
+ {
2005
+ "epoch": 150.0,
2006
+ "eval_gen_len": 8.9091,
2007
+ "eval_loss": 0.3528214395046234,
2008
+ "eval_rouge1": 0.0384,
2009
+ "eval_rouge2": 0.0236,
2010
+ "eval_rougeL": 0.0366,
2011
+ "eval_rougeLsum": 0.0369,
2012
+ "eval_runtime": 9.4747,
2013
+ "eval_samples_per_second": 11.61,
2014
+ "eval_steps_per_second": 1.478,
2015
+ "step": 4125
2016
+ },
2017
+ {
2018
+ "epoch": 150.98,
2019
+ "eval_gen_len": 9.4455,
2020
+ "eval_loss": 0.35084155201911926,
2021
+ "eval_rouge1": 0.0416,
2022
+ "eval_rouge2": 0.0254,
2023
+ "eval_rougeL": 0.0399,
2024
+ "eval_rougeLsum": 0.0399,
2025
+ "eval_runtime": 9.4685,
2026
+ "eval_samples_per_second": 11.618,
2027
+ "eval_steps_per_second": 1.479,
2028
+ "step": 4152
2029
+ },
2030
+ {
2031
+ "epoch": 152.0,
2032
+ "eval_gen_len": 9.6091,
2033
+ "eval_loss": 0.34896430373191833,
2034
+ "eval_rouge1": 0.0439,
2035
+ "eval_rouge2": 0.0257,
2036
+ "eval_rougeL": 0.0413,
2037
+ "eval_rougeLsum": 0.0415,
2038
+ "eval_runtime": 9.5218,
2039
+ "eval_samples_per_second": 11.552,
2040
+ "eval_steps_per_second": 1.47,
2041
+ "step": 4180
2042
+ },
2043
+ {
2044
+ "epoch": 152.98,
2045
+ "eval_gen_len": 9.7636,
2046
+ "eval_loss": 0.34783267974853516,
2047
+ "eval_rouge1": 0.0479,
2048
+ "eval_rouge2": 0.0297,
2049
+ "eval_rougeL": 0.045,
2050
+ "eval_rougeLsum": 0.0454,
2051
+ "eval_runtime": 9.5136,
2052
+ "eval_samples_per_second": 11.562,
2053
+ "eval_steps_per_second": 1.472,
2054
+ "step": 4207
2055
+ },
2056
+ {
2057
+ "epoch": 154.0,
2058
+ "eval_gen_len": 10.1909,
2059
+ "eval_loss": 0.3452661335468292,
2060
+ "eval_rouge1": 0.0495,
2061
+ "eval_rouge2": 0.0291,
2062
+ "eval_rougeL": 0.0464,
2063
+ "eval_rougeLsum": 0.0464,
2064
+ "eval_runtime": 9.5097,
2065
+ "eval_samples_per_second": 11.567,
2066
+ "eval_steps_per_second": 1.472,
2067
+ "step": 4235
2068
+ },
2069
+ {
2070
+ "epoch": 154.98,
2071
+ "eval_gen_len": 10.6,
2072
+ "eval_loss": 0.34583330154418945,
2073
+ "eval_rouge1": 0.0576,
2074
+ "eval_rouge2": 0.035,
2075
+ "eval_rougeL": 0.055,
2076
+ "eval_rougeLsum": 0.0551,
2077
+ "eval_runtime": 9.4851,
2078
+ "eval_samples_per_second": 11.597,
2079
+ "eval_steps_per_second": 1.476,
2080
+ "step": 4262
2081
+ },
2082
+ {
2083
+ "epoch": 156.0,
2084
+ "eval_gen_len": 10.2909,
2085
+ "eval_loss": 0.3417557179927826,
2086
+ "eval_rouge1": 0.0533,
2087
+ "eval_rouge2": 0.0314,
2088
+ "eval_rougeL": 0.0506,
2089
+ "eval_rougeLsum": 0.0507,
2090
+ "eval_runtime": 9.474,
2091
+ "eval_samples_per_second": 11.611,
2092
+ "eval_steps_per_second": 1.478,
2093
+ "step": 4290
2094
+ },
2095
+ {
2096
+ "epoch": 156.98,
2097
+ "eval_gen_len": 10.9364,
2098
+ "eval_loss": 0.3396497666835785,
2099
+ "eval_rouge1": 0.0591,
2100
+ "eval_rouge2": 0.0351,
2101
+ "eval_rougeL": 0.0561,
2102
+ "eval_rougeLsum": 0.0561,
2103
+ "eval_runtime": 9.496,
2104
+ "eval_samples_per_second": 11.584,
2105
+ "eval_steps_per_second": 1.474,
2106
+ "step": 4317
2107
+ },
2108
+ {
2109
+ "epoch": 158.0,
2110
+ "eval_gen_len": 11.0364,
2111
+ "eval_loss": 0.3386593759059906,
2112
+ "eval_rouge1": 0.0633,
2113
+ "eval_rouge2": 0.0387,
2114
+ "eval_rougeL": 0.0605,
2115
+ "eval_rougeLsum": 0.0601,
2116
+ "eval_runtime": 9.4912,
2117
+ "eval_samples_per_second": 11.59,
2118
+ "eval_steps_per_second": 1.475,
2119
+ "step": 4345
2120
+ },
2121
+ {
2122
+ "epoch": 158.98,
2123
+ "eval_gen_len": 11.4455,
2124
+ "eval_loss": 0.3368191123008728,
2125
+ "eval_rouge1": 0.0614,
2126
+ "eval_rouge2": 0.0371,
2127
+ "eval_rougeL": 0.0593,
2128
+ "eval_rougeLsum": 0.0583,
2129
+ "eval_runtime": 9.5012,
2130
+ "eval_samples_per_second": 11.578,
2131
+ "eval_steps_per_second": 1.474,
2132
+ "step": 4372
2133
+ },
2134
+ {
2135
+ "epoch": 160.0,
2136
+ "eval_gen_len": 11.4545,
2137
+ "eval_loss": 0.33700016140937805,
2138
+ "eval_rouge1": 0.0702,
2139
+ "eval_rouge2": 0.0444,
2140
+ "eval_rougeL": 0.0672,
2141
+ "eval_rougeLsum": 0.0671,
2142
+ "eval_runtime": 9.4911,
2143
+ "eval_samples_per_second": 11.59,
2144
+ "eval_steps_per_second": 1.475,
2145
+ "step": 4400
2146
+ },
2147
+ {
2148
+ "epoch": 160.98,
2149
+ "eval_gen_len": 11.4182,
2150
+ "eval_loss": 0.3347805440425873,
2151
+ "eval_rouge1": 0.0702,
2152
+ "eval_rouge2": 0.0444,
2153
+ "eval_rougeL": 0.0672,
2154
+ "eval_rougeLsum": 0.0671,
2155
+ "eval_runtime": 9.4829,
2156
+ "eval_samples_per_second": 11.6,
2157
+ "eval_steps_per_second": 1.476,
2158
+ "step": 4427
2159
+ },
2160
+ {
2161
+ "epoch": 162.0,
2162
+ "eval_gen_len": 11.2818,
2163
+ "eval_loss": 0.332653284072876,
2164
+ "eval_rouge1": 0.0691,
2165
+ "eval_rouge2": 0.0438,
2166
+ "eval_rougeL": 0.0661,
2167
+ "eval_rougeLsum": 0.0656,
2168
+ "eval_runtime": 9.5355,
2169
+ "eval_samples_per_second": 11.536,
2170
+ "eval_steps_per_second": 1.468,
2171
+ "step": 4455
2172
+ },
2173
+ {
2174
+ "epoch": 162.98,
2175
+ "eval_gen_len": 11.3545,
2176
+ "eval_loss": 0.3307643234729767,
2177
+ "eval_rouge1": 0.0648,
2178
+ "eval_rouge2": 0.0405,
2179
+ "eval_rougeL": 0.0614,
2180
+ "eval_rougeLsum": 0.0609,
2181
+ "eval_runtime": 9.4851,
2182
+ "eval_samples_per_second": 11.597,
2183
+ "eval_steps_per_second": 1.476,
2184
+ "step": 4482
2185
+ },
2186
+ {
2187
+ "epoch": 163.64,
2188
+ "grad_norm": 0.6090702414512634,
2189
+ "learning_rate": 5.20164609053498e-06,
2190
+ "loss": 0.4471,
2191
+ "step": 4500
2192
+ },
2193
+ {
2194
+ "epoch": 164.0,
2195
+ "eval_gen_len": 11.7909,
2196
+ "eval_loss": 0.32992881536483765,
2197
+ "eval_rouge1": 0.0711,
2198
+ "eval_rouge2": 0.0441,
2199
+ "eval_rougeL": 0.0677,
2200
+ "eval_rougeLsum": 0.0667,
2201
+ "eval_runtime": 9.5142,
2202
+ "eval_samples_per_second": 11.562,
2203
+ "eval_steps_per_second": 1.471,
2204
+ "step": 4510
2205
+ },
2206
+ {
2207
+ "epoch": 164.98,
2208
+ "eval_gen_len": 12.0273,
2209
+ "eval_loss": 0.3291892111301422,
2210
+ "eval_rouge1": 0.0749,
2211
+ "eval_rouge2": 0.0476,
2212
+ "eval_rougeL": 0.0709,
2213
+ "eval_rougeLsum": 0.0701,
2214
+ "eval_runtime": 9.5232,
2215
+ "eval_samples_per_second": 11.551,
2216
+ "eval_steps_per_second": 1.47,
2217
+ "step": 4537
2218
+ },
2219
+ {
2220
+ "epoch": 166.0,
2221
+ "eval_gen_len": 12.4364,
2222
+ "eval_loss": 0.3260752558708191,
2223
+ "eval_rouge1": 0.078,
2224
+ "eval_rouge2": 0.0484,
2225
+ "eval_rougeL": 0.0728,
2226
+ "eval_rougeLsum": 0.0724,
2227
+ "eval_runtime": 9.5359,
2228
+ "eval_samples_per_second": 11.535,
2229
+ "eval_steps_per_second": 1.468,
2230
+ "step": 4565
2231
+ },
2232
+ {
2233
+ "epoch": 166.98,
2234
+ "eval_gen_len": 12.6636,
2235
+ "eval_loss": 0.32543399930000305,
2236
+ "eval_rouge1": 0.0865,
2237
+ "eval_rouge2": 0.0555,
2238
+ "eval_rougeL": 0.0812,
2239
+ "eval_rougeLsum": 0.0806,
2240
+ "eval_runtime": 9.5501,
2241
+ "eval_samples_per_second": 11.518,
2242
+ "eval_steps_per_second": 1.466,
2243
+ "step": 4592
2244
+ },
2245
+ {
2246
+ "epoch": 168.0,
2247
+ "eval_gen_len": 12.8455,
2248
+ "eval_loss": 0.3237576484680176,
2249
+ "eval_rouge1": 0.081,
2250
+ "eval_rouge2": 0.0501,
2251
+ "eval_rougeL": 0.0763,
2252
+ "eval_rougeLsum": 0.0754,
2253
+ "eval_runtime": 9.5386,
2254
+ "eval_samples_per_second": 11.532,
2255
+ "eval_steps_per_second": 1.468,
2256
+ "step": 4620
2257
+ },
2258
+ {
2259
+ "epoch": 168.98,
2260
+ "eval_gen_len": 13.1273,
2261
+ "eval_loss": 0.32304662466049194,
2262
+ "eval_rouge1": 0.0899,
2263
+ "eval_rouge2": 0.0584,
2264
+ "eval_rougeL": 0.0846,
2265
+ "eval_rougeLsum": 0.0836,
2266
+ "eval_runtime": 9.4963,
2267
+ "eval_samples_per_second": 11.583,
2268
+ "eval_steps_per_second": 1.474,
2269
+ "step": 4647
2270
+ },
2271
+ {
2272
+ "epoch": 170.0,
2273
+ "eval_gen_len": 13.5364,
2274
+ "eval_loss": 0.3217833638191223,
2275
+ "eval_rouge1": 0.0936,
2276
+ "eval_rouge2": 0.0579,
2277
+ "eval_rougeL": 0.0864,
2278
+ "eval_rougeLsum": 0.0854,
2279
+ "eval_runtime": 9.5121,
2280
+ "eval_samples_per_second": 11.564,
2281
+ "eval_steps_per_second": 1.472,
2282
+ "step": 4675
2283
+ },
2284
+ {
2285
+ "epoch": 170.98,
2286
+ "eval_gen_len": 13.9455,
2287
+ "eval_loss": 0.3209010660648346,
2288
+ "eval_rouge1": 0.0976,
2289
+ "eval_rouge2": 0.063,
2290
+ "eval_rougeL": 0.0914,
2291
+ "eval_rougeLsum": 0.0904,
2292
+ "eval_runtime": 9.507,
2293
+ "eval_samples_per_second": 11.57,
2294
+ "eval_steps_per_second": 1.473,
2295
+ "step": 4702
2296
+ },
2297
+ {
2298
+ "epoch": 172.0,
2299
+ "eval_gen_len": 14.2545,
2300
+ "eval_loss": 0.31976738572120667,
2301
+ "eval_rouge1": 0.1024,
2302
+ "eval_rouge2": 0.0663,
2303
+ "eval_rougeL": 0.0959,
2304
+ "eval_rougeLsum": 0.0946,
2305
+ "eval_runtime": 9.527,
2306
+ "eval_samples_per_second": 11.546,
2307
+ "eval_steps_per_second": 1.47,
2308
+ "step": 4730
2309
+ },
2310
+ {
2311
+ "epoch": 172.98,
2312
+ "eval_gen_len": 13.7909,
2313
+ "eval_loss": 0.31930533051490784,
2314
+ "eval_rouge1": 0.0943,
2315
+ "eval_rouge2": 0.0596,
2316
+ "eval_rougeL": 0.0883,
2317
+ "eval_rougeLsum": 0.0873,
2318
+ "eval_runtime": 9.5147,
2319
+ "eval_samples_per_second": 11.561,
2320
+ "eval_steps_per_second": 1.471,
2321
+ "step": 4757
2322
+ },
2323
+ {
2324
+ "epoch": 174.0,
2325
+ "eval_gen_len": 13.9182,
2326
+ "eval_loss": 0.3177041709423065,
2327
+ "eval_rouge1": 0.0964,
2328
+ "eval_rouge2": 0.0605,
2329
+ "eval_rougeL": 0.0905,
2330
+ "eval_rougeLsum": 0.0896,
2331
+ "eval_runtime": 9.4816,
2332
+ "eval_samples_per_second": 11.601,
2333
+ "eval_steps_per_second": 1.477,
2334
+ "step": 4785
2335
+ },
2336
+ {
2337
+ "epoch": 174.98,
2338
+ "eval_gen_len": 13.4636,
2339
+ "eval_loss": 0.3157893121242523,
2340
+ "eval_rouge1": 0.0978,
2341
+ "eval_rouge2": 0.0628,
2342
+ "eval_rougeL": 0.0918,
2343
+ "eval_rougeLsum": 0.0905,
2344
+ "eval_runtime": 9.4906,
2345
+ "eval_samples_per_second": 11.59,
2346
+ "eval_steps_per_second": 1.475,
2347
+ "step": 4812
2348
+ },
2349
+ {
2350
+ "epoch": 176.0,
2351
+ "eval_gen_len": 14.3273,
2352
+ "eval_loss": 0.3147883713245392,
2353
+ "eval_rouge1": 0.1021,
2354
+ "eval_rouge2": 0.0643,
2355
+ "eval_rougeL": 0.0951,
2356
+ "eval_rougeLsum": 0.0943,
2357
+ "eval_runtime": 9.5161,
2358
+ "eval_samples_per_second": 11.559,
2359
+ "eval_steps_per_second": 1.471,
2360
+ "step": 4840
2361
+ },
2362
+ {
2363
+ "epoch": 176.98,
2364
+ "eval_gen_len": 14.3727,
2365
+ "eval_loss": 0.31470227241516113,
2366
+ "eval_rouge1": 0.1043,
2367
+ "eval_rouge2": 0.0661,
2368
+ "eval_rougeL": 0.0982,
2369
+ "eval_rougeLsum": 0.0972,
2370
+ "eval_runtime": 9.5231,
2371
+ "eval_samples_per_second": 11.551,
2372
+ "eval_steps_per_second": 1.47,
2373
+ "step": 4867
2374
+ },
2375
+ {
2376
+ "epoch": 178.0,
2377
+ "eval_gen_len": 14.4455,
2378
+ "eval_loss": 0.31245002150535583,
2379
+ "eval_rouge1": 0.1068,
2380
+ "eval_rouge2": 0.0691,
2381
+ "eval_rougeL": 0.1016,
2382
+ "eval_rougeLsum": 0.1001,
2383
+ "eval_runtime": 9.5249,
2384
+ "eval_samples_per_second": 11.549,
2385
+ "eval_steps_per_second": 1.47,
2386
+ "step": 4895
2387
+ },
2388
+ {
2389
+ "epoch": 178.98,
2390
+ "eval_gen_len": 14.7091,
2391
+ "eval_loss": 0.3121263384819031,
2392
+ "eval_rouge1": 0.1073,
2393
+ "eval_rouge2": 0.0682,
2394
+ "eval_rougeL": 0.1012,
2395
+ "eval_rougeLsum": 0.1,
2396
+ "eval_runtime": 9.531,
2397
+ "eval_samples_per_second": 11.541,
2398
+ "eval_steps_per_second": 1.469,
2399
+ "step": 4922
2400
+ },
2401
+ {
2402
+ "epoch": 180.0,
2403
+ "eval_gen_len": 14.7182,
2404
+ "eval_loss": 0.310048907995224,
2405
+ "eval_rouge1": 0.1095,
2406
+ "eval_rouge2": 0.0698,
2407
+ "eval_rougeL": 0.1038,
2408
+ "eval_rougeLsum": 0.1022,
2409
+ "eval_runtime": 9.5031,
2410
+ "eval_samples_per_second": 11.575,
2411
+ "eval_steps_per_second": 1.473,
2412
+ "step": 4950
2413
+ },
2414
+ {
2415
+ "epoch": 180.98,
2416
+ "eval_gen_len": 14.5727,
2417
+ "eval_loss": 0.3098377585411072,
2418
+ "eval_rouge1": 0.1122,
2419
+ "eval_rouge2": 0.0717,
2420
+ "eval_rougeL": 0.1054,
2421
+ "eval_rougeLsum": 0.1048,
2422
+ "eval_runtime": 9.5178,
2423
+ "eval_samples_per_second": 11.557,
2424
+ "eval_steps_per_second": 1.471,
2425
+ "step": 4977
2426
+ },
2427
+ {
2428
+ "epoch": 181.82,
2429
+ "grad_norm": 0.6373523473739624,
2430
+ "learning_rate": 3.555555555555556e-06,
2431
+ "loss": 0.4093,
2432
+ "step": 5000
2433
+ },
2434
+ {
2435
+ "epoch": 182.0,
2436
+ "eval_gen_len": 14.8182,
2437
+ "eval_loss": 0.3088683784008026,
2438
+ "eval_rouge1": 0.1148,
2439
+ "eval_rouge2": 0.074,
2440
+ "eval_rougeL": 0.1082,
2441
+ "eval_rougeLsum": 0.1071,
2442
+ "eval_runtime": 9.5147,
2443
+ "eval_samples_per_second": 11.561,
2444
+ "eval_steps_per_second": 1.471,
2445
+ "step": 5005
2446
+ },
2447
+ {
2448
+ "epoch": 182.98,
2449
+ "eval_gen_len": 14.6727,
2450
+ "eval_loss": 0.30828168988227844,
2451
+ "eval_rouge1": 0.1088,
2452
+ "eval_rouge2": 0.0689,
2453
+ "eval_rougeL": 0.1021,
2454
+ "eval_rougeLsum": 0.1014,
2455
+ "eval_runtime": 9.5092,
2456
+ "eval_samples_per_second": 11.568,
2457
+ "eval_steps_per_second": 1.472,
2458
+ "step": 5032
2459
+ },
2460
+ {
2461
+ "epoch": 184.0,
2462
+ "eval_gen_len": 15.2182,
2463
+ "eval_loss": 0.30716758966445923,
2464
+ "eval_rouge1": 0.1167,
2465
+ "eval_rouge2": 0.0746,
2466
+ "eval_rougeL": 0.1098,
2467
+ "eval_rougeLsum": 0.1084,
2468
+ "eval_runtime": 9.5556,
2469
+ "eval_samples_per_second": 11.512,
2470
+ "eval_steps_per_second": 1.465,
2471
+ "step": 5060
2472
+ },
2473
+ {
2474
+ "epoch": 184.98,
2475
+ "eval_gen_len": 15.9364,
2476
+ "eval_loss": 0.3059370815753937,
2477
+ "eval_rouge1": 0.1233,
2478
+ "eval_rouge2": 0.08,
2479
+ "eval_rougeL": 0.1166,
2480
+ "eval_rougeLsum": 0.1157,
2481
+ "eval_runtime": 9.5537,
2482
+ "eval_samples_per_second": 11.514,
2483
+ "eval_steps_per_second": 1.465,
2484
+ "step": 5087
2485
+ },
2486
+ {
2487
+ "epoch": 186.0,
2488
+ "eval_gen_len": 15.1727,
2489
+ "eval_loss": 0.3056795597076416,
2490
+ "eval_rouge1": 0.1128,
2491
+ "eval_rouge2": 0.0707,
2492
+ "eval_rougeL": 0.1055,
2493
+ "eval_rougeLsum": 0.1049,
2494
+ "eval_runtime": 9.5621,
2495
+ "eval_samples_per_second": 11.504,
2496
+ "eval_steps_per_second": 1.464,
2497
+ "step": 5115
2498
+ },
2499
+ {
2500
+ "epoch": 186.98,
2501
+ "eval_gen_len": 15.1818,
2502
+ "eval_loss": 0.3043256402015686,
2503
+ "eval_rouge1": 0.1131,
2504
+ "eval_rouge2": 0.0707,
2505
+ "eval_rougeL": 0.1057,
2506
+ "eval_rougeLsum": 0.105,
2507
+ "eval_runtime": 9.5545,
2508
+ "eval_samples_per_second": 11.513,
2509
+ "eval_steps_per_second": 1.465,
2510
+ "step": 5142
2511
+ },
2512
+ {
2513
+ "epoch": 188.0,
2514
+ "eval_gen_len": 15.1727,
2515
+ "eval_loss": 0.30425599217414856,
2516
+ "eval_rouge1": 0.1125,
2517
+ "eval_rouge2": 0.0703,
2518
+ "eval_rougeL": 0.1052,
2519
+ "eval_rougeLsum": 0.1046,
2520
+ "eval_runtime": 9.5018,
2521
+ "eval_samples_per_second": 11.577,
2522
+ "eval_steps_per_second": 1.473,
2523
+ "step": 5170
2524
+ },
2525
+ {
2526
+ "epoch": 188.98,
2527
+ "eval_gen_len": 15.1636,
2528
+ "eval_loss": 0.30395984649658203,
2529
+ "eval_rouge1": 0.1128,
2530
+ "eval_rouge2": 0.0705,
2531
+ "eval_rougeL": 0.1054,
2532
+ "eval_rougeLsum": 0.1049,
2533
+ "eval_runtime": 9.5213,
2534
+ "eval_samples_per_second": 11.553,
2535
+ "eval_steps_per_second": 1.47,
2536
+ "step": 5197
2537
+ },
2538
+ {
2539
+ "epoch": 190.0,
2540
+ "eval_gen_len": 15.2455,
2541
+ "eval_loss": 0.30324217677116394,
2542
+ "eval_rouge1": 0.1136,
2543
+ "eval_rouge2": 0.0705,
2544
+ "eval_rougeL": 0.1061,
2545
+ "eval_rougeLsum": 0.1056,
2546
+ "eval_runtime": 9.5432,
2547
+ "eval_samples_per_second": 11.527,
2548
+ "eval_steps_per_second": 1.467,
2549
+ "step": 5225
2550
+ },
2551
+ {
2552
+ "epoch": 190.98,
2553
+ "eval_gen_len": 15.6182,
2554
+ "eval_loss": 0.30257585644721985,
2555
+ "eval_rouge1": 0.1149,
2556
+ "eval_rouge2": 0.071,
2557
+ "eval_rougeL": 0.1075,
2558
+ "eval_rougeLsum": 0.107,
2559
+ "eval_runtime": 9.5398,
2560
+ "eval_samples_per_second": 11.531,
2561
+ "eval_steps_per_second": 1.468,
2562
+ "step": 5252
2563
+ },
2564
+ {
2565
+ "epoch": 192.0,
2566
+ "eval_gen_len": 15.7545,
2567
+ "eval_loss": 0.301755428314209,
2568
+ "eval_rouge1": 0.118,
2569
+ "eval_rouge2": 0.0744,
2570
+ "eval_rougeL": 0.1114,
2571
+ "eval_rougeLsum": 0.1105,
2572
+ "eval_runtime": 9.5491,
2573
+ "eval_samples_per_second": 11.519,
2574
+ "eval_steps_per_second": 1.466,
2575
+ "step": 5280
2576
+ },
2577
+ {
2578
+ "epoch": 192.98,
2579
+ "eval_gen_len": 15.7545,
2580
+ "eval_loss": 0.30100175738334656,
2581
+ "eval_rouge1": 0.1186,
2582
+ "eval_rouge2": 0.0756,
2583
+ "eval_rougeL": 0.1122,
2584
+ "eval_rougeLsum": 0.1116,
2585
+ "eval_runtime": 9.5605,
2586
+ "eval_samples_per_second": 11.506,
2587
+ "eval_steps_per_second": 1.464,
2588
+ "step": 5307
2589
+ },
2590
+ {
2591
+ "epoch": 194.0,
2592
+ "eval_gen_len": 15.6727,
2593
+ "eval_loss": 0.3014240562915802,
2594
+ "eval_rouge1": 0.1169,
2595
+ "eval_rouge2": 0.0738,
2596
+ "eval_rougeL": 0.1106,
2597
+ "eval_rougeLsum": 0.1094,
2598
+ "eval_runtime": 9.5649,
2599
+ "eval_samples_per_second": 11.5,
2600
+ "eval_steps_per_second": 1.464,
2601
+ "step": 5335
2602
+ },
2603
+ {
2604
+ "epoch": 194.98,
2605
+ "eval_gen_len": 15.6364,
2606
+ "eval_loss": 0.3001127541065216,
2607
+ "eval_rouge1": 0.1161,
2608
+ "eval_rouge2": 0.0734,
2609
+ "eval_rougeL": 0.1104,
2610
+ "eval_rougeLsum": 0.1092,
2611
+ "eval_runtime": 9.558,
2612
+ "eval_samples_per_second": 11.509,
2613
+ "eval_steps_per_second": 1.465,
2614
+ "step": 5362
2615
+ },
2616
+ {
2617
+ "epoch": 196.0,
2618
+ "eval_gen_len": 15.7909,
2619
+ "eval_loss": 0.29918792843818665,
2620
+ "eval_rouge1": 0.1192,
2621
+ "eval_rouge2": 0.0752,
2622
+ "eval_rougeL": 0.1132,
2623
+ "eval_rougeLsum": 0.1125,
2624
+ "eval_runtime": 9.5635,
2625
+ "eval_samples_per_second": 11.502,
2626
+ "eval_steps_per_second": 1.464,
2627
+ "step": 5390
2628
+ },
2629
+ {
2630
+ "epoch": 196.98,
2631
+ "eval_gen_len": 15.6364,
2632
+ "eval_loss": 0.2991277277469635,
2633
+ "eval_rouge1": 0.1205,
2634
+ "eval_rouge2": 0.0772,
2635
+ "eval_rougeL": 0.1141,
2636
+ "eval_rougeLsum": 0.1133,
2637
+ "eval_runtime": 9.524,
2638
+ "eval_samples_per_second": 11.55,
2639
+ "eval_steps_per_second": 1.47,
2640
+ "step": 5417
2641
+ },
2642
+ {
2643
+ "epoch": 198.0,
2644
+ "eval_gen_len": 15.7909,
2645
+ "eval_loss": 0.2986967861652374,
2646
+ "eval_rouge1": 0.1202,
2647
+ "eval_rouge2": 0.0768,
2648
+ "eval_rougeL": 0.1143,
2649
+ "eval_rougeLsum": 0.1138,
2650
+ "eval_runtime": 9.5106,
2651
+ "eval_samples_per_second": 11.566,
2652
+ "eval_steps_per_second": 1.472,
2653
+ "step": 5445
2654
+ },
2655
+ {
2656
+ "epoch": 198.98,
2657
+ "eval_gen_len": 15.6182,
2658
+ "eval_loss": 0.2981720566749573,
2659
+ "eval_rouge1": 0.1217,
2660
+ "eval_rouge2": 0.0793,
2661
+ "eval_rougeL": 0.1153,
2662
+ "eval_rougeLsum": 0.1148,
2663
+ "eval_runtime": 9.5676,
2664
+ "eval_samples_per_second": 11.497,
2665
+ "eval_steps_per_second": 1.463,
2666
+ "step": 5472
2667
+ },
2668
+ {
2669
+ "epoch": 200.0,
2670
+ "grad_norm": 1.2200640439987183,
2671
+ "learning_rate": 1.909465020576132e-06,
2672
+ "loss": 0.3864,
2673
+ "step": 5500
2674
+ },
2675
+ {
2676
+ "epoch": 200.0,
2677
+ "eval_gen_len": 15.7818,
2678
+ "eval_loss": 0.29755449295043945,
2679
+ "eval_rouge1": 0.1218,
2680
+ "eval_rouge2": 0.079,
2681
+ "eval_rougeL": 0.1154,
2682
+ "eval_rougeLsum": 0.1147,
2683
+ "eval_runtime": 9.547,
2684
+ "eval_samples_per_second": 11.522,
2685
+ "eval_steps_per_second": 1.466,
2686
+ "step": 5500
2687
+ },
2688
+ {
2689
+ "epoch": 200.98,
2690
+ "eval_gen_len": 15.6182,
2691
+ "eval_loss": 0.29704758524894714,
2692
+ "eval_rouge1": 0.1219,
2693
+ "eval_rouge2": 0.0811,
2694
+ "eval_rougeL": 0.1163,
2695
+ "eval_rougeLsum": 0.116,
2696
+ "eval_runtime": 9.5552,
2697
+ "eval_samples_per_second": 11.512,
2698
+ "eval_steps_per_second": 1.465,
2699
+ "step": 5527
2700
+ },
2701
+ {
2702
+ "epoch": 202.0,
2703
+ "eval_gen_len": 15.5273,
2704
+ "eval_loss": 0.29719075560569763,
2705
+ "eval_rouge1": 0.122,
2706
+ "eval_rouge2": 0.0811,
2707
+ "eval_rougeL": 0.116,
2708
+ "eval_rougeLsum": 0.1157,
2709
+ "eval_runtime": 9.5815,
2710
+ "eval_samples_per_second": 11.48,
2711
+ "eval_steps_per_second": 1.461,
2712
+ "step": 5555
2713
+ },
2714
+ {
2715
+ "epoch": 202.98,
2716
+ "eval_gen_len": 15.6909,
2717
+ "eval_loss": 0.29677239060401917,
2718
+ "eval_rouge1": 0.1209,
2719
+ "eval_rouge2": 0.0803,
2720
+ "eval_rougeL": 0.1156,
2721
+ "eval_rougeLsum": 0.1149,
2722
+ "eval_runtime": 9.499,
2723
+ "eval_samples_per_second": 11.58,
2724
+ "eval_steps_per_second": 1.474,
2725
+ "step": 5582
2726
+ },
2727
+ {
2728
+ "epoch": 204.0,
2729
+ "eval_gen_len": 15.5273,
2730
+ "eval_loss": 0.2963531017303467,
2731
+ "eval_rouge1": 0.1251,
2732
+ "eval_rouge2": 0.0846,
2733
+ "eval_rougeL": 0.1207,
2734
+ "eval_rougeLsum": 0.1194,
2735
+ "eval_runtime": 9.5143,
2736
+ "eval_samples_per_second": 11.562,
2737
+ "eval_steps_per_second": 1.471,
2738
+ "step": 5610
2739
+ },
2740
+ {
2741
+ "epoch": 204.98,
2742
+ "eval_gen_len": 15.6909,
2743
+ "eval_loss": 0.29531627893447876,
2744
+ "eval_rouge1": 0.1239,
2745
+ "eval_rouge2": 0.0831,
2746
+ "eval_rougeL": 0.1193,
2747
+ "eval_rougeLsum": 0.1184,
2748
+ "eval_runtime": 9.5198,
2749
+ "eval_samples_per_second": 11.555,
2750
+ "eval_steps_per_second": 1.471,
2751
+ "step": 5637
2752
+ },
2753
+ {
2754
+ "epoch": 206.0,
2755
+ "eval_gen_len": 15.5273,
2756
+ "eval_loss": 0.29536357522010803,
2757
+ "eval_rouge1": 0.1236,
2758
+ "eval_rouge2": 0.0835,
2759
+ "eval_rougeL": 0.1192,
2760
+ "eval_rougeLsum": 0.1182,
2761
+ "eval_runtime": 9.5066,
2762
+ "eval_samples_per_second": 11.571,
2763
+ "eval_steps_per_second": 1.473,
2764
+ "step": 5665
2765
+ },
2766
+ {
2767
+ "epoch": 206.98,
2768
+ "eval_gen_len": 15.6727,
2769
+ "eval_loss": 0.2951861023902893,
2770
+ "eval_rouge1": 0.1236,
2771
+ "eval_rouge2": 0.0832,
2772
+ "eval_rougeL": 0.1191,
2773
+ "eval_rougeLsum": 0.1181,
2774
+ "eval_runtime": 9.5179,
2775
+ "eval_samples_per_second": 11.557,
2776
+ "eval_steps_per_second": 1.471,
2777
+ "step": 5692
2778
+ },
2779
+ {
2780
+ "epoch": 208.0,
2781
+ "eval_gen_len": 15.8273,
2782
+ "eval_loss": 0.29457393288612366,
2783
+ "eval_rouge1": 0.1262,
2784
+ "eval_rouge2": 0.0856,
2785
+ "eval_rougeL": 0.1223,
2786
+ "eval_rougeLsum": 0.121,
2787
+ "eval_runtime": 9.5235,
2788
+ "eval_samples_per_second": 11.55,
2789
+ "eval_steps_per_second": 1.47,
2790
+ "step": 5720
2791
+ },
2792
+ {
2793
+ "epoch": 208.98,
2794
+ "eval_gen_len": 15.8273,
2795
+ "eval_loss": 0.29461580514907837,
2796
+ "eval_rouge1": 0.1269,
2797
+ "eval_rouge2": 0.086,
2798
+ "eval_rougeL": 0.1227,
2799
+ "eval_rougeLsum": 0.1213,
2800
+ "eval_runtime": 9.5193,
2801
+ "eval_samples_per_second": 11.556,
2802
+ "eval_steps_per_second": 1.471,
2803
+ "step": 5747
2804
+ },
2805
+ {
2806
+ "epoch": 210.0,
2807
+ "eval_gen_len": 15.6727,
2808
+ "eval_loss": 0.2948046028614044,
2809
+ "eval_rouge1": 0.1261,
2810
+ "eval_rouge2": 0.0859,
2811
+ "eval_rougeL": 0.1221,
2812
+ "eval_rougeLsum": 0.1208,
2813
+ "eval_runtime": 9.5297,
2814
+ "eval_samples_per_second": 11.543,
2815
+ "eval_steps_per_second": 1.469,
2816
+ "step": 5775
2817
+ },
2818
+ {
2819
+ "epoch": 210.98,
2820
+ "eval_gen_len": 15.7636,
2821
+ "eval_loss": 0.29468077421188354,
2822
+ "eval_rouge1": 0.129,
2823
+ "eval_rouge2": 0.0888,
2824
+ "eval_rougeL": 0.1244,
2825
+ "eval_rougeLsum": 0.1235,
2826
+ "eval_runtime": 9.5285,
2827
+ "eval_samples_per_second": 11.544,
2828
+ "eval_steps_per_second": 1.469,
2829
+ "step": 5802
2830
+ },
2831
+ {
2832
+ "epoch": 212.0,
2833
+ "eval_gen_len": 15.9091,
2834
+ "eval_loss": 0.2943172752857208,
2835
+ "eval_rouge1": 0.1308,
2836
+ "eval_rouge2": 0.0909,
2837
+ "eval_rougeL": 0.1267,
2838
+ "eval_rougeLsum": 0.1254,
2839
+ "eval_runtime": 9.5333,
2840
+ "eval_samples_per_second": 11.538,
2841
+ "eval_steps_per_second": 1.469,
2842
+ "step": 5830
2843
+ },
2844
+ {
2845
+ "epoch": 212.98,
2846
+ "eval_gen_len": 15.9091,
2847
+ "eval_loss": 0.2938406467437744,
2848
+ "eval_rouge1": 0.1293,
2849
+ "eval_rouge2": 0.0888,
2850
+ "eval_rougeL": 0.1251,
2851
+ "eval_rougeLsum": 0.1236,
2852
+ "eval_runtime": 9.5369,
2853
+ "eval_samples_per_second": 11.534,
2854
+ "eval_steps_per_second": 1.468,
2855
+ "step": 5857
2856
+ },
2857
+ {
2858
+ "epoch": 214.0,
2859
+ "eval_gen_len": 15.9727,
2860
+ "eval_loss": 0.2933821678161621,
2861
+ "eval_rouge1": 0.1284,
2862
+ "eval_rouge2": 0.0877,
2863
+ "eval_rougeL": 0.1243,
2864
+ "eval_rougeLsum": 0.123,
2865
+ "eval_runtime": 9.5327,
2866
+ "eval_samples_per_second": 11.539,
2867
+ "eval_steps_per_second": 1.469,
2868
+ "step": 5885
2869
+ },
2870
+ {
2871
+ "epoch": 214.98,
2872
+ "eval_gen_len": 15.8727,
2873
+ "eval_loss": 0.2933785617351532,
2874
+ "eval_rouge1": 0.1262,
2875
+ "eval_rouge2": 0.0855,
2876
+ "eval_rougeL": 0.1221,
2877
+ "eval_rougeLsum": 0.1208,
2878
+ "eval_runtime": 9.542,
2879
+ "eval_samples_per_second": 11.528,
2880
+ "eval_steps_per_second": 1.467,
2881
+ "step": 5912
2882
+ },
2883
+ {
2884
+ "epoch": 216.0,
2885
+ "eval_gen_len": 15.8727,
2886
+ "eval_loss": 0.29337677359580994,
2887
+ "eval_rouge1": 0.1267,
2888
+ "eval_rouge2": 0.0863,
2889
+ "eval_rougeL": 0.1225,
2890
+ "eval_rougeLsum": 0.1214,
2891
+ "eval_runtime": 9.5387,
2892
+ "eval_samples_per_second": 11.532,
2893
+ "eval_steps_per_second": 1.468,
2894
+ "step": 5940
2895
+ },
2896
+ {
2897
+ "epoch": 216.98,
2898
+ "eval_gen_len": 15.9636,
2899
+ "eval_loss": 0.2933517396450043,
2900
+ "eval_rouge1": 0.1292,
2901
+ "eval_rouge2": 0.0894,
2902
+ "eval_rougeL": 0.1254,
2903
+ "eval_rougeLsum": 0.1241,
2904
+ "eval_runtime": 9.5629,
2905
+ "eval_samples_per_second": 11.503,
2906
+ "eval_steps_per_second": 1.464,
2907
+ "step": 5967
2908
+ },
2909
+ {
2910
+ "epoch": 218.0,
2911
+ "eval_gen_len": 15.9636,
2912
+ "eval_loss": 0.29335835576057434,
2913
+ "eval_rouge1": 0.1289,
2914
+ "eval_rouge2": 0.0889,
2915
+ "eval_rougeL": 0.1249,
2916
+ "eval_rougeLsum": 0.1234,
2917
+ "eval_runtime": 9.5531,
2918
+ "eval_samples_per_second": 11.515,
2919
+ "eval_steps_per_second": 1.465,
2920
+ "step": 5995
2921
+ },
2922
+ {
2923
+ "epoch": 218.18,
2924
+ "grad_norm": 0.6804682016372681,
2925
+ "learning_rate": 2.6337448559670784e-07,
2926
+ "loss": 0.3747,
2927
+ "step": 6000
2928
+ }
2929
+ ],
2930
+ "logging_steps": 500,
2931
+ "max_steps": 6075,
2932
+ "num_input_tokens_seen": 0,
2933
+ "num_train_epochs": 225,
2934
+ "save_steps": 500,
2935
+ "total_flos": 1.1611644167297434e+17,
2936
+ "train_batch_size": 8,
2937
+ "trial_name": null,
2938
+ "trial_params": null
2939
+ }
training_args.bin CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:fa417b6b216c589bddc528e9d4e9c81b5edc8c24a516e2f555101dee203aa64c
3
- size 5048
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:561a4e3664cd42ae01fabfa43b75ecfaaf6c196ad5b44203d935f0e7bbbb1eba
3
+ size 5112