tarekziade commited on
Commit
2963c42
1 Parent(s): cbd6135

New training round

Browse files

- Corrected flikcr30k dataset
- 5000k images from pexels to reduce classes imbalance

README.md CHANGED
@@ -1,60 +1,78 @@
1
- ---
2
- tags:
3
- - image-to-text
4
- - image-captioning
5
- license: apache-2.0
6
- metrics:
7
- - rouge
8
- datasets:
9
- - Mozilla/flickr30k-transformed-captions-gpt4o
10
- widget:
11
- - src: https://huggingface.co/datasets/mishig/sample_images/resolve/main/savanna.jpg
12
- example_title: Savanna
13
- - src: https://huggingface.co/datasets/mishig/sample_images/resolve/main/football-match.jpg
14
- example_title: Football Match
15
- - src: https://huggingface.co/datasets/mishig/sample_images/resolve/main/airport.jpg
16
- example_title: Airport
17
- base_model:
18
- - google/vit-base-patch16-224-in21k
19
- ---
20
-
21
- # distilvit
22
-
23
- This model is a work in progress. Fine-tuned version of those base models:
24
-
25
- - a VIT model for the image encoder: https://huggingface.co/google/vit-base-patch16-224-in21k
26
- - a Distilled GPT-2 model for the text decoder: https://huggingface.co/distilbert/distilgpt2
27
-
28
- This model was trained on:
29
-
30
- - [Flickr30k debiased](https://huggingface.co/datasets/Mozilla/flickr30k-transformed-captions-gpt4o)
31
- - [DocOrNot](https://huggingface.co/datasets/Mozilla/docornot)
32
- - [Alt Text Validation](https://huggingface.co/datasets/Mozilla/alt-text-validation)
33
- - A debiased version of COCO 2017: https://cocodataset.org
34
-
35
- You can find the code used to create the model here: https://github.com/mozilla/distilvit
36
-
37
-
38
- # training results
39
-
40
- - eval/gen_len 14.99729
41
- - eval/loss 0.17093
42
- - eval/meteor 0.51479
43
- - eval/rouge1 57.8066
44
- - eval/rouge2 35.0888
45
- - eval/rougeL 52.9138
46
- - eval/rougeLsum 52.9101
47
- - eval/runtime 760.2135
48
- - eval/samples_per_second 11.18
49
- - eval/steps_per_second 0.112
50
- - train/epoch 8.0
51
- - train/global_step 11752
52
- - train/learning_rate 0.0
53
- - train/loss 0.1034
54
- - train/total_flos 1.518634875573869e+20
55
- - train/train_loss 0.14875
56
- - train/train_runtime 91405.9053
57
- - train/train_samples_per_second 12.855
58
- - train/train_steps_per_second 0.129
59
-
60
-
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ tags:
3
+ - image-to-text
4
+ - image-captioning
5
+ license: apache-2.0
6
+ metrics:
7
+ - rouge
8
+ datasets:
9
+ - Mozilla/flickr30k-transformed-captions
10
+ widget:
11
+ - src: https://huggingface.co/datasets/mishig/sample_images/resolve/main/savanna.jpg
12
+ example_title: Savanna
13
+ - src: https://huggingface.co/datasets/mishig/sample_images/resolve/main/football-match.jpg
14
+ example_title: Football Match
15
+ - src: https://huggingface.co/datasets/mishig/sample_images/resolve/main/airport.jpg
16
+ example_title: Airport
17
+ base_model:
18
+ - google/vit-base-patch16-224-in21k
19
+
20
+ model-index:
21
+ - name: mozilla/distilvit
22
+ results:
23
+ - task:
24
+ type: image-to-text
25
+ name: Image To Text
26
+ dataset:
27
+ name: Mozilla/flickr30k-transformed-captions
28
+ type: Mozilla/flickr30k-transformed-captions
29
+ metrics:
30
+ - name: ROUGE-1
31
+ type: rouge
32
+ value: 43.006
33
+ verified: true
34
+ - name: ROUGE-2
35
+ type: rouge
36
+ value: 16.9939
37
+ verified: true
38
+ - name: ROUGE-L
39
+ type: rouge
40
+ value: 38.8923
41
+ verified: true
42
+ - name: ROUGE-LSUM
43
+ type: rouge
44
+ value: 38.8877
45
+ verified: true
46
+ - name: loss
47
+ type: loss
48
+ value: 0.19939416646957397
49
+ - name: gen_len
50
+ type: gen_len
51
+ value: 11.327256736227712
52
+ verified: true
53
+ ---
54
+
55
+ # distilvit
56
+
57
+ This model is a work in progress. Fine-tuned version of those base models:
58
+
59
+ - a VIT model for the image encoder: https://huggingface.co/google/vit-base-patch16-224-in21k
60
+ - a Distilled GPT-2 model for the text decoder: https://huggingface.co/distilbert/distilgpt2
61
+
62
+ This model was trained on:
63
+
64
+ - Flickr30k : https://huggingface.co/datasets/nlphuji/flickr30k
65
+ - COCO 2017: https://cocodataset.org
66
+
67
+ You can get that checkpoint using the 3083a3cef6e3c8dd90df3f088074bbe836b0f403 commit.
68
+
69
+ It was then further fine-tuned on :
70
+
71
+ - [Flickr30k debiased](https://huggingface.co/datasets/Mozilla/flickr30k-transformed-captions)
72
+ - [DocOrNot](https://huggingface.co/datasets/Mozilla/docornot)
73
+ - [Alt Text Validation](https://huggingface.co/datasets/Mozilla/alt-text-validation)
74
+
75
+ For the latter, the dataset was annotated by our team to correct the alt text generated by the model,
76
+ using the [checkvite tool](https://github.com/mozila/checkvite).
77
+
78
+ You can find the code used to create the model here: https://github.com/mozilla/distilvit
config.json CHANGED
@@ -1,5 +1,5 @@
1
  {
2
- "_name_or_path": "C:\\Users\\tarek\\Dev\\distilvit\\distilvit\\..\\vit-base-patch16-224-in21k-distilgpt2",
3
  "architectures": [
4
  "VisionEncoderDecoderModel"
5
  ],
@@ -37,7 +37,7 @@
37
  "label2id": {
38
  "LABEL_0": 0
39
  },
40
- "layer_norm_epsilon": 1e-05,
41
  "length_penalty": 1.0,
42
  "max_length": 20,
43
  "min_length": 0,
 
1
  {
2
+ "_name_or_path": "mozilla/distilvit",
3
  "architectures": [
4
  "VisionEncoderDecoderModel"
5
  ],
 
37
  "label2id": {
38
  "LABEL_0": 0
39
  },
40
+ "layer_norm_epsilon": 1e-5,
41
  "length_penalty": 1.0,
42
  "max_length": 20,
43
  "min_length": 0,
generation_config.json CHANGED
@@ -1,17 +1,16 @@
1
  {
2
- "bos_token_id": 50256,
3
- "early_stopping": true,
4
- "eos_token_id": 50256,
5
- "max_length": 50,
6
- "no_repeat_ngram_size": 3,
7
- "num_beams": 2,
8
- "pad_token_id": 50256,
9
- "repetition_penalty": 1.3,
10
- "seed": 12,
11
- "max_time": 5,
12
- "transformers_version": "4.33.2",
13
- "do_sample": true,
14
- "temperature": 0.7,
15
- "top_p": 0.9,
16
- "top_k": 50
17
  }
 
1
  {
2
+ "bos_token_id": 50256,
3
+ "do_sample": true,
4
+ "early_stopping": true,
5
+ "eos_token_id": 50256,
6
+ "max_length": 50,
7
+ "max_time": 5,
8
+ "no_repeat_ngram_size": 2,
9
+ "num_beams": 2,
10
+ "pad_token_id": 50256,
11
+ "repetition_penalty": 1.4,
12
+ "seed": 12,
13
+ "temperature": 0.8,
14
+ "top_p": 0.9,
15
+ "transformers_version": "4.33.2"
 
16
  }
onnx/decoder_model.onnx CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:74a2588b4e510214a2b4f7b89e5805e6f8d88982d2800d62ba4c3f1589957f73
3
  size 385864797
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:5aedbc7bb81581bdd4b90f65db39dc22b3dcdea3923d67542449aa24bd46eaf7
3
  size 385864797
onnx/decoder_model_merged.onnx CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:44d84666cb213e0efdca9607f60890d21d2a6f8e74af659465b7f01ef40abda5
3
  size 387342586
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:c398f8e6fd8bf8b03105a7ada1541f488c07870229210a128205c6492b7c38ed
3
  size 387342586
onnx/decoder_model_merged_quantized.onnx CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:4b4f88b4343b356236aac05f47319b0f32920ff18c9ac97919dee68cca72dc8c
3
  size 99759579
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:0c66eda97aa444b17357dfba1c9827abc7824d6fd905dd28f097f74ddef02943
3
  size 99759579
onnx/decoder_model_quantized.onnx CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:49e086aa9b53b659ea8714e1dfd0d5ac9fa74b1a9fec949de911a43511f4fd1d
3
  size 98065763
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:8cda4a51db36003185beb5557cc379d0ad773d4efa9329e39793828b74d420f4
3
  size 98065763
onnx/decoder_with_past_model.onnx CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:ff3041032448a1af189cd6d017b5bdc82d1260481648b49b173c982ce2d17a9b
3
  size 385864377
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:9ab017c9a31793692bb56c0b93be42cb0090d97f6efc9cbd25f092a93451f45e
3
  size 385864377
onnx/decoder_with_past_model_quantized.onnx CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:a223f391306a0d9277b954dbad457c16a70227d1b7b8c29ce0523609c3b4468b
3
  size 98063170
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:7885df45251f50971c9a8fdf11c79011342988bcabdb0649470751076226b8ae
3
  size 98063170
onnx/encoder_model.onnx CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:3f96cda5722cdd6be96aeb5f0e1901a0365bb0024936fb1feb71a107b157fda5
3
  size 343440632
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:e48effb1e61fcecf4d1587fb15894d15510f366971ff22224719048168a70707
3
  size 343440632
onnx/encoder_model_quantized.onnx CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:930d1c85e497cc090b1c4894bef857cf0944a0c2157d2bc5fe2d421ad290077b
3
- size 87038170
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:04622bdece4419687c8a07f4df0e8c1bea1db354fb6decde97bc0bf39f26bab2
3
+ size 87038173
preprocessor_config.json CHANGED
@@ -7,7 +7,6 @@
7
  0.5,
8
  0.5
9
  ],
10
- "image_processor_type": "ViTImageProcessor",
11
  "feature_extractor_type": "ViTImageProcessor",
12
  "image_std": [
13
  0.5,
 
7
  0.5,
8
  0.5
9
  ],
 
10
  "feature_extractor_type": "ViTImageProcessor",
11
  "image_std": [
12
  0.5,
pytorch_model.bin CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:0c482fc553889157006b94f592050aba8b9e45475b42e8497ffa2adad938d3c7
3
  size 730052378
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:ea90951d68491925423f56366196ab4d2a7cbafecc88d58623b8f74ae2118872
3
  size 730052378
quantize_config.json CHANGED
@@ -4,120 +4,120 @@
4
  "per_model_config": {
5
  "decoder_model": {
6
  "op_types": [
7
- "Sub",
8
- "Concat",
9
- "Squeeze",
10
- "Where",
11
- "Slice",
12
  "Sqrt",
 
 
13
  "Mul",
 
 
 
14
  "Pow",
15
  "Div",
16
- "MatMul",
 
 
 
17
  "Gather",
 
 
 
18
  "Softmax",
19
- "Transpose",
20
  "ConstantOfShape",
21
- "Reshape",
22
- "Cast",
23
  "Tanh",
24
- "Unsqueeze",
25
  "Add",
26
- "Split",
27
- "Shape",
28
- "Gemm",
29
- "Constant",
30
- "Range",
31
- "ReduceMean"
32
  ],
33
  "weight_type": "QInt8"
34
  },
35
  "decoder_model_merged": {
36
  "op_types": [
37
- "Sub",
38
- "Concat",
39
- "Squeeze",
40
- "Where",
41
- "Slice",
42
  "Sqrt",
 
 
43
  "Mul",
 
 
 
 
44
  "Pow",
45
  "Div",
46
- "If",
47
- "MatMul",
 
 
48
  "Gather",
 
 
 
49
  "Softmax",
50
- "Transpose",
51
  "ConstantOfShape",
52
- "Reshape",
53
- "Cast",
54
  "Tanh",
55
- "Unsqueeze",
56
  "Add",
57
- "Split",
58
- "Shape",
59
- "Gemm",
60
- "Constant",
61
- "Range",
62
- "ReduceMean"
63
  ],
64
  "weight_type": "QInt8"
65
  },
66
  "decoder_with_past_model": {
67
  "op_types": [
68
- "Sub",
69
- "Concat",
70
- "Squeeze",
71
- "Where",
72
- "Slice",
73
  "Sqrt",
 
 
74
  "Mul",
 
 
 
75
  "Pow",
76
  "Div",
77
- "MatMul",
 
 
 
78
  "Gather",
 
 
 
79
  "Softmax",
80
- "Transpose",
81
  "ConstantOfShape",
82
- "Reshape",
83
- "Cast",
84
- "Tanh",
85
- "Unsqueeze",
86
- "Add",
87
- "Split",
88
- "Shape",
89
  "Gemm",
90
- "Constant",
91
- "Range",
92
- "ReduceMean"
 
93
  ],
94
  "weight_type": "QInt8"
95
  },
96
  "encoder_model": {
97
  "op_types": [
98
- "Sub",
99
- "Concat",
100
- "Where",
101
- "Slice",
102
  "Sqrt",
 
 
103
  "Mul",
 
 
 
104
  "Pow",
105
  "Div",
106
- "MatMul",
 
 
 
107
  "Gather",
108
  "Erf",
109
- "Softmax",
110
- "Transpose",
111
- "Expand",
112
- "ConstantOfShape",
113
- "Reshape",
114
  "Unsqueeze",
115
- "Conv",
116
- "Add",
117
- "Shape",
118
  "Equal",
119
- "Constant",
120
- "ReduceMean"
 
 
 
 
121
  ],
122
  "weight_type": "QUInt8"
123
  }
 
4
  "per_model_config": {
5
  "decoder_model": {
6
  "op_types": [
 
 
 
 
 
7
  "Sqrt",
8
+ "MatMul",
9
+ "Concat",
10
  "Mul",
11
+ "Constant",
12
+ "Shape",
13
+ "Range",
14
  "Pow",
15
  "Div",
16
+ "Split",
17
+ "Sub",
18
+ "Where",
19
+ "Reshape",
20
  "Gather",
21
+ "Unsqueeze",
22
+ "Cast",
23
+ "ReduceMean",
24
  "Softmax",
 
25
  "ConstantOfShape",
26
+ "Slice",
27
+ "Gemm",
28
  "Tanh",
29
+ "Transpose",
30
  "Add",
31
+ "Squeeze"
 
 
 
 
 
32
  ],
33
  "weight_type": "QInt8"
34
  },
35
  "decoder_model_merged": {
36
  "op_types": [
 
 
 
 
 
37
  "Sqrt",
38
+ "MatMul",
39
+ "Concat",
40
  "Mul",
41
+ "If",
42
+ "Constant",
43
+ "Shape",
44
+ "Range",
45
  "Pow",
46
  "Div",
47
+ "Split",
48
+ "Sub",
49
+ "Where",
50
+ "Reshape",
51
  "Gather",
52
+ "Unsqueeze",
53
+ "Cast",
54
+ "ReduceMean",
55
  "Softmax",
 
56
  "ConstantOfShape",
57
+ "Slice",
58
+ "Gemm",
59
  "Tanh",
60
+ "Transpose",
61
  "Add",
62
+ "Squeeze"
 
 
 
 
 
63
  ],
64
  "weight_type": "QInt8"
65
  },
66
  "decoder_with_past_model": {
67
  "op_types": [
 
 
 
 
 
68
  "Sqrt",
69
+ "MatMul",
70
+ "Concat",
71
  "Mul",
72
+ "Constant",
73
+ "Shape",
74
+ "Range",
75
  "Pow",
76
  "Div",
77
+ "Split",
78
+ "Sub",
79
+ "Where",
80
+ "Reshape",
81
  "Gather",
82
+ "Unsqueeze",
83
+ "Cast",
84
+ "ReduceMean",
85
  "Softmax",
 
86
  "ConstantOfShape",
87
+ "Slice",
 
 
 
 
 
 
88
  "Gemm",
89
+ "Tanh",
90
+ "Transpose",
91
+ "Squeeze",
92
+ "Add"
93
  ],
94
  "weight_type": "QInt8"
95
  },
96
  "encoder_model": {
97
  "op_types": [
 
 
 
 
98
  "Sqrt",
99
+ "MatMul",
100
+ "Concat",
101
  "Mul",
102
+ "Constant",
103
+ "Expand",
104
+ "Shape",
105
  "Pow",
106
  "Div",
107
+ "Conv",
108
+ "Sub",
109
+ "Where",
110
+ "Reshape",
111
  "Gather",
112
  "Erf",
 
 
 
 
 
113
  "Unsqueeze",
 
 
 
114
  "Equal",
115
+ "ReduceMean",
116
+ "Softmax",
117
+ "ConstantOfShape",
118
+ "Slice",
119
+ "Transpose",
120
+ "Add"
121
  ],
122
  "weight_type": "QUInt8"
123
  }
training_args.bin CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:1a963b881745d00ac683701b379f674fa0dbcd989df12e27bc7247cec0fdee7b
3
- size 4664
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:6df8b3132ea43f82797da62dc92bab7492b597315c59b9eeae0937e97904f9e0
3
+ size 4728