New training round

Browse files

- Corrected flikcr30k dataset
- 5000k images from pexels to reduce classes imbalance

Files changed (15) hide show

README.md +78 -60
config.json +2 -2
generation_config.json +14 -15
onnx/decoder_model.onnx +1 -1
onnx/decoder_model_merged.onnx +1 -1
onnx/decoder_model_merged_quantized.onnx +1 -1
onnx/decoder_model_quantized.onnx +1 -1
onnx/decoder_with_past_model.onnx +1 -1
onnx/decoder_with_past_model_quantized.onnx +1 -1
onnx/encoder_model.onnx +1 -1
onnx/encoder_model_quantized.onnx +2 -2
preprocessor_config.json +0 -1
pytorch_model.bin +1 -1
quantize_config.json +65 -65
training_args.bin +2 -2

README.md CHANGED Viewed

@@ -1,60 +1,78 @@
----
-tags:
-  - image-to-text
-  - image-captioning
-license: apache-2.0
-metrics:
-  - rouge
-datasets:
-  - Mozilla/flickr30k-transformed-captions-gpt4o
-widget:
-  - src: https://huggingface.co/datasets/mishig/sample_images/resolve/main/savanna.jpg
-    example_title: Savanna
-  - src: https://huggingface.co/datasets/mishig/sample_images/resolve/main/football-match.jpg
-    example_title: Football Match
-  - src: https://huggingface.co/datasets/mishig/sample_images/resolve/main/airport.jpg
-    example_title: Airport
-base_model:
-  - google/vit-base-patch16-224-in21k
----
-# distilvit
-This model is a work in progress. Fine-tuned version of those base models:
-- a VIT model for the image encoder: https://huggingface.co/google/vit-base-patch16-224-in21k
-- a Distilled GPT-2 model for the text decoder: https://huggingface.co/distilbert/distilgpt2
-This model was trained on:
-- [Flickr30k debiased](https://huggingface.co/datasets/Mozilla/flickr30k-transformed-captions-gpt4o)
-- [DocOrNot](https://huggingface.co/datasets/Mozilla/docornot)
-- [Alt Text Validation](https://huggingface.co/datasets/Mozilla/alt-text-validation)
-- A debiased version of COCO 2017: https://cocodataset.org
-You can find the code used to create the model here: https://github.com/mozilla/distilvit
-# training results
-- eval/gen_len 14.99729
-- eval/loss 0.17093
-- eval/meteor 0.51479
-- eval/rouge1 57.8066
-- eval/rouge2 35.0888
-- eval/rougeL 52.9138
-- eval/rougeLsum 52.9101
-- eval/runtime 760.2135
-- eval/samples_per_second 11.18
-- eval/steps_per_second 0.112
-- train/epoch 8.0
-- train/global_step 11752
-- train/learning_rate 0.0
-- train/loss 0.1034
-- train/total_flos 1.518634875573869e+20
-- train/train_loss 0.14875
-- train/train_runtime 91405.9053
-- train/train_samples_per_second 12.855
-- train/train_steps_per_second 0.129

+---
+tags:
+  - image-to-text
+  - image-captioning
+license: apache-2.0
+metrics:
+  - rouge
+datasets:
+  - Mozilla/flickr30k-transformed-captions
+widget:
+  - src: https://huggingface.co/datasets/mishig/sample_images/resolve/main/savanna.jpg
+    example_title: Savanna
+  - src: https://huggingface.co/datasets/mishig/sample_images/resolve/main/football-match.jpg
+    example_title: Football Match
+  - src: https://huggingface.co/datasets/mishig/sample_images/resolve/main/airport.jpg
+    example_title: Airport
+base_model:
+  - google/vit-base-patch16-224-in21k
+model-index:
+  - name: mozilla/distilvit
+    results:
+      - task:
+          type: image-to-text
+          name: Image To Text
+        dataset:
+          name: Mozilla/flickr30k-transformed-captions
+          type: Mozilla/flickr30k-transformed-captions
+        metrics:
+          - name: ROUGE-1
+            type: rouge
+            value: 43.006
+            verified: true
+          - name: ROUGE-2
+            type: rouge
+            value: 16.9939
+            verified: true
+          - name: ROUGE-L
+            type: rouge
+            value: 38.8923
+            verified: true
+          - name: ROUGE-LSUM
+            type: rouge
+            value: 38.8877
+            verified: true
+          - name: loss
+            type: loss
+            value: 0.19939416646957397
+          - name: gen_len
+            type: gen_len
+            value: 11.327256736227712
+            verified: true
+---
+# distilvit
+This model is a work in progress. Fine-tuned version of those base models:
+- a VIT model for the image encoder: https://huggingface.co/google/vit-base-patch16-224-in21k
+- a Distilled GPT-2 model for the text decoder: https://huggingface.co/distilbert/distilgpt2
+This model was trained on:
+- Flickr30k : https://huggingface.co/datasets/nlphuji/flickr30k
+- COCO 2017: https://cocodataset.org
+You can get that checkpoint using the 3083a3cef6e3c8dd90df3f088074bbe836b0f403 commit.
+It was then further fine-tuned on :
+- [Flickr30k debiased](https://huggingface.co/datasets/Mozilla/flickr30k-transformed-captions)
+- [DocOrNot](https://huggingface.co/datasets/Mozilla/docornot)
+- [Alt Text Validation](https://huggingface.co/datasets/Mozilla/alt-text-validation)
+For the latter, the dataset was annotated by our team to correct the alt text generated by the model,
+using the [checkvite tool](https://github.com/mozila/checkvite).
+You can find the code used to create the model here: https://github.com/mozilla/distilvit

config.json CHANGED Viewed

@@ -1,5 +1,5 @@
 {
-  "_name_or_path": "C:\\Users\\tarek\\Dev\\distilvit\\distilvit\\..\\vit-base-patch16-224-in21k-distilgpt2",
   "architectures": [
     "VisionEncoderDecoderModel"
   ],
@@ -37,7 +37,7 @@
     "label2id": {
       "LABEL_0": 0
     },
-    "layer_norm_epsilon": 1e-05,
     "length_penalty": 1.0,
     "max_length": 20,
     "min_length": 0,

 {
+  "_name_or_path": "mozilla/distilvit",
   "architectures": [
     "VisionEncoderDecoderModel"
   ],
     "label2id": {
       "LABEL_0": 0
     },
+    "layer_norm_epsilon": 1e-5,
     "length_penalty": 1.0,
     "max_length": 20,
     "min_length": 0,

generation_config.json CHANGED Viewed

@@ -1,17 +1,16 @@
 {
-    "bos_token_id": 50256,
-    "early_stopping": true,
-    "eos_token_id": 50256,
-    "max_length": 50,
-    "no_repeat_ngram_size": 3,
-    "num_beams": 2,
-    "pad_token_id": 50256,
-    "repetition_penalty": 1.3,
-    "seed": 12,
-    "max_time": 5,
-    "transformers_version": "4.33.2",
-    "do_sample": true,
-    "temperature": 0.7,
-    "top_p": 0.9,
-    "top_k": 50
 }

 {
+  "bos_token_id": 50256,
+  "do_sample": true,
+  "early_stopping": true,
+  "eos_token_id": 50256,
+  "max_length": 50,
+  "max_time": 5,
+  "no_repeat_ngram_size": 2,
+  "num_beams": 2,
+  "pad_token_id": 50256,
+  "repetition_penalty": 1.4,
+  "seed": 12,
+  "temperature": 0.8,
+  "top_p": 0.9,
+  "transformers_version": "4.33.2"
 }

onnx/decoder_model.onnx CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:74a2588b4e510214a2b4f7b89e5805e6f8d88982d2800d62ba4c3f1589957f73
 size 385864797

 version https://git-lfs.github.com/spec/v1
+oid sha256:5aedbc7bb81581bdd4b90f65db39dc22b3dcdea3923d67542449aa24bd46eaf7
 size 385864797

onnx/decoder_model_merged.onnx CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:44d84666cb213e0efdca9607f60890d21d2a6f8e74af659465b7f01ef40abda5
 size 387342586

 version https://git-lfs.github.com/spec/v1
+oid sha256:c398f8e6fd8bf8b03105a7ada1541f488c07870229210a128205c6492b7c38ed
 size 387342586

onnx/decoder_model_merged_quantized.onnx CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:4b4f88b4343b356236aac05f47319b0f32920ff18c9ac97919dee68cca72dc8c
 size 99759579

 version https://git-lfs.github.com/spec/v1
+oid sha256:0c66eda97aa444b17357dfba1c9827abc7824d6fd905dd28f097f74ddef02943
 size 99759579

onnx/decoder_model_quantized.onnx CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:49e086aa9b53b659ea8714e1dfd0d5ac9fa74b1a9fec949de911a43511f4fd1d
 size 98065763

 version https://git-lfs.github.com/spec/v1
+oid sha256:8cda4a51db36003185beb5557cc379d0ad773d4efa9329e39793828b74d420f4
 size 98065763

onnx/decoder_with_past_model.onnx CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:ff3041032448a1af189cd6d017b5bdc82d1260481648b49b173c982ce2d17a9b
 size 385864377

 version https://git-lfs.github.com/spec/v1
+oid sha256:9ab017c9a31793692bb56c0b93be42cb0090d97f6efc9cbd25f092a93451f45e
 size 385864377

onnx/decoder_with_past_model_quantized.onnx CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:a223f391306a0d9277b954dbad457c16a70227d1b7b8c29ce0523609c3b4468b
 size 98063170

 version https://git-lfs.github.com/spec/v1
+oid sha256:7885df45251f50971c9a8fdf11c79011342988bcabdb0649470751076226b8ae
 size 98063170

onnx/encoder_model.onnx CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:3f96cda5722cdd6be96aeb5f0e1901a0365bb0024936fb1feb71a107b157fda5
 size 343440632

 version https://git-lfs.github.com/spec/v1
+oid sha256:e48effb1e61fcecf4d1587fb15894d15510f366971ff22224719048168a70707
 size 343440632

onnx/encoder_model_quantized.onnx CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:930d1c85e497cc090b1c4894bef857cf0944a0c2157d2bc5fe2d421ad290077b
-size 87038170

 version https://git-lfs.github.com/spec/v1
+oid sha256:04622bdece4419687c8a07f4df0e8c1bea1db354fb6decde97bc0bf39f26bab2
+size 87038173

preprocessor_config.json CHANGED Viewed

@@ -7,7 +7,6 @@
     0.5,
     0.5
   ],
-  "image_processor_type": "ViTImageProcessor",
   "feature_extractor_type": "ViTImageProcessor",
   "image_std": [
     0.5,

     0.5,
     0.5
   ],
   "feature_extractor_type": "ViTImageProcessor",
   "image_std": [
     0.5,

pytorch_model.bin CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:0c482fc553889157006b94f592050aba8b9e45475b42e8497ffa2adad938d3c7
 size 730052378

 version https://git-lfs.github.com/spec/v1
+oid sha256:ea90951d68491925423f56366196ab4d2a7cbafecc88d58623b8f74ae2118872
 size 730052378

quantize_config.json CHANGED Viewed

@@ -4,120 +4,120 @@
     "per_model_config": {
         "decoder_model": {
             "op_types": [
-                "Sub",
-                "Concat",
-                "Squeeze",
-                "Where",
-                "Slice",
                 "Sqrt",
                 "Mul",
                 "Pow",
                 "Div",
-                "MatMul",
                 "Gather",
                 "Softmax",
-                "Transpose",
                 "ConstantOfShape",
-                "Reshape",
-                "Cast",
                 "Tanh",
-                "Unsqueeze",
                 "Add",
-                "Split",
-                "Shape",
-                "Gemm",
-                "Constant",
-                "Range",
-                "ReduceMean"
             ],
             "weight_type": "QInt8"
         },
         "decoder_model_merged": {
             "op_types": [
-                "Sub",
-                "Concat",
-                "Squeeze",
-                "Where",
-                "Slice",
                 "Sqrt",
                 "Mul",
                 "Pow",
                 "Div",
-                "If",
-                "MatMul",
                 "Gather",
                 "Softmax",
-                "Transpose",
                 "ConstantOfShape",
-                "Reshape",
-                "Cast",
                 "Tanh",
-                "Unsqueeze",
                 "Add",
-                "Split",
-                "Shape",
-                "Gemm",
-                "Constant",
-                "Range",
-                "ReduceMean"
             ],
             "weight_type": "QInt8"
         },
         "decoder_with_past_model": {
             "op_types": [
-                "Sub",
-                "Concat",
-                "Squeeze",
-                "Where",
-                "Slice",
                 "Sqrt",
                 "Mul",
                 "Pow",
                 "Div",
-                "MatMul",
                 "Gather",
                 "Softmax",
-                "Transpose",
                 "ConstantOfShape",
-                "Reshape",
-                "Cast",
-                "Tanh",
-                "Unsqueeze",
-                "Add",
-                "Split",
-                "Shape",
                 "Gemm",
-                "Constant",
-                "Range",
-                "ReduceMean"
             ],
             "weight_type": "QInt8"
         },
         "encoder_model": {
             "op_types": [
-                "Sub",
-                "Concat",
-                "Where",
-                "Slice",
                 "Sqrt",
                 "Mul",
                 "Pow",
                 "Div",
-                "MatMul",
                 "Gather",
                 "Erf",
-                "Softmax",
-                "Transpose",
-                "Expand",
-                "ConstantOfShape",
-                "Reshape",
                 "Unsqueeze",
-                "Conv",
-                "Add",
-                "Shape",
                 "Equal",
-                "Constant",
-                "ReduceMean"
             ],
             "weight_type": "QUInt8"
         }

     "per_model_config": {
         "decoder_model": {
             "op_types": [
                 "Sqrt",
+                "MatMul",
+                "Concat",
                 "Mul",
+                "Constant",
+                "Shape",
+                "Range",
                 "Pow",
                 "Div",
+                "Split",
+                "Sub",
+                "Where",
+                "Reshape",
                 "Gather",
+                "Unsqueeze",
+                "Cast",
+                "ReduceMean",
                 "Softmax",
                 "ConstantOfShape",
+                "Slice",
+                "Gemm",
                 "Tanh",
+                "Transpose",
                 "Add",
+                "Squeeze"
             ],
             "weight_type": "QInt8"
         },
         "decoder_model_merged": {
             "op_types": [
                 "Sqrt",
+                "MatMul",
+                "Concat",
                 "Mul",
+                "If",
+                "Constant",
+                "Shape",
+                "Range",
                 "Pow",
                 "Div",
+                "Split",
+                "Sub",
+                "Where",
+                "Reshape",
                 "Gather",
+                "Unsqueeze",
+                "Cast",
+                "ReduceMean",
                 "Softmax",
                 "ConstantOfShape",
+                "Slice",
+                "Gemm",
                 "Tanh",
+                "Transpose",
                 "Add",
+                "Squeeze"
             ],
             "weight_type": "QInt8"
         },
         "decoder_with_past_model": {
             "op_types": [
                 "Sqrt",
+                "MatMul",
+                "Concat",
                 "Mul",
+                "Constant",
+                "Shape",
+                "Range",
                 "Pow",
                 "Div",
+                "Split",
+                "Sub",
+                "Where",
+                "Reshape",
                 "Gather",
+                "Unsqueeze",
+                "Cast",
+                "ReduceMean",
                 "Softmax",
                 "ConstantOfShape",
+                "Slice",
                 "Gemm",
+                "Tanh",
+                "Transpose",
+                "Squeeze",
+                "Add"
             ],
             "weight_type": "QInt8"
         },
         "encoder_model": {
             "op_types": [
                 "Sqrt",
+                "MatMul",
+                "Concat",
                 "Mul",
+                "Constant",
+                "Expand",
+                "Shape",
                 "Pow",
                 "Div",
+                "Conv",
+                "Sub",
+                "Where",
+                "Reshape",
                 "Gather",
                 "Erf",
                 "Unsqueeze",
                 "Equal",
+                "ReduceMean",
+                "Softmax",
+                "ConstantOfShape",
+                "Slice",
+                "Transpose",
+                "Add"
             ],
             "weight_type": "QUInt8"
         }

training_args.bin CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:1a963b881745d00ac683701b379f674fa0dbcd989df12e27bc7247cec0fdee7b
-size 4664

 version https://git-lfs.github.com/spec/v1
+oid sha256:6df8b3132ea43f82797da62dc92bab7492b597315c59b9eeae0937e97904f9e0
+size 4728