test

Browse files

Files changed (4) hide show

.run_distillation.py.un~ +0 -0
generation_config.json +1 -1
run_distillation.py +29 -12
tokenizer.json +10 -10

.run_distillation.py.un~ CHANGED Viewed

Binary files a/.run_distillation.py.un~ and b/.run_distillation.py.un~ differ

generation_config.json CHANGED Viewed

@@ -165,7 +165,7 @@
     "<|yue|>": 50358,
     "<|zh|>": 50260
   },
-  "language": "<|en|>",
   "max_initial_timestamp_index": 1,
   "max_length": 448,
   "no_timestamps_token_id": 50364,

     "<|yue|>": 50358,
     "<|zh|>": 50260
   },
+  "language": "<|no|>",
   "max_initial_timestamp_index": 1,
   "max_length": 448,
   "no_timestamps_token_id": 50364,

run_distillation.py CHANGED Viewed

@@ -1344,16 +1344,23 @@ def main():
                 else raw_datasets[eval_split].select(range(data_args.max_eval_samples))
             )
     def is_wer_in_range(ground_truth, whisper_transcript):
         norm_ground_truth = normalizer(ground_truth)
-        if len(norm_ground_truth) > 0 and whisper_transcript is not None:
             norm_whisper_transcript = normalizer(whisper_transcript)
             wer = 100 * metric.compute(predictions=[norm_whisper_transcript], references=[norm_ground_truth])
             return wer < wer_threshold
         else:
-            # filter automatically since we can't know the WER
             return False
     filter_by_wer_threshold = partial(
         raw_datasets["train"].filter,
         function=is_wer_in_range,
@@ -1517,20 +1524,30 @@ def main():
         ]
         wer_ortho = 100 * metric.compute(predictions=spaced_pred_str, references=spaced_label_str)
-        # normalize everything and re-compute the WER
-        norm_pred_str = [normalizer(pred) for pred in pred_str]
-        norm_label_str = [normalizer(label) for label in label_str]
-        # for logging, we need the pred/labels to match the norm_pred/norm_labels, so discard any filtered samples here
-        pred_str = [pred_str[i] for i in range(len(norm_pred_str)) if len(norm_label_str[i]) > 0]
-        label_str = [label_str[i] for i in range(len(norm_label_str)) if len(norm_label_str[i]) > 0]
-        # filtering step to only evaluate the samples that correspond to non-zero normalized references:
-        norm_pred_str = [norm_pred_str[i] for i in range(len(norm_pred_str)) if len(norm_label_str[i]) > 0]
-        norm_label_str = [norm_label_str[i] for i in range(len(norm_label_str)) if len(norm_label_str[i]) > 0]
-        wer = 100 * metric.compute(predictions=norm_pred_str, references=norm_label_str)
         return {"wer": wer, "wer_ortho": wer_ortho}, pred_str, label_str, norm_pred_str, norm_label_str
     # 9. Save feature extractor, tokenizer, config and generation config
     feature_extractor.save_pretrained(training_args.output_dir)
     tokenizer.save_pretrained(training_args.output_dir)

                 else raw_datasets[eval_split].select(range(data_args.max_eval_samples))
             )
+    # 10.3: filter training data based on WER threshold -> this is KEY to good distillation performance
     def is_wer_in_range(ground_truth, whisper_transcript):
         norm_ground_truth = normalizer(ground_truth)
+        if whisper_transcript is not None and whisper_transcript.upper() == whisper_transcript:
+            # filter entirely upper-case transcriptions: these are erroneous generations from large-v3
+            return False
+        elif len(norm_ground_truth) == 0 and len(normalizer(whisper_transcript)) == 0:
+            return True
+        elif len(norm_ground_truth.strip()) > 0 and whisper_transcript is not None and len(normalizer(whisper_transcript).strip()) > 0:
             norm_whisper_transcript = normalizer(whisper_transcript)
             wer = 100 * metric.compute(predictions=[norm_whisper_transcript], references=[norm_ground_truth])
             return wer < wer_threshold
         else:
+            # filter automatically since we cant know WER
             return False
     filter_by_wer_threshold = partial(
         raw_datasets["train"].filter,
         function=is_wer_in_range,
         ]
         wer_ortho = 100 * metric.compute(predictions=spaced_pred_str, references=spaced_label_str)
+        # Iterate through all predictions and labels
+        for pred, label in zip(pred_str, label_str):
+            # Normalize the prediction and label
+            normalized_pred = normalizer(pred)
+            normalized_label = normalizer(label)
+            # If either normalized string is empty after normalization, replace with "<|nospeech|>"
+            if not normalized_pred.strip():
+                normalized_pred = "<|nospeech|>"
+            if not normalized_label.strip():
+                normalized_label = "<|nospeech|>"
+            norm_pred_str.append(normalized_pred)
+            norm_label_str.append(normalized_label)
+        # Replace original strings with "<|nocaptions|>" where necessary for consistency
+        pred_str = [pred if len(pred.strip()) > 0 else "<|nospeech|>" for pred in pred_str]
+        label_str = [label if len(label.strip()) > 0 else "<|nospeech|>" for label in label_str]
+        # Compute WER using all entries, including those with "<|nocaptions|>"
+        wer = 100 * metric.compute(predictions=norm_pred_str, references=norm_label_str)
         return {"wer": wer, "wer_ortho": wer_ortho}, pred_str, label_str, norm_pred_str, norm_label_str
     # 9. Save feature extractor, tokenizer, config and generation config
     feature_extractor.save_pretrained(training_args.output_dir)
     tokenizer.save_pretrained(training_args.output_dir)

tokenizer.json CHANGED Viewed

@@ -14503,7 +14503,7 @@
       },
       {
         "SpecialToken": {
-          "id": "<|en|>",
           "type_id": 0
         }
       },
@@ -14541,7 +14541,7 @@
       },
       {
         "SpecialToken": {
-          "id": "<|en|>",
           "type_id": 0
         }
       },
@@ -14586,22 +14586,22 @@
           "<|endoftext|>"
         ]
       },
-      "<|en|>": {
-        "id": "<|en|>",
         "ids": [
-          50259
         ],
         "tokens": [
-          "<|en|>"
         ]
       },
-      "<|notimestamps|>": {
-        "id": "<|notimestamps|>",
         "ids": [
-          50364
         ],
         "tokens": [
-          "<|notimestamps|>"
         ]
       },
       "<|startoftranscript|>": {

       },
       {
         "SpecialToken": {
+          "id": "<|no|>",
           "type_id": 0
         }
       },
       },
       {
         "SpecialToken": {
+          "id": "<|no|>",
           "type_id": 0
         }
       },
           "<|endoftext|>"
         ]
       },
+      "<|notimestamps|>": {
+        "id": "<|notimestamps|>",
         "ids": [
+          50364
         ],
         "tokens": [
+          "<|notimestamps|>"
         ]
       },
+      "<|no|>": {
+        "id": "<|no|>",
         "ids": [
+          50288
         ],
         "tokens": [
+          "<|no|>"
         ]
       },
       "<|startoftranscript|>": {