jhj0517 commited on
Commit
e29f6b4
β€’
1 Parent(s): 58c7e65

add advanced parameter tab

Browse files
app.py CHANGED
@@ -54,14 +54,20 @@ class App:
54
  cb_translate = gr.Checkbox(value=False, label="Translate to English?", interactive=True)
55
  with gr.Row():
56
  cb_timestamp = gr.Checkbox(value=True, label="Add a timestamp to the end of the filename", interactive=True)
 
 
 
 
57
  with gr.Row():
58
  btn_run = gr.Button("GENERATE SUBTITLE FILE", variant="primary")
59
  with gr.Row():
60
  tb_indicator = gr.Textbox(label="Output", scale=8)
61
  btn_openfolder = gr.Button('πŸ“‚', scale=2)
62
 
 
 
63
  btn_run.click(fn=self.whisper_inf.transcribe_file,
64
- inputs=[input_file, dd_model, dd_lang, dd_subformat, cb_translate, cb_timestamp],
65
  outputs=[tb_indicator])
66
  btn_openfolder.click(fn=lambda: self.open_folder("outputs"), inputs=None, outputs=None)
67
  dd_model.change(fn=self.on_change_models, inputs=[dd_model], outputs=[cb_translate])
@@ -86,14 +92,20 @@ class App:
86
  with gr.Row():
87
  cb_timestamp = gr.Checkbox(value=True, label="Add a timestamp to the end of the filename",
88
  interactive=True)
 
 
 
 
89
  with gr.Row():
90
  btn_run = gr.Button("GENERATE SUBTITLE FILE", variant="primary")
91
  with gr.Row():
92
  tb_indicator = gr.Textbox(label="Output", scale=8)
93
  btn_openfolder = gr.Button('πŸ“‚', scale=2)
94
 
 
 
95
  btn_run.click(fn=self.whisper_inf.transcribe_youtube,
96
- inputs=[tb_youtubelink, dd_model, dd_lang, dd_subformat, cb_translate, cb_timestamp],
97
  outputs=[tb_indicator])
98
  tb_youtubelink.change(get_ytmetas, inputs=[tb_youtubelink],
99
  outputs=[img_thumbnail, tb_title, tb_description])
@@ -111,14 +123,20 @@ class App:
111
  dd_subformat = gr.Dropdown(["SRT", "WebVTT"], value="SRT", label="Subtitle Format")
112
  with gr.Row():
113
  cb_translate = gr.Checkbox(value=False, label="Translate to English?", interactive=True)
 
 
 
 
114
  with gr.Row():
115
  btn_run = gr.Button("GENERATE SUBTITLE FILE", variant="primary")
116
  with gr.Row():
117
  tb_indicator = gr.Textbox(label="Output", scale=8)
118
  btn_openfolder = gr.Button('πŸ“‚', scale=2)
119
 
 
 
120
  btn_run.click(fn=self.whisper_inf.transcribe_mic,
121
- inputs=[mic_input, dd_model, dd_lang, dd_subformat, cb_translate],
122
  outputs=[tb_indicator])
123
  btn_openfolder.click(fn=lambda: self.open_folder("outputs"), inputs=None, outputs=None)
124
  dd_model.change(fn=self.on_change_models, inputs=[dd_model], outputs=[cb_translate])
 
54
  cb_translate = gr.Checkbox(value=False, label="Translate to English?", interactive=True)
55
  with gr.Row():
56
  cb_timestamp = gr.Checkbox(value=True, label="Add a timestamp to the end of the filename", interactive=True)
57
+ with gr.Accordion("Advanced_Parameters", open=False):
58
+ nb_beam_size = gr.Number(label="Beam Size", value=1, precision=0, interactive=True)
59
+ nb_log_prob_threshold = gr.Number(label="Log Probability Threshold", value=-1.0, interactive=True)
60
+ nb_no_speech_threshold = gr.Number(label="No Speech Threshold", value=0.6, interactive=True)
61
  with gr.Row():
62
  btn_run = gr.Button("GENERATE SUBTITLE FILE", variant="primary")
63
  with gr.Row():
64
  tb_indicator = gr.Textbox(label="Output", scale=8)
65
  btn_openfolder = gr.Button('πŸ“‚', scale=2)
66
 
67
+ params = [input_file, dd_model, dd_lang, dd_subformat, cb_translate, cb_timestamp]
68
+ advanced_params = [nb_beam_size, nb_log_prob_threshold, nb_no_speech_threshold]
69
  btn_run.click(fn=self.whisper_inf.transcribe_file,
70
+ inputs=params + advanced_params,
71
  outputs=[tb_indicator])
72
  btn_openfolder.click(fn=lambda: self.open_folder("outputs"), inputs=None, outputs=None)
73
  dd_model.change(fn=self.on_change_models, inputs=[dd_model], outputs=[cb_translate])
 
92
  with gr.Row():
93
  cb_timestamp = gr.Checkbox(value=True, label="Add a timestamp to the end of the filename",
94
  interactive=True)
95
+ with gr.Accordion("Advanced_Parameters", open=False):
96
+ nb_beam_size = gr.Number(label="Beam Size", value=1, precision=0, interactive=True)
97
+ nb_log_prob_threshold = gr.Number(label="Log Probability Threshold", value=-1.0, interactive=True)
98
+ nb_no_speech_threshold = gr.Number(label="No Speech Threshold", value=0.6, interactive=True)
99
  with gr.Row():
100
  btn_run = gr.Button("GENERATE SUBTITLE FILE", variant="primary")
101
  with gr.Row():
102
  tb_indicator = gr.Textbox(label="Output", scale=8)
103
  btn_openfolder = gr.Button('πŸ“‚', scale=2)
104
 
105
+ params = [tb_youtubelink, dd_model, dd_lang, dd_subformat, cb_translate, cb_timestamp]
106
+ advanced_params = [nb_beam_size, nb_log_prob_threshold, nb_no_speech_threshold]
107
  btn_run.click(fn=self.whisper_inf.transcribe_youtube,
108
+ inputs=params + advanced_params,
109
  outputs=[tb_indicator])
110
  tb_youtubelink.change(get_ytmetas, inputs=[tb_youtubelink],
111
  outputs=[img_thumbnail, tb_title, tb_description])
 
123
  dd_subformat = gr.Dropdown(["SRT", "WebVTT"], value="SRT", label="Subtitle Format")
124
  with gr.Row():
125
  cb_translate = gr.Checkbox(value=False, label="Translate to English?", interactive=True)
126
+ with gr.Accordion("Advanced_Parameters", open=False):
127
+ nb_beam_size = gr.Number(label="Beam Size", value=1, precision=0, interactive=True)
128
+ nb_log_prob_threshold = gr.Number(label="Log Probability Threshold", value=-1.0, interactive=True)
129
+ nb_no_speech_threshold = gr.Number(label="No Speech Threshold", value=0.6, interactive=True)
130
  with gr.Row():
131
  btn_run = gr.Button("GENERATE SUBTITLE FILE", variant="primary")
132
  with gr.Row():
133
  tb_indicator = gr.Textbox(label="Output", scale=8)
134
  btn_openfolder = gr.Button('πŸ“‚', scale=2)
135
 
136
+ params = [mic_input, dd_model, dd_lang, dd_subformat, cb_translate]
137
+ advanced_params = [nb_beam_size, nb_log_prob_threshold, nb_no_speech_threshold]
138
  btn_run.click(fn=self.whisper_inf.transcribe_mic,
139
+ inputs=params + advanced_params,
140
  outputs=[tb_indicator])
141
  btn_openfolder.click(fn=lambda: self.open_folder("outputs"), inputs=None, outputs=None)
142
  dd_model.change(fn=self.on_change_models, inputs=[dd_model], outputs=[cb_translate])
modules/faster_whisper_inference.py CHANGED
@@ -34,6 +34,9 @@ class FasterWhisperInference(BaseInterface):
34
  subformat: str,
35
  istranslate: bool,
36
  add_timestamp: bool,
 
 
 
37
  progress=gr.Progress()
38
  ) -> str:
39
  """
@@ -54,6 +57,15 @@ class FasterWhisperInference(BaseInterface):
54
  It's Whisper's feature to translate speech from another language directly into English end-to-end.
55
  add_timestamp: bool
56
  Boolean value from gr.Checkbox() that determines whether to add a timestamp at the end of the filename.
 
 
 
 
 
 
 
 
 
57
  progress: gr.Progress
58
  Indicator to show progress directly in gradio.
59
 
@@ -74,6 +86,9 @@ class FasterWhisperInference(BaseInterface):
74
  audio=fileobj.name,
75
  lang=lang,
76
  istranslate=istranslate,
 
 
 
77
  progress=progress
78
  )
79
 
@@ -110,6 +125,9 @@ class FasterWhisperInference(BaseInterface):
110
  subformat: str,
111
  istranslate: bool,
112
  add_timestamp: bool,
 
 
 
113
  progress=gr.Progress()
114
  ) -> str:
115
  """
@@ -130,6 +148,15 @@ class FasterWhisperInference(BaseInterface):
130
  It's Whisper's feature to translate speech from another language directly into English end-to-end.
131
  add_timestamp: bool
132
  Boolean value from gr.Checkbox() that determines whether to add a timestamp at the end of the filename.
 
 
 
 
 
 
 
 
 
133
  progress: gr.Progress
134
  Indicator to show progress directly in gradio.
135
 
@@ -152,6 +179,9 @@ class FasterWhisperInference(BaseInterface):
152
  audio=audio,
153
  lang=lang,
154
  istranslate=istranslate,
 
 
 
155
  progress=progress
156
  )
157
 
@@ -168,10 +198,17 @@ class FasterWhisperInference(BaseInterface):
168
  except Exception as e:
169
  return f"Error: {str(e)}"
170
  finally:
171
- yt = get_ytdata(youtubelink)
172
- file_path = get_ytaudio(yt)
173
- self.release_cuda_memory()
174
- self.remove_input_files([file_path])
 
 
 
 
 
 
 
175
 
176
  def transcribe_mic(self,
177
  micaudio: str,
@@ -179,6 +216,9 @@ class FasterWhisperInference(BaseInterface):
179
  lang: str,
180
  subformat: str,
181
  istranslate: bool,
 
 
 
182
  progress=gr.Progress()
183
  ) -> str:
184
  """
@@ -197,6 +237,15 @@ class FasterWhisperInference(BaseInterface):
197
  istranslate: bool
198
  Boolean value from gr.Checkbox() that determines whether to translate to English.
199
  It's Whisper's feature to translate speech from another language directly into English end-to-end.
 
 
 
 
 
 
 
 
 
200
  progress: gr.Progress
201
  Indicator to show progress directly in gradio.
202
 
@@ -217,6 +266,9 @@ class FasterWhisperInference(BaseInterface):
217
  audio=micaudio,
218
  lang=lang,
219
  istranslate=istranslate,
 
 
 
220
  progress=progress
221
  )
222
  progress(1, desc="Completed!")
@@ -238,6 +290,9 @@ class FasterWhisperInference(BaseInterface):
238
  audio: Union[str, BinaryIO, np.ndarray],
239
  lang: str,
240
  istranslate: bool,
 
 
 
241
  progress: gr.Progress
242
  ) -> Tuple[list, float]:
243
  """
@@ -252,6 +307,15 @@ class FasterWhisperInference(BaseInterface):
252
  istranslate: bool
253
  Boolean value from gr.Checkbox() that determines whether to translate to English.
254
  It's Whisper's feature to translate speech from another language directly into English end-to-end.
 
 
 
 
 
 
 
 
 
255
  progress: gr.Progress
256
  Indicator to show progress directly in gradio.
257
 
@@ -269,8 +333,10 @@ class FasterWhisperInference(BaseInterface):
269
  segments, info = self.model.transcribe(
270
  audio=audio,
271
  language=lang,
272
- beam_size=self.default_beam_size,
273
- task="translate" if istranslate and self.current_model_size in self.translatable_models else "transcribe"
 
 
274
  )
275
  progress(0, desc="Loading audio..")
276
 
 
34
  subformat: str,
35
  istranslate: bool,
36
  add_timestamp: bool,
37
+ beam_size: int,
38
+ log_prob_threshold: float,
39
+ no_speech_threshold: float,
40
  progress=gr.Progress()
41
  ) -> str:
42
  """
 
57
  It's Whisper's feature to translate speech from another language directly into English end-to-end.
58
  add_timestamp: bool
59
  Boolean value from gr.Checkbox() that determines whether to add a timestamp at the end of the filename.
60
+ beam_size: int
61
+ Int value from gr.Number() that is used for decoding option.
62
+ log_prob_threshold: float
63
+ float value from gr.Number(). If the average log probability over sampled tokens is
64
+ below this value, treat as failed.
65
+ no_speech_threshold: float
66
+ float value from gr.Number(). If the no_speech probability is higher than this value AND
67
+ the average log probability over sampled tokens is below `log_prob_threshold`,
68
+ consider the segment as silent.
69
  progress: gr.Progress
70
  Indicator to show progress directly in gradio.
71
 
 
86
  audio=fileobj.name,
87
  lang=lang,
88
  istranslate=istranslate,
89
+ beam_size=beam_size,
90
+ log_prob_threshold=log_prob_threshold,
91
+ no_speech_threshold=no_speech_threshold,
92
  progress=progress
93
  )
94
 
 
125
  subformat: str,
126
  istranslate: bool,
127
  add_timestamp: bool,
128
+ beam_size: int,
129
+ log_prob_threshold: float,
130
+ no_speech_threshold: float,
131
  progress=gr.Progress()
132
  ) -> str:
133
  """
 
148
  It's Whisper's feature to translate speech from another language directly into English end-to-end.
149
  add_timestamp: bool
150
  Boolean value from gr.Checkbox() that determines whether to add a timestamp at the end of the filename.
151
+ beam_size: int
152
+ Int value from gr.Number() that is used for decoding option.
153
+ log_prob_threshold: float
154
+ float value from gr.Number(). If the average log probability over sampled tokens is
155
+ below this value, treat as failed.
156
+ no_speech_threshold: float
157
+ float value from gr.Number(). If the no_speech probability is higher than this value AND
158
+ the average log probability over sampled tokens is below `log_prob_threshold`,
159
+ consider the segment as silent.
160
  progress: gr.Progress
161
  Indicator to show progress directly in gradio.
162
 
 
179
  audio=audio,
180
  lang=lang,
181
  istranslate=istranslate,
182
+ beam_size=beam_size,
183
+ log_prob_threshold=log_prob_threshold,
184
+ no_speech_threshold=no_speech_threshold,
185
  progress=progress
186
  )
187
 
 
198
  except Exception as e:
199
  return f"Error: {str(e)}"
200
  finally:
201
+ try:
202
+ if 'yt' not in locals():
203
+ yt = get_ytdata(youtubelink)
204
+ file_path = get_ytaudio(yt)
205
+ else:
206
+ file_path = get_ytaudio(yt)
207
+
208
+ self.release_cuda_memory()
209
+ self.remove_input_files([file_path])
210
+ except Exception as cleanup_error:
211
+ pass
212
 
213
  def transcribe_mic(self,
214
  micaudio: str,
 
216
  lang: str,
217
  subformat: str,
218
  istranslate: bool,
219
+ beam_size: int,
220
+ log_prob_threshold: float,
221
+ no_speech_threshold: float,
222
  progress=gr.Progress()
223
  ) -> str:
224
  """
 
237
  istranslate: bool
238
  Boolean value from gr.Checkbox() that determines whether to translate to English.
239
  It's Whisper's feature to translate speech from another language directly into English end-to-end.
240
+ beam_size: int
241
+ Int value from gr.Number() that is used for decoding option.
242
+ log_prob_threshold: float
243
+ float value from gr.Number(). If the average log probability over sampled tokens is
244
+ below this value, treat as failed.
245
+ no_speech_threshold: float
246
+ float value from gr.Number(). If the no_speech probability is higher than this value AND
247
+ the average log probability over sampled tokens is below `log_prob_threshold`,
248
+ consider the segment as silent.
249
  progress: gr.Progress
250
  Indicator to show progress directly in gradio.
251
 
 
266
  audio=micaudio,
267
  lang=lang,
268
  istranslate=istranslate,
269
+ beam_size=beam_size,
270
+ log_prob_threshold=log_prob_threshold,
271
+ no_speech_threshold=no_speech_threshold,
272
  progress=progress
273
  )
274
  progress(1, desc="Completed!")
 
290
  audio: Union[str, BinaryIO, np.ndarray],
291
  lang: str,
292
  istranslate: bool,
293
+ beam_size: int,
294
+ log_prob_threshold: float,
295
+ no_speech_threshold: float,
296
  progress: gr.Progress
297
  ) -> Tuple[list, float]:
298
  """
 
307
  istranslate: bool
308
  Boolean value from gr.Checkbox() that determines whether to translate to English.
309
  It's Whisper's feature to translate speech from another language directly into English end-to-end.
310
+ beam_size: int
311
+ Int value from gr.Number() that is used for decoding option.
312
+ log_prob_threshold: float
313
+ float value from gr.Number(). If the average log probability over sampled tokens is
314
+ below this value, treat as failed.
315
+ no_speech_threshold: float
316
+ float value from gr.Number(). If the no_speech probability is higher than this value AND
317
+ the average log probability over sampled tokens is below `log_prob_threshold`,
318
+ consider the segment as silent.
319
  progress: gr.Progress
320
  Indicator to show progress directly in gradio.
321
 
 
333
  segments, info = self.model.transcribe(
334
  audio=audio,
335
  language=lang,
336
+ task="translate" if istranslate and self.current_model_size in self.translatable_models else "transcribe",
337
+ beam_size=beam_size,
338
+ log_prob_threshold=log_prob_threshold,
339
+ no_speech_threshold=no_speech_threshold,
340
  )
341
  progress(0, desc="Loading audio..")
342
 
modules/whisper_Inference.py CHANGED
@@ -30,6 +30,9 @@ class WhisperInference(BaseInterface):
30
  subformat: str,
31
  istranslate: bool,
32
  add_timestamp: bool,
 
 
 
33
  progress=gr.Progress()):
34
  """
35
  Write subtitle file from Files
@@ -49,6 +52,15 @@ class WhisperInference(BaseInterface):
49
  It's Whisper's feature to translate speech from another language directly into English end-to-end.
50
  add_timestamp: bool
51
  Boolean value from gr.Checkbox() that determines whether to add a timestamp at the end of the filename.
 
 
 
 
 
 
 
 
 
52
  progress: gr.Progress
53
  Indicator to show progress directly in gradio.
54
  I use a forked version of whisper for this. To see more info : https://github.com/jhj0517/jhj0517-whisper/tree/add-progress-callback
@@ -66,6 +78,9 @@ class WhisperInference(BaseInterface):
66
  result, elapsed_time = self.transcribe(audio=audio,
67
  lang=lang,
68
  istranslate=istranslate,
 
 
 
69
  progress=progress)
70
  progress(1, desc="Completed!")
71
 
@@ -103,6 +118,9 @@ class WhisperInference(BaseInterface):
103
  subformat: str,
104
  istranslate: bool,
105
  add_timestamp: bool,
 
 
 
106
  progress=gr.Progress()):
107
  """
108
  Write subtitle file from Youtube
@@ -122,6 +140,15 @@ class WhisperInference(BaseInterface):
122
  It's Whisper's feature to translate speech from another language directly into English end-to-end.
123
  add_timestamp: bool
124
  Boolean value from gr.Checkbox() that determines whether to add a timestamp at the end of the filename.
 
 
 
 
 
 
 
 
 
125
  progress: gr.Progress
126
  Indicator to show progress directly in gradio.
127
  I use a forked version of whisper for this. To see more info : https://github.com/jhj0517/jhj0517-whisper/tree/add-progress-callback
@@ -137,6 +164,9 @@ class WhisperInference(BaseInterface):
137
  result, elapsed_time = self.transcribe(audio=audio,
138
  lang=lang,
139
  istranslate=istranslate,
 
 
 
140
  progress=progress)
141
  progress(1, desc="Completed!")
142
 
@@ -153,10 +183,17 @@ class WhisperInference(BaseInterface):
153
  print(f"Error transcribing youtube video: {str(e)}")
154
  return f"Error transcribing youtube video: {str(e)}"
155
  finally:
156
- yt = get_ytdata(youtubelink)
157
- file_path = get_ytaudio(yt)
158
- self.release_cuda_memory()
159
- self.remove_input_files([file_path])
 
 
 
 
 
 
 
160
 
161
  def transcribe_mic(self,
162
  micaudio: str,
@@ -164,6 +201,9 @@ class WhisperInference(BaseInterface):
164
  lang: str,
165
  subformat: str,
166
  istranslate: bool,
 
 
 
167
  progress=gr.Progress()):
168
  """
169
  Write subtitle file from microphone
@@ -181,6 +221,15 @@ class WhisperInference(BaseInterface):
181
  istranslate: bool
182
  Boolean value from gr.Checkbox() that determines whether to translate to English.
183
  It's Whisper's feature to translate speech from another language directly into English end-to-end.
 
 
 
 
 
 
 
 
 
184
  progress: gr.Progress
185
  Indicator to show progress directly in gradio.
186
  I use a forked version of whisper for this. To see more info : https://github.com/jhj0517/jhj0517-whisper/tree/add-progress-callback
@@ -193,6 +242,9 @@ class WhisperInference(BaseInterface):
193
  result, elapsed_time = self.transcribe(audio=micaudio,
194
  lang=lang,
195
  istranslate=istranslate,
 
 
 
196
  progress=progress)
197
  progress(1, desc="Completed!")
198
 
@@ -215,6 +267,9 @@ class WhisperInference(BaseInterface):
215
  audio: Union[str, np.ndarray, torch.Tensor],
216
  lang: str,
217
  istranslate: bool,
 
 
 
218
  progress: gr.Progress
219
  ) -> Tuple[list[dict], float]:
220
  """
@@ -229,6 +284,15 @@ class WhisperInference(BaseInterface):
229
  istranslate: bool
230
  Boolean value from gr.Checkbox() that determines whether to translate to English.
231
  It's Whisper's feature to translate speech from another language directly into English end-to-end.
 
 
 
 
 
 
 
 
 
232
  progress: gr.Progress
233
  Indicator to show progress directly in gradio.
234
 
@@ -251,7 +315,9 @@ class WhisperInference(BaseInterface):
251
  segments_result = self.model.transcribe(audio=audio,
252
  language=lang,
253
  verbose=False,
254
- beam_size=self.default_beam_size,
 
 
255
  task="translate" if istranslate and self.current_model_size in translatable_model else "transcribe",
256
  progress_callback=progress_callback)["segments"]
257
  elapsed_time = time.time() - start_time
 
30
  subformat: str,
31
  istranslate: bool,
32
  add_timestamp: bool,
33
+ beam_size: int,
34
+ log_prob_threshold: float,
35
+ no_speech_threshold: float,
36
  progress=gr.Progress()):
37
  """
38
  Write subtitle file from Files
 
52
  It's Whisper's feature to translate speech from another language directly into English end-to-end.
53
  add_timestamp: bool
54
  Boolean value from gr.Checkbox() that determines whether to add a timestamp at the end of the filename.
55
+ beam_size: int
56
+ Int value from gr.Number() that is used for decoding option.
57
+ log_prob_threshold: float
58
+ float value from gr.Number(). If the average log probability over sampled tokens is
59
+ below this value, treat as failed.
60
+ no_speech_threshold: float
61
+ float value from gr.Number(). If the no_speech probability is higher than this value AND
62
+ the average log probability over sampled tokens is below `log_prob_threshold`,
63
+ consider the segment as silent.
64
  progress: gr.Progress
65
  Indicator to show progress directly in gradio.
66
  I use a forked version of whisper for this. To see more info : https://github.com/jhj0517/jhj0517-whisper/tree/add-progress-callback
 
78
  result, elapsed_time = self.transcribe(audio=audio,
79
  lang=lang,
80
  istranslate=istranslate,
81
+ beam_size=beam_size,
82
+ log_prob_threshold=log_prob_threshold,
83
+ no_speech_threshold=no_speech_threshold,
84
  progress=progress)
85
  progress(1, desc="Completed!")
86
 
 
118
  subformat: str,
119
  istranslate: bool,
120
  add_timestamp: bool,
121
+ beam_size: int,
122
+ log_prob_threshold: float,
123
+ no_speech_threshold: float,
124
  progress=gr.Progress()):
125
  """
126
  Write subtitle file from Youtube
 
140
  It's Whisper's feature to translate speech from another language directly into English end-to-end.
141
  add_timestamp: bool
142
  Boolean value from gr.Checkbox() that determines whether to add a timestamp at the end of the filename.
143
+ beam_size: int
144
+ Int value from gr.Number() that is used for decoding option.
145
+ log_prob_threshold: float
146
+ float value from gr.Number(). If the average log probability over sampled tokens is
147
+ below this value, treat as failed.
148
+ no_speech_threshold: float
149
+ float value from gr.Number(). If the no_speech probability is higher than this value AND
150
+ the average log probability over sampled tokens is below `log_prob_threshold`,
151
+ consider the segment as silent.
152
  progress: gr.Progress
153
  Indicator to show progress directly in gradio.
154
  I use a forked version of whisper for this. To see more info : https://github.com/jhj0517/jhj0517-whisper/tree/add-progress-callback
 
164
  result, elapsed_time = self.transcribe(audio=audio,
165
  lang=lang,
166
  istranslate=istranslate,
167
+ beam_size=beam_size,
168
+ log_prob_threshold=log_prob_threshold,
169
+ no_speech_threshold=no_speech_threshold,
170
  progress=progress)
171
  progress(1, desc="Completed!")
172
 
 
183
  print(f"Error transcribing youtube video: {str(e)}")
184
  return f"Error transcribing youtube video: {str(e)}"
185
  finally:
186
+ try:
187
+ if 'yt' not in locals():
188
+ yt = get_ytdata(youtubelink)
189
+ file_path = get_ytaudio(yt)
190
+ else:
191
+ file_path = get_ytaudio(yt)
192
+
193
+ self.release_cuda_memory()
194
+ self.remove_input_files([file_path])
195
+ except Exception as cleanup_error:
196
+ pass
197
 
198
  def transcribe_mic(self,
199
  micaudio: str,
 
201
  lang: str,
202
  subformat: str,
203
  istranslate: bool,
204
+ beam_size: int,
205
+ log_prob_threshold: float,
206
+ no_speech_threshold: float,
207
  progress=gr.Progress()):
208
  """
209
  Write subtitle file from microphone
 
221
  istranslate: bool
222
  Boolean value from gr.Checkbox() that determines whether to translate to English.
223
  It's Whisper's feature to translate speech from another language directly into English end-to-end.
224
+ beam_size: int
225
+ Int value from gr.Number() that is used for decoding option.
226
+ log_prob_threshold: float
227
+ float value from gr.Number(). If the average log probability over sampled tokens is
228
+ below this value, treat as failed.
229
+ no_speech_threshold: float
230
+ float value from gr.Number(). If the no_speech probability is higher than this value AND
231
+ the average log probability over sampled tokens is below `log_prob_threshold`,
232
+ consider the segment as silent.
233
  progress: gr.Progress
234
  Indicator to show progress directly in gradio.
235
  I use a forked version of whisper for this. To see more info : https://github.com/jhj0517/jhj0517-whisper/tree/add-progress-callback
 
242
  result, elapsed_time = self.transcribe(audio=micaudio,
243
  lang=lang,
244
  istranslate=istranslate,
245
+ beam_size=beam_size,
246
+ log_prob_threshold=log_prob_threshold,
247
+ no_speech_threshold=no_speech_threshold,
248
  progress=progress)
249
  progress(1, desc="Completed!")
250
 
 
267
  audio: Union[str, np.ndarray, torch.Tensor],
268
  lang: str,
269
  istranslate: bool,
270
+ beam_size: int,
271
+ log_prob_threshold: float,
272
+ no_speech_threshold: float,
273
  progress: gr.Progress
274
  ) -> Tuple[list[dict], float]:
275
  """
 
284
  istranslate: bool
285
  Boolean value from gr.Checkbox() that determines whether to translate to English.
286
  It's Whisper's feature to translate speech from another language directly into English end-to-end.
287
+ beam_size: int
288
+ Int value from gr.Number() that is used for decoding option.
289
+ log_prob_threshold: float
290
+ float value from gr.Number(). If the average log probability over sampled tokens is
291
+ below this value, treat as failed.
292
+ no_speech_threshold: float
293
+ float value from gr.Number(). If the no_speech probability is higher than this value AND
294
+ the average log probability over sampled tokens is below `log_prob_threshold`,
295
+ consider the segment as silent.
296
  progress: gr.Progress
297
  Indicator to show progress directly in gradio.
298
 
 
315
  segments_result = self.model.transcribe(audio=audio,
316
  language=lang,
317
  verbose=False,
318
+ beam_size=beam_size,
319
+ logprob_threshold=log_prob_threshold,
320
+ no_speech_threshold=no_speech_threshold,
321
  task="translate" if istranslate and self.current_model_size in translatable_model else "transcribe",
322
  progress_callback=progress_callback)["segments"]
323
  elapsed_time = time.time() - start_time