Spaces:

jhj0517
/

Whisper-WebUI

Running

App Files Files Community

jhj0517 commited on Sep 21, 2023

Commit

1c64e54

•

2 Parent(s): 58c7e65 e29f6b4

Merge pull request #43 from jhj0517/add-advanced-params

Browse files

Files changed (3) hide show

app.py +21 -3
modules/faster_whisper_inference.py +72 -6
modules/whisper_Inference.py +71 -5

app.py CHANGED Viewed

@@ -54,14 +54,20 @@ class App:
                         cb_translate = gr.Checkbox(value=False, label="Translate to English?", interactive=True)
                     with gr.Row():
                         cb_timestamp = gr.Checkbox(value=True, label="Add a timestamp to the end of the filename", interactive=True)
                     with gr.Row():
                         btn_run = gr.Button("GENERATE SUBTITLE FILE", variant="primary")
                     with gr.Row():
                         tb_indicator = gr.Textbox(label="Output", scale=8)
                         btn_openfolder = gr.Button('📂', scale=2)
                     btn_run.click(fn=self.whisper_inf.transcribe_file,
-                                  inputs=[input_file, dd_model, dd_lang, dd_subformat, cb_translate, cb_timestamp],
                                   outputs=[tb_indicator])
                     btn_openfolder.click(fn=lambda: self.open_folder("outputs"), inputs=None, outputs=None)
                     dd_model.change(fn=self.on_change_models, inputs=[dd_model], outputs=[cb_translate])
@@ -86,14 +92,20 @@ class App:
                     with gr.Row():
                         cb_timestamp = gr.Checkbox(value=True, label="Add a timestamp to the end of the filename",
                                                    interactive=True)
                     with gr.Row():
                         btn_run = gr.Button("GENERATE SUBTITLE FILE", variant="primary")
                     with gr.Row():
                         tb_indicator = gr.Textbox(label="Output", scale=8)
                         btn_openfolder = gr.Button('📂', scale=2)
                     btn_run.click(fn=self.whisper_inf.transcribe_youtube,
-                                  inputs=[tb_youtubelink, dd_model, dd_lang, dd_subformat, cb_translate, cb_timestamp],
                                   outputs=[tb_indicator])
                     tb_youtubelink.change(get_ytmetas, inputs=[tb_youtubelink],
                                           outputs=[img_thumbnail, tb_title, tb_description])
@@ -111,14 +123,20 @@ class App:
                         dd_subformat = gr.Dropdown(["SRT", "WebVTT"], value="SRT", label="Subtitle Format")
                     with gr.Row():
                         cb_translate = gr.Checkbox(value=False, label="Translate to English?", interactive=True)
                     with gr.Row():
                         btn_run = gr.Button("GENERATE SUBTITLE FILE", variant="primary")
                     with gr.Row():
                         tb_indicator = gr.Textbox(label="Output", scale=8)
                         btn_openfolder = gr.Button('📂', scale=2)
                     btn_run.click(fn=self.whisper_inf.transcribe_mic,
-                                  inputs=[mic_input, dd_model, dd_lang, dd_subformat, cb_translate],
                                   outputs=[tb_indicator])
                     btn_openfolder.click(fn=lambda: self.open_folder("outputs"), inputs=None, outputs=None)
                     dd_model.change(fn=self.on_change_models, inputs=[dd_model], outputs=[cb_translate])

                         cb_translate = gr.Checkbox(value=False, label="Translate to English?", interactive=True)
                     with gr.Row():
                         cb_timestamp = gr.Checkbox(value=True, label="Add a timestamp to the end of the filename", interactive=True)
+                    with gr.Accordion("Advanced_Parameters", open=False):
+                        nb_beam_size = gr.Number(label="Beam Size", value=1, precision=0, interactive=True)
+                        nb_log_prob_threshold = gr.Number(label="Log Probability Threshold", value=-1.0, interactive=True)
+                        nb_no_speech_threshold = gr.Number(label="No Speech Threshold", value=0.6, interactive=True)
                     with gr.Row():
                         btn_run = gr.Button("GENERATE SUBTITLE FILE", variant="primary")
                     with gr.Row():
                         tb_indicator = gr.Textbox(label="Output", scale=8)
                         btn_openfolder = gr.Button('📂', scale=2)
+                    params = [input_file, dd_model, dd_lang, dd_subformat, cb_translate, cb_timestamp]
+                    advanced_params = [nb_beam_size, nb_log_prob_threshold, nb_no_speech_threshold]
                     btn_run.click(fn=self.whisper_inf.transcribe_file,
+                                  inputs=params + advanced_params,
                                   outputs=[tb_indicator])
                     btn_openfolder.click(fn=lambda: self.open_folder("outputs"), inputs=None, outputs=None)
                     dd_model.change(fn=self.on_change_models, inputs=[dd_model], outputs=[cb_translate])
                     with gr.Row():
                         cb_timestamp = gr.Checkbox(value=True, label="Add a timestamp to the end of the filename",
                                                    interactive=True)
+                    with gr.Accordion("Advanced_Parameters", open=False):
+                        nb_beam_size = gr.Number(label="Beam Size", value=1, precision=0, interactive=True)
+                        nb_log_prob_threshold = gr.Number(label="Log Probability Threshold", value=-1.0, interactive=True)
+                        nb_no_speech_threshold = gr.Number(label="No Speech Threshold", value=0.6, interactive=True)
                     with gr.Row():
                         btn_run = gr.Button("GENERATE SUBTITLE FILE", variant="primary")
                     with gr.Row():
                         tb_indicator = gr.Textbox(label="Output", scale=8)
                         btn_openfolder = gr.Button('📂', scale=2)
+                    params = [tb_youtubelink, dd_model, dd_lang, dd_subformat, cb_translate, cb_timestamp]
+                    advanced_params = [nb_beam_size, nb_log_prob_threshold, nb_no_speech_threshold]
                     btn_run.click(fn=self.whisper_inf.transcribe_youtube,
+                                  inputs=params + advanced_params,
                                   outputs=[tb_indicator])
                     tb_youtubelink.change(get_ytmetas, inputs=[tb_youtubelink],
                                           outputs=[img_thumbnail, tb_title, tb_description])
                         dd_subformat = gr.Dropdown(["SRT", "WebVTT"], value="SRT", label="Subtitle Format")
                     with gr.Row():
                         cb_translate = gr.Checkbox(value=False, label="Translate to English?", interactive=True)
+                    with gr.Accordion("Advanced_Parameters", open=False):
+                        nb_beam_size = gr.Number(label="Beam Size", value=1, precision=0, interactive=True)
+                        nb_log_prob_threshold = gr.Number(label="Log Probability Threshold", value=-1.0, interactive=True)
+                        nb_no_speech_threshold = gr.Number(label="No Speech Threshold", value=0.6, interactive=True)
                     with gr.Row():
                         btn_run = gr.Button("GENERATE SUBTITLE FILE", variant="primary")
                     with gr.Row():
                         tb_indicator = gr.Textbox(label="Output", scale=8)
                         btn_openfolder = gr.Button('📂', scale=2)
+                    params = [mic_input, dd_model, dd_lang, dd_subformat, cb_translate]
+                    advanced_params = [nb_beam_size, nb_log_prob_threshold, nb_no_speech_threshold]
                     btn_run.click(fn=self.whisper_inf.transcribe_mic,
+                                  inputs=params + advanced_params,
                                   outputs=[tb_indicator])
                     btn_openfolder.click(fn=lambda: self.open_folder("outputs"), inputs=None, outputs=None)
                     dd_model.change(fn=self.on_change_models, inputs=[dd_model], outputs=[cb_translate])

modules/faster_whisper_inference.py CHANGED Viewed

@@ -34,6 +34,9 @@ class FasterWhisperInference(BaseInterface):
                         subformat: str,
                         istranslate: bool,
                         add_timestamp: bool,
                         progress=gr.Progress()
                         ) -> str:
         """
@@ -54,6 +57,15 @@ class FasterWhisperInference(BaseInterface):
             It's Whisper's feature to translate speech from another language directly into English end-to-end.
         add_timestamp: bool
             Boolean value from gr.Checkbox() that determines whether to add a timestamp at the end of the filename.
         progress: gr.Progress
             Indicator to show progress directly in gradio.
@@ -74,6 +86,9 @@ class FasterWhisperInference(BaseInterface):
                     audio=fileobj.name,
                     lang=lang,
                     istranslate=istranslate,
                     progress=progress
                 )
@@ -110,6 +125,9 @@ class FasterWhisperInference(BaseInterface):
                            subformat: str,
                            istranslate: bool,
                            add_timestamp: bool,
                            progress=gr.Progress()
                            ) -> str:
         """
@@ -130,6 +148,15 @@ class FasterWhisperInference(BaseInterface):
             It's Whisper's feature to translate speech from another language directly into English end-to-end.
         add_timestamp: bool
             Boolean value from gr.Checkbox() that determines whether to add a timestamp at the end of the filename.
         progress: gr.Progress
             Indicator to show progress directly in gradio.
@@ -152,6 +179,9 @@ class FasterWhisperInference(BaseInterface):
                 audio=audio,
                 lang=lang,
                 istranslate=istranslate,
                 progress=progress
             )
@@ -168,10 +198,17 @@ class FasterWhisperInference(BaseInterface):
         except Exception as e:
             return f"Error: {str(e)}"
         finally:
-            yt = get_ytdata(youtubelink)
-            file_path = get_ytaudio(yt)
-            self.release_cuda_memory()
-            self.remove_input_files([file_path])
     def transcribe_mic(self,
                        micaudio: str,
@@ -179,6 +216,9 @@ class FasterWhisperInference(BaseInterface):
                        lang: str,
                        subformat: str,
                        istranslate: bool,
                        progress=gr.Progress()
                        ) -> str:
         """
@@ -197,6 +237,15 @@ class FasterWhisperInference(BaseInterface):
         istranslate: bool
             Boolean value from gr.Checkbox() that determines whether to translate to English.
             It's Whisper's feature to translate speech from another language directly into English end-to-end.
         progress: gr.Progress
             Indicator to show progress directly in gradio.
@@ -217,6 +266,9 @@ class FasterWhisperInference(BaseInterface):
                 audio=micaudio,
                 lang=lang,
                 istranslate=istranslate,
                 progress=progress
             )
             progress(1, desc="Completed!")
@@ -238,6 +290,9 @@ class FasterWhisperInference(BaseInterface):
                    audio: Union[str, BinaryIO, np.ndarray],
                    lang: str,
                    istranslate: bool,
                    progress: gr.Progress
                    ) -> Tuple[list, float]:
         """
@@ -252,6 +307,15 @@ class FasterWhisperInference(BaseInterface):
         istranslate: bool
             Boolean value from gr.Checkbox() that determines whether to translate to English.
             It's Whisper's feature to translate speech from another language directly into English end-to-end.
         progress: gr.Progress
             Indicator to show progress directly in gradio.
@@ -269,8 +333,10 @@ class FasterWhisperInference(BaseInterface):
         segments, info = self.model.transcribe(
             audio=audio,
             language=lang,
-            beam_size=self.default_beam_size,
-            task="translate" if istranslate and self.current_model_size in self.translatable_models else "transcribe"
         )
         progress(0, desc="Loading audio..")

                         subformat: str,
                         istranslate: bool,
                         add_timestamp: bool,
+                        beam_size: int,
+                        log_prob_threshold: float,
+                        no_speech_threshold: float,
                         progress=gr.Progress()
                         ) -> str:
         """
             It's Whisper's feature to translate speech from another language directly into English end-to-end.
         add_timestamp: bool
             Boolean value from gr.Checkbox() that determines whether to add a timestamp at the end of the filename.
+        beam_size: int
+            Int value from gr.Number() that is used for decoding option.
+        log_prob_threshold: float
+            float value from gr.Number(). If the average log probability over sampled tokens is
+            below this value, treat as failed.
+        no_speech_threshold: float
+            float value from gr.Number(). If the no_speech probability is higher than this value AND
+            the average log probability over sampled tokens is below `log_prob_threshold`,
+            consider the segment as silent.
         progress: gr.Progress
             Indicator to show progress directly in gradio.
                     audio=fileobj.name,
                     lang=lang,
                     istranslate=istranslate,
+                    beam_size=beam_size,
+                    log_prob_threshold=log_prob_threshold,
+                    no_speech_threshold=no_speech_threshold,
                     progress=progress
                 )
                            subformat: str,
                            istranslate: bool,
                            add_timestamp: bool,
+                           beam_size: int,
+                           log_prob_threshold: float,
+                           no_speech_threshold: float,
                            progress=gr.Progress()
                            ) -> str:
         """
             It's Whisper's feature to translate speech from another language directly into English end-to-end.
         add_timestamp: bool
             Boolean value from gr.Checkbox() that determines whether to add a timestamp at the end of the filename.
+        beam_size: int
+            Int value from gr.Number() that is used for decoding option.
+        log_prob_threshold: float
+            float value from gr.Number(). If the average log probability over sampled tokens is
+            below this value, treat as failed.
+        no_speech_threshold: float
+            float value from gr.Number(). If the no_speech probability is higher than this value AND
+            the average log probability over sampled tokens is below `log_prob_threshold`,
+            consider the segment as silent.
         progress: gr.Progress
             Indicator to show progress directly in gradio.
                 audio=audio,
                 lang=lang,
                 istranslate=istranslate,
+                beam_size=beam_size,
+                log_prob_threshold=log_prob_threshold,
+                no_speech_threshold=no_speech_threshold,
                 progress=progress
             )
         except Exception as e:
             return f"Error: {str(e)}"
         finally:
+            try:
+                if 'yt' not in locals():
+                    yt = get_ytdata(youtubelink)
+                    file_path = get_ytaudio(yt)
+                else:
+                    file_path = get_ytaudio(yt)
+                self.release_cuda_memory()
+                self.remove_input_files([file_path])
+            except Exception as cleanup_error:
+                pass
     def transcribe_mic(self,
                        micaudio: str,
                        lang: str,
                        subformat: str,
                        istranslate: bool,
+                       beam_size: int,
+                       log_prob_threshold: float,
+                       no_speech_threshold: float,
                        progress=gr.Progress()
                        ) -> str:
         """
         istranslate: bool
             Boolean value from gr.Checkbox() that determines whether to translate to English.
             It's Whisper's feature to translate speech from another language directly into English end-to-end.
+        beam_size: int
+            Int value from gr.Number() that is used for decoding option.
+        log_prob_threshold: float
+            float value from gr.Number(). If the average log probability over sampled tokens is
+            below this value, treat as failed.
+        no_speech_threshold: float
+            float value from gr.Number(). If the no_speech probability is higher than this value AND
+            the average log probability over sampled tokens is below `log_prob_threshold`,
+            consider the segment as silent.
         progress: gr.Progress
             Indicator to show progress directly in gradio.
                 audio=micaudio,
                 lang=lang,
                 istranslate=istranslate,
+                beam_size=beam_size,
+                log_prob_threshold=log_prob_threshold,
+                no_speech_threshold=no_speech_threshold,
                 progress=progress
             )
             progress(1, desc="Completed!")
                    audio: Union[str, BinaryIO, np.ndarray],
                    lang: str,
                    istranslate: bool,
+                   beam_size: int,
+                   log_prob_threshold: float,
+                   no_speech_threshold: float,
                    progress: gr.Progress
                    ) -> Tuple[list, float]:
         """
         istranslate: bool
             Boolean value from gr.Checkbox() that determines whether to translate to English.
             It's Whisper's feature to translate speech from another language directly into English end-to-end.
+        beam_size: int
+            Int value from gr.Number() that is used for decoding option.
+        log_prob_threshold: float
+            float value from gr.Number(). If the average log probability over sampled tokens is
+            below this value, treat as failed.
+        no_speech_threshold: float
+            float value from gr.Number(). If the no_speech probability is higher than this value AND
+            the average log probability over sampled tokens is below `log_prob_threshold`,
+            consider the segment as silent.
         progress: gr.Progress
             Indicator to show progress directly in gradio.
         segments, info = self.model.transcribe(
             audio=audio,
             language=lang,
+            task="translate" if istranslate and self.current_model_size in self.translatable_models else "transcribe",
+            beam_size=beam_size,
+            log_prob_threshold=log_prob_threshold,
+            no_speech_threshold=no_speech_threshold,
         )
         progress(0, desc="Loading audio..")

modules/whisper_Inference.py CHANGED Viewed

@@ -30,6 +30,9 @@ class WhisperInference(BaseInterface):
                         subformat: str,
                         istranslate: bool,
                         add_timestamp: bool,
                         progress=gr.Progress()):
         """
         Write subtitle file from Files
@@ -49,6 +52,15 @@ class WhisperInference(BaseInterface):
             It's Whisper's feature to translate speech from another language directly into English end-to-end.
         add_timestamp: bool
             Boolean value from gr.Checkbox() that determines whether to add a timestamp at the end of the filename.
         progress: gr.Progress
             Indicator to show progress directly in gradio.
             I use a forked version of whisper for this. To see more info : https://github.com/jhj0517/jhj0517-whisper/tree/add-progress-callback
@@ -66,6 +78,9 @@ class WhisperInference(BaseInterface):
                 result, elapsed_time = self.transcribe(audio=audio,
                                                        lang=lang,
                                                        istranslate=istranslate,
                                                        progress=progress)
                 progress(1, desc="Completed!")
@@ -103,6 +118,9 @@ class WhisperInference(BaseInterface):
                            subformat: str,
                            istranslate: bool,
                            add_timestamp: bool,
                            progress=gr.Progress()):
         """
         Write subtitle file from Youtube
@@ -122,6 +140,15 @@ class WhisperInference(BaseInterface):
             It's Whisper's feature to translate speech from another language directly into English end-to-end.
         add_timestamp: bool
             Boolean value from gr.Checkbox() that determines whether to add a timestamp at the end of the filename.
         progress: gr.Progress
             Indicator to show progress directly in gradio.
             I use a forked version of whisper for this. To see more info : https://github.com/jhj0517/jhj0517-whisper/tree/add-progress-callback
@@ -137,6 +164,9 @@ class WhisperInference(BaseInterface):
             result, elapsed_time = self.transcribe(audio=audio,
                                                    lang=lang,
                                                    istranslate=istranslate,
                                                    progress=progress)
             progress(1, desc="Completed!")
@@ -153,10 +183,17 @@ class WhisperInference(BaseInterface):
             print(f"Error transcribing youtube video: {str(e)}")
             return f"Error transcribing youtube video: {str(e)}"
         finally:
-            yt = get_ytdata(youtubelink)
-            file_path = get_ytaudio(yt)
-            self.release_cuda_memory()
-            self.remove_input_files([file_path])
     def transcribe_mic(self,
                        micaudio: str,
@@ -164,6 +201,9 @@ class WhisperInference(BaseInterface):
                        lang: str,
                        subformat: str,
                        istranslate: bool,
                        progress=gr.Progress()):
         """
         Write subtitle file from microphone
@@ -181,6 +221,15 @@ class WhisperInference(BaseInterface):
         istranslate: bool
             Boolean value from gr.Checkbox() that determines whether to translate to English.
             It's Whisper's feature to translate speech from another language directly into English end-to-end.
         progress: gr.Progress
             Indicator to show progress directly in gradio.
             I use a forked version of whisper for this. To see more info : https://github.com/jhj0517/jhj0517-whisper/tree/add-progress-callback
@@ -193,6 +242,9 @@ class WhisperInference(BaseInterface):
             result, elapsed_time = self.transcribe(audio=micaudio,
                                                    lang=lang,
                                                    istranslate=istranslate,
                                                    progress=progress)
             progress(1, desc="Completed!")
@@ -215,6 +267,9 @@ class WhisperInference(BaseInterface):
                    audio: Union[str, np.ndarray, torch.Tensor],
                    lang: str,
                    istranslate: bool,
                    progress: gr.Progress
                    ) -> Tuple[list[dict], float]:
         """
@@ -229,6 +284,15 @@ class WhisperInference(BaseInterface):
         istranslate: bool
             Boolean value from gr.Checkbox() that determines whether to translate to English.
             It's Whisper's feature to translate speech from another language directly into English end-to-end.
         progress: gr.Progress
             Indicator to show progress directly in gradio.
@@ -251,7 +315,9 @@ class WhisperInference(BaseInterface):
         segments_result = self.model.transcribe(audio=audio,
                                                 language=lang,
                                                 verbose=False,
-                                                beam_size=self.default_beam_size,
                                                 task="translate" if istranslate and self.current_model_size in translatable_model else "transcribe",
                                                 progress_callback=progress_callback)["segments"]
         elapsed_time = time.time() - start_time

                         subformat: str,
                         istranslate: bool,
                         add_timestamp: bool,
+                        beam_size: int,
+                        log_prob_threshold: float,
+                        no_speech_threshold: float,
                         progress=gr.Progress()):
         """
         Write subtitle file from Files
             It's Whisper's feature to translate speech from another language directly into English end-to-end.
         add_timestamp: bool
             Boolean value from gr.Checkbox() that determines whether to add a timestamp at the end of the filename.
+        beam_size: int
+            Int value from gr.Number() that is used for decoding option.
+        log_prob_threshold: float
+            float value from gr.Number(). If the average log probability over sampled tokens is
+            below this value, treat as failed.
+        no_speech_threshold: float
+            float value from gr.Number(). If the no_speech probability is higher than this value AND
+            the average log probability over sampled tokens is below `log_prob_threshold`,
+            consider the segment as silent.
         progress: gr.Progress
             Indicator to show progress directly in gradio.
             I use a forked version of whisper for this. To see more info : https://github.com/jhj0517/jhj0517-whisper/tree/add-progress-callback
                 result, elapsed_time = self.transcribe(audio=audio,
                                                        lang=lang,
                                                        istranslate=istranslate,
+                                                       beam_size=beam_size,
+                                                       log_prob_threshold=log_prob_threshold,
+                                                       no_speech_threshold=no_speech_threshold,
                                                        progress=progress)
                 progress(1, desc="Completed!")
                            subformat: str,
                            istranslate: bool,
                            add_timestamp: bool,
+                           beam_size: int,
+                           log_prob_threshold: float,
+                           no_speech_threshold: float,
                            progress=gr.Progress()):
         """
         Write subtitle file from Youtube
             It's Whisper's feature to translate speech from another language directly into English end-to-end.
         add_timestamp: bool
             Boolean value from gr.Checkbox() that determines whether to add a timestamp at the end of the filename.
+        beam_size: int
+            Int value from gr.Number() that is used for decoding option.
+        log_prob_threshold: float
+            float value from gr.Number(). If the average log probability over sampled tokens is
+            below this value, treat as failed.
+        no_speech_threshold: float
+            float value from gr.Number(). If the no_speech probability is higher than this value AND
+            the average log probability over sampled tokens is below `log_prob_threshold`,
+            consider the segment as silent.
         progress: gr.Progress
             Indicator to show progress directly in gradio.
             I use a forked version of whisper for this. To see more info : https://github.com/jhj0517/jhj0517-whisper/tree/add-progress-callback
             result, elapsed_time = self.transcribe(audio=audio,
                                                    lang=lang,
                                                    istranslate=istranslate,
+                                                   beam_size=beam_size,
+                                                   log_prob_threshold=log_prob_threshold,
+                                                   no_speech_threshold=no_speech_threshold,
                                                    progress=progress)
             progress(1, desc="Completed!")
             print(f"Error transcribing youtube video: {str(e)}")
             return f"Error transcribing youtube video: {str(e)}"
         finally:
+            try:
+                if 'yt' not in locals():
+                    yt = get_ytdata(youtubelink)
+                    file_path = get_ytaudio(yt)
+                else:
+                    file_path = get_ytaudio(yt)
+                self.release_cuda_memory()
+                self.remove_input_files([file_path])
+            except Exception as cleanup_error:
+                pass
     def transcribe_mic(self,
                        micaudio: str,
                        lang: str,
                        subformat: str,
                        istranslate: bool,
+                       beam_size: int,
+                       log_prob_threshold: float,
+                       no_speech_threshold: float,
                        progress=gr.Progress()):
         """
         Write subtitle file from microphone
         istranslate: bool
             Boolean value from gr.Checkbox() that determines whether to translate to English.
             It's Whisper's feature to translate speech from another language directly into English end-to-end.
+        beam_size: int
+            Int value from gr.Number() that is used for decoding option.
+        log_prob_threshold: float
+            float value from gr.Number(). If the average log probability over sampled tokens is
+            below this value, treat as failed.
+        no_speech_threshold: float
+            float value from gr.Number(). If the no_speech probability is higher than this value AND
+            the average log probability over sampled tokens is below `log_prob_threshold`,
+            consider the segment as silent.
         progress: gr.Progress
             Indicator to show progress directly in gradio.
             I use a forked version of whisper for this. To see more info : https://github.com/jhj0517/jhj0517-whisper/tree/add-progress-callback
             result, elapsed_time = self.transcribe(audio=micaudio,
                                                    lang=lang,
                                                    istranslate=istranslate,
+                                                   beam_size=beam_size,
+                                                   log_prob_threshold=log_prob_threshold,
+                                                   no_speech_threshold=no_speech_threshold,
                                                    progress=progress)
             progress(1, desc="Completed!")
                    audio: Union[str, np.ndarray, torch.Tensor],
                    lang: str,
                    istranslate: bool,
+                   beam_size: int,
+                   log_prob_threshold: float,
+                   no_speech_threshold: float,
                    progress: gr.Progress
                    ) -> Tuple[list[dict], float]:
         """
         istranslate: bool
             Boolean value from gr.Checkbox() that determines whether to translate to English.
             It's Whisper's feature to translate speech from another language directly into English end-to-end.
+        beam_size: int
+            Int value from gr.Number() that is used for decoding option.
+        log_prob_threshold: float
+            float value from gr.Number(). If the average log probability over sampled tokens is
+            below this value, treat as failed.
+        no_speech_threshold: float
+            float value from gr.Number(). If the no_speech probability is higher than this value AND
+            the average log probability over sampled tokens is below `log_prob_threshold`,
+            consider the segment as silent.
         progress: gr.Progress
             Indicator to show progress directly in gradio.
         segments_result = self.model.transcribe(audio=audio,
                                                 language=lang,
                                                 verbose=False,
+                                                beam_size=beam_size,
+                                                logprob_threshold=log_prob_threshold,
+                                                no_speech_threshold=no_speech_threshold,
                                                 task="translate" if istranslate and self.current_model_size in translatable_model else "transcribe",
                                                 progress_callback=progress_callback)["segments"]
         elapsed_time = time.time() - start_time