|
from dataclasses import dataclass, field |
|
from typing import Optional |
|
|
|
|
|
@dataclass |
|
class WhisperSTTHandlerArguments: |
|
stt_model_name: str = field( |
|
default="distil-whisper/distil-large-v3", |
|
metadata={ |
|
"help": "The pretrained Whisper model to use. Default is 'distil-whisper/distil-large-v3'." |
|
}, |
|
) |
|
stt_device: str = field( |
|
default="cuda", |
|
metadata={ |
|
"help": "The device type on which the model will run. Default is 'cuda' for GPU acceleration." |
|
}, |
|
) |
|
stt_torch_dtype: str = field( |
|
default="float16", |
|
metadata={ |
|
"help": "The PyTorch data type for the model and input tensors. One of `float32` (full-precision), `float16` or `bfloat16` (both half-precision)." |
|
}, |
|
) |
|
stt_compile_mode: str = field( |
|
default=None, |
|
metadata={ |
|
"help": "Compile mode for torch compile. Either 'default', 'reduce-overhead' and 'max-autotune'. Default is None (no compilation)" |
|
}, |
|
) |
|
stt_gen_max_new_tokens: int = field( |
|
default=128, |
|
metadata={ |
|
"help": "The maximum number of new tokens to generate. Default is 128." |
|
}, |
|
) |
|
stt_gen_num_beams: int = field( |
|
default=1, |
|
metadata={ |
|
"help": "The number of beams for beam search. Default is 1, implying greedy decoding." |
|
}, |
|
) |
|
stt_gen_return_timestamps: bool = field( |
|
default=False, |
|
metadata={ |
|
"help": "Whether to return timestamps with transcriptions. Default is False." |
|
}, |
|
) |
|
stt_gen_task: str = field( |
|
default="transcribe", |
|
metadata={ |
|
"help": "The task to perform, typically 'transcribe' for transcription. Default is 'transcribe'." |
|
}, |
|
) |
|
language: Optional[str] = field( |
|
default='en', |
|
metadata={ |
|
"help": """The language for the conversation. |
|
Choose between 'en' (english), 'fr' (french), 'es' (spanish), |
|
'zh' (chinese), 'ko' (korean), 'ja' (japanese), or 'None'. |
|
If using 'auto', the language is automatically detected and can |
|
change during the conversation. Default is 'en'.""" |
|
}, |
|
) |