import spaces import gradio as gr import torch from transformers import pipeline import librosa # モデルの設定 model_id = "kotoba-tech/kotoba-whisper-v1.0" torch_dtype = torch.float16 if torch.cuda.is_available() else torch.float32 device = "cuda:0" if torch.cuda.is_available() else "cpu" model_kwargs = {"attn_implementation": "sdpa"} if torch.cuda.is_available() else {} generate_kwargs = {"language": "japanese", "task": "transcribe"} # モデルのロード pipe = pipeline( "automatic-speech-recognition", model=model_id, torch_dtype=torch_dtype, device=device, model_kwargs=model_kwargs ) @spaces.GPU(duration=120) def transcribe(audio): # 音声の読み込み audio_data, sr = librosa.load(audio, sr=None) # 音声をリサンプリング target_sr = 16000 audio_resampled = librosa.resample(audio_data, orig_sr=sr, target_sr=target_sr) # 推論の実行 result = pipe(audio_resampled, generate_kwargs=generate_kwargs) return result["text"] description = """


""" theme='JohnSmith9982/small_and_pretty' # Gradioインターフェースの定義 iface = gr.Interface( fn=transcribe, # fn=None, inputs=gr.Audio(type="filepath", label="Upload Audio (MP3 or MP4)"), outputs="text", title="KotobaTranscriber", description=description, theme=theme, ) # アプリの起動 iface.launch(server_name="0.0.0.0", server_port=7860, share=True)