Spaces:
Sleeping
Sleeping
File size: 1,584 Bytes
d026c41 edf2658 4cd874e edf2658 4cd874e edf2658 4cd874e d026c41 afb0b38 edf2658 afb0b38 4cd874e edf2658 afb0b38 edf2658 db1910e c010e1a db1910e edf2658 c06dc29 db1910e edf2658 db1910e b222b4d edf2658 db1910e edf2658 db1910e edf2658 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 |
import spaces
import gradio as gr
import torch
from transformers import pipeline
import librosa
# モデルの設定
model_id = "kotoba-tech/kotoba-whisper-v1.0"
torch_dtype = torch.float16 if torch.cuda.is_available() else torch.float32
device = "cuda:0" if torch.cuda.is_available() else "cpu"
model_kwargs = {"attn_implementation": "sdpa"} if torch.cuda.is_available() else {}
generate_kwargs = {"language": "japanese", "task": "transcribe"}
# モデルのロード
pipe = pipeline(
"automatic-speech-recognition",
model=model_id,
torch_dtype=torch_dtype,
device=device,
model_kwargs=model_kwargs
)
@spaces.GPU(duration=120)
def transcribe(audio):
# 音声の読み込み
audio_data, sr = librosa.load(audio, sr=None)
# 音声をリサンプリング
target_sr = 16000
audio_resampled = librosa.resample(audio_data, orig_sr=sr, target_sr=target_sr)
# 推論の実行
result = pipe(audio_resampled, generate_kwargs=generate_kwargs)
return result["text"]
description = """
<p align="center">
<img src="https://huggingface.co/datasets/MakiAi/IconAssets/resolve/main/KotobaTranscriber.png" width="70%">
<br>
</p>
"""
theme='JohnSmith9982/small_and_pretty'
# Gradioインターフェースの定義
iface = gr.Interface(
fn=transcribe,
# fn=None,
inputs=gr.Audio(type="filepath", label="Upload Audio (MP3 or MP4)"),
outputs="text",
title="KotobaTranscriber",
description=description,
theme=theme,
)
# アプリの起動
iface.launch(server_name="0.0.0.0", server_port=7860, share=True) |