File size: 1,584 Bytes
d026c41
edf2658
 
 
 
4cd874e
edf2658
 
 
 
 
 
4cd874e
edf2658
 
 
 
 
 
 
 
4cd874e
d026c41
afb0b38
edf2658
afb0b38
4cd874e
edf2658
 
afb0b38
edf2658
 
 
 
 
 
 
db1910e
c010e1a
db1910e
 
edf2658
 
c06dc29
db1910e
edf2658
 
 
db1910e
b222b4d
edf2658
db1910e
edf2658
db1910e
edf2658
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
import spaces
import gradio as gr
import torch
from transformers import pipeline
import librosa

# モデルの設定
model_id = "kotoba-tech/kotoba-whisper-v1.0"
torch_dtype = torch.float16 if torch.cuda.is_available() else torch.float32
device = "cuda:0" if torch.cuda.is_available() else "cpu"
model_kwargs = {"attn_implementation": "sdpa"} if torch.cuda.is_available() else {}
generate_kwargs = {"language": "japanese", "task": "transcribe"}

# モデルのロード
pipe = pipeline(
    "automatic-speech-recognition",
    model=model_id,
    torch_dtype=torch_dtype,
    device=device,
    model_kwargs=model_kwargs
)

@spaces.GPU(duration=120)
def transcribe(audio):
    # 音声の読み込み
    audio_data, sr = librosa.load(audio, sr=None)
    
    # 音声をリサンプリング
    target_sr = 16000
    audio_resampled = librosa.resample(audio_data, orig_sr=sr, target_sr=target_sr)
    
    # 推論の実行
    result = pipe(audio_resampled, generate_kwargs=generate_kwargs)
    
    return result["text"]

description = """
<p align="center">
<img src="https://huggingface.co/datasets/MakiAi/IconAssets/resolve/main/KotobaTranscriber.png" width="70%">
<br>
</p>
"""

theme='JohnSmith9982/small_and_pretty'

# Gradioインターフェースの定義
iface = gr.Interface(
    fn=transcribe,
    # fn=None,
    inputs=gr.Audio(type="filepath", label="Upload Audio (MP3 or MP4)"),
    outputs="text",
    title="KotobaTranscriber",
    description=description,
    theme=theme,
)
# アプリの起動
iface.launch(server_name="0.0.0.0", server_port=7860, share=True)