import gradio as gr
import pandas as pd
from css_html_js import custom_css
TITLE = """
🇲🇾 Malaysian Speech-to-Text Leaderboard
"""
INTRODUCTION_TEXT = """
📐 The 🇲🇾 Malaysian Speech-to-Text Leaderboard aims to track, rank and evaluate Malaysian Speech-to-Text models. All notebooks at https://github.com/mesolitica/malaysian-stt-benchmarks
## Dataset
📈 We evaluate models based on 3 datasets,
1. Malaya-Speech test set, Malay language, https://huggingface.co/datasets/mesolitica/speech-test-set/tree/main/malaya-speech
2. Fleurs MS-MY test set, Malay language, https://huggingface.co/datasets/mesolitica/speech-test-set/tree/main/fleurs-ms-my
3. IMDA TTS first 700 audio files, English language but with Manglish slang, https://huggingface.co/datasets/mesolitica/IMDA-TTS
## Heavy postprocess test set
1. We filtered test set that contain numbers because malaya-speech transducer trained on normalized numbers.
2. We lower case because malaya-speech transducer trained on lower case.
3. We removed punctuation because malaya-speech transducer trained without punctuation.
"""
open_source = [
{
'model': 'goodtape.io',
'model size FP16 (MB)': None,
'Malaya-Speech test CER': 0.09504487340205486,
'Malaya-Speech test WER': 0.1691902868373457,
'Fleurs MY-MS CER': 0.03643102801583697,
'Fleurs MY-MS WER': 0.08672758155453257,
},
{
'model': 'openai/whisper-large-v3',
'model size FP16 (MB)': 3090,
'Malaya-Speech test CER': 0.0349251317825172,
'Malaya-Speech test WER': 0.1032828282828283,
'Fleurs MY-MS CER': 0.026055551396846878,
'Fleurs MY-MS WER': 0.07652049926522007,
'IMDA TTS CER': 0.016648493852990828,
'IMDA TTS WER': 0.0386282289139432,
},
{
'model': 'openai/whisper-medium',
'model size FP16 (MB)': 1530,
'Malaya-Speech test CER': 0.05064920144820262,
'Malaya-Speech test WER': 0.17534205321090568,
'Fleurs MY-MS CER': 0.04366882208520179,
'Fleurs MY-MS WER': 0.13546055192128273,
'IMDA TTS CER': 0.02065587879424904,
'IMDA TTS WER': 0.047277690563404855,
},
{
'model': 'openai/whisper-small',
'model size FP16 (MB)': 483.5,
'Malaya-Speech test CER': 0.07485209857268262,
'Malaya-Speech test WER': 0.25748516055893106,
'Fleurs MY-MS CER': 0.06781078047622793,
'Fleurs MY-MS WER': 0.21953142859857497,
'IMDA TTS CER': 0.024812471688517194,
'IMDA TTS WER': 0.058901277294134434,
},
{
'model': 'openai/whisper-base',
'model size FP16 (MB)': 145,
'Malaya-Speech test CER': 0.3574879236610538,
'Malaya-Speech test WER': 0.8303456599563157,
'Fleurs MY-MS CER': 0.1319124653794061,
'Fleurs MY-MS WER': 0.40499286081235003,
'IMDA TTS CER': 0.03914533450681607,
'IMDA TTS WER': 0.08951682444539587,
},
{
'model': 'openai/whisper-tiny',
'model size FP16 (MB)': 75.5,
'Malaya-Speech test CER': 0.26941094281472105,
'Malaya-Speech test WER': 0.7414099751189915,
'Fleurs MY-MS CER': 0.38749733168917505,
'Fleurs MY-MS WER': 0.812253445128297,
'IMDA TTS CER': 0.048805770734828904,
'IMDA TTS WER': 0.11150629529200957,
},
{
'model': 'mesolitica/malaysian-whisper-medium',
'model size FP16 (MB)': 1530,
'Malaya-Speech test CER': 0.05622483776367814,
'Malaya-Speech test WER': 0.14406629724252673,
'Fleurs MY-MS CER': 0.025543266604368554,
'Fleurs MY-MS WER': 0.07940219915492629,
'IMDA TTS CER': 0.01971214262944062,
'IMDA TTS WER': 0.047223078508792794,
},
{
'model': 'mesolitica/malaysian-whisper-small',
'model size FP16 (MB)': 483.5,
'Malaya-Speech test CER': 0.049162419174983304,
'Malaya-Speech test WER': 0.15926901346983313,
'Fleurs MY-MS CER': 0.035517572531147,
'Fleurs MY-MS WER': 0.10938718963023729,
'IMDA TTS CER': 0.024228721439634855,
'IMDA TTS WER': 0.05546294182008469,
},
{
'model': 'mesolitica/malaysian-whisper-base',
'model size FP16 (MB)': 145,
'Malaya-Speech test CER': 0.07242006488452603,
'Malaya-Speech test WER': 0.22081683495617924,
'Fleurs MY-MS CER': 0.06639564802362424,
'Fleurs MY-MS WER': 0.19675812232021192,
'IMDA TTS CER': 0.03982418421412676,
'IMDA TTS WER': 0.08917690642690643,
},
{
'model': 'mesolitica/malaysian-whisper-tiny',
'model size FP16 (MB)': 75.5,
'Malaya-Speech test CER': 0.09423990117534763,
'Malaya-Speech test WER': 0.295029492365558,
'Fleurs MY-MS CER': 0.13390519685940314,
'Fleurs MY-MS WER': 0.3461808122686204,
'IMDA TTS CER': 0.07957313474501154,
'IMDA TTS WER': 0.1421708648494363,
},
{
'model': 'mesolitica/conformer-large-malay-whisper',
'model size FP16 (MB)': 206.5,
'Malaya-Speech test CER': 0.025933167255719317,
'Malaya-Speech test WER': 0.0912131356803488,
'Fleurs MY-MS CER': 0.02548791948171514,
'Fleurs MY-MS WER': 0.08376713097429746,
},
{
'model': 'mesolitica/conformer-medium-malay-whisper',
'model size FP16 (MB)': 121.5,
'Malaya-Speech test CER': 0.024955598713609053,
'Malaya-Speech test WER': 0.09315638444736804,
'Fleurs MY-MS CER': 0.029205645523910067,
'Fleurs MY-MS WER': 0.09253131557833799,
},
{
'model': 'mesolitica/conformer-medium-mixed',
'model size FP16 (MB)': 121.5,
'Malaya-Speech test CER': 0.034618711056551774,
'Malaya-Speech test WER': 0.11179440626161938,
'Fleurs MY-MS CER': 0.032894184549728075,
'Fleurs MY-MS WER': 0.1026977414887425,
},
{
'model': 'mesolitica/conformer-tiny-ctc + mesolitica/kenlm-pseudolabel-whisper-large-v3',
'model size FP16 (MB)': 7.9,
'Malaya-Speech test CER': 0.0612581761581601,
'Malaya-Speech test WER': 0.21302693966628394,
'Fleurs MY-MS CER': 0.07573301800412188,
'Fleurs MY-MS WER': 0.2527434609577528,
},
{
'model': 'mesolitica/conformer-12M-ctc + mesolitica/kenlm-pseudolabel-whisper-large-v3',
'model size FP16 (MB)': 24.2,
'Malaya-Speech test CER': 0.06941749946814912,
'Malaya-Speech test WER': 0.22261096523391607,
'Fleurs MY-MS CER': 0.07657934690019219,
'Fleurs MY-MS WER': 0.263075623142674,
},
]
data = pd.DataFrame(open_source)
demo = gr.Blocks(css=custom_css)
with demo:
gr.HTML(TITLE)
gr.Markdown(INTRODUCTION_TEXT, elem_classes="markdown-text")
gr.DataFrame(data)
demo.launch()