import streamlit as st import sacrebleu from bert_score import score as bert_score import jieba # Function to calculate BLEU score def calculate_bleu(translations, references): return sacrebleu.corpus_bleu(translations, [references]).score # Function to calculate TER score def calculate_ter(translations, references): return sacrebleu.corpus_ter(translations, [references]).score # Function to calculate CHRF score def calculate_chrf(translations, references): return sacrebleu.corpus_chrf(translations, [references]).score # Function to calculate BERTScore def calculate_bertscore(translations, references, lang): P, R, F1 = bert_score(translations, references, lang=lang) return F1.mean().item() # Streamlit app st.title("Machine Translation Quality Evaluation") st.write("Input the translated text and the reference translation to compute BLEU, TER, CHRF, and BERTScore metrics.") # List of supported languages languages = { "English": "en", "Chinese": "zh", "French": "fr", "German": "de", "Spanish": "es", "Russian": "ru", "Japanese": "ja", "Korean": "ko", "Arabic": "ar", "Italian": "it", "Dutch": "nl", "Portuguese": "pt", "Turkish": "tr", "Polish": "pl", "Czech": "cs", "Swedish": "sv", "Danish": "da", "Finnish": "fi", "Greek": "el", "Hungarian": "hu", "Indonesian": "id", "Norwegian": "no", "Romanian": "ro", "Thai": "th", "Vietnamese": "vi", "Hebrew": "he", "Hindi": "hi", "Bengali": "bn", "Tamil": "ta", "Urdu": "ur", "Other": "other" } # Language selection source_lang = st.selectbox("Select Source Language", list(languages.keys())) target_lang = st.selectbox("Select Target Language", list(languages.keys())) # Input fields for custom language codes if "Other" is selected source_lang_code = st.text_input("Enter Source Language Code (ISO 639-1):", value=languages[source_lang]) if source_lang == "Other" else languages[source_lang] target_lang_code = st.text_input("Enter Target Language Code (ISO 639-1):", value=languages[target_lang]) if target_lang == "Other" else languages[target_lang] # Input fields for translations and references translation_input = st.text_area("Translated Text", height=200) reference_input = st.text_area("Reference Translation", height=200) # Evaluate button if st.button("Evaluate"): if translation_input and reference_input: translations = [translation_input.strip()] references = [reference_input.strip()] # Handle tokenization if necessary (e.g., for Chinese) if source_lang_code == "zh" or target_lang_code == "zh": translations = [' '.join(jieba.cut(text)) for text in translations] references = [' '.join(jieba.cut(text)) for text in references] bleu_score = calculate_bleu(translations, references) ter_score = calculate_ter(translations, references) chrf_score = calculate_chrf(translations, references) bertscore = calculate_bertscore(translations, references, target_lang_code) st.write(f"**BLEU Score:** {bleu_score:.2f}") st.write(f"**TER Score:** {ter_score:.2f}") st.write(f"**CHRF Score:** {chrf_score:.2f}") st.write(f"**BERTScore:** {bertscore:.2f}") else: st.error("Please provide both translated text and reference translation.")