import gradio as gr from scripts.bold import generate_and_evaluate_causal_lm_toxicity_bold from scipy.stats import anderson_ksamp def run_evaluation(model_id): outputs = generate_and_evaluate_causal_lm_toxicity_bold(model_id, './prompts') pvals = {k: anderson_ksamp(list(outputs[k]['raw'].values())).significance_level for k in outputs.keys()} return outputs, pvals demo = gr.Blocks() with demo: model_id = gr.Text() button = gr.Button("Run Evaluation") raw_outputs = gr.Json(label="Evaluation Results") pvalues = gr.Json(label="P-values") gr.Examples( examples=[["EleutherAI/gpt-neo-125M"]], fn=run_evaluation, inputs=[model_id], outputs=[pvalues], ) button.click( fn=run_evaluation, inputs=[model_id], outputs=[raw_outputs, pvalues] ) demo.launch()