Spaces:
Sleeping
Sleeping
File size: 4,697 Bytes
301c810 04efd2c 301c810 84a5d01 301c810 1dbdb66 301c810 508850f 301c810 04efd2c 301c810 508850f 301c810 554396f 04efd2c 554396f 04efd2c 301c810 508850f 301c810 508850f 84a5d01 301c810 84a5d01 4a2ff24 84a5d01 301c810 84a5d01 301c810 748e489 301c810 748e489 301c810 748e489 301c810 04efd2c 301c810 84a5d01 04efd2c 84a5d01 301c810 04efd2c 301c810 84a5d01 301c810 84a5d01 301c810 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 |
import gradio as gr
import sys
import os
from datasets import load_dataset
from typing import List
from tigerscore import TIGERScorer
DESCRIPTIONS = """
We present ***TIGERScore***, a **T**rained metric that follows **I**nstruction **G**uidance to perform **E**xplainable, and **R**eference-free evaluation over a wide spectrum of text generation tasks. Different from other automatic evaluation methods that only provide arcane scores, TIGERScore is guided by the natural language instruction to provide error analysis to pinpoint the mistakes in the generated text.
### [**Website**](https://tiger-ai-lab.github.io/TIGERScore/) [**Paper**](https://arxiv.org/abs/2310.00752) [**Code**](https://github.com/TIGER-AI-Lab/TIGERScore) [**TIGERScore-7B**](https://huggingface.co/TIGER-Lab/TIGERScore-7B) [**TIGERScore-13B**](https://huggingface.co/TIGER-Lab/TIGERScore-13B)
"""
EXAMPLES_DATASET = load_dataset("TIGER-Lab/MetricInstruct", split="train", streaming=True)
SHUFFLED_EXAMPLES_DATASET = EXAMPLES_DATASET.shuffle(seed=42)
EXAMPLES = []
fields = ["instruction", "input_context", "hypo_output"]
print("Loading examples...")
for i, ex in enumerate(SHUFFLED_EXAMPLES_DATASET.take(100)):
# if any([not ex[field] for field in fields]):
# continue
EXAMPLES.append([ex[field] for field in fields])
# scorer = TIGERScorer("TIGER-Lab/TIGERScore-7B-GGUF", use_llamacpp=True)
def submit_fn(input_context, generation_instruction, hypo_output, max_new_tokens=512, temperature=0.7, top_p=1.0):
# return scorer.score(
# insts=[generation_instruction],
# hypo_outputs=[hypo_output],
# input_contexts=[input_context],
# max_new_tokens=max_new_tokens,
# temperature=temperature,
# top_p=top_p,
# )[0]['raw_output'].strip()
return "None"
def get_examples(inst_textbox, input_textbox, hypo_output_textbox):
return inst_textbox, input_textbox, hypo_output_textbox
def clear_all(inst_textbox, input_textbox, hypo_output_textbox):
return "", "", ""
with gr.Blocks(theme='gradio/soft') as demo:
gr.Markdown("# 🐯 TIGERScore Demo")
with gr.Row():
gr.Markdown(DESCRIPTIONS)
gr.Image("https://jdf-prog.github.io/assets/img/publication_preview/tigerscore_preview.png")
gr.Markdown("## TIGERScore Inputs")
inst_textbox = gr.Textbox(lines=1, label="Instruction", placeholder="Enter instruction here", show_label=True)
input_textbox = gr.Textbox(lines=4, label="Input Context", placeholder="Enter input context here", show_label=True)
hypo_output_textbox = gr.Textbox(lines=4, label="Hypothesis Output", placeholder="Enter hypothesis output to be evaluated here", show_label=True)
with gr.Row():
clear_button = gr.Button('Clear', variant='primary')
submit_button = gr.Button('Submit', variant='primary')
with gr.Accordion(label='Advanced options', open=False):
max_new_tokens = gr.Slider(
label='Max new tokens to generate',
minimum=256,
maximum=1024,
step=1,
value=512,
)
temperature = gr.Slider(
label='Temperature of generation',
minimum=0.1,
maximum=2.0,
step=0.1,
value=0.7,
)
top_p = gr.Slider(
label='Top-p of generation',
minimum=0.05,
maximum=1.0,
step=0.05,
value=1.0,
)
gr.Markdown("## TIGERScore Outputs")
evaluation_output_textbox = gr.Textbox(lines=4, label="Evaluation Output", placeholder="Evaluation output", show_label=True)
submit_button.click(
fn=submit_fn,
inputs=[input_textbox, inst_textbox, hypo_output_textbox, max_new_tokens, temperature, top_p],
outputs=evaluation_output_textbox,
)
clear_button.click(
fn=clear_all,
inputs=[inst_textbox, input_textbox, hypo_output_textbox],
outputs=[inst_textbox, input_textbox, hypo_output_textbox],
)
batch_examples = gr.Examples(
examples=EXAMPLES,
fn=get_examples,
cache_examples=True,
examples_per_page=5,
inputs=[inst_textbox, input_textbox, hypo_output_textbox],
outputs=[inst_textbox, input_textbox, hypo_output_textbox],
)
citations = gr.Markdown("""## Citation
```txt
@article{jiang2023TIGERScore,
title={TIGERScore: Towards Building Explainable Metric for All Text Generation Tasks},
author={Dongfu Jiang, Yishan Li, Ge Zhang, Wenhao Huang, Bill Yuchen Lin, Wenhu Chen},
journal={arXiv preprint arXiv:2310.00752},
year={2023}
}
```""")
demo.queue(max_size=20).launch() |