File size: 4,697 Bytes
301c810
 
 
 
 
04efd2c
301c810
 
84a5d01
301c810
 
1dbdb66
301c810
 
 
508850f
301c810
 
04efd2c
301c810
508850f
 
 
301c810
 
554396f
04efd2c
 
554396f
 
 
 
 
 
 
 
 
04efd2c
 
301c810
508850f
 
301c810
508850f
 
84a5d01
301c810
84a5d01
4a2ff24
84a5d01
 
 
301c810
84a5d01
301c810
 
 
 
 
 
 
 
 
 
748e489
301c810
 
 
 
 
 
748e489
301c810
 
 
 
 
 
748e489
301c810
 
 
 
 
 
 
 
 
 
 
04efd2c
 
301c810
 
84a5d01
 
 
04efd2c
 
84a5d01
301c810
 
 
 
 
 
04efd2c
 
301c810
 
 
84a5d01
301c810
 
 
 
 
84a5d01
 
301c810
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
import gradio as gr
import sys
import os
from datasets import load_dataset
from typing import List
from tigerscore import TIGERScorer


DESCRIPTIONS = """
We present ***TIGERScore***, a **T**rained metric that follows **I**nstruction **G**uidance to perform **E**xplainable, and **R**eference-free evaluation over a wide spectrum of text generation tasks. Different from other automatic evaluation methods that only provide arcane scores, TIGERScore is guided by the natural language instruction to provide error analysis to pinpoint the mistakes in the generated text.

### [**Website**](https://tiger-ai-lab.github.io/TIGERScore/)  [**Paper**](https://arxiv.org/abs/2310.00752)   [**Code**](https://github.com/TIGER-AI-Lab/TIGERScore)   [**TIGERScore-7B**](https://huggingface.co/TIGER-Lab/TIGERScore-7B)   [**TIGERScore-13B**](https://huggingface.co/TIGER-Lab/TIGERScore-13B)

"""

EXAMPLES_DATASET = load_dataset("TIGER-Lab/MetricInstruct", split="train", streaming=True)
SHUFFLED_EXAMPLES_DATASET = EXAMPLES_DATASET.shuffle(seed=42)
EXAMPLES = []
fields = ["instruction", "input_context", "hypo_output"]
print("Loading examples...")
for i, ex in enumerate(SHUFFLED_EXAMPLES_DATASET.take(100)):
    # if any([not ex[field] for field in fields]):
    #     continue
    EXAMPLES.append([ex[field] for field in fields])

# scorer = TIGERScorer("TIGER-Lab/TIGERScore-7B-GGUF", use_llamacpp=True)

def submit_fn(input_context, generation_instruction, hypo_output, max_new_tokens=512, temperature=0.7, top_p=1.0):
    # return scorer.score(
    #     insts=[generation_instruction],
    #     hypo_outputs=[hypo_output],
    #     input_contexts=[input_context],
    #     max_new_tokens=max_new_tokens,
    #     temperature=temperature,
    #     top_p=top_p,
    # )[0]['raw_output'].strip() 
    return "None"
        
        

def get_examples(inst_textbox, input_textbox, hypo_output_textbox):
    return inst_textbox, input_textbox, hypo_output_textbox

def clear_all(inst_textbox, input_textbox, hypo_output_textbox):
    return "", "", ""

with gr.Blocks(theme='gradio/soft') as demo:
     
    gr.Markdown("# 🐯 TIGERScore Demo")
    with gr.Row():
        gr.Markdown(DESCRIPTIONS)
        gr.Image("https://jdf-prog.github.io/assets/img/publication_preview/tigerscore_preview.png")

    gr.Markdown("## TIGERScore Inputs")
    inst_textbox = gr.Textbox(lines=1, label="Instruction", placeholder="Enter instruction here", show_label=True)
    input_textbox = gr.Textbox(lines=4, label="Input Context", placeholder="Enter input context here", show_label=True)
    hypo_output_textbox = gr.Textbox(lines=4, label="Hypothesis Output", placeholder="Enter hypothesis output to be evaluated here", show_label=True)
    
    with gr.Row():
        clear_button = gr.Button('Clear', variant='primary')
        submit_button = gr.Button('Submit', variant='primary')
    
    with gr.Accordion(label='Advanced options', open=False):
        max_new_tokens = gr.Slider(
            label='Max new tokens to generate',
            minimum=256,
            maximum=1024,
            step=1,
            value=512,
        )
        temperature = gr.Slider(
            label='Temperature of generation',
            minimum=0.1,
            maximum=2.0,
            step=0.1,
            value=0.7,
        )
        top_p = gr.Slider(
            label='Top-p of generation',
            minimum=0.05,
            maximum=1.0,
            step=0.05,
            value=1.0,
        )

    gr.Markdown("## TIGERScore Outputs")
    evaluation_output_textbox = gr.Textbox(lines=4, label="Evaluation Output", placeholder="Evaluation output", show_label=True)


    submit_button.click(
        fn=submit_fn,
        inputs=[input_textbox, inst_textbox, hypo_output_textbox, max_new_tokens, temperature, top_p],
        outputs=evaluation_output_textbox,
    )

    clear_button.click(
        fn=clear_all,
        inputs=[inst_textbox, input_textbox, hypo_output_textbox],
        outputs=[inst_textbox, input_textbox, hypo_output_textbox],
    )
    
    batch_examples = gr.Examples(
        examples=EXAMPLES,
        fn=get_examples,
        cache_examples=True,
        examples_per_page=5,
        inputs=[inst_textbox, input_textbox, hypo_output_textbox],
        outputs=[inst_textbox, input_textbox, hypo_output_textbox],
    )  

    citations = gr.Markdown("""## Citation
```txt
@article{jiang2023TIGERScore,
  title={TIGERScore: Towards Building Explainable Metric for All Text Generation Tasks},
  author={Dongfu Jiang, Yishan Li, Ge Zhang, Wenhao Huang, Bill Yuchen Lin, Wenhu Chen},
  journal={arXiv preprint arXiv:2310.00752},
  year={2023}
}
```""")

demo.queue(max_size=20).launch()