File size: 6,501 Bytes
22cbbce
301c810
 
152a862
2bed478
301c810
152a862
742d7b5
301c810
84a5d01
301c810
 
742d7b5
 
 
 
 
301c810
 
 
508850f
301c810
 
04efd2c
301c810
508850f
 
 
301c810
 
152a862
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2107a44
 
742d7b5
 
 
 
 
 
 
 
 
 
2bed478
742d7b5
 
 
 
 
 
322dcb3
742d7b5
 
152a862
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
301c810
508850f
 
301c810
508850f
 
84a5d01
301c810
84a5d01
4a2ff24
84a5d01
 
 
301c810
84a5d01
301c810
 
 
 
 
 
 
 
 
 
748e489
301c810
 
 
152a862
301c810
 
748e489
301c810
 
 
 
 
 
748e489
301c810
 
 
 
 
 
 
 
152a862
301c810
 
742d7b5
04efd2c
301c810
 
84a5d01
 
 
04efd2c
 
84a5d01
301c810
 
 
 
 
 
04efd2c
 
301c810
 
 
84a5d01
301c810
 
 
 
 
84a5d01
 
301c810
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
import os
import gradio as gr
import sys
import copy
import spaces
from datasets import load_dataset
from string import Template
from tigerscore import TIGERScorer

DESCRIPTIONS = """
We present ***TIGERScore***, a **T**rained metric that follows **I**nstruction **G**uidance to perform **E**xplainable, and **R**eference-free evaluation over a wide spectrum of text generation tasks. Different from other automatic evaluation methods that only provide arcane scores, TIGERScore is guided by the natural language instruction to provide error analysis to pinpoint the mistakes in the generated text.

[**Website**](https://tiger-ai-lab.github.io/TIGERScore/) | 
[**Paper**](https://arxiv.org/abs/2310.00752) | 
[**Code**](https://github.com/TIGER-AI-Lab/TIGERScore) | 
[**TIGERScore-7B**](https://huggingface.co/TIGER-Lab/TIGERScore-7B) |
[**TIGERScore-13B**](https://huggingface.co/TIGER-Lab/TIGERScore-13B)

"""

EXAMPLES_DATASET = load_dataset("TIGER-Lab/MetricInstruct", split="train", streaming=True)
SHUFFLED_EXAMPLES_DATASET = EXAMPLES_DATASET.shuffle(seed=42)
EXAMPLES = []
fields = ["instruction", "input_context", "hypo_output"]
print("Loading examples...")
for i, ex in enumerate(SHUFFLED_EXAMPLES_DATASET.take(100)):
    # if any([not ex[field] for field in fields]):
    #     continue
    EXAMPLES.append([ex[field] for field in fields])

TEMPLATE = """You are evaluating errors in a model-generated output for a given instruction.
Instruction: 
${generation_instruction}
${input_context}

Model-generated Output: 
${hypothesis_output}

For each error you give in the response, please also elaborate the following information:
- error location (the words that are wrong in the output)
- error aspect it belongs to.
- explanation why it's an error, and the correction suggestions.
- severity of the error ("Major" or "Minor"). 
- reduction of score (between 0.5 and 5 given the severity of the error)

Your evaluation output:
"""

# from huggingface_hub import hf_hub_download  
# from llama_cpp import Llama
# llm = Llama(
#     model_path=hf_hub_download(
#         repo_id=os.environ.get("REPO_ID", "TIGER-Lab/TIGERScore-13B-GGUF"),
#         filename=os.environ.get("MODEL_FILE", "ggml-model-q4_0.gguf"),
#     ),
#     n_ctx=2048,
#     n_gpu_layers=50, # change n_gpu_layers if you have more or less VRAM 
# ) 
scorer = TIGERScorer(model_name="TIGER-Lab/TIGERScore-13B")

@spaces.GPU(duration=60)
def generate_text_hf(input_context, generation_instruction, hypo_output, max_new_tokens=1024, temperature=0.7, top_p=1.0):
    global scorer
    scorer.model = scorer.model.to("cuda")
    
    for output in scorer.generate_stream(generation_instruction, hypo_output, input_context, max_new_tokens=max_new_tokens, temperature=temperature, top_p=top_p):
        yield output
        
def generate_text_llamacpp(input_context, generation_instruction, hypo_output, max_new_tokens=1024, temperature=0.7, top_p=1.0):
    global llm
    prompt_template = Template(TEMPLATE)
    prompt = prompt_template.substitute(
            generation_instruction=generation_instruction,
            input_context=input_context,
            hypothesis_output=hypo_output,
        ).strip("\n ")
    gen_params = {
        "max_tokens": max_new_tokens,
        "top_p": top_p,
        "top_k": 40,
        "temperature": temperature,
        "frequency_penalty": 0.0,
        "presence_penalty": 0.0,
        "echo": False,
        "stream": True,
    }
    outputs = llm(prompt, **gen_params)
    temp=""
    for out in outputs:
        stream = copy.deepcopy(out)
        temp += stream["choices"][0]["text"]
        yield temp

def get_examples(inst_textbox, input_textbox, hypo_output_textbox):
    return inst_textbox, input_textbox, hypo_output_textbox

def clear_all(inst_textbox, input_textbox, hypo_output_textbox):
    return "", "", ""

with gr.Blocks(theme='gradio/soft') as demo:
     
    gr.Markdown("# 🐯 TIGERScore Demo")
    with gr.Row():
        gr.Markdown(DESCRIPTIONS)
        gr.Image("https://jdf-prog.github.io/assets/img/publication_preview/tigerscore_preview.png")

    gr.Markdown("## TIGERScore Inputs")
    inst_textbox = gr.Textbox(lines=1, label="Instruction", placeholder="Enter instruction here", show_label=True)
    input_textbox = gr.Textbox(lines=4, label="Input Context", placeholder="Enter input context here", show_label=True)
    hypo_output_textbox = gr.Textbox(lines=4, label="Hypothesis Output", placeholder="Enter hypothesis output to be evaluated here", show_label=True)
    
    with gr.Row():
        clear_button = gr.Button('Clear', variant='primary')
        submit_button = gr.Button('Submit', variant='primary')
    
    with gr.Accordion(label='Advanced options', open=False):
        max_new_tokens = gr.Slider(
            label='Max new tokens to generate',
            minimum=256,
            maximum=1024,
            step=1,
            value=1024,
        )
        temperature = gr.Slider(
            label='Temperature of generation',
            minimum=0.1,
            maximum=2.0,
            step=0.1,
            value=0.7,
        )
        top_p = gr.Slider(
            label='Top-p of generation',
            minimum=0.05,
            maximum=1.0,
            step=0.05,
            value=1.0,
        )

    gr.Markdown("## TIGERScore Outputs")
    evaluation_output_textbox = gr.Textbox(lines=4, label="Evaluation Output", placeholder="Evaluation output", show_label=True)
    

    submit_button.click(
        fn=generate_text_hf,
        inputs=[input_textbox, inst_textbox, hypo_output_textbox, max_new_tokens, temperature, top_p],
        outputs=evaluation_output_textbox,
    )

    clear_button.click(
        fn=clear_all,
        inputs=[inst_textbox, input_textbox, hypo_output_textbox],
        outputs=[inst_textbox, input_textbox, hypo_output_textbox],
    )
    
    batch_examples = gr.Examples(
        examples=EXAMPLES,
        fn=get_examples,
        cache_examples=True,
        examples_per_page=5,
        inputs=[inst_textbox, input_textbox, hypo_output_textbox],
        outputs=[inst_textbox, input_textbox, hypo_output_textbox],
    )  

    citations = gr.Markdown("""## Citation
```txt
@article{jiang2023TIGERScore,
  title={TIGERScore: Towards Building Explainable Metric for All Text Generation Tasks},
  author={Dongfu Jiang, Yishan Li, Ge Zhang, Wenhao Huang, Bill Yuchen Lin, Wenhu Chen},
  journal={arXiv preprint arXiv:2310.00752},
  year={2023}
}
```""")

demo.queue(max_size=20).launch()