DongfuJiang commited on
Commit
152a862
1 Parent(s): 7ad9a04
Files changed (2) hide show
  1. app.py +57 -17
  2. requirements.txt +5 -2
app.py CHANGED
@@ -2,10 +2,12 @@ import os
2
  # os.environ["HF_HOME"] = "/data/.huggingface"
3
  import gradio as gr
4
  import sys
 
5
  from datasets import load_dataset
6
  from typing import List
7
- from tigerscore import TIGERScorer
8
-
 
9
 
10
  DESCRIPTIONS = """
11
  We present ***TIGERScore***, a **T**rained metric that follows **I**nstruction **G**uidance to perform **E**xplainable, and **R**eference-free evaluation over a wide spectrum of text generation tasks. Different from other automatic evaluation methods that only provide arcane scores, TIGERScore is guided by the natural language instruction to provide error analysis to pinpoint the mistakes in the generated text.
@@ -24,18 +26,56 @@ for i, ex in enumerate(SHUFFLED_EXAMPLES_DATASET.take(100)):
24
  # continue
25
  EXAMPLES.append([ex[field] for field in fields])
26
 
27
- scorer = TIGERScorer("TIGER-Lab/TIGERScore-13B-GGUF", use_llamacpp=True)
28
-
29
- def submit_fn(input_context, generation_instruction, hypo_output, max_new_tokens=512, temperature=0.7, top_p=1.0):
30
- return scorer.score(
31
- insts=[generation_instruction],
32
- hypo_outputs=[hypo_output],
33
- input_contexts=[input_context],
34
- max_new_tokens=max_new_tokens,
35
- temperature=temperature,
36
- top_p=top_p,
37
- )[0]['raw_output'].strip()
38
-
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
39
 
40
 
41
  def get_examples(inst_textbox, input_textbox, hypo_output_textbox):
@@ -66,7 +106,7 @@ with gr.Blocks(theme='gradio/soft') as demo:
66
  minimum=256,
67
  maximum=1024,
68
  step=1,
69
- value=512,
70
  )
71
  temperature = gr.Slider(
72
  label='Temperature of generation',
@@ -85,10 +125,10 @@ with gr.Blocks(theme='gradio/soft') as demo:
85
 
86
  gr.Markdown("## TIGERScore Outputs")
87
  evaluation_output_textbox = gr.Textbox(lines=4, label="Evaluation Output", placeholder="Evaluation output", show_label=True)
88
-
89
 
90
  submit_button.click(
91
- fn=submit_fn,
92
  inputs=[input_textbox, inst_textbox, hypo_output_textbox, max_new_tokens, temperature, top_p],
93
  outputs=evaluation_output_textbox,
94
  )
 
2
  # os.environ["HF_HOME"] = "/data/.huggingface"
3
  import gradio as gr
4
  import sys
5
+ import copy
6
  from datasets import load_dataset
7
  from typing import List
8
+ from llama_cpp import Llama
9
+ from huggingface_hub import hf_hub_download
10
+ from string import Template
11
 
12
  DESCRIPTIONS = """
13
  We present ***TIGERScore***, a **T**rained metric that follows **I**nstruction **G**uidance to perform **E**xplainable, and **R**eference-free evaluation over a wide spectrum of text generation tasks. Different from other automatic evaluation methods that only provide arcane scores, TIGERScore is guided by the natural language instruction to provide error analysis to pinpoint the mistakes in the generated text.
 
26
  # continue
27
  EXAMPLES.append([ex[field] for field in fields])
28
 
29
+ TEMPLATE = """You are evaluating errors in a model-generated output for a given instruction.
30
+ Instruction:
31
+ ${generation_instruction}
32
+ ${input_context}
33
+
34
+ Model-generated Output:
35
+ ${hypothesis_output}
36
+
37
+ For each error you give in the response, please also elaborate the following information:
38
+ - error location (the words that are wrong in the output)
39
+ - error aspect it belongs to.
40
+ - explanation why it's an error, and the correction suggestions.
41
+ - severity of the error ("Major" or "Minor").
42
+ - reduction of score (between 0.5 and 5 given the severity of the error)
43
+
44
+ Your evaluation output:
45
+ """
46
+
47
+ llm = Llama(
48
+ model_path=hf_hub_download(
49
+ repo_id=os.environ.get("REPO_ID", "TIGER-Lab/TIGERScore-7B-GGUF"),
50
+ filename=os.environ.get("MODEL_FILE", "ggml-model-q4_0.gguf"),
51
+ ),
52
+ n_ctx=2048,
53
+ # n_gpu_layers=50, # change n_gpu_layers if you have more or less VRAM
54
+ )
55
+
56
+ def generate_text(input_context, generation_instruction, hypo_output, max_new_tokens=1024, temperature=0.7, top_p=1.0):
57
+ prompt_template = Template(TEMPLATE)
58
+ prompt = prompt_template.substitute(
59
+ generation_instruction=generation_instruction,
60
+ input_context=input_context,
61
+ hypothesis_output=hypo_output,
62
+ ).strip("\n ")
63
+ gen_params = {
64
+ "max_tokens": max_new_tokens,
65
+ "top_p": top_p,
66
+ "top_k": 40,
67
+ "temperature": temperature,
68
+ "frequency_penalty": 0.0,
69
+ "presence_penalty": 0.0,
70
+ "echo": False,
71
+ "stream": True,
72
+ }
73
+ outputs = llm(prompt, **gen_params)
74
+ temp=""
75
+ for out in outputs:
76
+ stream = copy.deepcopy(out)
77
+ temp += stream["choices"][0]["text"]
78
+ yield temp
79
 
80
 
81
  def get_examples(inst_textbox, input_textbox, hypo_output_textbox):
 
106
  minimum=256,
107
  maximum=1024,
108
  step=1,
109
+ value=1024,
110
  )
111
  temperature = gr.Slider(
112
  label='Temperature of generation',
 
125
 
126
  gr.Markdown("## TIGERScore Outputs")
127
  evaluation_output_textbox = gr.Textbox(lines=4, label="Evaluation Output", placeholder="Evaluation output", show_label=True)
128
+
129
 
130
  submit_button.click(
131
+ fn=generate_text,
132
  inputs=[input_textbox, inst_textbox, hypo_output_textbox, max_new_tokens, temperature, top_p],
133
  outputs=evaluation_output_textbox,
134
  )
requirements.txt CHANGED
@@ -1,2 +1,5 @@
1
- git+https://github.com/TIGER-AI-Lab/TIGERScore.git
2
- datasets==2.14.5
 
 
 
 
1
+ datasets==2.14.5
2
+ torch
3
+ transformers
4
+ llama
5
+ llama-cpp-python