DongfuJiang commited on
Commit
04efd2c
1 Parent(s): 4a2ff24
Files changed (3) hide show
  1. app.py +22 -21
  2. requirements.txt +2 -1
  3. utils.py +0 -85
app.py CHANGED
@@ -3,7 +3,7 @@ import sys
3
  import os
4
  from datasets import load_dataset
5
  from typing import List
6
- import utils
7
 
8
 
9
  DESCRIPTIONS = """
@@ -13,10 +13,10 @@ We present ***TIGERScore***, a **T**rained metric that follows **I**nstruction *
13
 
14
  """
15
 
16
- EXAMPLES_DATASET = load_dataset("TIGER-Lab/MetricInstruct", split="train_mix")
17
  SHUFFLED_EXAMPLES_DATASET = EXAMPLES_DATASET.shuffle(seed=42)
18
  EXAMPLES = []
19
- fields = ["task", "instruction", "input_context", "hypo_output"]
20
  print("Loading examples...")
21
  for i, ex in enumerate(SHUFFLED_EXAMPLES_DATASET):
22
  if any([not ex[field] for field in fields]):
@@ -25,13 +25,19 @@ for i, ex in enumerate(SHUFFLED_EXAMPLES_DATASET):
25
  if i >= 100:
26
  break
27
 
28
- def tigerscore(task, input_context, generation_instruction, hypo_output, max_new_tokens=512, temperature=0.7, top_p=1.0):
29
- return utils.generate(
30
- task, input_context,
31
- generation_instruction, hypo_output,
32
- max_new_tokens=max_new_tokens,
33
- temperature=temperature, top_p=top_p
34
- )
 
 
 
 
 
 
35
 
36
  def get_examples(task, inst_textbox, input_textbox, hypo_output_textbox):
37
  return gr.Dropdown.update(value=task), inst_textbox, input_textbox, hypo_output_textbox
@@ -39,10 +45,6 @@ def get_examples(task, inst_textbox, input_textbox, hypo_output_textbox):
39
  def clear_all(task, inst_textbox, input_textbox, hypo_output_textbox):
40
  return gr.Dropdown.update(value=task), "", "", ""
41
 
42
- ## initialize the model
43
- print("Loading TIGERScore model...")
44
- utils.load_tigerscore("7b")
45
-
46
  with gr.Blocks(theme='gradio/soft') as demo:
47
 
48
  gr.Markdown("# 🐯 TIGERScore Demo")
@@ -51,7 +53,6 @@ with gr.Blocks(theme='gradio/soft') as demo:
51
  gr.Image("https://jdf-prog.github.io/assets/img/publication_preview/tigerscore_preview.png")
52
 
53
  gr.Markdown("## TIGERScore Inputs")
54
- tasks_dropdown = gr.Dropdown(label="Task", choices=utils.tasks, value="translation", show_label=True, allow_custom_value=True)
55
  inst_textbox = gr.Textbox(lines=1, label="Instruction", placeholder="Enter instruction here", show_label=True)
56
  input_textbox = gr.Textbox(lines=4, label="Input Context", placeholder="Enter input context here", show_label=True)
57
  hypo_output_textbox = gr.Textbox(lines=4, label="Hypothesis Output", placeholder="Enter hypothesis output to be evaluated here", show_label=True)
@@ -88,15 +89,15 @@ with gr.Blocks(theme='gradio/soft') as demo:
88
 
89
 
90
  submit_button.click(
91
- fn=tigerscore,
92
- inputs=[tasks_dropdown, input_textbox, inst_textbox, hypo_output_textbox, max_new_tokens, temperature, top_p],
93
  outputs=evaluation_output_textbox,
94
  )
95
 
96
  clear_button.click(
97
  fn=clear_all,
98
- inputs=[tasks_dropdown, inst_textbox, input_textbox, hypo_output_textbox],
99
- outputs=[tasks_dropdown, inst_textbox, input_textbox, hypo_output_textbox],
100
  )
101
 
102
  batch_examples = gr.Examples(
@@ -104,8 +105,8 @@ with gr.Blocks(theme='gradio/soft') as demo:
104
  fn=get_examples,
105
  cache_examples=True,
106
  examples_per_page=5,
107
- inputs=[tasks_dropdown, inst_textbox, input_textbox, hypo_output_textbox],
108
- outputs=[tasks_dropdown, inst_textbox, input_textbox, hypo_output_textbox],
109
  )
110
 
111
  citations = gr.Markdown("""## Citation
 
3
  import os
4
  from datasets import load_dataset
5
  from typing import List
6
+ from tigerscore import TIGERScorer
7
 
8
 
9
  DESCRIPTIONS = """
 
13
 
14
  """
15
 
16
+ EXAMPLES_DATASET = load_dataset("TIGER-Lab/MetricInstruct", split="train")
17
  SHUFFLED_EXAMPLES_DATASET = EXAMPLES_DATASET.shuffle(seed=42)
18
  EXAMPLES = []
19
+ fields = ["instruction", "input_context", "hypo_output"]
20
  print("Loading examples...")
21
  for i, ex in enumerate(SHUFFLED_EXAMPLES_DATASET):
22
  if any([not ex[field] for field in fields]):
 
25
  if i >= 100:
26
  break
27
 
28
+ scorer = TIGERScorer("TIGER-Lab/TIGERScore-7B-GGUF", use_llamacpp=True)
29
+
30
+ def submit_fn(input_context, generation_instruction, hypo_output, max_new_tokens=512, temperature=0.7, top_p=1.0):
31
+ return scorer.score(
32
+ insts=[generation_instruction],
33
+ hypo_outputs=[hypo_output],
34
+ input_contexts=[input_context],
35
+ max_new_tokens=max_new_tokens,
36
+ temperature=temperature,
37
+ top_p=top_p,
38
+ )[0]['raw_output'].strip()
39
+
40
+
41
 
42
  def get_examples(task, inst_textbox, input_textbox, hypo_output_textbox):
43
  return gr.Dropdown.update(value=task), inst_textbox, input_textbox, hypo_output_textbox
 
45
  def clear_all(task, inst_textbox, input_textbox, hypo_output_textbox):
46
  return gr.Dropdown.update(value=task), "", "", ""
47
 
 
 
 
 
48
  with gr.Blocks(theme='gradio/soft') as demo:
49
 
50
  gr.Markdown("# 🐯 TIGERScore Demo")
 
53
  gr.Image("https://jdf-prog.github.io/assets/img/publication_preview/tigerscore_preview.png")
54
 
55
  gr.Markdown("## TIGERScore Inputs")
 
56
  inst_textbox = gr.Textbox(lines=1, label="Instruction", placeholder="Enter instruction here", show_label=True)
57
  input_textbox = gr.Textbox(lines=4, label="Input Context", placeholder="Enter input context here", show_label=True)
58
  hypo_output_textbox = gr.Textbox(lines=4, label="Hypothesis Output", placeholder="Enter hypothesis output to be evaluated here", show_label=True)
 
89
 
90
 
91
  submit_button.click(
92
+ fn=submit_fn,
93
+ inputs=[input_textbox, inst_textbox, hypo_output_textbox, max_new_tokens, temperature, top_p],
94
  outputs=evaluation_output_textbox,
95
  )
96
 
97
  clear_button.click(
98
  fn=clear_all,
99
+ inputs=[inst_textbox, input_textbox, hypo_output_textbox],
100
+ outputs=[inst_textbox, input_textbox, hypo_output_textbox],
101
  )
102
 
103
  batch_examples = gr.Examples(
 
105
  fn=get_examples,
106
  cache_examples=True,
107
  examples_per_page=5,
108
+ inputs=[inst_textbox, input_textbox, hypo_output_textbox],
109
+ outputs=[inst_textbox, input_textbox, hypo_output_textbox],
110
  )
111
 
112
  citations = gr.Markdown("""## Citation
requirements.txt CHANGED
@@ -28,4 +28,5 @@ rouge_score
28
  bs4
29
  py7zr
30
  sacrebleu
31
- gdown
 
 
28
  bs4
29
  py7zr
30
  sacrebleu
31
+ gdown
32
+ bitsandbytes
utils.py DELETED
@@ -1,85 +0,0 @@
1
- from transformers import AutoTokenizer, AutoModelForCausalLM
2
- from string import Template
3
- import torch
4
-
5
- FINETUNE_INST = "You are evaluating errors in a model-generated output for a(an) ${task} task."
6
- FINETUNE_INPUT = """\
7
- Task instruction: ${generation_instruction}
8
- Source: ${input_context}
9
- Model-generated Output: ${hypothesis_output}
10
-
11
- Based on the given task instruction and source, identify errors in this model-generated output.
12
- For each error you give in the response, please also elaborate the following information:
13
- - error location (the words that are wrong in the output)
14
- - error aspect it belongs to.
15
- - explanation why it's an error, and the correction suggestions.
16
- - severity of the error ("Major" or "Minor").
17
- - reduction of score (between 0.5 and 5 given the severity of the error)
18
-
19
- Your evaluation output:
20
- """
21
-
22
- TIGERScore_model_map = {
23
- "7b": "TIGER-Lab/TIGERScore-7B-V1.0",
24
- "13b": "TIGER-Lab/TIGERScore-13B-V1.0",
25
- }
26
- tigerscore_model = None
27
- tigerscore_tokenizer = None
28
-
29
- tasks = [
30
- "translation",
31
- "summarization",
32
- "data2text",
33
- "mathQA",
34
- "long-form QA",
35
- "instruction-following",
36
- ]
37
-
38
- def load_tigerscore(model_size):
39
- assert model_size in TIGERScore_model_map
40
- model_name = TIGERScore_model_map[model_size]
41
- global tigerscore_model, tigerscore_tokenizer
42
- tigerscore_model = AutoModelForCausalLM.from_pretrained(
43
- model_name,
44
- torch_dtype=torch.bfloat16,
45
- device_map="auto"
46
- )
47
- tigerscore_tokenizer = AutoTokenizer.from_pretrained(
48
- model_name,
49
- use_fast=True
50
- )
51
-
52
- def generate(task, input_context, generation_instruction, hypo_output, **generate_kwargs):
53
- inst_part = Template(FINETUNE_INST)
54
- inst_part = inst_part.substitute(task=task)
55
- input_part = Template(FINETUNE_INPUT)
56
- input_part = input_part.substitute(
57
- generation_instruction=generation_instruction,
58
- input_context=input_context,
59
- hypothesis_output=hypo_output
60
- )
61
- prompt = (inst_part + "\n" + input_part).strip("\n ") + "\n"
62
- encodings = tigerscore_tokenizer(prompt, return_tensors="pt")
63
- input_ids = encodings["input_ids"].to(tigerscore_model.device)
64
- attention_mask = encodings["attention_mask"].to(tigerscore_model.device)
65
- gen_params = {
66
- "input_ids": input_ids,
67
- "attention_mask": attention_mask,
68
- "max_new_tokens": 512,
69
- "do_sample": True,
70
- "top_k": 1,
71
- "num_return_sequences": 1,
72
- }
73
- gen_params.update(generate_kwargs)
74
- output = tigerscore_model.generate(**gen_params)
75
- output = tigerscore_tokenizer.decode(output[0][len(input_ids[0]):], skip_special_tokens=True)
76
- return output
77
-
78
- if __name__ == "__main__":
79
- task = "translation"
80
- input_context = "Der künftige EM-Cheforganisator Philipp Lahm soll laut Grindel im DFB-Präsidium mitarbeiten."
81
- generation_instruction = "Translate the following text from German to English."
82
- hypo_output = "According to Grindel, the future head of the European Championships, Philipp Lahm, is to participate in the DFB Presidency."
83
- output = generate(task, input_context, generation_instruction, hypo_output)
84
- print(output)
85
-