Miaoran000 commited on
Commit
150bb15
1 Parent(s): 8a6bfdc

minor update and extend to support different APIs

Browse files
.gitignore CHANGED
@@ -15,3 +15,6 @@ eval-queue-bk/
15
  eval-results-bk/
16
 
17
  src/assets/model_counts.html
 
 
 
 
15
  eval-results-bk/
16
 
17
  src/assets/model_counts.html
18
+
19
+ generated_results/
20
+ Hallucination Leaderboard Results
generation_results/CohereForAI/c4ai-command-r-plus.csv ADDED
The diff for this file is too large to render. See raw diff
 
generation_results/databricks/dbrx-instruct.csv ADDED
The diff for this file is too large to render. See raw diff
 
generation_results/google/gemma-1.1-2b-it.csv ADDED
The diff for this file is too large to render. See raw diff
 
generation_results/google/gemma-1.1-7b-it.csv ADDED
The diff for this file is too large to render. See raw diff
 
generation_results/microsoft/WizardLM-2-8x22B.csv ADDED
The diff for this file is too large to render. See raw diff
 
generation_results/mistralai/mixtral-8x22b.csv ADDED
The diff for this file is too large to render. See raw diff
 
generation_results/mistralai/mixtral-8x22b_v1.csv ADDED
The diff for this file is too large to render. See raw diff
 
generation_results/openai/GPT-4-Turbo.csv ADDED
The diff for this file is too large to render. See raw diff
 
src/backend/evaluate_model.py CHANGED
@@ -1,5 +1,7 @@
1
  import logging
2
  import pandas as pd
 
 
3
 
4
  import src.envs as envs
5
 
@@ -70,13 +72,16 @@ class Evaluator:
70
  """
71
  try:
72
  df = pd.read_csv(envs.DATASET_PATH)
73
- generated_summaries_df = self.summary_generator.generate_summaries(df)
 
 
 
74
 
75
  avg_summary_len = self.summary_generator.avg_length
76
  answer_rate = self.summary_generator.answer_rate
77
 
78
- hallucination_scores = self.eval_model.evaluate_hallucination(
79
- generated_summaries_df)
80
  factual_consistency_rate = self.eval_model.compute_factual_consistency_rate()
81
  hallucination_rate = self.eval_model.hallucination_rate
82
 
@@ -93,3 +98,38 @@ class Evaluator:
93
  except Exception as e:
94
  logging.error(f"Error during evaluation: {e}")
95
  raise
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  import logging
2
  import pandas as pd
3
+ import os
4
+ import csv
5
 
6
  import src.envs as envs
7
 
 
72
  """
73
  try:
74
  df = pd.read_csv(envs.DATASET_PATH)
75
+ # print(envs.DATASET_PATH)
76
+ # print(df.shape)
77
+ # print(df.iloc[-1])
78
+ self.generated_summaries_df = self.summary_generator.generate_summaries(df, save_path=f"generation_results/{self.model}.csv")
79
 
80
  avg_summary_len = self.summary_generator.avg_length
81
  answer_rate = self.summary_generator.answer_rate
82
 
83
+ self.hallucination_scores, self.eval_results = self.eval_model.evaluate_hallucination(
84
+ self.generated_summaries_df)
85
  factual_consistency_rate = self.eval_model.compute_factual_consistency_rate()
86
  hallucination_rate = self.eval_model.hallucination_rate
87
 
 
98
  except Exception as e:
99
  logging.error(f"Error during evaluation: {e}")
100
  raise
101
+
102
+ def write_results(self):
103
+ print('Updating result files')
104
+ leaderboard_path = os.getcwd() # the path of leaderboard folder
105
+ print(leaderboard_path)
106
+ working_path = os.path.join(leaderboard_path, 'Hallucination Leaderboard Results')
107
+ if not os.path.exists(working_path):
108
+ logging.error(f"Need to first download the results from google drive to the learderboard folder")
109
+ raise
110
+
111
+ source_summary_df = self.generated_summaries_df[["source", "summary"]]
112
+
113
+ # #update leaderboard_summaries.csv
114
+ # #first remove previous results for the current model
115
+ # existing_df = pd.read_csv(os.path.join(working_path, 'leaderboard_summaries.csv'), encoding='utf-8', sep="\t")
116
+ # mask = existing_df['model'] == self.model
117
+ # existing_df = existing_df[~mask]
118
+ # # get new result
119
+ leaderboard_summaries_df = source_summary_df
120
+ leaderboard_summaries_df.insert(2, "model", [self.model]*leaderboard_summaries_df.shape[0])
121
+ leaderboard_summaries_df.to_csv(os.path.join(working_path, 'leaderboard_summaries.csv'), mode='a', index=False, header=False)
122
+ print('leaderboard_summaries.csv has been updated')
123
+
124
+ # update leaderboard_summaries_with_scores.csv
125
+ # BUG: get error when opening the file
126
+ # existing_df = pd.read_csv(os.path.join(working_path, 'leaderboard_summaries_with_scores.csv'),
127
+ # encoding='utf-8', sep=",", on_bad_lines='warn', quotechar='"', quoting=2)
128
+ # print(existing_df.shape)
129
+ # mask = existing_df['model'] == self.model
130
+ # existing_df = existing_df[~mask]
131
+ # get new result
132
+ leaderboard_summaries_with_scores_df = pd.DataFrame.from_dict(self.eval_results)
133
+ leaderboard_summaries_with_scores_df.insert(3, "model", [self.model]*leaderboard_summaries_with_scores_df.shape[0])
134
+ leaderboard_summaries_with_scores_df.to_csv(os.path.join(working_path, 'leaderboard_summaries_with_scores.csv'), mode='a', index=False, header=False)
135
+ print('leaderboard_summaries_with_scores.csv has been updated')
src/backend/manage_requests.py CHANGED
@@ -12,7 +12,7 @@ class EvalRequest:
12
  model: str
13
  # private: bool
14
  status: str
15
- json_filepath: str
16
  private: bool = False
17
  weight_type: str = "Original"
18
  model_type: str = "" # pretrained, finetuned, with RL
 
12
  model: str
13
  # private: bool
14
  status: str
15
+ json_filepath: str = None
16
  private: bool = False
17
  weight_type: str = "Original"
18
  model_type: str = "" # pretrained, finetuned, with RL
src/backend/model_operations.py CHANGED
@@ -2,17 +2,30 @@ import os
2
  import time
3
  from datetime import datetime
4
  import logging
 
 
 
5
 
6
  import numpy as np
7
  import pandas as pd
8
  import spacy
9
  from sentence_transformers import CrossEncoder
10
- from litellm import completion
 
11
  from tqdm import tqdm
 
 
 
 
 
 
 
12
 
13
  import src.backend.util as util
14
  import src.envs as envs
15
 
 
 
16
  # Set up basic configuration for logging
17
  logging.basicConfig(level=logging.INFO,
18
  format='%(asctime)s - %(levelname)s - %(message)s')
@@ -36,18 +49,6 @@ def load_evaluation_model(model_path):
36
  return model
37
 
38
 
39
- def generate_summary(model: str, system_prompt: str, user_prompt: str, api_base: str):
40
- response = completion(
41
- model=model,
42
- messages=[{"role": "system", "content": system_prompt},
43
- {"role": "user", "content": user_prompt}],
44
- temperature=0.0,
45
- max_tokens=1024,
46
- api_base=api_base,
47
- )
48
- return response['choices'][0]['message']['content']
49
-
50
-
51
  class ModelLoadingException(Exception):
52
  """Exception raised for errors in loading a model.
53
 
@@ -82,6 +83,7 @@ class SummaryGenerator:
82
  model_id (str): Identifier for the model.
83
  revision (str): Revision of the model.
84
  """
 
85
  self.model = f"huggingface/{model_id}"
86
  self.api_base = f"https://api-inference.huggingface.co/models/{model_id}"
87
  self.summaries_df = pd.DataFrame()
@@ -89,8 +91,9 @@ class SummaryGenerator:
89
  self.avg_length = None
90
  self.answer_rate = None
91
  self.exceptions = None
 
92
 
93
- def generate_summaries(self, df):
94
  """Generate summaries for a given DataFrame of source docs.
95
 
96
  Args:
@@ -99,47 +102,155 @@ class SummaryGenerator:
99
  Returns:
100
  summaries_df (DataFrame): Generated summaries by the model.
101
  """
102
- source, summary, dataset = [], [], []
103
  exceptions = []
104
-
105
- for index, row in tqdm(df.iterrows(), total=df.shape[0]):
106
- _source = row['text']
107
- _dataset = row['dataset']
108
-
109
- system_prompt = envs.SYSTEM_PROMPT
110
- user_prompt = f"{envs.USER_PROMPT}\nPassage:\n{_source}"
111
-
112
- while True:
113
- try:
114
- _summary = generate_summary(self.model, system_prompt,
115
- user_prompt, self.api_base)
116
- break
117
- except Exception as e:
118
- if 'Rate limit reached' in str(e):
119
- wait_time = 3660
120
- current_time = datetime.now().strftime('%H:%M:%S')
121
- print(f"Rate limit hit at {current_time}. Waiting for 1 hour before retrying...")
122
- time.sleep(wait_time)
123
- else:
124
- print(f"Error at index {index}: {e}")
125
- _summary = ""
126
- exceptions.append(index)
127
  break
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
128
 
129
- summary.append(_summary)
130
- source.append(_source)
131
- dataset.append(_dataset)
132
-
133
- # Sleep to prevent hitting rate limits too frequently
134
- time.sleep(1)
135
-
136
- self.summaries_df = pd.DataFrame(list(zip(source, summary, dataset)),
137
- columns=["source", "summary", "dataset"])
138
  self.exceptions = exceptions
139
  self._compute_avg_length()
140
  self._compute_answer_rate()
141
 
142
  return self.summaries_df
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
143
 
144
  def _compute_avg_length(self):
145
  """
@@ -203,22 +314,35 @@ class EvaluationModel:
203
  list: List of hallucination scores. Also updates the 'scores' attribute of the instance.
204
  """
205
  hem_scores = []
 
 
206
  source_summary_pairs = util.create_pairs(summaries_df)
207
 
208
  for doc, summary in tqdm(source_summary_pairs, desc="Evaluating hallucinations"):
209
  if util.is_summary_valid(summary):
210
  try:
211
- score = self.model.predict([doc, summary])[0]
 
 
 
 
 
212
  if not isinstance(score, float):
213
- logging.warning(f"Score type mismatch: Expected float, got {type(score)}.")
214
- continue
 
 
 
215
  hem_scores.append(score)
 
 
216
  except Exception as e:
217
  logging.error(f"Error while running HEM: {e}")
218
  raise
219
 
220
  self.scores = hem_scores
221
- return hem_scores
 
222
 
223
 
224
  def compute_factual_consistency_rate(self, threshold=0.5):
 
2
  import time
3
  from datetime import datetime
4
  import logging
5
+ from pathlib import Path
6
+ import requests
7
+ import json
8
 
9
  import numpy as np
10
  import pandas as pd
11
  import spacy
12
  from sentence_transformers import CrossEncoder
13
+ import litellm
14
+ # from litellm import completion
15
  from tqdm import tqdm
16
+ from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig, AutoConfig
17
+ # from accelerate import PartialState
18
+ # from accelerate.inference import prepare_pippy
19
+ import torch
20
+ import cohere
21
+ from openai import OpenAI
22
+
23
 
24
  import src.backend.util as util
25
  import src.envs as envs
26
 
27
+ litellm.set_verbose=False
28
+
29
  # Set up basic configuration for logging
30
  logging.basicConfig(level=logging.INFO,
31
  format='%(asctime)s - %(levelname)s - %(message)s')
 
49
  return model
50
 
51
 
 
 
 
 
 
 
 
 
 
 
 
 
52
  class ModelLoadingException(Exception):
53
  """Exception raised for errors in loading a model.
54
 
 
83
  model_id (str): Identifier for the model.
84
  revision (str): Revision of the model.
85
  """
86
+ self.model_id = model_id
87
  self.model = f"huggingface/{model_id}"
88
  self.api_base = f"https://api-inference.huggingface.co/models/{model_id}"
89
  self.summaries_df = pd.DataFrame()
 
91
  self.avg_length = None
92
  self.answer_rate = None
93
  self.exceptions = None
94
+ self.local_model = None
95
 
96
+ def generate_summaries(self, df, save_path=None):
97
  """Generate summaries for a given DataFrame of source docs.
98
 
99
  Args:
 
102
  Returns:
103
  summaries_df (DataFrame): Generated summaries by the model.
104
  """
 
105
  exceptions = []
106
+ if (save_path is not None) and os.path.exists(save_path):
107
+ self.summaries_df = pd.read_csv(save_path)
108
+ print(f'Loaded generated summaries from {save_path}')
109
+ else:
110
+ source, summary, dataset = [], [], []
111
+ print(f"Total: {df.shape[0]}")
112
+ for index, row in tqdm(df.iterrows(), total=df.shape[0]):
113
+ _source = row['text']
114
+ _dataset = row['dataset']
115
+
116
+ system_prompt = envs.SYSTEM_PROMPT
117
+ user_prompt = f"{envs.USER_PROMPT}\nPassage:\n{_source}"
118
+
119
+ while True:
120
+ try:
121
+ _summary = self.generate_summary(system_prompt, user_prompt)
122
+ # print(f"Finish index {index}")
 
 
 
 
 
 
123
  break
124
+ except Exception as e:
125
+ if 'Rate limit reached' in str(e):
126
+ wait_time = 3660
127
+ current_time = datetime.now().strftime('%H:%M:%S')
128
+ print(f"Rate limit hit at {current_time}. Waiting for 1 hour before retrying...")
129
+ time.sleep(wait_time)
130
+ elif 'is currently loading' in str(e):
131
+ wait_time = 200
132
+ print(f"Model is loading, wait for {wait_time}")
133
+ time.sleep(wait_time)
134
+ else:
135
+ print(f"Error at index {index}: {e}")
136
+ _summary = ""
137
+ exceptions.append(index)
138
+ break
139
+
140
+ summary.append(_summary)
141
+ source.append(_source)
142
+ dataset.append(_dataset)
143
+
144
+ # Sleep to prevent hitting rate limits too frequently
145
+ time.sleep(1)
146
+
147
+ self.summaries_df = pd.DataFrame(list(zip(source, summary, dataset)),
148
+ columns=["source", "summary", "dataset"])
149
+
150
+ if save_path is not None:
151
+ print(f'Save summaries to {save_path}')
152
+ fpath = Path(save_path)
153
+ fpath.parent.mkdir(parents=True, exist_ok=True)
154
+ self.summaries_df.to_csv(fpath)
155
 
 
 
 
 
 
 
 
 
 
156
  self.exceptions = exceptions
157
  self._compute_avg_length()
158
  self._compute_answer_rate()
159
 
160
  return self.summaries_df
161
+
162
+ def generate_summary(self, system_prompt: str, user_prompt: str):
163
+ # Using Together AI API
164
+ if 'mixtral' in self.model_id.lower() or 'dbrx' in self.model_id.lower() or 'wizardlm' in self.model_id.lower(): # For mixtral and dbrx models, use Together AI API
165
+ suffix = "completions" if ('mixtral' in self.model_id.lower() or 'base' in self.model_id.lower()) else "chat/completions"
166
+ url = f"https://api.together.xyz/v1/{suffix}"
167
+
168
+ payload = {
169
+ "model": self.model_id,
170
+ # "max_tokens": 4096,
171
+ 'max_new_tokens': 250,
172
+ "temperature": 0.0,
173
+ 'repetition_penalty': 1.1 if 'mixtral' in self.model_id.lower() else 1
174
+ }
175
+ if 'mixtral' in self.model_id.lower():
176
+ # payload['prompt'] = user_prompt
177
+ # payload['prompt'] = "Write a summary of the following passage:\nPassage:\n" + user_prompt.split('Passage:\n')[-1] + '\n\nSummary:'
178
+ payload['prompt'] = 'You must stick to the passage provided. Provide a concise summary of the following passage, covering the core pieces of information described:\nPassage:\n' + user_prompt.split('Passage:\n')[-1] + '\n\nSummary:'
179
+ print(payload)
180
+ else:
181
+ payload['messages'] = [{"role": "system", "content": system_prompt},
182
+ {"role": "user", "content": user_prompt}]
183
+ headers = {
184
+ "accept": "application/json",
185
+ "content-type": "application/json",
186
+ "Authorization": f"Bearer {os.environ['TOGETHER_API_KEY']}"
187
+ }
188
+
189
+ response = requests.post(url, json=payload, headers=headers)
190
+ try:
191
+ result = json.loads(response.text)
192
+ # print(result)
193
+ result = result["choices"][0]
194
+ if 'message' in result:
195
+ result = result["message"]["content"].strip()
196
+ else:
197
+ result = result["text"]
198
+ result_candidates = [result_cancdidate for result_cancdidate in result.split('\n\n') if len(result_cancdidate) > 0]
199
+ result = result_candidates[0]
200
+ print(result)
201
+ except:
202
+ print(response)
203
+ result = ''
204
+ return result
205
+
206
+ # Using OpenAI API
207
+ elif 'gpt' in self.model_id.lower():
208
+ response = litellm.completion(
209
+ model=self.model_id.replace('openai/',''),
210
+ messages=[{"role": "system", "content": system_prompt},
211
+ {"role": "user", "content": user_prompt}],
212
+ temperature=0.0,
213
+ max_tokens=250,
214
+ )
215
+ result = response['choices'][0]['message']['content']
216
+ print(result)
217
+ return result
218
+
219
+ # Using HF API or download checkpoints
220
+ if self.local_model is None:
221
+ try: # try use HuggingFace API
222
+
223
+ response = litellm.completion(
224
+ model='command-r-plus' if 'command' in self.model else self.model,
225
+ messages=[{"role": "system", "content": system_prompt},
226
+ {"role": "user", "content": user_prompt}],
227
+ temperature=0.0,
228
+ max_tokens=1024,
229
+ api_base=self.api_base,
230
+ )
231
+ result = response['choices'][0]['message']['content']
232
+ except: # fail to call api. run it locally.
233
+ self.tokenizer = AutoTokenizer.from_pretrained(self.model_id, trust_remote_code=True)
234
+ print("Tokenizer loaded")
235
+ self.local_model = AutoModelForCausalLM.from_pretrained(self.model_id, trust_remote_code=True, device_map="auto", torch_dtype="auto")
236
+ print("Local model loaded")
237
+
238
+ # Using local model
239
+ if self.local_model: # cannot call API. using local model
240
+ messages=[
241
+ {"role": "system", "content": system_prompt}, # gemma-1.1 does not accept system role
242
+ {"role": "user", "content": user_prompt}
243
+ ],
244
+ prompt = self.tokenizer.apply_chat_template(messages,add_generation_prompt=True, tokenize=False)
245
+ print(prompt)
246
+ input_ids = self.tokenizer(prompt, return_tensors="pt").to('cuda')
247
+ with torch.no_grad():
248
+ outputs = self.local_model.generate(**input_ids, max_new_tokens=250, do_sample=True, temperature=0.01, pad_token_id=self.tokenizer.eos_token_id)
249
+ result = self.tokenizer.decode(outputs[0], skip_special_tokens=True)
250
+ result = result.replace(prompt[0], '')
251
+ print(result)
252
+
253
+ return result
254
 
255
  def _compute_avg_length(self):
256
  """
 
314
  list: List of hallucination scores. Also updates the 'scores' attribute of the instance.
315
  """
316
  hem_scores = []
317
+ sources = []
318
+ summaries = []
319
  source_summary_pairs = util.create_pairs(summaries_df)
320
 
321
  for doc, summary in tqdm(source_summary_pairs, desc="Evaluating hallucinations"):
322
  if util.is_summary_valid(summary):
323
  try:
324
+ # summary_pieces = summary.split('\n')
325
+ # summary = summary_pieces[0] if len(summary_pieces[0].strip()) > 0 else summary_pieces[1]
326
+ summary = summary.replace('<bos>','').replace('<eos>','')
327
+ # print([doc, summary])
328
+ # print(self.model.predict([doc, summary]))
329
+ score = self.model.predict([doc, summary])# [0]
330
  if not isinstance(score, float):
331
+ try:
332
+ score = score.item()
333
+ except:
334
+ logging.warning(f"Score type mismatch: Expected float, got {type(score)}.")
335
+ continue
336
  hem_scores.append(score)
337
+ sources.append(doc)
338
+ summaries.append(summary)
339
  except Exception as e:
340
  logging.error(f"Error while running HEM: {e}")
341
  raise
342
 
343
  self.scores = hem_scores
344
+ eval_results = {'source': sources, 'summary': summaries, 'HEM scores': hem_scores}
345
+ return hem_scores, eval_results
346
 
347
 
348
  def compute_factual_consistency_rate(self, threshold=0.5):
src/backend/run_eval_suite.py CHANGED
@@ -14,7 +14,8 @@ logging.getLogger("openai").setLevel(logging.WARNING)
14
 
15
 
16
  def run_evaluation(eval_request: EvalRequest, batch_size, device,
17
- local_dir: str, results_repo: str, no_cache=True, limit=None):
 
18
  """
19
  Run the evaluation for a given model and upload the results.
20
 
@@ -34,11 +35,20 @@ def run_evaluation(eval_request: EvalRequest, batch_size, device,
34
  if limit:
35
  logging.warning("WARNING: --limit SHOULD ONLY BE USED FOR TESTING. REAL METRICS SHOULD NOT BE COMPUTED USING LIMIT.")
36
 
 
 
 
 
 
 
 
 
37
  try:
38
  evaluator = Evaluator(eval_request.model, eval_request.revision, eval_request.precision,
39
  batch_size, device, no_cache, limit, write_out=True,
40
  output_base_path='logs')
41
  results = evaluator.evaluate()
 
42
  except Exception as e:
43
  logging.error(f"Error during evaluation: {e}")
44
  raise
@@ -46,17 +56,20 @@ def run_evaluation(eval_request: EvalRequest, batch_size, device,
46
  dumped = json.dumps(results, indent=2)
47
  logging.info(dumped)
48
 
49
- output_path = os.path.join(local_dir, *eval_request.model.split("/"),
50
- f"results_{datetime.now()}.json")
51
- os.makedirs(os.path.dirname(output_path), exist_ok=True)
52
  with open(output_path, "w") as f:
53
  f.write(dumped)
 
54
 
55
- envs.API.upload_file(
56
- path_or_fileobj=output_path,
57
- path_in_repo=f"{eval_request.model}/results_{datetime.now()}.json",
58
- repo_id=results_repo,
59
- repo_type="dataset",
60
- )
 
 
61
 
62
  return results
 
14
 
15
 
16
  def run_evaluation(eval_request: EvalRequest, batch_size, device,
17
+ local_dir: str, results_repo: str, no_cache=True, limit=None,
18
+ need_check=True, write_results=True):
19
  """
20
  Run the evaluation for a given model and upload the results.
21
 
 
35
  if limit:
36
  logging.warning("WARNING: --limit SHOULD ONLY BE USED FOR TESTING. REAL METRICS SHOULD NOT BE COMPUTED USING LIMIT.")
37
 
38
+ output_folder = os.path.join(local_dir, *eval_request.model.split("/"))
39
+ # if os.path.exists(output_folder):
40
+ # f_name = os.listdir(output_folder)[-1]
41
+ # print(f"Loading results from {os.path.join(output_folder, f_name)}")
42
+ # results = json.loads(os.path.join(output_folder, f_name))
43
+ # dumped = json.dumps(results, indent=2)
44
+ # logging.info(dumped)
45
+ # else:
46
  try:
47
  evaluator = Evaluator(eval_request.model, eval_request.revision, eval_request.precision,
48
  batch_size, device, no_cache, limit, write_out=True,
49
  output_base_path='logs')
50
  results = evaluator.evaluate()
51
+ evaluator.write_results()
52
  except Exception as e:
53
  logging.error(f"Error during evaluation: {e}")
54
  raise
 
56
  dumped = json.dumps(results, indent=2)
57
  logging.info(dumped)
58
 
59
+ output_path = os.path.join(output_folder,
60
+ f"results_{datetime.now()}.json") #
61
+ os.makedirs(output_folder, exist_ok=True)
62
  with open(output_path, "w") as f:
63
  f.write(dumped)
64
+ print(f"Results have been saved to{output_path}")
65
 
66
+ if not need_check:
67
+ print("Path in the repo:", f"{eval_request.model}/results_{datetime.now()}.json")
68
+ envs.API.upload_file(
69
+ path_or_fileobj=output_path,
70
+ path_in_repo=f"{eval_request.model}/results_{datetime.now()}.json",
71
+ repo_id=results_repo,
72
+ repo_type="dataset",
73
+ )
74
 
75
  return results
src/backend/util.py CHANGED
@@ -14,6 +14,7 @@ def is_summary_valid(summary: str) -> bool:
14
  words = summary.split()
15
  if len(words) >= 5:
16
  return True
 
17
  return False
18
 
19
 
@@ -60,16 +61,16 @@ def format_results(model_name: str, revision: str, precision: str,
60
  },
61
  "results": {
62
  "hallucination_rate": {
63
- "hallucination_rate": hallucination_rate
64
  },
65
  "factual_consistency_rate": {
66
- "factual_consistency_rate": factual_consistency_rate
67
  },
68
  "answer_rate": {
69
- "answer_rate": answer_rate
70
  },
71
  "average_summary_length": {
72
- "average_summary_length": avg_summary_len
73
  },
74
  }
75
  }
 
14
  words = summary.split()
15
  if len(words) >= 5:
16
  return True
17
+ # print(summary)
18
  return False
19
 
20
 
 
61
  },
62
  "results": {
63
  "hallucination_rate": {
64
+ "hallucination_rate": round(hallucination_rate,1)
65
  },
66
  "factual_consistency_rate": {
67
+ "factual_consistency_rate": round(factual_consistency_rate,1)
68
  },
69
  "answer_rate": {
70
+ "answer_rate": round(answer_rate*100,1)
71
  },
72
  "average_summary_length": {
73
+ "average_summary_length": round(avg_summary_len,1)
74
  },
75
  }
76
  }
src/envs.py CHANGED
@@ -1,5 +1,5 @@
1
  import os
2
-
3
  from huggingface_hub import HfApi
4
 
5
 
@@ -19,7 +19,7 @@ EVAL_RESULTS_PATH = os.path.join(CACHE_PATH, "eval-results")
19
  EVAL_REQUESTS_PATH_BACKEND = os.path.join(CACHE_PATH, "eval-queue-bk")
20
  EVAL_RESULTS_PATH_BACKEND = os.path.join(CACHE_PATH, "eval-results-bk")
21
 
22
- DEVICE = "cpu"
23
  API = HfApi(token=TOKEN)
24
 
25
  DATASET_PATH = "src/datasets/leaderboard_dataset.csv"
 
1
  import os
2
+ import torch
3
  from huggingface_hub import HfApi
4
 
5
 
 
19
  EVAL_REQUESTS_PATH_BACKEND = os.path.join(CACHE_PATH, "eval-queue-bk")
20
  EVAL_RESULTS_PATH_BACKEND = os.path.join(CACHE_PATH, "eval-results-bk")
21
 
22
+ DEVICE = torch.device('cuda' if torch.cuda.is_available() else 'cpu') #"cpu"
23
  API = HfApi(token=TOKEN)
24
 
25
  DATASET_PATH = "src/datasets/leaderboard_dataset.csv"