Spaces:
Running
on
CPU Upgrade
Running
on
CPU Upgrade
Minseok Bae
commited on
Commit
•
2c24f05
1
Parent(s):
b46b972
modified the evaluation pipelines.
Browse files- src/backend/model_operations.py +36 -27
- src/backend/util.py +19 -0
src/backend/model_operations.py
CHANGED
@@ -6,10 +6,9 @@ import logging
|
|
6 |
import numpy as np
|
7 |
import pandas as pd
|
8 |
import spacy
|
9 |
-
# from transformers import AutoModelForCausalLM, AutoTokenizer
|
10 |
from sentence_transformers import CrossEncoder
|
11 |
-
import litellm
|
12 |
from litellm import completion
|
|
|
13 |
|
14 |
import src.backend.util as util
|
15 |
import src.envs as envs
|
@@ -23,8 +22,6 @@ nlp = spacy.load("en_core_web_sm")
|
|
23 |
|
24 |
os.environ["HUGGINGFACE_API_KEY"] = envs.TOKEN
|
25 |
|
26 |
-
litellm.set_verbose=True
|
27 |
-
|
28 |
|
29 |
def load_evaluation_model(model_path):
|
30 |
"""Load the evaluation model from the given path
|
@@ -105,7 +102,7 @@ class SummaryGenerator:
|
|
105 |
source, summary, dataset = [], [], []
|
106 |
exceptions = []
|
107 |
|
108 |
-
for index, row in df.iterrows():
|
109 |
_source = row['text']
|
110 |
_dataset = row['dataset']
|
111 |
|
@@ -129,11 +126,12 @@ class SummaryGenerator:
|
|
129 |
exceptions.append(index)
|
130 |
break
|
131 |
|
132 |
-
|
133 |
-
|
134 |
-
|
135 |
|
136 |
-
|
|
|
137 |
|
138 |
self.summaries_df = pd.DataFrame(list(zip(source, summary, dataset)),
|
139 |
columns=["source", "summary", "dataset"])
|
@@ -147,26 +145,28 @@ class SummaryGenerator:
|
|
147 |
"""
|
148 |
Compute the average length of non-empty summaries using SpaCy.
|
149 |
"""
|
150 |
-
|
151 |
-
|
152 |
|
153 |
for summary in self.summaries_df['summary']:
|
154 |
-
if summary
|
155 |
doc = nlp(summary)
|
156 |
words = [token.text for token in doc if token.is_alpha]
|
157 |
-
|
158 |
-
|
159 |
|
160 |
-
self.avg_length = 0 if
|
161 |
|
162 |
def _compute_answer_rate(self):
|
163 |
"""
|
164 |
Compute the rate of non-empty summaries.
|
165 |
"""
|
166 |
-
|
167 |
-
|
|
|
|
|
168 |
|
169 |
-
self.answer_rate = 0 if
|
170 |
|
171 |
|
172 |
class EvaluationModel:
|
@@ -193,7 +193,7 @@ class EvaluationModel:
|
|
193 |
|
194 |
def evaluate_hallucination(self, summaries_df):
|
195 |
"""
|
196 |
-
Evaluate the hallucination rate in summaries.
|
197 |
of the instance with the computed scores.
|
198 |
|
199 |
Args:
|
@@ -202,14 +202,24 @@ class EvaluationModel:
|
|
202 |
Returns:
|
203 |
list: List of hallucination scores. Also updates the 'scores' attribute of the instance.
|
204 |
"""
|
|
|
205 |
source_summary_pairs = util.create_pairs(summaries_df)
|
206 |
-
|
207 |
-
|
208 |
-
|
209 |
-
|
210 |
-
|
211 |
-
|
212 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
213 |
|
214 |
def compute_factual_consistency_rate(self, threshold=0.5):
|
215 |
"""
|
@@ -240,4 +250,3 @@ class EvaluationModel:
|
|
240 |
self.hallucination_rate = 100 - self.factual_consistency_rate
|
241 |
|
242 |
return self.factual_consistency_rate
|
243 |
-
|
|
|
6 |
import numpy as np
|
7 |
import pandas as pd
|
8 |
import spacy
|
|
|
9 |
from sentence_transformers import CrossEncoder
|
|
|
10 |
from litellm import completion
|
11 |
+
from tqdm import tqdm
|
12 |
|
13 |
import src.backend.util as util
|
14 |
import src.envs as envs
|
|
|
22 |
|
23 |
os.environ["HUGGINGFACE_API_KEY"] = envs.TOKEN
|
24 |
|
|
|
|
|
25 |
|
26 |
def load_evaluation_model(model_path):
|
27 |
"""Load the evaluation model from the given path
|
|
|
102 |
source, summary, dataset = [], [], []
|
103 |
exceptions = []
|
104 |
|
105 |
+
for index, row in tqdm(df.iterrows(), total=df.shape[0]):
|
106 |
_source = row['text']
|
107 |
_dataset = row['dataset']
|
108 |
|
|
|
126 |
exceptions.append(index)
|
127 |
break
|
128 |
|
129 |
+
summary.append(_summary)
|
130 |
+
source.append(_source)
|
131 |
+
dataset.append(_dataset)
|
132 |
|
133 |
+
# Sleep to prevent hitting rate limits too frequently
|
134 |
+
time.sleep(1)
|
135 |
|
136 |
self.summaries_df = pd.DataFrame(list(zip(source, summary, dataset)),
|
137 |
columns=["source", "summary", "dataset"])
|
|
|
145 |
"""
|
146 |
Compute the average length of non-empty summaries using SpaCy.
|
147 |
"""
|
148 |
+
total_word_count = 0
|
149 |
+
total_count = 0
|
150 |
|
151 |
for summary in self.summaries_df['summary']:
|
152 |
+
if util.is_summary_valid(summary):
|
153 |
doc = nlp(summary)
|
154 |
words = [token.text for token in doc if token.is_alpha]
|
155 |
+
total_word_count += len(words)
|
156 |
+
total_count += 1
|
157 |
|
158 |
+
self.avg_length = 0 if total_count == 0 else total_word_count / total_count
|
159 |
|
160 |
def _compute_answer_rate(self):
|
161 |
"""
|
162 |
Compute the rate of non-empty summaries.
|
163 |
"""
|
164 |
+
valid_count = sum(1 for summary in self.summaries_df['summary']
|
165 |
+
if util.is_summary_valid(summary))
|
166 |
+
|
167 |
+
total_count = len(self.summaries_df)
|
168 |
|
169 |
+
self.answer_rate = 0 if total_count == 0 else valid_count / total_count
|
170 |
|
171 |
|
172 |
class EvaluationModel:
|
|
|
193 |
|
194 |
def evaluate_hallucination(self, summaries_df):
|
195 |
"""
|
196 |
+
Evaluate the hallucination rate in summaries. Updates the 'scores' attribute
|
197 |
of the instance with the computed scores.
|
198 |
|
199 |
Args:
|
|
|
202 |
Returns:
|
203 |
list: List of hallucination scores. Also updates the 'scores' attribute of the instance.
|
204 |
"""
|
205 |
+
hem_scores = []
|
206 |
source_summary_pairs = util.create_pairs(summaries_df)
|
207 |
+
|
208 |
+
for doc, summary in tqdm(source_summary_pairs, desc="Evaluating hallucinations"):
|
209 |
+
if util.is_summary_valid(summary):
|
210 |
+
try:
|
211 |
+
score = self.model.predict([doc, summary])[0]
|
212 |
+
if not isinstance(score, float):
|
213 |
+
logging.warning(f"Score type mismatch: Expected float, got {type(score)}.")
|
214 |
+
continue
|
215 |
+
hem_scores.append(score)
|
216 |
+
except Exception as e:
|
217 |
+
logging.error(f"Error while running HEM: {e}")
|
218 |
+
raise
|
219 |
+
|
220 |
+
self.scores = hem_scores
|
221 |
+
return hem_scores
|
222 |
+
|
223 |
|
224 |
def compute_factual_consistency_rate(self, threshold=0.5):
|
225 |
"""
|
|
|
250 |
self.hallucination_rate = 100 - self.factual_consistency_rate
|
251 |
|
252 |
return self.factual_consistency_rate
|
|
src/backend/util.py
CHANGED
@@ -1,3 +1,22 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
def create_pairs(df):
|
2 |
"""
|
3 |
Creates pairs of source and summary from the dataframe.
|
|
|
1 |
+
def is_summary_valid(summary: str) -> bool:
|
2 |
+
"""
|
3 |
+
Checks if the summary is valid.
|
4 |
+
|
5 |
+
A summary is valid if it is not empty and contains at least five words.
|
6 |
+
|
7 |
+
Args:
|
8 |
+
summary (str): The summary to check.
|
9 |
+
|
10 |
+
Returns:
|
11 |
+
bool: True if the summary is valid, False otherwise.
|
12 |
+
"""
|
13 |
+
if isinstance(summary, str):
|
14 |
+
words = summary.split()
|
15 |
+
if len(words) >= 5:
|
16 |
+
return True
|
17 |
+
return False
|
18 |
+
|
19 |
+
|
20 |
def create_pairs(df):
|
21 |
"""
|
22 |
Creates pairs of source and summary from the dataframe.
|