Minseok Bae commited on
Commit
2864204
1 Parent(s): 404587d

Implemented litellm pipeline

Browse files
requirements.txt CHANGED
@@ -5,6 +5,7 @@ datasets==2.14.5
5
  gradio==4.4.0
6
  gradio_client==0.7.0
7
  huggingface-hub>=0.18.0
 
8
  matplotlib==3.7.1
9
  numpy==1.24.2
10
  pandas==2.0.0
 
5
  gradio==4.4.0
6
  gradio_client==0.7.0
7
  huggingface-hub>=0.18.0
8
+ litellm==1.15.1
9
  matplotlib==3.7.1
10
  numpy==1.24.2
11
  pandas==2.0.0
src/backend/model_operations.py CHANGED
@@ -1,12 +1,17 @@
 
 
 
1
  import logging
2
 
3
  import numpy as np
4
  import pandas as pd
5
  import spacy
6
- from transformers import AutoModelForCausalLM, AutoTokenizer
7
  from sentence_transformers import CrossEncoder
 
8
 
9
  import src.backend.util as util
 
10
 
11
  # Set up basic configuration for logging
12
  logging.basicConfig(level=logging.INFO,
@@ -15,6 +20,8 @@ logging.basicConfig(level=logging.INFO,
15
  # Load spacy model for word tokenization
16
  nlp = spacy.load("en_core_web_sm")
17
 
 
 
18
 
19
  def load_evaluation_model(model_path):
20
  """Load the evaluation model from the given path
@@ -29,6 +36,18 @@ def load_evaluation_model(model_path):
29
  return model
30
 
31
 
 
 
 
 
 
 
 
 
 
 
 
 
32
  class ModelLoadingException(Exception):
33
  """Exception raised for errors in loading a model.
34
 
@@ -42,12 +61,13 @@ class ModelLoadingException(Exception):
42
  self.revision = revision
43
  super().__init__(f"{messages} id={model_id} revision={revision}")
44
 
 
45
  class SummaryGenerator:
46
  """A class to generate summaries using a causal language model.
47
 
48
  Attributes:
49
- tokenizer (AutoTokenizer): Tokenizer for the model.
50
- model (AutoModelForCausalLM): The causal language model.
51
  summaries_df (DataFrame): DataFrame to store generated summaries.
52
  revision (str): Model revision.
53
  avg_length (float): Average length of summaries.
@@ -62,17 +82,13 @@ class SummaryGenerator:
62
  model_id (str): Identifier for the model.
63
  revision (str): Revision of the model.
64
  """
65
- try:
66
- self.tokenizer = AutoTokenizer.from_pretrained(model_id, revision)
67
- self.model = AutoModelForCausalLM.from_pretrained(model_id, revision)
68
- except Exception as e:
69
- logging.error(f"Error initializing model with id {model_id} and revision {revision}: {e}")
70
- raise ModelLoadingException(model_id, revision) from e
71
  self.summaries_df = pd.DataFrame()
72
  self.revision = revision
73
  self.avg_length = None
74
  self.answer_rate = None
75
- self.error_rate = None
76
 
77
  def generate_summaries(self, df):
78
  """Generate summaries for a given DataFrame of source docs.
@@ -84,34 +100,43 @@ class SummaryGenerator:
84
  summaries_df (DataFrame): Generated summaries by the model.
85
  """
86
  source, summary, dataset = [], [], []
 
87
 
88
- error_count = 0
89
  for index, row in df.iterrows():
90
  _source = row['text']
91
  _dataset = row['dataset']
92
-
93
- prompt = util.generate_prompt(_source)
94
- inputs = self.tokenizer(prompt, return_tensors='pt', max_length=1024,
95
- revision=self.revision)
96
- try:
97
- outputs = self.model.generate(**inputs, max_new_tokens=1024, do_sample=False,
98
- temperature=0.0, revision=self.revision)
99
- response = self.tokenizer.decode(outputs[0], skip_special_tokens=True,
100
- revision=self.revision)
101
- except Exception as e:
102
- print(f"Error at index {index}: {e}")
103
- response = ""
104
- error_count += 1
105
-
106
- summary.append(response)
107
- source.append(_source)
108
- dataset.append(_dataset)
 
 
 
 
 
 
 
 
 
109
 
110
  self.summaries_df = pd.DataFrame(list(zip(source, summary, dataset)),
111
  columns=["source", "summary", "dataset"])
 
112
  self._compute_avg_length()
113
  self._compute_answer_rate()
114
- # self._compute_error_rate(error_count)
115
 
116
  return self.summaries_df
117
 
@@ -140,14 +165,6 @@ class SummaryGenerator:
140
 
141
  self.answer_rate = 0 if total_rows == 0 else non_empty_count / total_rows
142
 
143
- # def _compute_error_rate(self, count):
144
- # """
145
- # Compute the error rate of summaries.
146
- # """
147
- # total_rows = len(self.summaries_df)
148
-
149
- # self.error_rate = 0 if total_rows == 0 else count / total_rows
150
-
151
 
152
  class EvaluationModel:
153
  """A class to evaluate generated summaries.
 
1
+ import os
2
+ import time
3
+ from datetime import datetime
4
  import logging
5
 
6
  import numpy as np
7
  import pandas as pd
8
  import spacy
9
+ # from transformers import AutoModelForCausalLM, AutoTokenizer
10
  from sentence_transformers import CrossEncoder
11
+ from litellm import completion
12
 
13
  import src.backend.util as util
14
+ import src.envs as envs
15
 
16
  # Set up basic configuration for logging
17
  logging.basicConfig(level=logging.INFO,
 
20
  # Load spacy model for word tokenization
21
  nlp = spacy.load("en_core_web_sm")
22
 
23
+ os.environ["HUGGINGFACE_API_KEY"] = envs.TOKEN
24
+
25
 
26
  def load_evaluation_model(model_path):
27
  """Load the evaluation model from the given path
 
36
  return model
37
 
38
 
39
+ def generate_summary(model: str, system_prompt: str, user_prompt: str, api_base: str):
40
+ response = completion(
41
+ model=model,
42
+ messages=[{"role": "system", "content": system_prompt},
43
+ {"role": "user", "content": user_prompt}],
44
+ temperature=0.0,
45
+ max_tokens=1024,
46
+ api_base=api_base,
47
+ )
48
+ return response['choices'][0]['message']['content']
49
+
50
+
51
  class ModelLoadingException(Exception):
52
  """Exception raised for errors in loading a model.
53
 
 
61
  self.revision = revision
62
  super().__init__(f"{messages} id={model_id} revision={revision}")
63
 
64
+
65
  class SummaryGenerator:
66
  """A class to generate summaries using a causal language model.
67
 
68
  Attributes:
69
+ model (str): huggingface/{model_id}
70
+ api_base (str): https://api-inference.huggingface.co/models/{model_id}
71
  summaries_df (DataFrame): DataFrame to store generated summaries.
72
  revision (str): Model revision.
73
  avg_length (float): Average length of summaries.
 
82
  model_id (str): Identifier for the model.
83
  revision (str): Revision of the model.
84
  """
85
+ self.model = f"huggingface/{model_id}"
86
+ self.api_base = f"https://api-inference.huggingface.co/models/{model_id}"
 
 
 
 
87
  self.summaries_df = pd.DataFrame()
88
  self.revision = revision
89
  self.avg_length = None
90
  self.answer_rate = None
91
+ self.exceptions = None
92
 
93
  def generate_summaries(self, df):
94
  """Generate summaries for a given DataFrame of source docs.
 
100
  summaries_df (DataFrame): Generated summaries by the model.
101
  """
102
  source, summary, dataset = [], [], []
103
+ exceptions = []
104
 
 
105
  for index, row in df.iterrows():
106
  _source = row['text']
107
  _dataset = row['dataset']
108
+
109
+ system_prompt = envs.SYSTEM_PROMPT
110
+ user_prompt = f"{envs.USER_PROMPT}\nPassage:\n{_source}"
111
+
112
+ while True:
113
+ try:
114
+ _summary = generate_summary(self.model, system_prompt,
115
+ user_prompt, self.api_base)
116
+ break
117
+ except Exception as e:
118
+ if 'Rate limit reached' in str(e):
119
+ wait_time = 3660
120
+ current_time = datetime.now().strftime('%H:%M:%S')
121
+ print(f"Rate limit hit at {current_time}. Waiting for 1 hour before retrying...")
122
+ time.sleep(wait_time)
123
+ else:
124
+ print(f"Error at index {index}: {e}")
125
+ _summary = ""
126
+ exceptions.append(index)
127
+ break
128
+
129
+ summary.append(_summary)
130
+ source.append(_source)
131
+ dataset.append(_dataset)
132
+
133
+ time.sleep(1)
134
 
135
  self.summaries_df = pd.DataFrame(list(zip(source, summary, dataset)),
136
  columns=["source", "summary", "dataset"])
137
+ self.exceptions = exceptions
138
  self._compute_avg_length()
139
  self._compute_answer_rate()
 
140
 
141
  return self.summaries_df
142
 
 
165
 
166
  self.answer_rate = 0 if total_rows == 0 else non_empty_count / total_rows
167
 
 
 
 
 
 
 
 
 
168
 
169
  class EvaluationModel:
170
  """A class to evaluate generated summaries.
src/display/about.py CHANGED
@@ -43,10 +43,10 @@ Our evaluation dataset is composed of 1006 documents from multiple public datase
43
  We generate summaries for each of these documents using submitted LLMs and compute hallucination scores for each pair of document and generated summary. (Check the prompt we used [here](https://huggingface.co/spaces/vectara/Hallucination-evaluation-leaderboard))
44
 
45
  ## Understand each metric
46
- ### - Hallucination Rate: The percentage of summaries that have a hallucination score below 0.5
47
- ### - Factual Consistency Rate: (1 - Hallucination Rate) * 100 (%)
48
- ### - Answer Rate: The percentage of summaries that are non-empty. (This is a proxy for whether the model generates a summary at all)
49
- ### - Average Summary Length: The average number of words in the generated summaries
50
 
51
  ## Reproducibility
52
  To reproduce our results, here is the commands you can run:
 
43
  We generate summaries for each of these documents using submitted LLMs and compute hallucination scores for each pair of document and generated summary. (Check the prompt we used [here](https://huggingface.co/spaces/vectara/Hallucination-evaluation-leaderboard))
44
 
45
  ## Understand each metric
46
+ - Hallucination Rate: The percentage of summaries that have a hallucination score below 0.5
47
+ - Factual Consistency Rate: (1 - Hallucination Rate) * 100 (%)
48
+ - Answer Rate: The percentage of summaries that are non-empty. (This is a proxy for whether the model generates a summary at all)
49
+ - Average Summary Length: The average number of words in the generated summaries
50
 
51
  ## Reproducibility
52
  To reproduce our results, here is the commands you can run:
src/envs.py CHANGED
@@ -24,3 +24,6 @@ API = HfApi(token=TOKEN)
24
  SOURCE_PATH = "src/datasets/leaderboard_dataset.csv"
25
  SAMPLE_DATASET_PATH = "src/datasets/sample_dataset.csv"
26
  HEM_PATH = 'vectara/hallucination_evaluation_model'
 
 
 
 
24
  SOURCE_PATH = "src/datasets/leaderboard_dataset.csv"
25
  SAMPLE_DATASET_PATH = "src/datasets/sample_dataset.csv"
26
  HEM_PATH = 'vectara/hallucination_evaluation_model'
27
+
28
+ SYSTEM_PROMPT = "You are a chat bot answering questions using data. You must stick to the answers provided solely by the text in the passage provided."
29
+ USER_PROMPT = "You are asked the question 'Provide a concise summary of the following passage, covering the core pieces of information described': "