asoria HF staff commited on
Commit
a5c2f0e
1 Parent(s): abbebb7

Adding progress bar

Browse files
Files changed (1) hide show
  1. app.py +34 -7
app.py CHANGED
@@ -38,6 +38,9 @@ logging.basicConfig(
38
  level=logging.INFO, format="%(asctime)s - %(name)s - %(levelname)s - %(message)s"
39
  )
40
 
 
 
 
41
 
42
  session = requests.Session()
43
  sentence_model = SentenceTransformer("all-MiniLM-L6-v2")
@@ -113,6 +116,22 @@ reduce_umap_model = UMAP(
113
  global_topic_model = None
114
 
115
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
116
  def get_parquet_urls(dataset, config, split):
117
  parquet_files = session.get(
118
  f"https://datasets-server.huggingface.co/parquet?dataset={dataset}&config={config}&split={split}",
@@ -170,9 +189,13 @@ def generate_topics(dataset, config, split, column, nested_column):
170
  )
171
 
172
  parquet_urls = get_parquet_urls(dataset, config, split)
173
- limit = 1_000
174
- chunk_size = 300
 
 
175
  offset = 0
 
 
176
  base_model = None
177
  all_docs = []
178
  reduced_embeddings_list = []
@@ -180,15 +203,17 @@ def generate_topics(dataset, config, split, column, nested_column):
180
  yield (
181
  gr.DataFrame(interactive=False, visible=True),
182
  gr.Plot(visible=True),
183
- gr.Label({f"⚙️ Generating topics {dataset}": 0.0}, visible=True),
 
 
184
  )
185
  while offset < limit:
186
- docs = get_docs_from_parquet(parquet_urls, column, offset, chunk_size)
187
  if not docs:
188
  break
189
 
190
  logging.info(
191
- f"----> Processing chunk: {offset=} {chunk_size=} with {len(docs)} docs"
192
  )
193
 
194
  embeddings = calculate_embeddings(docs)
@@ -225,15 +250,17 @@ def generate_topics(dataset, config, split, column, nested_column):
225
  )
226
 
227
  logging.info(f"Topics: {repr_model_topics}")
228
- progress = min(offset / limit, 1.0)
229
 
 
 
 
230
  yield (
231
  topics_info,
232
  topic_plot,
233
  gr.Label({f"⚙️ Generating topics {dataset}": progress}, visible=True),
234
  )
235
 
236
- offset += chunk_size
237
 
238
  logging.info("Finished processing all data")
239
  cuda.empty_cache() # Clear cache at the end of each chunk
 
38
  level=logging.INFO, format="%(asctime)s - %(name)s - %(levelname)s - %(message)s"
39
  )
40
 
41
+ MAX_ROWS = 1_000
42
+ CHUNK_SIZE = 300
43
+
44
 
45
  session = requests.Session()
46
  sentence_model = SentenceTransformer("all-MiniLM-L6-v2")
 
116
  global_topic_model = None
117
 
118
 
119
+ def get_split_rows(dataset, config, split):
120
+ config_size = session.get(
121
+ f"https://datasets-server.huggingface.co/size?dataset={dataset}&config={config}",
122
+ timeout=20,
123
+ ).json()
124
+ if "error" in config_size:
125
+ raise Exception(f"Error fetching config size: {config_size['error']}")
126
+ split_size = next(
127
+ (s for s in config_size["size"]["splits"] if s["split"] == split),
128
+ None,
129
+ )
130
+ if split_size is None:
131
+ raise Exception(f"Error fetching split{split} in config {config}")
132
+ return split_size["num_rows"]
133
+
134
+
135
  def get_parquet_urls(dataset, config, split):
136
  parquet_files = session.get(
137
  f"https://datasets-server.huggingface.co/parquet?dataset={dataset}&config={config}&split={split}",
 
189
  )
190
 
191
  parquet_urls = get_parquet_urls(dataset, config, split)
192
+ split_rows = get_split_rows(dataset, config, split)
193
+ logging.info(f"Split rows: {split_rows}")
194
+
195
+ limit = min(split_rows, MAX_ROWS)
196
  offset = 0
197
+ rows_processed = 0
198
+
199
  base_model = None
200
  all_docs = []
201
  reduced_embeddings_list = []
 
203
  yield (
204
  gr.DataFrame(interactive=False, visible=True),
205
  gr.Plot(visible=True),
206
+ gr.Label(
207
+ {f"⚙️ Generating topics {dataset}": rows_processed / limit}, visible=True
208
+ ),
209
  )
210
  while offset < limit:
211
+ docs = get_docs_from_parquet(parquet_urls, column, offset, CHUNK_SIZE)
212
  if not docs:
213
  break
214
 
215
  logging.info(
216
+ f"----> Processing chunk: {offset=} {CHUNK_SIZE=} with {len(docs)} docs"
217
  )
218
 
219
  embeddings = calculate_embeddings(docs)
 
250
  )
251
 
252
  logging.info(f"Topics: {repr_model_topics}")
 
253
 
254
+ rows_processed += len(docs)
255
+ progress = min(rows_processed / limit, 1.0)
256
+ logging.info(f"Progress: {progress} % - {rows_processed} of {limit}")
257
  yield (
258
  topics_info,
259
  topic_plot,
260
  gr.Label({f"⚙️ Generating topics {dataset}": progress}, visible=True),
261
  )
262
 
263
+ offset += CHUNK_SIZE
264
 
265
  logging.info("Finished processing all data")
266
  cuda.empty_cache() # Clear cache at the end of each chunk