Spaces:
Running
on
T4
Running
on
T4
Adding progress bar
Browse files
app.py
CHANGED
@@ -38,6 +38,9 @@ logging.basicConfig(
|
|
38 |
level=logging.INFO, format="%(asctime)s - %(name)s - %(levelname)s - %(message)s"
|
39 |
)
|
40 |
|
|
|
|
|
|
|
41 |
|
42 |
session = requests.Session()
|
43 |
sentence_model = SentenceTransformer("all-MiniLM-L6-v2")
|
@@ -113,6 +116,22 @@ reduce_umap_model = UMAP(
|
|
113 |
global_topic_model = None
|
114 |
|
115 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
116 |
def get_parquet_urls(dataset, config, split):
|
117 |
parquet_files = session.get(
|
118 |
f"https://datasets-server.huggingface.co/parquet?dataset={dataset}&config={config}&split={split}",
|
@@ -170,9 +189,13 @@ def generate_topics(dataset, config, split, column, nested_column):
|
|
170 |
)
|
171 |
|
172 |
parquet_urls = get_parquet_urls(dataset, config, split)
|
173 |
-
|
174 |
-
|
|
|
|
|
175 |
offset = 0
|
|
|
|
|
176 |
base_model = None
|
177 |
all_docs = []
|
178 |
reduced_embeddings_list = []
|
@@ -180,15 +203,17 @@ def generate_topics(dataset, config, split, column, nested_column):
|
|
180 |
yield (
|
181 |
gr.DataFrame(interactive=False, visible=True),
|
182 |
gr.Plot(visible=True),
|
183 |
-
gr.Label(
|
|
|
|
|
184 |
)
|
185 |
while offset < limit:
|
186 |
-
docs = get_docs_from_parquet(parquet_urls, column, offset,
|
187 |
if not docs:
|
188 |
break
|
189 |
|
190 |
logging.info(
|
191 |
-
f"----> Processing chunk: {offset=} {
|
192 |
)
|
193 |
|
194 |
embeddings = calculate_embeddings(docs)
|
@@ -225,15 +250,17 @@ def generate_topics(dataset, config, split, column, nested_column):
|
|
225 |
)
|
226 |
|
227 |
logging.info(f"Topics: {repr_model_topics}")
|
228 |
-
progress = min(offset / limit, 1.0)
|
229 |
|
|
|
|
|
|
|
230 |
yield (
|
231 |
topics_info,
|
232 |
topic_plot,
|
233 |
gr.Label({f"⚙️ Generating topics {dataset}": progress}, visible=True),
|
234 |
)
|
235 |
|
236 |
-
offset +=
|
237 |
|
238 |
logging.info("Finished processing all data")
|
239 |
cuda.empty_cache() # Clear cache at the end of each chunk
|
|
|
38 |
level=logging.INFO, format="%(asctime)s - %(name)s - %(levelname)s - %(message)s"
|
39 |
)
|
40 |
|
41 |
+
MAX_ROWS = 1_000
|
42 |
+
CHUNK_SIZE = 300
|
43 |
+
|
44 |
|
45 |
session = requests.Session()
|
46 |
sentence_model = SentenceTransformer("all-MiniLM-L6-v2")
|
|
|
116 |
global_topic_model = None
|
117 |
|
118 |
|
119 |
+
def get_split_rows(dataset, config, split):
|
120 |
+
config_size = session.get(
|
121 |
+
f"https://datasets-server.huggingface.co/size?dataset={dataset}&config={config}",
|
122 |
+
timeout=20,
|
123 |
+
).json()
|
124 |
+
if "error" in config_size:
|
125 |
+
raise Exception(f"Error fetching config size: {config_size['error']}")
|
126 |
+
split_size = next(
|
127 |
+
(s for s in config_size["size"]["splits"] if s["split"] == split),
|
128 |
+
None,
|
129 |
+
)
|
130 |
+
if split_size is None:
|
131 |
+
raise Exception(f"Error fetching split{split} in config {config}")
|
132 |
+
return split_size["num_rows"]
|
133 |
+
|
134 |
+
|
135 |
def get_parquet_urls(dataset, config, split):
|
136 |
parquet_files = session.get(
|
137 |
f"https://datasets-server.huggingface.co/parquet?dataset={dataset}&config={config}&split={split}",
|
|
|
189 |
)
|
190 |
|
191 |
parquet_urls = get_parquet_urls(dataset, config, split)
|
192 |
+
split_rows = get_split_rows(dataset, config, split)
|
193 |
+
logging.info(f"Split rows: {split_rows}")
|
194 |
+
|
195 |
+
limit = min(split_rows, MAX_ROWS)
|
196 |
offset = 0
|
197 |
+
rows_processed = 0
|
198 |
+
|
199 |
base_model = None
|
200 |
all_docs = []
|
201 |
reduced_embeddings_list = []
|
|
|
203 |
yield (
|
204 |
gr.DataFrame(interactive=False, visible=True),
|
205 |
gr.Plot(visible=True),
|
206 |
+
gr.Label(
|
207 |
+
{f"⚙️ Generating topics {dataset}": rows_processed / limit}, visible=True
|
208 |
+
),
|
209 |
)
|
210 |
while offset < limit:
|
211 |
+
docs = get_docs_from_parquet(parquet_urls, column, offset, CHUNK_SIZE)
|
212 |
if not docs:
|
213 |
break
|
214 |
|
215 |
logging.info(
|
216 |
+
f"----> Processing chunk: {offset=} {CHUNK_SIZE=} with {len(docs)} docs"
|
217 |
)
|
218 |
|
219 |
embeddings = calculate_embeddings(docs)
|
|
|
250 |
)
|
251 |
|
252 |
logging.info(f"Topics: {repr_model_topics}")
|
|
|
253 |
|
254 |
+
rows_processed += len(docs)
|
255 |
+
progress = min(rows_processed / limit, 1.0)
|
256 |
+
logging.info(f"Progress: {progress} % - {rows_processed} of {limit}")
|
257 |
yield (
|
258 |
topics_info,
|
259 |
topic_plot,
|
260 |
gr.Label({f"⚙️ Generating topics {dataset}": progress}, visible=True),
|
261 |
)
|
262 |
|
263 |
+
offset += CHUNK_SIZE
|
264 |
|
265 |
logging.info("Finished processing all data")
|
266 |
cuda.empty_cache() # Clear cache at the end of each chunk
|