Spaces:
Running
on
T4
Running
on
T4
Update app.py
Browse files
app.py
CHANGED
@@ -19,6 +19,7 @@ logging.basicConfig(
|
|
19 |
|
20 |
|
21 |
session = requests.Session()
|
|
|
22 |
|
23 |
|
24 |
def get_parquet_urls(dataset, config, split):
|
@@ -41,7 +42,7 @@ def get_docs_from_parquet(parquet_urls, column, offset, limit):
|
|
41 |
|
42 |
|
43 |
@spaces.GPU
|
44 |
-
def calculate_embeddings(
|
45 |
embeddings = sentence_model.encode(docs, show_progress_bar=True, batch_size=100)
|
46 |
logging.info(f"Embeddings shape: {embeddings.shape}")
|
47 |
return embeddings
|
@@ -91,11 +92,10 @@ def generate_topics(dataset, config, split, column, nested_column):
|
|
91 |
# Create instances of GPU-accelerated UMAP and HDBSCAN
|
92 |
# umap_model = UMAP(n_components=5, n_neighbors=15, min_dist=0.0)
|
93 |
# hdbscan_model = HDBSCAN(min_samples=10, gen_min_span_tree=True)
|
94 |
-
sentence_model = SentenceTransformer("all-MiniLM-L6-v2")
|
95 |
while True:
|
96 |
docs = get_docs_from_parquet(parquet_urls, column, offset, chunk_size)
|
97 |
logging.info(f"------------> New chunk data {offset=} {chunk_size=}")
|
98 |
-
embeddings = calculate_embeddings(
|
99 |
offset = offset + chunk_size
|
100 |
if not docs or offset >= limit:
|
101 |
break
|
|
|
19 |
|
20 |
|
21 |
session = requests.Session()
|
22 |
+
sentence_model = SentenceTransformer("all-MiniLM-L6-v2")
|
23 |
|
24 |
|
25 |
def get_parquet_urls(dataset, config, split):
|
|
|
42 |
|
43 |
|
44 |
@spaces.GPU
|
45 |
+
def calculate_embeddings(docs):
|
46 |
embeddings = sentence_model.encode(docs, show_progress_bar=True, batch_size=100)
|
47 |
logging.info(f"Embeddings shape: {embeddings.shape}")
|
48 |
return embeddings
|
|
|
92 |
# Create instances of GPU-accelerated UMAP and HDBSCAN
|
93 |
# umap_model = UMAP(n_components=5, n_neighbors=15, min_dist=0.0)
|
94 |
# hdbscan_model = HDBSCAN(min_samples=10, gen_min_span_tree=True)
|
|
|
95 |
while True:
|
96 |
docs = get_docs_from_parquet(parquet_urls, column, offset, chunk_size)
|
97 |
logging.info(f"------------> New chunk data {offset=} {chunk_size=}")
|
98 |
+
embeddings = calculate_embeddings(docs)
|
99 |
offset = offset + chunk_size
|
100 |
if not docs or offset >= limit:
|
101 |
break
|