Spaces:
Running
on
T4
Running
on
T4
Change parameters by dataset size
Browse files
app.py
CHANGED
@@ -2,12 +2,10 @@ import requests
|
|
2 |
import logging
|
3 |
import duckdb
|
4 |
import numpy as np
|
5 |
-
|
6 |
from gradio_huggingfacehub_search import HuggingfaceHubSearch
|
7 |
from bertopic import BERTopic
|
8 |
-
from bertopic.representation import
|
9 |
-
KeyBERTInspired,
|
10 |
-
)
|
11 |
from umap import UMAP
|
12 |
from hdbscan import HDBSCAN
|
13 |
from sklearn.feature_extraction.text import CountVectorizer
|
@@ -21,6 +19,11 @@ import os
|
|
21 |
import gradio as gr
|
22 |
|
23 |
|
|
|
|
|
|
|
|
|
|
|
24 |
load_dotenv()
|
25 |
HF_TOKEN = os.getenv("HF_TOKEN")
|
26 |
assert HF_TOKEN is not None, "You need to set HF_TOKEN in your environment variables"
|
@@ -55,7 +58,7 @@ def get_split_rows(dataset, config, split):
|
|
55 |
None,
|
56 |
)
|
57 |
if split_size is None:
|
58 |
-
raise Exception(f"Error fetching split{split} in config {config}")
|
59 |
return split_size["num_rows"]
|
60 |
|
61 |
|
@@ -83,27 +86,37 @@ def calculate_embeddings(docs):
|
|
83 |
return sentence_model.encode(docs, show_progress_bar=True, batch_size=32)
|
84 |
|
85 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
86 |
# @spaces.GPU
|
87 |
-
def fit_model(docs, embeddings, n_neighbors):
|
88 |
global global_topic_model
|
89 |
|
90 |
umap_model = UMAP(
|
91 |
n_neighbors=n_neighbors,
|
92 |
-
n_components=
|
93 |
min_dist=0.0,
|
94 |
metric="cosine",
|
95 |
random_state=42,
|
96 |
)
|
97 |
|
98 |
hdbscan_model = HDBSCAN(
|
99 |
-
min_cluster_size=
|
|
|
|
|
100 |
metric="euclidean",
|
101 |
cluster_selection_method="eom",
|
102 |
prediction_data=True,
|
103 |
)
|
104 |
|
105 |
new_model = BERTopic(
|
106 |
-
"english",
|
107 |
# Sub-models
|
108 |
embedding_model=sentence_model,
|
109 |
umap_model=umap_model,
|
@@ -113,7 +126,7 @@ def fit_model(docs, embeddings, n_neighbors):
|
|
113 |
# Hyperparameters
|
114 |
top_n_words=10,
|
115 |
verbose=True,
|
116 |
-
min_topic_size=n_neighbors, #
|
117 |
)
|
118 |
logging.info("Fitting new model")
|
119 |
new_model.fit(docs, embeddings)
|
@@ -124,10 +137,6 @@ def fit_model(docs, embeddings, n_neighbors):
|
|
124 |
logging.info("Global model updated")
|
125 |
|
126 |
|
127 |
-
def calculate_n_neighbors(n_rows):
|
128 |
-
return max(n_rows // 20, 2)
|
129 |
-
|
130 |
-
|
131 |
def generate_topics(dataset, config, split, column, nested_column):
|
132 |
logging.info(
|
133 |
f"Generating topics for {dataset} with config {config} {split} {column} {nested_column}"
|
@@ -138,11 +147,11 @@ def generate_topics(dataset, config, split, column, nested_column):
|
|
138 |
logging.info(f"Split rows: {split_rows}")
|
139 |
|
140 |
limit = min(split_rows, MAX_ROWS)
|
141 |
-
n_neighbors =
|
142 |
|
143 |
reduce_umap_model = UMAP(
|
144 |
n_neighbors=n_neighbors,
|
145 |
-
n_components=2,
|
146 |
min_dist=0.0,
|
147 |
metric="cosine",
|
148 |
random_state=42,
|
@@ -172,7 +181,7 @@ def generate_topics(dataset, config, split, column, nested_column):
|
|
172 |
)
|
173 |
|
174 |
embeddings = calculate_embeddings(docs)
|
175 |
-
fit_model(docs, embeddings, n_neighbors)
|
176 |
|
177 |
if base_model is None:
|
178 |
base_model = global_topic_model
|
|
|
2 |
import logging
|
3 |
import duckdb
|
4 |
import numpy as np
|
5 |
+
from torch import cuda
|
6 |
from gradio_huggingfacehub_search import HuggingfaceHubSearch
|
7 |
from bertopic import BERTopic
|
8 |
+
from bertopic.representation import KeyBERTInspired
|
|
|
|
|
9 |
from umap import UMAP
|
10 |
from hdbscan import HDBSCAN
|
11 |
from sklearn.feature_extraction.text import CountVectorizer
|
|
|
19 |
import gradio as gr
|
20 |
|
21 |
|
22 |
+
"""
|
23 |
+
TODOs:
|
24 |
+
- Try for small dataset <1000 rows
|
25 |
+
"""
|
26 |
+
|
27 |
load_dotenv()
|
28 |
HF_TOKEN = os.getenv("HF_TOKEN")
|
29 |
assert HF_TOKEN is not None, "You need to set HF_TOKEN in your environment variables"
|
|
|
58 |
None,
|
59 |
)
|
60 |
if split_size is None:
|
61 |
+
raise Exception(f"Error fetching split {split} in config {config}")
|
62 |
return split_size["num_rows"]
|
63 |
|
64 |
|
|
|
86 |
return sentence_model.encode(docs, show_progress_bar=True, batch_size=32)
|
87 |
|
88 |
|
89 |
+
# Adjust n_neighbors and n_components based on dataset size
|
90 |
+
def calculate_n_neighbors_and_components(n_rows):
|
91 |
+
# Ensure n_neighbors is proportional to the dataset size, with reasonable limits
|
92 |
+
n_neighbors = min(max(n_rows // 20, 15), 100)
|
93 |
+
n_components = 10 if n_rows > 1000 else 5 # Higher components for larger datasets
|
94 |
+
return n_neighbors, n_components
|
95 |
+
|
96 |
+
|
97 |
# @spaces.GPU
|
98 |
+
def fit_model(docs, embeddings, n_neighbors, n_components):
|
99 |
global global_topic_model
|
100 |
|
101 |
umap_model = UMAP(
|
102 |
n_neighbors=n_neighbors,
|
103 |
+
n_components=n_components,
|
104 |
min_dist=0.0,
|
105 |
metric="cosine",
|
106 |
random_state=42,
|
107 |
)
|
108 |
|
109 |
hdbscan_model = HDBSCAN(
|
110 |
+
min_cluster_size=max(
|
111 |
+
5, n_neighbors // 2
|
112 |
+
), # Reducing min_cluster_size for fewer outliers
|
113 |
metric="euclidean",
|
114 |
cluster_selection_method="eom",
|
115 |
prediction_data=True,
|
116 |
)
|
117 |
|
118 |
new_model = BERTopic(
|
119 |
+
language="english",
|
120 |
# Sub-models
|
121 |
embedding_model=sentence_model,
|
122 |
umap_model=umap_model,
|
|
|
126 |
# Hyperparameters
|
127 |
top_n_words=10,
|
128 |
verbose=True,
|
129 |
+
min_topic_size=n_neighbors, # Coherent with n_neighbors?
|
130 |
)
|
131 |
logging.info("Fitting new model")
|
132 |
new_model.fit(docs, embeddings)
|
|
|
137 |
logging.info("Global model updated")
|
138 |
|
139 |
|
|
|
|
|
|
|
|
|
140 |
def generate_topics(dataset, config, split, column, nested_column):
|
141 |
logging.info(
|
142 |
f"Generating topics for {dataset} with config {config} {split} {column} {nested_column}"
|
|
|
147 |
logging.info(f"Split rows: {split_rows}")
|
148 |
|
149 |
limit = min(split_rows, MAX_ROWS)
|
150 |
+
n_neighbors, n_components = calculate_n_neighbors_and_components(limit)
|
151 |
|
152 |
reduce_umap_model = UMAP(
|
153 |
n_neighbors=n_neighbors,
|
154 |
+
n_components=2, # For visualization, keeping it at 2 (2D)
|
155 |
min_dist=0.0,
|
156 |
metric="cosine",
|
157 |
random_state=42,
|
|
|
181 |
)
|
182 |
|
183 |
embeddings = calculate_embeddings(docs)
|
184 |
+
fit_model(docs, embeddings, n_neighbors, n_components)
|
185 |
|
186 |
if base_model is None:
|
187 |
base_model = global_topic_model
|