asoria HF staff commited on
Commit
b5bf2c0
1 Parent(s): 7dcda45

Change parameters by dataset size

Browse files
Files changed (1) hide show
  1. app.py +26 -17
app.py CHANGED
@@ -2,12 +2,10 @@ import requests
2
  import logging
3
  import duckdb
4
  import numpy as np
5
-
6
  from gradio_huggingfacehub_search import HuggingfaceHubSearch
7
  from bertopic import BERTopic
8
- from bertopic.representation import (
9
- KeyBERTInspired,
10
- )
11
  from umap import UMAP
12
  from hdbscan import HDBSCAN
13
  from sklearn.feature_extraction.text import CountVectorizer
@@ -21,6 +19,11 @@ import os
21
  import gradio as gr
22
 
23
 
 
 
 
 
 
24
  load_dotenv()
25
  HF_TOKEN = os.getenv("HF_TOKEN")
26
  assert HF_TOKEN is not None, "You need to set HF_TOKEN in your environment variables"
@@ -55,7 +58,7 @@ def get_split_rows(dataset, config, split):
55
  None,
56
  )
57
  if split_size is None:
58
- raise Exception(f"Error fetching split{split} in config {config}")
59
  return split_size["num_rows"]
60
 
61
 
@@ -83,27 +86,37 @@ def calculate_embeddings(docs):
83
  return sentence_model.encode(docs, show_progress_bar=True, batch_size=32)
84
 
85
 
 
 
 
 
 
 
 
 
86
  # @spaces.GPU
87
- def fit_model(docs, embeddings, n_neighbors):
88
  global global_topic_model
89
 
90
  umap_model = UMAP(
91
  n_neighbors=n_neighbors,
92
- n_components=5,
93
  min_dist=0.0,
94
  metric="cosine",
95
  random_state=42,
96
  )
97
 
98
  hdbscan_model = HDBSCAN(
99
- min_cluster_size=n_neighbors,
 
 
100
  metric="euclidean",
101
  cluster_selection_method="eom",
102
  prediction_data=True,
103
  )
104
 
105
  new_model = BERTopic(
106
- "english",
107
  # Sub-models
108
  embedding_model=sentence_model,
109
  umap_model=umap_model,
@@ -113,7 +126,7 @@ def fit_model(docs, embeddings, n_neighbors):
113
  # Hyperparameters
114
  top_n_words=10,
115
  verbose=True,
116
- min_topic_size=n_neighbors, # TODO: Should this value be coherent with N_NEIGHBORS?
117
  )
118
  logging.info("Fitting new model")
119
  new_model.fit(docs, embeddings)
@@ -124,10 +137,6 @@ def fit_model(docs, embeddings, n_neighbors):
124
  logging.info("Global model updated")
125
 
126
 
127
- def calculate_n_neighbors(n_rows):
128
- return max(n_rows // 20, 2)
129
-
130
-
131
  def generate_topics(dataset, config, split, column, nested_column):
132
  logging.info(
133
  f"Generating topics for {dataset} with config {config} {split} {column} {nested_column}"
@@ -138,11 +147,11 @@ def generate_topics(dataset, config, split, column, nested_column):
138
  logging.info(f"Split rows: {split_rows}")
139
 
140
  limit = min(split_rows, MAX_ROWS)
141
- n_neighbors = calculate_n_neighbors(limit)
142
 
143
  reduce_umap_model = UMAP(
144
  n_neighbors=n_neighbors,
145
- n_components=2,
146
  min_dist=0.0,
147
  metric="cosine",
148
  random_state=42,
@@ -172,7 +181,7 @@ def generate_topics(dataset, config, split, column, nested_column):
172
  )
173
 
174
  embeddings = calculate_embeddings(docs)
175
- fit_model(docs, embeddings, n_neighbors)
176
 
177
  if base_model is None:
178
  base_model = global_topic_model
 
2
  import logging
3
  import duckdb
4
  import numpy as np
5
+ from torch import cuda
6
  from gradio_huggingfacehub_search import HuggingfaceHubSearch
7
  from bertopic import BERTopic
8
+ from bertopic.representation import KeyBERTInspired
 
 
9
  from umap import UMAP
10
  from hdbscan import HDBSCAN
11
  from sklearn.feature_extraction.text import CountVectorizer
 
19
  import gradio as gr
20
 
21
 
22
+ """
23
+ TODOs:
24
+ - Try for small dataset <1000 rows
25
+ """
26
+
27
  load_dotenv()
28
  HF_TOKEN = os.getenv("HF_TOKEN")
29
  assert HF_TOKEN is not None, "You need to set HF_TOKEN in your environment variables"
 
58
  None,
59
  )
60
  if split_size is None:
61
+ raise Exception(f"Error fetching split {split} in config {config}")
62
  return split_size["num_rows"]
63
 
64
 
 
86
  return sentence_model.encode(docs, show_progress_bar=True, batch_size=32)
87
 
88
 
89
+ # Adjust n_neighbors and n_components based on dataset size
90
+ def calculate_n_neighbors_and_components(n_rows):
91
+ # Ensure n_neighbors is proportional to the dataset size, with reasonable limits
92
+ n_neighbors = min(max(n_rows // 20, 15), 100)
93
+ n_components = 10 if n_rows > 1000 else 5 # Higher components for larger datasets
94
+ return n_neighbors, n_components
95
+
96
+
97
  # @spaces.GPU
98
+ def fit_model(docs, embeddings, n_neighbors, n_components):
99
  global global_topic_model
100
 
101
  umap_model = UMAP(
102
  n_neighbors=n_neighbors,
103
+ n_components=n_components,
104
  min_dist=0.0,
105
  metric="cosine",
106
  random_state=42,
107
  )
108
 
109
  hdbscan_model = HDBSCAN(
110
+ min_cluster_size=max(
111
+ 5, n_neighbors // 2
112
+ ), # Reducing min_cluster_size for fewer outliers
113
  metric="euclidean",
114
  cluster_selection_method="eom",
115
  prediction_data=True,
116
  )
117
 
118
  new_model = BERTopic(
119
+ language="english",
120
  # Sub-models
121
  embedding_model=sentence_model,
122
  umap_model=umap_model,
 
126
  # Hyperparameters
127
  top_n_words=10,
128
  verbose=True,
129
+ min_topic_size=n_neighbors, # Coherent with n_neighbors?
130
  )
131
  logging.info("Fitting new model")
132
  new_model.fit(docs, embeddings)
 
137
  logging.info("Global model updated")
138
 
139
 
 
 
 
 
140
  def generate_topics(dataset, config, split, column, nested_column):
141
  logging.info(
142
  f"Generating topics for {dataset} with config {config} {split} {column} {nested_column}"
 
147
  logging.info(f"Split rows: {split_rows}")
148
 
149
  limit = min(split_rows, MAX_ROWS)
150
+ n_neighbors, n_components = calculate_n_neighbors_and_components(limit)
151
 
152
  reduce_umap_model = UMAP(
153
  n_neighbors=n_neighbors,
154
+ n_components=2, # For visualization, keeping it at 2 (2D)
155
  min_dist=0.0,
156
  metric="cosine",
157
  random_state=42,
 
181
  )
182
 
183
  embeddings = calculate_embeddings(docs)
184
+ fit_model(docs, embeddings, n_neighbors, n_components)
185
 
186
  if base_model is None:
187
  base_model = global_topic_model