asoria HF staff commited on
Commit
e2d9a99
1 Parent(s): 10cefed

Adding viz for merged model

Browse files
Files changed (2) hide show
  1. app.py +24 -47
  2. requirements.txt +2 -1
app.py CHANGED
@@ -1,4 +1,4 @@
1
- import spaces
2
  import requests
3
  import logging
4
  import duckdb
@@ -8,6 +8,7 @@ import pandas as pd
8
  import gradio as gr
9
  from bertopic.representation import KeyBERTInspired
10
  from umap import UMAP
 
11
 
12
  # from cuml.cluster import HDBSCAN
13
  # from cuml.manifold import UMAP
@@ -41,14 +42,14 @@ def get_docs_from_parquet(parquet_urls, column, offset, limit):
41
  return df[column].tolist()
42
 
43
 
44
- @spaces.GPU
45
  def calculate_embeddings(docs):
46
  embeddings = sentence_model.encode(docs, show_progress_bar=True, batch_size=100)
47
  logging.info(f"Embeddings shape: {embeddings.shape}")
48
  return embeddings
49
 
50
 
51
- @spaces.GPU
52
  def fit_model(base_model, sentence_model, representation_model, docs, embeddings):
53
  new_model = BERTopic(
54
  "english",
@@ -81,59 +82,35 @@ def generate_topics(dataset, config, split, column, nested_column):
81
  offset = 0
82
  representation_model = KeyBERTInspired()
83
  base_model = None
84
- # docs = get_docs_from_parquet(parquet_urls, column, offset, chunk_size)
85
-
86
- # base_model = BERTopic(
87
- # "english", representation_model=representation_model, min_topic_size=15
88
- # )
89
- # base_model.fit_transform(docs)
90
-
91
- # yield base_model.get_topic_info(), base_model.visualize_topics()
92
- # Create instances of GPU-accelerated UMAP and HDBSCAN
93
- # umap_model = UMAP(n_components=5, n_neighbors=15, min_dist=0.0)
94
- # hdbscan_model = HDBSCAN(min_samples=10, gen_min_span_tree=True)
95
  while True:
96
  docs = get_docs_from_parquet(parquet_urls, column, offset, chunk_size)
97
- logging.info(f"------------> New chunk data {offset=} {chunk_size=}")
 
 
98
  embeddings = calculate_embeddings(docs)
99
  offset = offset + chunk_size
100
  if not docs or offset >= limit:
101
  break
102
-
103
- # new_model = BERTopic(
104
- # "english",
105
- # embedding_model=sentence_model,
106
- # representation_model=representation_model,
107
- # min_topic_size=15, # umap_model=umap_model, hdbscan_model=hdbscan_model
108
- # )
109
- # logging.info("Fitting new model")
110
- # new_model.fit(docs, embeddings)
111
- # logging.info("End fitting new model")
112
- # if base_model is not None:
113
- # updated_model = BERTopic.merge_models([base_model, new_model])
114
- # nr_new_topics = len(set(updated_model.topics_)) - len(
115
- # set(base_model.topics_)
116
- # )
117
- # new_topics = list(updated_model.topic_labels_.values())[-nr_new_topics:]
118
- # logging.info("The following topics are newly found:")
119
- # logging.info(f"{new_topics}\n")
120
- # base_model = updated_model
121
- # else:
122
- # base_model = new_model
123
- # logging.info(base_model.get_topic_info())
124
- base_model, new_model = fit_model(
125
  base_model, sentence_model, representation_model, docs, embeddings
126
  )
127
- # reduced_embeddings = UMAP(
128
- # n_neighbors=10, n_components=2, min_dist=0.0, metric="cosine"
129
- # ).fit_transform(embeddings)
130
- # logging.info(f"Reduced embeddings shape: {reduced_embeddings.shape}")
131
- yield (
132
- base_model.get_topic_info(),
133
- new_model.visualize_documents(
134
- docs, embeddings=embeddings
135
- ), # TODO: Visualize the merged models
 
 
136
  )
 
 
 
137
  logging.info("Finished processing all data")
138
  return base_model.get_topic_info(), base_model.visualize_topics()
139
 
 
1
+ # import spaces
2
  import requests
3
  import logging
4
  import duckdb
 
8
  import gradio as gr
9
  from bertopic.representation import KeyBERTInspired
10
  from umap import UMAP
11
+ import numpy as np
12
 
13
  # from cuml.cluster import HDBSCAN
14
  # from cuml.manifold import UMAP
 
42
  return df[column].tolist()
43
 
44
 
45
+ # @spaces.GPU
46
  def calculate_embeddings(docs):
47
  embeddings = sentence_model.encode(docs, show_progress_bar=True, batch_size=100)
48
  logging.info(f"Embeddings shape: {embeddings.shape}")
49
  return embeddings
50
 
51
 
52
+ # @spaces.GPU
53
  def fit_model(base_model, sentence_model, representation_model, docs, embeddings):
54
  new_model = BERTopic(
55
  "english",
 
82
  offset = 0
83
  representation_model = KeyBERTInspired()
84
  base_model = None
85
+ all_docs = []
86
+ all_reduced_embeddings = np.empty((0, 2))
 
 
 
 
 
 
 
 
 
87
  while True:
88
  docs = get_docs_from_parquet(parquet_urls, column, offset, chunk_size)
89
+ logging.info(
90
+ f"------------> New chunk data {offset=} {chunk_size=} with {len(docs)} docs"
91
+ )
92
  embeddings = calculate_embeddings(docs)
93
  offset = offset + chunk_size
94
  if not docs or offset >= limit:
95
  break
96
+ base_model, _ = fit_model(
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
97
  base_model, sentence_model, representation_model, docs, embeddings
98
  )
99
+ reduced_embeddings = UMAP(
100
+ n_neighbors=10, n_components=2, min_dist=0.0, metric="cosine"
101
+ ).fit_transform(embeddings)
102
+ logging.info(f"Reduced embeddings shape: {reduced_embeddings.shape}")
103
+
104
+ all_docs.extend(docs)
105
+ all_reduced_embeddings = np.vstack((all_reduced_embeddings, reduced_embeddings))
106
+ logging.info(f"Stacked embeddings shape: {all_reduced_embeddings.shape}")
107
+ topics_info = base_model.get_topic_info()
108
+ topic_plot = base_model.visualize_documents(
109
+ all_docs, reduced_embeddings=all_reduced_embeddings
110
  )
111
+
112
+ yield topics_info, topic_plot
113
+
114
  logging.info("Finished processing all data")
115
  return base_model.get_topic_info(), base_model.visualize_topics()
116
 
requirements.txt CHANGED
@@ -7,4 +7,5 @@ sentence-transformers
7
  datamapplot
8
  bertopic
9
  pandas
10
- torch
 
 
7
  datamapplot
8
  bertopic
9
  pandas
10
+ torch
11
+ numpy