Spaces:

datasets-topics
/

topics-generator

Running on T4

App Files Files Community

asoria HF staff commited on Oct 3

Commit

560300f

•

1 Parent(s): 4996a19

Refactor

Browse files

Files changed (2) hide show

app.py +22 -20
prompts.py +5 -3

app.py CHANGED Viewed

@@ -7,7 +7,6 @@ from bertopic import BERTopic
 import gradio as gr
 from bertopic.representation import (
     KeyBERTInspired,
-    MaximalMarginalRelevance,
     TextGeneration,
 )
 from umap import UMAP
@@ -19,8 +18,7 @@ from transformers import (
     AutoModelForCausalLM,
     pipeline,
 )
-from prompts import system_prompt, example_prompt, main_prompt
-from umap import UMAP
 from hdbscan import HDBSCAN
 from sklearn.feature_extraction.text import CountVectorizer
@@ -36,7 +34,6 @@ logging.basicConfig(
 session = requests.Session()
 sentence_model = SentenceTransformer("all-MiniLM-L6-v2")
 keybert = KeyBERTInspired()
-mmr = MaximalMarginalRelevance(diversity=0.3)
 vectorizer_model = CountVectorizer(stop_words="english")
 model_id = "meta-llama/Llama-2-7b-chat-hf"
@@ -52,7 +49,6 @@ bnb_config = BitsAndBytesConfig(
 tokenizer = AutoTokenizer.from_pretrained(model_id)
-# Llama 2 Model
 model = AutoModelForCausalLM.from_pretrained(
     model_id,
     trust_remote_code=True,
@@ -68,13 +64,11 @@ generator = pipeline(
     max_new_tokens=500,
     repetition_penalty=1.1,
 )
-prompt = system_prompt + example_prompt + main_prompt
-llama2 = TextGeneration(generator, prompt=prompt)
 representation_model = {
     "KeyBERT": keybert,
     "Llama2": llama2,
-    # "MMR": mmr,
 }
 umap_model = UMAP(
@@ -132,9 +126,9 @@ def fit_model(base_model, docs, embeddings):
         verbose=True,
         min_topic_size=15,
     )
-    logging.info("Fitting new model")
     new_model.fit(docs, embeddings)
-    logging.info("End fitting new model")
     if base_model is None:
         return new_model, new_model
@@ -157,35 +151,43 @@ def generate_topics(dataset, config, split, column, nested_column):
     offset = 0
     base_model = None
     all_docs = []
-    all_reduced_embeddings = np.empty((0, 2))
-    while True:
         docs = get_docs_from_parquet(parquet_urls, column, offset, chunk_size)
         logging.info(
-            f"------------> New chunk data {offset=} {chunk_size=} with {len(docs)} docs"
         )
         embeddings = calculate_embeddings(docs)
-        offset = offset + chunk_size
-        if not docs or offset >= limit:
-            break
         base_model, _ = fit_model(base_model, docs, embeddings)
         llama2_labels = [
             label[0][0].split("\n")[0]
             for label in base_model.get_topics(full=True)["Llama2"].values()
         ]
-        logging.info(f"Topics: {llama2_labels}")
         base_model.set_topic_labels(llama2_labels)
         reduced_embeddings = reduce_umap_model.fit_transform(embeddings)
         all_docs.extend(docs)
-        all_reduced_embeddings = np.vstack((all_reduced_embeddings, reduced_embeddings))
         topics_info = base_model.get_topic_info()
         topic_plot = base_model.visualize_documents(
-            all_docs, reduced_embeddings=all_reduced_embeddings, custom_labels=True
         )
-        logging.info(f"Topics for merged model: {base_model.topic_labels_}")
         yield topics_info, topic_plot
     logging.info("Finished processing all data")
     return base_model.get_topic_info(), base_model.visualize_topics()

 import gradio as gr
 from bertopic.representation import (
     KeyBERTInspired,
     TextGeneration,
 )
 from umap import UMAP
     AutoModelForCausalLM,
     pipeline,
 )
+from prompts import REPRESENTATION_PROMPT
 from hdbscan import HDBSCAN
 from sklearn.feature_extraction.text import CountVectorizer
 session = requests.Session()
 sentence_model = SentenceTransformer("all-MiniLM-L6-v2")
 keybert = KeyBERTInspired()
 vectorizer_model = CountVectorizer(stop_words="english")
 model_id = "meta-llama/Llama-2-7b-chat-hf"
 tokenizer = AutoTokenizer.from_pretrained(model_id)
 model = AutoModelForCausalLM.from_pretrained(
     model_id,
     trust_remote_code=True,
     max_new_tokens=500,
     repetition_penalty=1.1,
 )
+llama2 = TextGeneration(generator, prompt=REPRESENTATION_PROMPT)
 representation_model = {
     "KeyBERT": keybert,
     "Llama2": llama2,
 }
 umap_model = UMAP(
         verbose=True,
         min_topic_size=15,
     )
+    logging.debug("Fitting new model")
     new_model.fit(docs, embeddings)
+    logging.debug("End fitting new model")
     if base_model is None:
         return new_model, new_model
     offset = 0
     base_model = None
     all_docs = []
+    reduced_embeddings_list = []
+    while offset < limit:
         docs = get_docs_from_parquet(parquet_urls, column, offset, chunk_size)
+        if not docs:
+            break
         logging.info(
+            f"----> Processing chunk: {offset=} {chunk_size=} with {len(docs)} docs"
         )
         embeddings = calculate_embeddings(docs)
         base_model, _ = fit_model(base_model, docs, embeddings)
         llama2_labels = [
             label[0][0].split("\n")[0]
             for label in base_model.get_topics(full=True)["Llama2"].values()
         ]
         base_model.set_topic_labels(llama2_labels)
         reduced_embeddings = reduce_umap_model.fit_transform(embeddings)
+        reduced_embeddings_list.append(reduced_embeddings)
         all_docs.extend(docs)
         topics_info = base_model.get_topic_info()
         topic_plot = base_model.visualize_documents(
+            all_docs,
+            reduced_embeddings=np.vstack(reduced_embeddings_list),
+            custom_labels=True,
         )
+        logging.info(f"Topics: {llama2_labels}")
         yield topics_info, topic_plot
+        offset += chunk_size
     logging.info("Finished processing all data")
     return base_model.get_topic_info(), base_model.visualize_topics()

prompts.py CHANGED Viewed

@@ -1,10 +1,10 @@
-system_prompt = """
 <s>[INST] <<SYS>>
 You are a helpful, respectful and honest assistant for labeling topics.
 <</SYS>>
 """
-example_prompt = """
 I have a topic that contains the following documents:
 - Traditional diets in most cultures were primarily plant-based with a little meat on top, but with the rise of industrial style meat production and factory farming, meat has become a staple food.
 - Meat, but especially beef, is the word food in terms of emissions.
@@ -17,7 +17,7 @@ Based on the information about the topic above, please create a short label of t
 [/INST] Environmental impacts of eating meat
 """
-main_prompt = """
 [INST]
 I have a topic that contains the following documents:
 [DOCUMENTS]
@@ -27,3 +27,5 @@ The topic is described by the following keywords: '[KEYWORDS]'.
 Based on the information about the topic above, please create a short label of this topic. Make sure you to only return the label and nothing more.
 [/INST]
 """

+SYSTEM_PROMPT = """
 <s>[INST] <<SYS>>
 You are a helpful, respectful and honest assistant for labeling topics.
 <</SYS>>
 """
+EXAMPLE_PROMPT = """
 I have a topic that contains the following documents:
 - Traditional diets in most cultures were primarily plant-based with a little meat on top, but with the rise of industrial style meat production and factory farming, meat has become a staple food.
 - Meat, but especially beef, is the word food in terms of emissions.
 [/INST] Environmental impacts of eating meat
 """
+MAIN_PROMPT = """
 [INST]
 I have a topic that contains the following documents:
 [DOCUMENTS]
 Based on the information about the topic above, please create a short label of this topic. Make sure you to only return the label and nothing more.
 [/INST]
 """
+REPRESENTATION_PROMPT = SYSTEM_PROMPT + EXAMPLE_PROMPT + MAIN_PROMPT