Spaces:

datasets-topics
/

topics-generator

Sleeping

App Files Files Community

asoria HF staff commited on 18 days ago

Commit

b5ecaeb

•

1 Parent(s): 8712d35

Apply text generation layer at the end only

Browse files

Files changed (1) hide show

app.py +76 -40

app.py CHANGED Viewed

@@ -44,7 +44,6 @@ DATASETS_TOPICS_ORGANIZATION = os.getenv(
     "DATASETS_TOPICS_ORGANIZATION", "datasets-topics"
 )
 USE_CUML = int(os.getenv("USE_CUML", "1"))
-USE_LLM_TEXT_GENERATION = int(os.getenv("USE_LLM_TEXT_GENERATION", "1"))
 # Use cuml lib only if configured
 if USE_CUML:
@@ -60,43 +59,39 @@ logging.basicConfig(
 )
 api = HfApi(token=HF_TOKEN)
-sentence_model = SentenceTransformer("all-MiniLM-L6-v2")
-# Representation model
-if USE_LLM_TEXT_GENERATION:
-    bnb_config = BitsAndBytesConfig(
-        load_in_4bit=True,
-        bnb_4bit_quant_type="nf4",
-        bnb_4bit_use_double_quant=True,
-        bnb_4bit_compute_dtype=bfloat16,
-    )
-    model_id = "meta-llama/Llama-2-7b-chat-hf"
-    tokenizer = AutoTokenizer.from_pretrained(model_id)
-    model = AutoModelForCausalLM.from_pretrained(
-        model_id,
-        trust_remote_code=True,
-        quantization_config=bnb_config,
-        device_map="auto",
-    )
-    model.eval()
-    generator = pipeline(
-        model=model,
-        tokenizer=tokenizer,
-        task="text-generation",
-        temperature=0.1,
-        max_new_tokens=500,
-        repetition_penalty=1.1,
-    )
-    representation_model = TextGeneration(generator, prompt=REPRESENTATION_PROMPT)
-else:
-    representation_model = KeyBERTInspired()
 vectorizer_model = CountVectorizer(stop_words="english")
 def calculate_embeddings(docs):
-    return sentence_model.encode(docs, show_progress_bar=True, batch_size=32)
 def calculate_n_neighbors_and_components(n_rows):
@@ -126,7 +121,7 @@ def fit_model(docs, embeddings, n_neighbors, n_components):
     new_model = BERTopic(
         language="english",
         # Sub-models
-        embedding_model=sentence_model,  # Step 1 - Extract embeddings
         umap_model=umap_model,  # Step 2 - UMAP model
         hdbscan_model=hdbscan_model,  # Step 3 - Cluster reduced embeddings
         vectorizer_model=vectorizer_model,  # Step 4 - Tokenize topics
@@ -294,13 +289,55 @@ def generate_topics(dataset, config, split, column, plot_type):
     all_topics = base_model.topics_
     topic_info = base_model.get_topic_info()
-    topic_names = {row["Topic"]: row["Name"] for _, row in topic_info.iterrows()}
-    topic_names_array = np.array(
-        [
-            topic_names.get(topic, "No Topic").split("_")[1].strip("-")
-            for topic in all_topics
-        ]
     )
     interactive_plot = datamapplot.create_interactive_plot(
         reduced_embeddings_array,
         topic_names_array,
@@ -348,7 +385,6 @@ def generate_topics(dataset, config, split, column, plot_type):
         base_model,
         all_topics,
         topic_info,
-        topic_names,
         topic_names_array,
         interactive_plot,
     )

     "DATASETS_TOPICS_ORGANIZATION", "datasets-topics"
 )
 USE_CUML = int(os.getenv("USE_CUML", "1"))
 # Use cuml lib only if configured
 if USE_CUML:
 )
 api = HfApi(token=HF_TOKEN)
+bnb_config = BitsAndBytesConfig(
+    load_in_4bit=True,
+    bnb_4bit_quant_type="nf4",
+    bnb_4bit_use_double_quant=True,
+    bnb_4bit_compute_dtype=bfloat16,
+)
+model_id = "meta-llama/Llama-2-7b-chat-hf"
+tokenizer = AutoTokenizer.from_pretrained(model_id)
+model = AutoModelForCausalLM.from_pretrained(
+    model_id,
+    trust_remote_code=True,
+    quantization_config=bnb_config,
+    device_map="auto",
+)
+model.eval()
+generator = pipeline(
+    model=model,
+    tokenizer=tokenizer,
+    task="text-generation",
+    temperature=0.1,
+    max_new_tokens=500,
+    repetition_penalty=1.1,
+)
+embedding_model = SentenceTransformer("all-MiniLM-L6-v2")
 vectorizer_model = CountVectorizer(stop_words="english")
+representation_model = KeyBERTInspired()
 def calculate_embeddings(docs):
+    return embedding_model.encode(docs, show_progress_bar=True, batch_size=32)
 def calculate_n_neighbors_and_components(n_rows):
     new_model = BERTopic(
         language="english",
         # Sub-models
+        embedding_model=embedding_model,  # Step 1 - Extract embeddings
         umap_model=umap_model,  # Step 2 - UMAP model
         hdbscan_model=hdbscan_model,  # Step 3 - Cluster reduced embeddings
         vectorizer_model=vectorizer_model,  # Step 4 - Tokenize topics
     all_topics = base_model.topics_
     topic_info = base_model.get_topic_info()
+    new_topics_by_text_generation = {}
+    for _, row in topic_info.iterrows():
+        logging.info(
+            f"Processing topic: {row['Topic']} - Representation: {row['Representation']}"
+        )
+        prompt = f"{REPRESENTATION_PROMPT.replace('[KEYWORDS]', ','.join(row['Representation']))}"
+        logging.info(prompt)
+        topic_description = generator(prompt)
+        logging.info(topic_description)
+        new_topics_by_text_generation[row["Topic"]] = topic_description[0][
+            "generated_text"
+        ].replace(prompt, "")
+    base_model.set_topic_labels(new_topics_by_text_generation)
+    topics_info = base_model.get_topic_info()
+    topic_plot = (
+        base_model.visualize_document_datamap(
+            docs=all_docs,
+            topics=all_topics,
+            custom_labels=True,
+            reduced_embeddings=reduced_embeddings_array,
+            title="",
+            sub_title=sub_title,
+            width=800,
+            height=700,
+            arrowprops={
+                "arrowstyle": "wedge,tail_width=0.5",
+                "connectionstyle": "arc3,rad=0.05",
+                "linewidth": 0,
+                "fc": "#33333377",
+            },
+            dynamic_label_size=True,
+            # label_wrap_width=12,
+            label_over_points=True,
+            max_font_size=36,
+            min_font_size=4,
+        )
+        if plot_type == "DataMapPlot"
+        else base_model.visualize_documents(
+            docs=all_docs,
+            reduced_embeddings=reduced_embeddings_array,
+            custom_labels=True,
+            title="",
+        )
     )
+    custom_labels = base_model.custom_labels_
+    topic_names_array = [custom_labels[doc_topic + 1] for doc_topic in all_topics]
     interactive_plot = datamapplot.create_interactive_plot(
         reduced_embeddings_array,
         topic_names_array,
         base_model,
         all_topics,
         topic_info,
         topic_names_array,
         interactive_plot,
     )