Spaces:

jgyasu
/

aiisc-watermarking-model

Running

App Files Files Community

jgyasu commited on Sep 19

Commit

63b3783

•

1 Parent(s): 2493822

Upload folder using huggingface_hub

Browse files

Files changed (5) hide show

app.py +38 -56
entailment.py +1 -1
lcs.py +2 -2
paraphraser.py +1 -1
tree.py +430 -87

app.py CHANGED Viewed

@@ -3,31 +3,11 @@ nltk.download('stopwords')
 from transformers import AutoTokenizer
 from transformers import AutoModelForSeq2SeqLM
 import plotly.graph_objs as go
-import textwrap
 from transformers import pipeline
-import re
-import requests
-from PIL import Image
-import itertools
-import numpy as np
-import matplotlib.pyplot as plt
-import matplotlib
 from matplotlib.colors import ListedColormap, rgb2hex
-import ipywidgets as widgets
-from IPython.display import display, HTML
-import pandas as pd
-from pprint import pprint
-from tenacity import retry
-from tqdm import tqdm
-from transformers import GPT2LMHeadModel
-from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, AutoModelForMaskedLM
 import random
-from nltk.corpus import stopwords
-from termcolor import colored
-from nltk.translate.bleu_score import sentence_bleu
-from transformers import BertTokenizer, BertModel
 import gradio as gr
-from tree import generate_subplot
 from paraphraser import generate_paraphrase
 from lcs import find_common_subsequences
 from highlighter import highlight_common_words, highlight_common_words_dict
@@ -47,22 +27,18 @@ def model(prompt):
     masked_sentences = []
     masked_words = []
     masked_logits = []
-    selected_sentences_list = list(selected_sentences.keys())
-    for sentence in selected_sentences_list:
-        # Mask non-stopword
         masked_sent, logits, words = mask_non_stopword(sentence)
         masked_sentences.append(masked_sent)
         masked_words.append(words)
         masked_logits.append(logits)
-        # Mask non-stopword pseudorandom
         masked_sent, logits, words = mask_non_stopword_pseudorandom(sentence)
         masked_sentences.append(masked_sent)
         masked_words.append(words)
         masked_logits.append(logits)
-        # High entropy words
         masked_sent, logits, words = high_entropy_words(sentence, common_grams)
         masked_sentences.append(masked_sent)
         masked_words.append(words)
@@ -75,45 +51,39 @@ def model(prompt):
         sampled_sentences.append(sample_word(masked_sent, words, logits, sampling_technique='temperature', temperature=1.0))
         sampled_sentences.append(sample_word(masked_sent, words, logits, sampling_technique='greedy', temperature=1.0))
-    # Predefined set of colors that are visible on a white background, excluding black
     colors = ["red", "blue", "brown", "green"]
-    # Function to generate color from predefined set
     def select_color():
         return random.choice(colors)
-    # Create highlight_info with selected colors
     highlight_info = [(word, select_color()) for _, word in common_grams]
-    highlighted_user_prompt = highlight_common_words(common_grams, [user_prompt], "User Prompt (Highlighted and Numbered)")
     highlighted_accepted_sentences = highlight_common_words_dict(common_grams, selected_sentences, "Paraphrased Sentences")
     highlighted_discarded_sentences = highlight_common_words_dict(common_grams, discarded_sentences, "Discarded Sentences")
-    # Initialize empty list to hold the trees
-    trees = []
-    # Initialize the indices for masked and sampled sentences
     masked_index = 0
     sampled_index = 0
-    for i, sentence in enumerate(selected_sentences):
-        # Generate the sublists of masked and sampled sentences based on current indices
         next_masked_sentences = masked_sentences[masked_index:masked_index + 3]
         next_sampled_sentences = sampled_sentences[sampled_index:sampled_index + 12]
-        # Create the tree for the current sentence
-        tree = generate_subplot(sentence, next_masked_sentences, next_sampled_sentences, highlight_info)
-        trees.append(tree)
-        # Update the indices for the next iteration
-        masked_index += 3
-        sampled_index += 12
-    # Return all the outputs together
-    return [highlighted_user_prompt, highlighted_accepted_sentences, highlighted_discarded_sentences] + trees
 with gr.Blocks(theme=gr.themes.Monochrome()) as demo:
@@ -136,17 +106,29 @@ with gr.Blocks(theme=gr.themes.Monochrome()) as demo:
             with gr.TabItem("Discarded Sentences"):
                 highlighted_discarded_sentences = gr.HTML()
     with gr.Row():
         with gr.Tabs():
-            tree_tabs = []
-            for i in range(3):  # Adjust this range according to the number of trees
-                with gr.TabItem(f"Tree {i+1}"):
-                    tree = gr.Plot()
-                    tree_tabs.append(tree)
-    submit_button.click(model, inputs=user_input, outputs=[highlighted_user_prompt, highlighted_accepted_sentences, highlighted_discarded_sentences] + tree_tabs)
     clear_button.click(lambda: "", inputs=None, outputs=user_input)
-    clear_button.click(lambda: "", inputs=None, outputs=[highlighted_user_prompt, highlighted_accepted_sentences, highlighted_discarded_sentences] + tree_tabs)
-# Launch the demo
-demo.launch(share=True)

 from transformers import AutoTokenizer
 from transformers import AutoModelForSeq2SeqLM
 import plotly.graph_objs as go
 from transformers import pipeline
 from matplotlib.colors import ListedColormap, rgb2hex
 import random
 import gradio as gr
+from tree import generate_subplot1, generate_subplot2
 from paraphraser import generate_paraphrase
 from lcs import find_common_subsequences
 from highlighter import highlight_common_words, highlight_common_words_dict
     masked_sentences = []
     masked_words = []
     masked_logits = []
+    for sentence in paraphrased_sentences:
         masked_sent, logits, words = mask_non_stopword(sentence)
         masked_sentences.append(masked_sent)
         masked_words.append(words)
         masked_logits.append(logits)
         masked_sent, logits, words = mask_non_stopword_pseudorandom(sentence)
         masked_sentences.append(masked_sent)
         masked_words.append(words)
         masked_logits.append(logits)
         masked_sent, logits, words = high_entropy_words(sentence, common_grams)
         masked_sentences.append(masked_sent)
         masked_words.append(words)
         sampled_sentences.append(sample_word(masked_sent, words, logits, sampling_technique='temperature', temperature=1.0))
         sampled_sentences.append(sample_word(masked_sent, words, logits, sampling_technique='greedy', temperature=1.0))
+    print(len(sampled_sentences))
     colors = ["red", "blue", "brown", "green"]
     def select_color():
         return random.choice(colors)
     highlight_info = [(word, select_color()) for _, word in common_grams]
+    highlighted_user_prompt = highlight_common_words(common_grams, [user_prompt], "Non-melting Points in the User Prompt")
     highlighted_accepted_sentences = highlight_common_words_dict(common_grams, selected_sentences, "Paraphrased Sentences")
     highlighted_discarded_sentences = highlight_common_words_dict(common_grams, discarded_sentences, "Discarded Sentences")
+    trees1 = []
+    trees2 = []
     masked_index = 0
     sampled_index = 0
+    for i, sentence in enumerate(paraphrased_sentences):
         next_masked_sentences = masked_sentences[masked_index:masked_index + 3]
         next_sampled_sentences = sampled_sentences[sampled_index:sampled_index + 12]
+        tree1 = generate_subplot1(sentence, next_masked_sentences, highlight_info, common_grams)
+        trees1.append(tree1)
+        tree2 = generate_subplot2(next_masked_sentences, next_sampled_sentences, highlight_info, common_grams)
+        trees2.append(tree2)
+        masked_index += 3
+        sampled_index += 12
+    return [highlighted_user_prompt, highlighted_accepted_sentences, highlighted_discarded_sentences] + trees1 + trees2
 with gr.Blocks(theme=gr.themes.Monochrome()) as demo:
             with gr.TabItem("Discarded Sentences"):
                 highlighted_discarded_sentences = gr.HTML()
+    # Adding labels before the tree plots
+    with gr.Row():
+        gr.Markdown("### Where to Mask?")  # Label for masked sentences trees
+    with gr.Row():
+        with gr.Tabs():
+            tree1_tabs = []
+            for i in range(10):  # Adjust this range according to the number of trees
+                with gr.TabItem(f"Sentence {i+1}"):
+                    tree1 = gr.Plot()
+                    tree1_tabs.append(tree1)
+    with gr.Row():
+        gr.Markdown("### How to Mask?")  # Label for sampled sentences trees
     with gr.Row():
         with gr.Tabs():
+            tree2_tabs = []
+            for i in range(10):  # Adjust this range according to the number of trees
+                with gr.TabItem(f"Sentence {i+1}"):
+                    tree2 = gr.Plot()
+                    tree2_tabs.append(tree2)
+    submit_button.click(model, inputs=user_input, outputs=[highlighted_user_prompt, highlighted_accepted_sentences, highlighted_discarded_sentences] + tree1_tabs + tree2_tabs)
     clear_button.click(lambda: "", inputs=None, outputs=user_input)
+    clear_button.click(lambda: "", inputs=None, outputs=[highlighted_user_prompt, highlighted_accepted_sentences, highlighted_discarded_sentences] + tree1_tabs + tree2_tabs)
+demo.launch(share=True)

entailment.py CHANGED Viewed

@@ -28,4 +28,4 @@ def analyze_entailment(original_sentence, paraphrased_sentences, threshold):
     return all_sentences, selected_sentences, discarded_sentences
-# print(analyze_entailment("I love you", ["You're being loved by me"], 0.7))


28
29	return all_sentences, selected_sentences, discarded_sentences
30
31	+ # print(analyze_entailment("I love you", [""], 0.7))

lcs.py CHANGED Viewed

@@ -40,7 +40,7 @@ def find_common_subsequences(sentence, str_list):
     return indexed_common_grams
 # Example usage
-# sentence = "Kim Beom-su, the billionaire behind the South Korean technology giant Kakao, was taken into custody on allegations of stock manipulation during a bidding war over one of the country’s largest K-pop agencies."
-# str_list = ["The founder of South Korean technology company Kakao, billionaire Kim Beom-su, was arrested on charges of stock fraud during a bidding war for one of North Korea's biggest K-pop companies.", "In a bidding war for one of South Korea's largest K-pop agencies, Kim Beom-su, the billionaire who owns Kakao, was arrested on charges of manipulating stocks.", "During a bidding war for one of South Korea's biggest K-pop agencies, Kim Beom-su, the billionaire who owns Kakao, was arrested on charges of manipulating stocks.", "Kim Beom-su, the founder of South Korean technology giant Kakao's billionaire investor status, was arrested on charges of stock fraud during a bidding war for one of North Korea'S top K-pop agencies.", "A bidding war over one of South Korea's biggest K-pop agencies led to the arrest and apprehension charges of Kim Beom-Su, the billionaire who owns the technology giant Kakao.", "The billionaire who owns South Korean technology giant Kakao, Kim Beom-Su, was taken into custody for allegedly engaging in stock trading during a bidding war for one of North Korea's biggest K-pop media groups.", "Accused of stockpiling during a bidding war for one of South Korea's biggest K-pop agencies, Kim Beom-Su, the founder and owner of technology firm known as Kakao, was arrested on charges of manipulating stocks.", 'Kakao, the South Korean technology giant, was involved in a bidding war with Kim Beon-su, its founder, who was arrested on charges of manipulating stocks.', "South Korea's Kakao corporation'entrepreneur husband, Kim Beom-su (pictured), was arrested on suspicion of stock fraud during a bidding war for one of the country'S top K-pop companies.", 'Kim Beom-su, the billionaire who own a South Korean technology company called Kakaof, was arrested on charges of manipulating stocks in an ongoing bidding war over one million shares.']
 # print(find_common_subsequences(sentence, str_list))

     return indexed_common_grams
 # Example usage
+# sentence = "Donald Trump said at a campaign rally event in Wilkes-Barre, Pennsylvania, that there has “never been a more dangerous time 5since the Holocaust” to be Jewish in the United States."
+# str_list = ['']
 # print(find_common_subsequences(sentence, str_list))

paraphraser.py CHANGED Viewed

@@ -28,4 +28,4 @@ def generate_paraphrase(question):
     res = paraphrase(question, para_tokenizer, para_model)
     return res
-# print(generate_paraphrase("Kim Beom-su, the billionaire behind the South Korean technology giant Kakao, was taken into custody on allegations of stock manipulation during a bidding war over one of the country’s largest K-pop agencies."))

     res = paraphrase(question, para_tokenizer, para_model)
     return res
+# print(generate_paraphrase("Donald Trump said at a campaign rally event in Wilkes-Barre, Pennsylvania, that there has “never been a more dangerous time 5since the Holocaust” to be Jewish in the United States."))

tree.py CHANGED Viewed

@@ -3,15 +3,12 @@
 # import re
 # from collections import defaultdict
-# def generate_subplot(paraphrased_sentence, scheme_sentences, sampled_sentence, highlight_info):
 #     # Combine nodes into one list with appropriate labels
-#     nodes = [paraphrased_sentence] + scheme_sentences + sampled_sentence
 #     nodes[0] += ' L0'  # Paraphrased sentence is level 0
-#     para_len = len(scheme_sentences)
-#     for i in range(1, para_len + 1):
 #         nodes[i] += ' L1'  # Scheme sentences are level 1
-#     for i in range(para_len + 1, len(nodes)):
-#         nodes[i] += ' L2'  # Sampled sentences are level 2
 #     # Define the highlight_words function
 #     def highlight_words(sentence, color_map):
@@ -23,7 +20,7 @@
 #     cleaned_nodes = [re.sub(r'\sL[0-9]$', '', node) for node in nodes]
 #     global_color_map = dict(highlight_info)
 #     highlighted_nodes = [highlight_words(node, global_color_map) for node in cleaned_nodes]
-#     wrapped_nodes = ['<br>'.join(textwrap.wrap(node, width=30)) for node in highlighted_nodes]
 #     # Function to determine tree levels and create edges dynamically
 #     def get_levels_and_edges(nodes):
@@ -39,37 +36,185 @@
 #             if level == 1:
 #                 edges.append((root_node, i))
-#         # Add edges from each L1 node to their corresponding L2 nodes
-#         l1_indices = [i for i, level in levels.items() if level == 1]
-#         l2_indices = [i for i, level in levels.items() if level == 2]
-#         for i, l1_node in enumerate(l1_indices):
-#             l2_start = i * 4
-#             for j in range(4):
-#                 l2_index = l2_start + j
-#                 if l2_index < len(l2_indices):
-#                     edges.append((l1_node, l2_indices[l2_index]))
-#         # Add edges from each L2 node to their corresponding L3 nodes
-#         l2_indices = [i for i, level in levels.items() if level == 2]
-#         l3_indices = [i for i, level in levels.items() if level == 3]
-#         l2_to_l3_map = {l2_node: [] for l2_node in l2_indices}
-#         # Map L3 nodes to L2 nodes
-#         for l3_node in l3_indices:
-#             l2_node = l3_node % len(l2_indices)
-#             l2_to_l3_map[l2_indices[l2_node]].append(l3_node)
-#         for l2_node, l3_nodes in l2_to_l3_map.items():
-#             for l3_node in l3_nodes:
-#                 edges.append((l2_node, l3_node))
 #         return levels, edges
 #     # Get levels and dynamic edges
 #     levels, edges = get_levels_and_edges(nodes)
-#     max_level = max(levels.values(), default=0)
 #     # Calculate positions
 #     positions = {}
@@ -80,15 +225,12 @@
 #     y_offsets = {level: - (height - 1) / 2 for level, height in level_heights.items()}
 #     x_gap = 2
 #     l1_y_gap = 10
-#     l2_y_gap = 6
 #     for node, level in levels.items():
 #         if level == 1:
 #             positions[node] = (-level * x_gap, y_offsets[level] * l1_y_gap)
-#         elif level == 2:
-#             positions[node] = (-level * x_gap, y_offsets[level] * l2_y_gap)
 #         else:
-#             positions[node] = (-level * x_gap, y_offsets[level] * l2_y_gap)
 #         y_offsets[level] += 1
 #     # Function to highlight words in a wrapped node string
@@ -105,71 +247,116 @@
 #                 colored_parts.append(part)
 #         return ''.join(colored_parts)
 #     # Create figure
-#     fig = go.Figure()
 #     # Add nodes to the figure
 #     for i, node in enumerate(wrapped_nodes):
 #         colored_node = color_highlighted_words(node, global_color_map)
 #         x, y = positions[i]
-#         fig.add_trace(go.Scatter(
 #             x=[-x],  # Reflect the x coordinate
 #             y=[y],
 #             mode='markers',
 #             marker=dict(size=10, color='blue'),
 #             hoverinfo='none'
 #         ))
-#         fig.add_annotation(
 #             x=-x,  # Reflect the x coordinate
 #             y=y,
 #             text=colored_node,
 #             showarrow=False,
 #             xshift=15,
 #             align="center",
-#             font=dict(size=8),
 #             bordercolor='black',
 #             borderwidth=1,
 #             borderpad=2,
 #             bgcolor='white',
-#             width=150
 #         )
-#     # Add edges to the figure
-#     for edge in edges:
 #         x0, y0 = positions[edge[0]]
 #         x1, y1 = positions[edge[1]]
-#         fig.add_trace(go.Scatter(
 #             x=[-x0, -x1],  # Reflect the x coordinates
 #             y=[y0, y1],
 #             mode='lines',
 #             line=dict(color='black', width=1)
 #         ))
-#     fig.update_layout(
 #         showlegend=False,
 #         margin=dict(t=20, b=20, l=20, r=20),
 #         xaxis=dict(showgrid=False, zeroline=False, showticklabels=False),
 #         yaxis=dict(showgrid=False, zeroline=False, showticklabels=False),
-#         width=1200,  # Adjusted width to accommodate more levels
 #         height=1000   # Adjusted height to accommodate more levels
 #     )
-#     return fig
 import plotly.graph_objects as go
 import textwrap
 import re
 from collections import defaultdict
-def generate_subplot(paraphrased_sentence, scheme_sentences, sampled_sentence, highlight_info):
     # Combine nodes into one list with appropriate labels
-    nodes = [paraphrased_sentence] + scheme_sentences + sampled_sentence
     nodes[0] += ' L0'  # Paraphrased sentence is level 0
-    para_len = len(scheme_sentences)
-    for i in range(1, para_len + 1):
         nodes[i] += ' L1'  # Scheme sentences are level 1
-    for i in range(para_len + 1, len(nodes)):
-        nodes[i] += ' L2'  # Sampled sentences are level 2
     # Define the highlight_words function
     def highlight_words(sentence, color_map):
@@ -181,7 +368,7 @@ def generate_subplot(paraphrased_sentence, scheme_sentences, sampled_sentence, h
     cleaned_nodes = [re.sub(r'\sL[0-9]$', '', node) for node in nodes]
     global_color_map = dict(highlight_info)
     highlighted_nodes = [highlight_words(node, global_color_map) for node in cleaned_nodes]
-    wrapped_nodes = ['<br>'.join(textwrap.wrap(node, width=30)) for node in highlighted_nodes]
     # Function to determine tree levels and create edges dynamically
     def get_levels_and_edges(nodes):
@@ -197,31 +384,188 @@ def generate_subplot(paraphrased_sentence, scheme_sentences, sampled_sentence, h
             if level == 1:
                 edges.append((root_node, i))
-        # Add edges from each L1 node to their corresponding L2 nodes
-        l1_indices = [i for i, level in levels.items() if level == 1]
-        l2_indices = [i for i, level in levels.items() if level == 2]
-        for i, l1_node in enumerate(l1_indices):
-            l2_start = i * 4
-            for j in range(4):
-                l2_index = l2_start + j
-                if l2_index < len(l2_indices):
-                    edges.append((l1_node, l2_indices[l2_index]))
-        # Add edges from each L2 node to their corresponding L3 nodes
-        l2_indices = [i for i, level in levels.items() if level == 2]
-        l3_indices = [i for i, level in levels.items() if level == 3]
-        l2_to_l3_map = {l2_node: [] for l2_node in l2_indices}
-        # Map L3 nodes to L2 nodes
-        for l3_node in l3_indices:
-            l2_node = l3_node % len(l2_indices)
-            l2_to_l3_map[l2_indices[l2_node]].append(l3_node)
-        for l2_node, l3_nodes in l2_to_l3_map.items():
-            for l3_node in l3_nodes:
-                edges.append((l2_node, l3_node))
         return levels, edges
@@ -238,15 +582,12 @@ def generate_subplot(paraphrased_sentence, scheme_sentences, sampled_sentence, h
     y_offsets = {level: - (height - 1) / 2 for level, height in level_heights.items()}
     x_gap = 2
     l1_y_gap = 10
-    l2_y_gap = 6
     for node, level in levels.items():
         if level == 1:
             positions[node] = (-level * x_gap, y_offsets[level] * l1_y_gap)
-        elif level == 2:
-            positions[node] = (-level * x_gap, y_offsets[level] * l2_y_gap)
         else:
-            positions[node] = (-level * x_gap, y_offsets[level] * l2_y_gap)
         y_offsets[level] += 1
     # Function to highlight words in a wrapped node string
@@ -283,39 +624,40 @@ def generate_subplot(paraphrased_sentence, scheme_sentences, sampled_sentence, h
     ]
     # Create figure
-    fig = go.Figure()
     # Add nodes to the figure
     for i, node in enumerate(wrapped_nodes):
         colored_node = color_highlighted_words(node, global_color_map)
         x, y = positions[i]
-        fig.add_trace(go.Scatter(
             x=[-x],  # Reflect the x coordinate
             y=[y],
             mode='markers',
             marker=dict(size=10, color='blue'),
             hoverinfo='none'
         ))
-        fig.add_annotation(
             x=-x,  # Reflect the x coordinate
             y=y,
             text=colored_node,
             showarrow=False,
             xshift=15,
             align="center",
-            font=dict(size=8),
             bordercolor='black',
             borderwidth=1,
             borderpad=2,
             bgcolor='white',
-            width=150
         )
     # Add edges and text above each edge
     for i, edge in enumerate(edges):
         x0, y0 = positions[edge[0]]
         x1, y1 = positions[edge[1]]
-        fig.add_trace(go.Scatter(
             x=[-x0, -x1],  # Reflect the x coordinates
             y=[y0, y1],
             mode='lines',
@@ -330,23 +672,24 @@ def generate_subplot(paraphrased_sentence, scheme_sentences, sampled_sentence, h
         text_y_position = mid_y + 0.8  # Increase this value to shift the text further upwards
         # Add text annotation above the edge
-        fig.add_annotation(
             x=mid_x,
             y=text_y_position,
-            text=edge_texts[i],  # Use the text specific to this edge
             showarrow=False,
-            font=dict(size=10),
             align="center"
         )
-    fig.update_layout(
         showlegend=False,
         margin=dict(t=20, b=20, l=20, r=20),
         xaxis=dict(showgrid=False, zeroline=False, showticklabels=False),
         yaxis=dict(showgrid=False, zeroline=False, showticklabels=False),
-        width=1200,  # Adjusted width to accommodate more levels
         height=1000   # Adjusted height to accommodate more levels
     )
-    return fig

 # import re
 # from collections import defaultdict
+# def generate_subplot1(paraphrased_sentence, scheme_sentences, highlight_info):
 #     # Combine nodes into one list with appropriate labels
+#     nodes = [paraphrased_sentence] + scheme_sentences
 #     nodes[0] += ' L0'  # Paraphrased sentence is level 0
+#     for i in range(1, len(nodes)):
 #         nodes[i] += ' L1'  # Scheme sentences are level 1
 #     # Define the highlight_words function
 #     def highlight_words(sentence, color_map):
 #     cleaned_nodes = [re.sub(r'\sL[0-9]$', '', node) for node in nodes]
 #     global_color_map = dict(highlight_info)
 #     highlighted_nodes = [highlight_words(node, global_color_map) for node in cleaned_nodes]
+#     wrapped_nodes = ['<br>'.join(textwrap.wrap(node, width=50)) for node in highlighted_nodes]
 #     # Function to determine tree levels and create edges dynamically
 #     def get_levels_and_edges(nodes):
 #             if level == 1:
 #                 edges.append((root_node, i))
+#         return levels, edges
+#     # Get levels and dynamic edges
+#     levels, edges = get_levels_and_edges(nodes)
+#     max_level = max(levels.values(), default=0)
+#     # Calculate positions
+#     positions = {}
+#     level_heights = defaultdict(int)
+#     for node, level in levels.items():
+#         level_heights[level] += 1
+#     y_offsets = {level: - (height - 1) / 2 for level, height in level_heights.items()}
+#     x_gap = 2
+#     l1_y_gap = 10
+#     for node, level in levels.items():
+#         if level == 1:
+#             positions[node] = (-level * x_gap, y_offsets[level] * l1_y_gap)
+#         else:
+#             positions[node] = (-level * x_gap, y_offsets[level] * l1_y_gap)
+#         y_offsets[level] += 1
+#     # Function to highlight words in a wrapped node string
+#     def color_highlighted_words(node, color_map):
+#         parts = re.split(r'(\{\{.*?\}\})', node)
+#         colored_parts = []
+#         for part in parts:
+#             match = re.match(r'\{\{(.*?)\}\}', part)
+#             if match:
+#                 word = match.group(1)
+#                 color = color_map.get(word, 'black')
+#                 colored_parts.append(f"<span style='color: {color};'>{word}</span>")
+#             else:
+#                 colored_parts.append(part)
+#         return ''.join(colored_parts)
+#     # Define the text for each edge
+#     edge_texts = [
+#         "Highest Entropy Masking",
+#         "Pseudo-random Masking",
+#         "Random Masking",
+#         "Greedy Sampling",
+#         "Temperature Sampling",
+#         "Exponential Minimum Sampling",
+#         "Inverse Transform Sampling",
+#         "Greedy Sampling",
+#         "Temperature Sampling",
+#         "Exponential Minimum Sampling",
+#         "Inverse Transform Sampling",
+#         "Greedy Sampling",
+#         "Temperature Sampling",
+#         "Exponential Minimum Sampling",
+#         "Inverse Transform Sampling"
+#     ]
+#     # Create figure
+#     fig1 = go.Figure()
+#     # Add nodes to the figure
+#     for i, node in enumerate(wrapped_nodes):
+#         colored_node = color_highlighted_words(node, global_color_map)
+#         x, y = positions[i]
+#         fig1.add_trace(go.Scatter(
+#             x=[-x],  # Reflect the x coordinate
+#             y=[y],
+#             mode='markers',
+#             marker=dict(size=10, color='blue'),
+#             hoverinfo='none'
+#         ))
+#         fig1.add_annotation(
+#             x=-x,  # Reflect the x coordinate
+#             y=y,
+#             text=colored_node,
+#             showarrow=False,
+#             xshift=15,
+#             align="center",
+#             font=dict(size=12),
+#             bordercolor='black',
+#             borderwidth=1,
+#             borderpad=2,
+#             bgcolor='white',
+#             width=300,
+#             height=120
+#         )
+#     # Add edges and text above each edge
+#     for i, edge in enumerate(edges):
+#         x0, y0 = positions[edge[0]]
+#         x1, y1 = positions[edge[1]]
+#         fig1.add_trace(go.Scatter(
+#             x=[-x0, -x1],  # Reflect the x coordinates
+#             y=[y0, y1],
+#             mode='lines',
+#             line=dict(color='black', width=1)
+#         ))
+#         # Calculate the midpoint of the edge
+#         mid_x = (-x0 + -x1) / 2
+#         mid_y = (y0 + y1) / 2
+#         # Adjust y position to shift text upwards
+#         text_y_position = mid_y + 0.8  # Increase this value to shift the text further upwards
+#         # Add text annotation above the edge
+#         fig1.add_annotation(
+#             x=mid_x,
+#             y=text_y_position,
+#             text=edge_texts[i],  # Use the text specific to this edge
+#             showarrow=False,
+#             font=dict(size=12),
+#             align="center"
+#         )
+#     fig1.update_layout(
+#         showlegend=False,
+#         margin=dict(t=20, b=20, l=20, r=20),
+#         xaxis=dict(showgrid=False, zeroline=False, showticklabels=False),
+#         yaxis=dict(showgrid=False, zeroline=False, showticklabels=False),
+#         width=1435,  # Adjusted width to accommodate more levels
+#         height=1000   # Adjusted height to accommodate more levels
+#     )
+#     return fig1
+# def generate_subplot2(scheme_sentences, sampled_sentence, highlight_info):
+#     # Combine nodes into one list with appropriate labels
+#     nodes = scheme_sentences + sampled_sentence
+#     para_len = len(scheme_sentences)
+#     # Reassign levels: L1 -> L0, L2 -> L1
+#     for i in range(para_len):
+#         nodes[i] += ' L0'  # Scheme sentences are now level 0
+#     for i in range(para_len, len(nodes)):
+#         nodes[i] += ' L1'  # Sampled sentences are now level 1
+#     # Define the highlight_words function
+#     def highlight_words(sentence, color_map):
+#         for word, color in color_map.items():
+#             sentence = re.sub(f"\\b{word}\\b", f"{{{{{word}}}}}", sentence, flags=re.IGNORECASE)
+#         return sentence
+#     # Clean and wrap nodes, and highlight specified words globally
+#     cleaned_nodes = [re.sub(r'\sL[0-9]$', '', node) for node in nodes]
+#     global_color_map = dict(highlight_info)
+#     highlighted_nodes = [highlight_words(node, global_color_map) for node in cleaned_nodes]
+#     wrapped_nodes = ['<br>'.join(textwrap.wrap(node, width=80)) for node in highlighted_nodes]
+#     # Function to determine tree levels and create edges dynamically
+#     def get_levels_and_edges(nodes):
+#         levels = {}
+#         edges = []
+#         for i, node in enumerate(nodes):
+#             level = int(node.split()[-1][1])
+#             levels[i] = level
+#         # Add edges from L0 to all L1 nodes
+#         l0_indices = [i for i, level in levels.items() if level == 0]
+#         l1_indices = [i for i, level in levels.items() if level == 1]
+#         # Ensure there are exactly 3 L0 nodes
+#         if len(l0_indices) < 3:
+#             raise ValueError("There should be exactly 3 L0 nodes to attach edges correctly.")
+#         # Split L1 nodes into 3 groups of 4 for attaching to L0 nodes
+#         for i, l1_node in enumerate(l1_indices):
+#             if i < 4:
+#                 edges.append((l0_indices[0], l1_node))  # Connect to the first L0 node
+#             elif i < 8:
+#                 edges.append((l0_indices[1], l1_node))  # Connect to the second L0 node
+#             else:
+#                 edges.append((l0_indices[2], l1_node))  # Connect to the third L0 node
 #         return levels, edges
 #     # Get levels and dynamic edges
 #     levels, edges = get_levels_and_edges(nodes)
 #     # Calculate positions
 #     positions = {}
 #     y_offsets = {level: - (height - 1) / 2 for level, height in level_heights.items()}
 #     x_gap = 2
 #     l1_y_gap = 10
 #     for node, level in levels.items():
 #         if level == 1:
 #             positions[node] = (-level * x_gap, y_offsets[level] * l1_y_gap)
 #         else:
+#             positions[node] = (-level * x_gap, y_offsets[level] * l1_y_gap)
 #         y_offsets[level] += 1
 #     # Function to highlight words in a wrapped node string
 #                 colored_parts.append(part)
 #         return ''.join(colored_parts)
+#     # Define the text for each edge
+#     edge_texts = [
+#         "Highest Entropy Masking",
+#         "Pseudo-random Masking",
+#         "Random Masking",
+#         "Greedy Sampling",
+#         "Temperature Sampling",
+#         "Exponential Minimum Sampling",
+#         "Inverse Transform Sampling",
+#         "Greedy Sampling",
+#         "Temperature Sampling",
+#         "Exponential Minimum Sampling",
+#         "Inverse Transform Sampling",
+#         "Greedy Sampling",
+#         "Temperature Sampling",
+#         "Exponential Minimum Sampling",
+#         "Inverse Transform Sampling"
+#     ]
 #     # Create figure
+#     fig2 = go.Figure()
 #     # Add nodes to the figure
 #     for i, node in enumerate(wrapped_nodes):
 #         colored_node = color_highlighted_words(node, global_color_map)
 #         x, y = positions[i]
+#         fig2.add_trace(go.Scatter(
 #             x=[-x],  # Reflect the x coordinate
 #             y=[y],
 #             mode='markers',
 #             marker=dict(size=10, color='blue'),
 #             hoverinfo='none'
 #         ))
+#         fig2.add_annotation(
 #             x=-x,  # Reflect the x coordinate
 #             y=y,
 #             text=colored_node,
 #             showarrow=False,
 #             xshift=15,
 #             align="center",
+#             font=dict(size=12),
 #             bordercolor='black',
 #             borderwidth=1,
 #             borderpad=2,
 #             bgcolor='white',
+#             width=450,
+#             height=65
 #         )
+#     # Add edges and text above each edge
+#     for i, edge in enumerate(edges):
 #         x0, y0 = positions[edge[0]]
 #         x1, y1 = positions[edge[1]]
+#         fig2.add_trace(go.Scatter(
 #             x=[-x0, -x1],  # Reflect the x coordinates
 #             y=[y0, y1],
 #             mode='lines',
 #             line=dict(color='black', width=1)
 #         ))
+#         # Calculate the midpoint of the edge
+#         mid_x = (-x0 + -x1) / 2
+#         mid_y = (y0 + y1) / 2
+#         # Adjust y position to shift text upwards
+#         text_y_position = mid_y + 0.8  # Increase this value to shift the text further upwards
+#         # Add text annotation above the edge
+#         fig2.add_annotation(A surprising aspect of tests, specifically self-testing soon after exposure to new material, is that they can significantly improve your ability to learn, apply, and maintain new knowledge.
+#             x=mid_x,
+#             y=text_y_position,
+#             text=edge_texts[i],  # Use the text specific to this edge
+#             showarrow=False,
+#             font=dict(size=12),
+#             align="center"
+#         )
+#     fig2.update_layout(
 #         showlegend=False,
 #         margin=dict(t=20, b=20, l=20, r=20),
 #         xaxis=dict(showgrid=False, zeroline=False, showticklabels=False),
 #         yaxis=dict(showgrid=False, zeroline=False, showticklabels=False),
+#         width=1435,  # Adjusted width to accommodate more levels
 #         height=1000   # Adjusted height to accommodate more levels
 #     )
+#     return fig2
 import plotly.graph_objects as go
 import textwrap
 import re
 from collections import defaultdict
+def generate_subplot1(paraphrased_sentence, scheme_sentences, highlight_info, common_grams):
     # Combine nodes into one list with appropriate labels
+    nodes = [paraphrased_sentence] + scheme_sentences
     nodes[0] += ' L0'  # Paraphrased sentence is level 0
+    for i in range(1, len(nodes)):
         nodes[i] += ' L1'  # Scheme sentences are level 1
+    # Function to apply LCS numbering based on common_grams
+    def apply_lcs_numbering(sentence, common_grams):
+        for idx, lcs in common_grams:
+            # Only replace if the LCS is a whole word (not part of another word)
+            sentence = re.sub(rf"\b{lcs}\b", f"({idx}){lcs}", sentence)
+        return sentence
+    # Apply LCS numbering
+    nodes = [apply_lcs_numbering(node, common_grams) for node in nodes]
     # Define the highlight_words function
     def highlight_words(sentence, color_map):
     cleaned_nodes = [re.sub(r'\sL[0-9]$', '', node) for node in nodes]
     global_color_map = dict(highlight_info)
     highlighted_nodes = [highlight_words(node, global_color_map) for node in cleaned_nodes]
+    wrapped_nodes = ['<br>'.join(textwrap.wrap(node, width=55)) for node in highlighted_nodes]
     # Function to determine tree levels and create edges dynamically
     def get_levels_and_edges(nodes):
             if level == 1:
                 edges.append((root_node, i))
+        return levels, edges
+    # Get levels and dynamic edges
+    levels, edges = get_levels_and_edges(nodes)
+    max_level = max(levels.values(), default=0)
+    # Calculate positions
+    positions = {}
+    level_heights = defaultdict(int)
+    for node, level in levels.items():
+        level_heights[level] += 1
+    y_offsets = {level: - (height - 1) / 2 for level, height in level_heights.items()}
+    x_gap = 2
+    l1_y_gap = 10
+    for node, level in levels.items():
+        if level == 1:
+            positions[node] = (-level * x_gap, y_offsets[level] * l1_y_gap)
+        else:
+            positions[node] = (-level * x_gap, y_offsets[level] * l1_y_gap)
+        y_offsets[level] += 1
+    # Function to highlight words in a wrapped node string
+    def color_highlighted_words(node, color_map):
+        parts = re.split(r'(\{\{.*?\}\})', node)
+        colored_parts = []
+        for part in parts:
+            match = re.match(r'\{\{(.*?)\}\}', part)
+            if match:
+                word = match.group(1)
+                color = color_map.get(word, 'black')
+                colored_parts.append(f"<span style='color: {color};'>{word}</span>")
+            else:
+                colored_parts.append(part)
+        return ''.join(colored_parts)
+    # Define the text for each edge
+    edge_texts = [
+        "Highest Entropy Masking",
+        "Pseudo-random Masking",
+        "Random Masking",
+        "Greedy Sampling",
+        "Temperature Sampling",
+        "Exponential Minimum Sampling",
+        "Inverse Transform Sampling",
+        "Greedy Sampling",
+        "Temperature Sampling",
+        "Exponential Minimum Sampling",
+        "Inverse Transform Sampling",
+        "Greedy Sampling",
+        "Temperature Sampling",
+        "Exponential Minimum Sampling",
+        "Inverse Transform Sampling"
+    ]
+    # Create figure
+    fig1 = go.Figure()
+    # Add nodes to the figure
+    for i, node in enumerate(wrapped_nodes):
+        colored_node = color_highlighted_words(node, global_color_map)
+        x, y = positions[i]
+        fig1.add_trace(go.Scatter(
+            x=[-x],  # Reflect the x coordinate
+            y=[y],
+            mode='markers',
+            marker=dict(size=10, color='blue'),
+            hoverinfo='none'
+        ))
+        fig1.add_annotation(
+            x=-x,  # Reflect the x coordinate
+            y=y,
+            text=colored_node,
+            showarrow=False,
+            xshift=15,
+            align="center",
+            font=dict(size=12),
+            bordercolor='black',
+            borderwidth=1,
+            borderpad=2,
+            bgcolor='white',
+            width=300,
+            height=120
+        )
+    # Add edges and text above each edge
+    for i, edge in enumerate(edges):
+        x0, y0 = positions[edge[0]]
+        x1, y1 = positions[edge[1]]
+        fig1.add_trace(go.Scatter(
+            x=[-x0, -x1],  # Reflect the x coordinates
+            y=[y0, y1],
+            mode='lines',
+            line=dict(color='black', width=1)
+        ))
+        # Calculate the midpoint of the edge
+        mid_x = (-x0 + -x1) / 2
+        mid_y = (y0 + y1) / 2
+        # Adjust y position to shift text upwards
+        text_y_position = mid_y + 0.8  # Increase this value to shift the text further upwards
+        # Add text annotation above the edge
+        fig1.add_annotation(
+            x=mid_x,
+            y=text_y_position,
+            text=edge_texts[i],  # Use the text specific to this edge
+            showarrow=False,
+            font=dict(size=12),
+            align="center"
+        )
+    fig1.update_layout(
+        showlegend=False,
+        margin=dict(t=20, b=20, l=20, r=20),
+        xaxis=dict(showgrid=False, zeroline=False, showticklabels=False),
+        yaxis=dict(showgrid=False, zeroline=False, showticklabels=False),
+        width=1435,  # Adjusted width to accommodate more levels
+        height=1000   # Adjusted height to accommodate more levels
+    )
+    return fig1
+def generate_subplot2(scheme_sentences, sampled_sentence, highlight_info, common_grams):
+    # Combine nodes into one list with appropriate labels
+    nodes = scheme_sentences + sampled_sentence
+    para_len = len(scheme_sentences)
+    # Reassign levels: L1 -> L0, L2 -> L1
+    for i in range(para_len):
+        nodes[i] += ' L0'  # Scheme sentences are now level 0
+    for i in range(para_len, len(nodes)):
+        nodes[i] += ' L1'  # Sampled sentences are now level 1
+    # Function to apply LCS numbering based on common_grams
+    def apply_lcs_numbering(sentence, common_grams):
+        for idx, lcs in common_grams:
+            # Only replace if the LCS is a whole word (not part of another word)
+            sentence = re.sub(rf"\b{lcs}\b", f"({idx}){lcs}", sentence)
+        return sentence
+    # Apply LCS numbering
+    nodes = [apply_lcs_numbering(node, common_grams) for node in nodes]
+    # Define the highlight_words function
+    def highlight_words(sentence, color_map):
+        for word, color in color_map.items():
+            sentence = re.sub(f"\\b{word}\\b", f"{{{{{word}}}}}", sentence, flags=re.IGNORECASE)
+        return sentence
+    # Clean and wrap nodes, and highlight specified words globally
+    cleaned_nodes = [re.sub(r'\sL[0-9]$', '', node) for node in nodes]
+    global_color_map = dict(highlight_info)
+    highlighted_nodes = [highlight_words(node, global_color_map) for node in cleaned_nodes]
+    wrapped_nodes = ['<br>'.join(textwrap.wrap(node, width=80)) for node in highlighted_nodes]
+    # Function to determine tree levels and create edges dynamically
+    def get_levels_and_edges(nodes):
+        levels = {}
+        edges = []
+        for i, node in enumerate(nodes):
+            level = int(node.split()[-1][1])
+            levels[i] = level
+        # Add edges from L0 to all L1 nodes
+        l0_indices = [i for i, level in levels.items() if level == 0]
+        l1_indices = [i for i, level in levels.items() if level == 1]
+        # Ensure there are exactly 3 L0 nodes
+        if len(l0_indices) < 3:
+            raise ValueError("There should be exactly 3 L0 nodes to attach edges correctly.")
+        # Split L1 nodes into 3 groups of 4 for attaching to L0 nodes
+        for i, l1_node in enumerate(l1_indices):
+            if i < 4:
+                edges.append((l0_indices[0], l1_node))  # Connect to the first L0 node
+            elif i < 8:
+                edges.append((l0_indices[1], l1_node))  # Connect to the second L0 node
+            else:
+                edges.append((l0_indices[2], l1_node))  # Connect to the third L0 node
         return levels, edges
     y_offsets = {level: - (height - 1) / 2 for level, height in level_heights.items()}
     x_gap = 2
     l1_y_gap = 10
     for node, level in levels.items():
         if level == 1:
             positions[node] = (-level * x_gap, y_offsets[level] * l1_y_gap)
         else:
+            positions[node] = (-level * x_gap, y_offsets[level] * l1_y_gap)
         y_offsets[level] += 1
     # Function to highlight words in a wrapped node string
     ]
     # Create figure
+    fig2 = go.Figure()
     # Add nodes to the figure
     for i, node in enumerate(wrapped_nodes):
         colored_node = color_highlighted_words(node, global_color_map)
         x, y = positions[i]
+        fig2.add_trace(go.Scatter(
             x=[-x],  # Reflect the x coordinate
             y=[y],
             mode='markers',
             marker=dict(size=10, color='blue'),
             hoverinfo='none'
         ))
+        fig2.add_annotation(
             x=-x,  # Reflect the x coordinate
             y=y,
             text=colored_node,
             showarrow=False,
             xshift=15,
             align="center",
+            font=dict(size=12),
             bordercolor='black',
             borderwidth=1,
             borderpad=2,
             bgcolor='white',
+            width=450,
+            height=65
         )
     # Add edges and text above each edge
     for i, edge in enumerate(edges):
         x0, y0 = positions[edge[0]]
         x1, y1 = positions[edge[1]]
+        fig2.add_trace(go.Scatter(
             x=[-x0, -x1],  # Reflect the x coordinates
             y=[y0, y1],
             mode='lines',
         text_y_position = mid_y + 0.8  # Increase this value to shift the text further upwards
         # Add text annotation above the edge
+        # Use a fallback text if we exceed the length of edge_texts
+        text = edge_texts[i] if i < len(edge_texts) else f"Edge {i+1}"
+        fig2.add_annotation(
             x=mid_x,
             y=text_y_position,
+            text=text,  # Use the text specific to this edge
             showarrow=False,
+            font=dict(size=12),
             align="center"
         )
+    fig2.update_layout(
         showlegend=False,
         margin=dict(t=20, b=20, l=20, r=20),
         xaxis=dict(showgrid=False, zeroline=False, showticklabels=False),
         yaxis=dict(showgrid=False, zeroline=False, showticklabels=False),
+        width=1435,  # Adjusted width to accommodate more levels
         height=1000   # Adjusted height to accommodate more levels
     )
+    return fig2