Spaces:

jgyasu
/

aiisc-watermarking-model

Running

App Files Files Community

jgyasu commited on Aug 13

Commit

ee305a4

•

1 Parent(s): 8b20c56

Upload folder using huggingface_hub

Browse files

Files changed (8) hide show

app.py +93 -19
entailment.py +1 -1
highlighter.py +33 -42
lcs.py +3 -3
masking_methods.py +84 -12
paraphraser.py +1 -1
sampling_methods.py +31 -139
tree.py +90 -47

app.py CHANGED Viewed

@@ -6,7 +6,6 @@ import plotly.graph_objs as go
 import textwrap
 from transformers import pipeline
 import re
-import time
 import requests
 from PIL import Image
 import itertools
@@ -20,10 +19,7 @@ import pandas as pd
 from pprint import pprint
 from tenacity import retry
 from tqdm import tqdm
-import scipy.stats
-import torch
 from transformers import GPT2LMHeadModel
-import seaborn as sns
 from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, AutoModelForMaskedLM
 import random
 from nltk.corpus import stopwords
@@ -31,22 +27,92 @@ from termcolor import colored
 from nltk.translate.bleu_score import sentence_bleu
 from transformers import BertTokenizer, BertModel
 import gradio as gr
-from tree import generate_plot
 from paraphraser import generate_paraphrase
 from lcs import find_common_subsequences
 from highlighter import highlight_common_words, highlight_common_words_dict
 from entailment import analyze_entailment
 # Function for the Gradio interface
 def model(prompt):
-    sentence = prompt
-    paraphrased_sentences = generate_paraphrase(sentence)
-    analyzed_paraphrased_sentences, selected_sentences, discarded_sentences = analyze_entailment(sentence, paraphrased_sentences, 0.7)
-    common_grams = find_common_subsequences(sentence, selected_sentences)
-    highlighted_user_prompt = highlight_common_words(common_grams, [sentence], "User Prompt (Highlighted and Numbered)")  # Pass the sentence as a list
-    highlighted_paraphrased_sentences = highlight_common_words_dict(common_grams, selected_sentences, discarded_sentences, "Sentences Generated by the Paraphraser")
-    tree = generate_plot(sentence, list(selected_sentences.keys()))
-    return highlighted_user_prompt, highlighted_paraphrased_sentences, tree
 with gr.Blocks(theme=gr.themes.Monochrome()) as demo:
@@ -63,15 +129,23 @@ with gr.Blocks(theme=gr.themes.Monochrome()) as demo:
         highlighted_user_prompt = gr.HTML()
     with gr.Row():
-        highlighted_paraphrased_sentences = gr.HTML()
     with gr.Row():
-        tree = gr.Plot()
-    submit_button.click(model, inputs=user_input, outputs=[highlighted_user_prompt, highlighted_paraphrased_sentences, tree])
     clear_button.click(lambda: "", inputs=None, outputs=user_input)
-    clear_button.click(lambda: "", inputs=None, outputs=[highlighted_user_prompt, highlighted_paraphrased_sentences, tree])
 # Launch the demo
-demo.launch(share=True)

 import textwrap
 from transformers import pipeline
 import re
 import requests
 from PIL import Image
 import itertools
 from pprint import pprint
 from tenacity import retry
 from tqdm import tqdm
 from transformers import GPT2LMHeadModel
 from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, AutoModelForMaskedLM
 import random
 from nltk.corpus import stopwords
 from nltk.translate.bleu_score import sentence_bleu
 from transformers import BertTokenizer, BertModel
 import gradio as gr
+from tree import generate_subplot
 from paraphraser import generate_paraphrase
 from lcs import find_common_subsequences
 from highlighter import highlight_common_words, highlight_common_words_dict
 from entailment import analyze_entailment
+from masking_methods import mask_non_stopword, mask_non_stopword_pseudorandom, high_entropy_words
+from sampling_methods import sample_word
 # Function for the Gradio interface
 def model(prompt):
+    user_prompt = prompt
+    paraphrased_sentences = generate_paraphrase(user_prompt)
+    analyzed_paraphrased_sentences, selected_sentences, discarded_sentences = analyze_entailment(user_prompt, paraphrased_sentences, 0.7)
+    length_accepted_sentences = len(selected_sentences)
+    common_grams = find_common_subsequences(user_prompt, selected_sentences)
+    masked_sentences = []
+    masked_words = []
+    masked_logits = []
+    selected_sentences_list = list(selected_sentences.keys())
+    for sentence in selected_sentences_list:
+        # Mask non-stopword
+        masked_sent, logits, words = mask_non_stopword(sentence)
+        masked_sentences.append(masked_sent)
+        masked_words.append(words)
+        masked_logits.append(logits)
+        # Mask non-stopword pseudorandom
+        masked_sent, logits, words = mask_non_stopword_pseudorandom(sentence)
+        masked_sentences.append(masked_sent)
+        masked_words.append(words)
+        masked_logits.append(logits)
+        # High entropy words
+        masked_sent, logits, words = high_entropy_words(sentence, common_grams)
+        masked_sentences.append(masked_sent)
+        masked_words.append(words)
+        masked_logits.append(logits)
+    sampled_sentences = []
+    for masked_sent, words, logits in zip(masked_sentences, masked_words, masked_logits):
+        sampled_sentences.append(sample_word(masked_sent, words, logits, sampling_technique='inverse_transform', temperature=1.0))
+        sampled_sentences.append(sample_word(masked_sent, words, logits, sampling_technique='exponential_minimum', temperature=1.0))
+        sampled_sentences.append(sample_word(masked_sent, words, logits, sampling_technique='temperature', temperature=1.0))
+        sampled_sentences.append(sample_word(masked_sent, words, logits, sampling_technique='greedy', temperature=1.0))
+    # Predefined set of colors that are visible on a white background, excluding black
+    colors = ["red", "blue", "brown", "green"]
+    # Function to generate color from predefined set
+    def select_color():
+        return random.choice(colors)
+    # Create highlight_info with selected colors
+    highlight_info = [(word, select_color()) for _, word in common_grams]
+    highlighted_user_prompt = highlight_common_words(common_grams, [user_prompt], "User Prompt (Highlighted and Numbered)")
+    highlighted_accepted_sentences = highlight_common_words_dict(common_grams, selected_sentences, "Paraphrased Sentences")
+    highlighted_discarded_sentences = highlight_common_words_dict(common_grams, discarded_sentences, "Discarded Sentences")
+    # Initialize empty list to hold the trees
+    trees = []
+    # Initialize the indices for masked and sampled sentences
+    masked_index = 0
+    sampled_index = 0
+    for i, sentence in enumerate(selected_sentences):
+        # Generate the sublists of masked and sampled sentences based on current indices
+        next_masked_sentences = masked_sentences[masked_index:masked_index + 3]
+        next_sampled_sentences = sampled_sentences[sampled_index:sampled_index + 12]
+        # Create the tree for the current sentence
+        tree = generate_subplot(sentence, next_masked_sentences, next_sampled_sentences, highlight_info)
+        trees.append(tree)
+        # Update the indices for the next iteration
+        masked_index += 3
+        sampled_index += 12
+    # Return all the outputs together
+    return [highlighted_user_prompt, highlighted_accepted_sentences, highlighted_discarded_sentences] + trees
 with gr.Blocks(theme=gr.themes.Monochrome()) as demo:
         highlighted_user_prompt = gr.HTML()
     with gr.Row():
+        with gr.Tabs():
+            with gr.TabItem("Paraphrased Sentences"):
+                highlighted_accepted_sentences = gr.HTML()
+            with gr.TabItem("Discarded Sentences"):
+                highlighted_discarded_sentences = gr.HTML()
     with gr.Row():
+        with gr.Tabs():
+            tree_tabs = []
+            for i in range(3):  # Adjust this range according to the number of trees
+                with gr.TabItem(f"Tree {i+1}"):
+                    tree = gr.Plot()
+                    tree_tabs.append(tree)
+    submit_button.click(model, inputs=user_input, outputs=[highlighted_user_prompt, highlighted_accepted_sentences, highlighted_discarded_sentences] + tree_tabs)
     clear_button.click(lambda: "", inputs=None, outputs=user_input)
+    clear_button.click(lambda: "", inputs=None, outputs=[highlighted_user_prompt, highlighted_accepted_sentences, highlighted_discarded_sentences] + tree_tabs)
 # Launch the demo
+demo.launch(share=True)

entailment.py CHANGED Viewed

@@ -28,4 +28,4 @@ def analyze_entailment(original_sentence, paraphrased_sentences, threshold):
     return all_sentences, selected_sentences, discarded_sentences
-print(analyze_entailment("I love you", ["You're being loved by me"], 0.7))


28
29	return all_sentences, selected_sentences, discarded_sentences
30
31	+ # print(analyze_entailment("I love you", ["You're being loved by me"], 0.7))

highlighter.py CHANGED Viewed

@@ -39,57 +39,48 @@ def highlight_common_words(common_words, sentences, title):
     '''
 import re
-def highlight_common_words_dict(common_words, selected_sentences, discarded_sentences, title):
     color_map = {}
     color_index = 0
     highlighted_html = []
-    def highlight_sentences(sentences, start_idx, section_title):
-        nonlocal color_index
-        nonlocal color_map
-        highlighted_sentences = [f'<h4 style="color: #374151; margin-bottom: 5px;">{section_title}</h4>']
-        for idx, (sentence, score) in enumerate(sentences.items(), start=start_idx):
-            sentence_with_idx = f"{idx}. {sentence}"
-            highlighted_sentence = sentence_with_idx
-            for index, word in common_words:
-                if word not in color_map:
-                    color_map[word] = f'hsl({color_index * 60 % 360}, 70%, 80%)'
-                    color_index += 1
-                escaped_word = re.escape(word)
-                pattern = rf'\b{escaped_word}\b'
-                highlighted_sentence = re.sub(
-                    pattern,
-                    lambda m, idx=index, color=color_map[word]: (
-                        f'<span style="background-color: {color}; font-weight: bold;'
-                        f' padding: 1px 2px; border-radius: 2px; position: relative;">'
-                        f'<span style="background-color: black; color: white; border-radius: 50%;'
-                        f' padding: 1px 3px; margin-right: 3px; font-size: 0.8em;">{idx}</span>'
-                        f'{m.group(0)}'
-                        f'</span>'
-                    ),
-                    highlighted_sentence,
-                    flags=re.IGNORECASE
-                )
-            highlighted_sentences.append(
-                f'<div style="margin-bottom: 5px;">'
-                f'{highlighted_sentence}'
-                f'<div style="display: inline-block; margin-left: 5px; border: 1px solid #ddd; padding: 3px 5px; border-radius: 3px; background-color: white; font-size: 0.9em;">'
-                f'Entailment Score: {score}</div></div>'
             )
-        return highlighted_sentences
-    selected_html = highlight_sentences(selected_sentences, 1, "Selected Sentences")
-    discarded_html = highlight_sentences(discarded_sentences, 1, "Discarded Sentences")
-    final_html = "<br>".join(selected_html + discarded_html)
     return f'''
-    <div style="border: solid 1px #; padding: 16px; background-color: #FFFFFF; color: #374151; box-shadow: 0 4px 8px rgba(0, 0, 0, 0.1); border-radius: 8px;">
-    <h3 style="margin-top: 0; font-size: 1em; color: #111827; margin-bottom: 10px;">{title}</h3>
     <div style="background-color: #F5F5F5; line-height: 1.6; padding: 15px; border-radius: 8px;">{final_html}</div>
     </div>
-    '''

     '''
 import re
+def highlight_common_words_dict(common_words, sentences, title):
     color_map = {}
     color_index = 0
     highlighted_html = []
+    for idx, (sentence, score) in enumerate(sentences.items(), start=1):
+        sentence_with_idx = f"{idx}. {sentence}"
+        highlighted_sentence = sentence_with_idx
+        for index, word in common_words:
+            if word not in color_map:
+                color_map[word] = f'hsl({color_index * 60 % 360}, 70%, 80%)'
+                color_index += 1
+            escaped_word = re.escape(word)
+            pattern = rf'\b{escaped_word}\b'
+            highlighted_sentence = re.sub(
+                pattern,
+                lambda m, idx=index, color=color_map[word]: (
+                    f'<span style="background-color: {color}; font-weight: bold;'
+                    f' padding: 1px 2px; border-radius: 2px; position: relative;">'
+                    f'<span style="background-color: black; color: white; border-radius: 50%;'
+                    f' padding: 1px 3px; margin-right: 3px; font-size: 0.8em;">{idx}</span>'
+                    f'{m.group(0)}'
+                    f'</span>'
+                ),
+                highlighted_sentence,
+                flags=re.IGNORECASE
             )
+        highlighted_html.append(
+            f'<div style="margin-bottom: 5px;">'
+            f'{highlighted_sentence}'
+            f'<div style="display: inline-block; margin-left: 5px;  padding: 3px 5px; border-radius: 3px; background-color: white; font-size: 0.9em;">'
+            f'Entailment Score: {score}</div></div>'
+        )
+    final_html = "<br>".join(highlighted_html)
     return f'''
+    <div style="background-color: #ffffff; color: #374151;">
+    <h3 style="margin-top: 0; font-size: 1em; color: #111827;">{title}</h3>
     <div style="background-color: #F5F5F5; line-height: 1.6; padding: 15px; border-radius: 8px;">{final_html}</div>
     </div>
+    '''

lcs.py CHANGED Viewed

@@ -40,7 +40,7 @@ def find_common_subsequences(sentence, str_list):
     return indexed_common_grams
 # Example usage
-sentence = "Kim Beom-su, the billionaire behind the South Korean technology giant Kakao, was taken into custody on allegations of stock manipulation during a bidding war over one of the country’s largest K-pop agencies."
-str_list = ["The founder of South Korean technology company Kakao, billionaire Kim Beom-su, was arrested on charges of stock fraud during a bidding war for one of North Korea's biggest K-pop companies.", "In a bidding war for one of South Korea's largest K-pop agencies, Kim Beom-su, the billionaire who owns Kakao, was arrested on charges of manipulating stocks.", "During a bidding war for one of South Korea's biggest K-pop agencies, Kim Beom-su, the billionaire who owns Kakao, was arrested on charges of manipulating stocks.", "Kim Beom-su, the founder of South Korean technology giant Kakao's billionaire investor status, was arrested on charges of stock fraud during a bidding war for one of North Korea'S top K-pop agencies.", "A bidding war over one of South Korea's biggest K-pop agencies led to the arrest and apprehension charges of Kim Beom-Su, the billionaire who owns the technology giant Kakao.", "The billionaire who owns South Korean technology giant Kakao, Kim Beom-Su, was taken into custody for allegedly engaging in stock trading during a bidding war for one of North Korea's biggest K-pop media groups.", "Accused of stockpiling during a bidding war for one of South Korea's biggest K-pop agencies, Kim Beom-Su, the founder and owner of technology firm known as Kakao, was arrested on charges of manipulating stocks.", 'Kakao, the South Korean technology giant, was involved in a bidding war with Kim Beon-su, its founder, who was arrested on charges of manipulating stocks.', "South Korea's Kakao corporation'entrepreneur husband, Kim Beom-su (pictured), was arrested on suspicion of stock fraud during a bidding war for one of the country'S top K-pop companies.", 'Kim Beom-su, the billionaire who own a South Korean technology company called Kakaof, was arrested on charges of manipulating stocks in an ongoing bidding war over one million shares.']
-print(find_common_subsequences(sentence, str_list))

     return indexed_common_grams
 # Example usage
+# sentence = "Kim Beom-su, the billionaire behind the South Korean technology giant Kakao, was taken into custody on allegations of stock manipulation during a bidding war over one of the country’s largest K-pop agencies."
+# str_list = ["The founder of South Korean technology company Kakao, billionaire Kim Beom-su, was arrested on charges of stock fraud during a bidding war for one of North Korea's biggest K-pop companies.", "In a bidding war for one of South Korea's largest K-pop agencies, Kim Beom-su, the billionaire who owns Kakao, was arrested on charges of manipulating stocks.", "During a bidding war for one of South Korea's biggest K-pop agencies, Kim Beom-su, the billionaire who owns Kakao, was arrested on charges of manipulating stocks.", "Kim Beom-su, the founder of South Korean technology giant Kakao's billionaire investor status, was arrested on charges of stock fraud during a bidding war for one of North Korea'S top K-pop agencies.", "A bidding war over one of South Korea's biggest K-pop agencies led to the arrest and apprehension charges of Kim Beom-Su, the billionaire who owns the technology giant Kakao.", "The billionaire who owns South Korean technology giant Kakao, Kim Beom-Su, was taken into custody for allegedly engaging in stock trading during a bidding war for one of North Korea's biggest K-pop media groups.", "Accused of stockpiling during a bidding war for one of South Korea's biggest K-pop agencies, Kim Beom-Su, the founder and owner of technology firm known as Kakao, was arrested on charges of manipulating stocks.", 'Kakao, the South Korean technology giant, was involved in a bidding war with Kim Beon-su, its founder, who was arrested on charges of manipulating stocks.', "South Korea's Kakao corporation'entrepreneur husband, Kim Beom-su (pictured), was arrested on suspicion of stock fraud during a bidding war for one of the country'S top K-pop companies.", 'Kim Beom-su, the billionaire who own a South Korean technology company called Kakaof, was arrested on charges of manipulating stocks in an ongoing bidding war over one million shares.']
+# print(find_common_subsequences(sentence, str_list))

masking_methods.py CHANGED Viewed

@@ -1,3 +1,66 @@
 from transformers import AutoTokenizer, AutoModelForMaskedLM
 from transformers import pipeline
 import random
@@ -10,21 +73,27 @@ def mask_non_stopword(sentence):
     words = sentence.split()
     non_stop_words = [word for word in words if word.lower() not in stop_words]
     if not non_stop_words:
-        return sentence
     word_to_mask = random.choice(non_stop_words)
     masked_sentence = sentence.replace(word_to_mask, '[MASK]', 1)
-    return masked_sentence
 def mask_non_stopword_pseudorandom(sentence):
     stop_words = set(stopwords.words('english'))
     words = sentence.split()
     non_stop_words = [word for word in words if word.lower() not in stop_words]
     if not non_stop_words:
-        return sentence
     random.seed(10)
     word_to_mask = random.choice(non_stop_words)
     masked_sentence = sentence.replace(word_to_mask, '[MASK]', 1)
-    return masked_sentence
 def high_entropy_words(sentence, non_melting_points):
     stop_words = set(stopwords.words('english'))
@@ -37,10 +106,11 @@ def high_entropy_words(sentence, non_melting_points):
     candidate_words = [word for word in words if word.lower() not in stop_words and word.lower() not in non_melting_words]
     if not candidate_words:
-        return sentence
     max_entropy = -float('inf')
     max_entropy_word = None
     for word in candidate_words:
         masked_sentence = sentence.replace(word, '[MASK]', 1)
@@ -52,17 +122,19 @@ def high_entropy_words(sentence, non_melting_points):
         if entropy > max_entropy:
             max_entropy = entropy
             max_entropy_word = word
-    return sentence.replace(max_entropy_word, '[MASK]', 1)
 # Load tokenizer and model for masked language model
 tokenizer = AutoTokenizer.from_pretrained("bert-large-cased-whole-word-masking")
 model = AutoModelForMaskedLM.from_pretrained("bert-large-cased-whole-word-masking")
 fill_mask = pipeline("fill-mask", model=model, tokenizer=tokenizer)
-def mask(sentence):
-    predictions = fill_mask(sentence)
-    masked_sentences = [predictions[i]['sequence'] for i in range(len(predictions))]
-    return masked_sentences

+# from transformers import AutoTokenizer, AutoModelForMaskedLM
+# from transformers import pipeline
+# import random
+# from nltk.corpus import stopwords
+# import math
+# # Masking Model
+# def mask_non_stopword(sentence):
+#     stop_words = set(stopwords.words('english'))
+#     words = sentence.split()
+#     non_stop_words = [word for word in words if word.lower() not in stop_words]
+#     if not non_stop_words:
+#         return sentence
+#     word_to_mask = random.choice(non_stop_words)
+#     masked_sentence = sentence.replace(word_to_mask, '[MASK]', 1)
+#     return masked_sentence
+# def mask_non_stopword_pseudorandom(sentence):
+#     stop_words = set(stopwords.words('english'))
+#     words = sentence.split()
+#     non_stop_words = [word for word in words if word.lower() not in stop_words]
+#     if not non_stop_words:
+#         return sentence
+#     random.seed(10)
+#     word_to_mask = random.choice(non_stop_words)
+#     masked_sentence = sentence.replace(word_to_mask, '[MASK]', 1)
+#     return masked_sentence
+# def high_entropy_words(sentence, non_melting_points):
+#     stop_words = set(stopwords.words('english'))
+#     words = sentence.split()
+#     non_melting_words = set()
+#     for _, point in non_melting_points:
+#         non_melting_words.update(point.lower().split())
+#     candidate_words = [word for word in words if word.lower() not in stop_words and word.lower() not in non_melting_words]
+#     if not candidate_words:
+#         return sentence
+#     max_entropy = -float('inf')
+#     max_entropy_word = None
+#     for word in candidate_words:
+#         masked_sentence = sentence.replace(word, '[MASK]', 1)
+#         predictions = fill_mask(masked_sentence)
+#         # Calculate entropy based on top 5 predictions
+#         entropy = -sum(pred['score'] * math.log(pred['score']) for pred in predictions[:5])
+#         if entropy > max_entropy:
+#             max_entropy = entropy
+#             max_entropy_word = word
+#     return sentence.replace(max_entropy_word, '[MASK]', 1)
+# # Load tokenizer and model for masked language model
+# tokenizer = AutoTokenizer.from_pretrained("bert-large-cased-whole-word-masking")
+# model = AutoModelForMaskedLM.from_pretrained("bert-large-cased-whole-word-masking")
+# fill_mask = pipeline("fill-mask", model=model, tokenizer=tokenizer)
 from transformers import AutoTokenizer, AutoModelForMaskedLM
 from transformers import pipeline
 import random
     words = sentence.split()
     non_stop_words = [word for word in words if word.lower() not in stop_words]
     if not non_stop_words:
+        return sentence, None, None
     word_to_mask = random.choice(non_stop_words)
     masked_sentence = sentence.replace(word_to_mask, '[MASK]', 1)
+    predictions = fill_mask(masked_sentence)
+    words = [pred['score'] for pred in predictions]
+    logits = [pred['token_str'] for pred in predictions]
+    return masked_sentence, words, logits
 def mask_non_stopword_pseudorandom(sentence):
     stop_words = set(stopwords.words('english'))
     words = sentence.split()
     non_stop_words = [word for word in words if word.lower() not in stop_words]
     if not non_stop_words:
+        return sentence, None, None
     random.seed(10)
     word_to_mask = random.choice(non_stop_words)
     masked_sentence = sentence.replace(word_to_mask, '[MASK]', 1)
+    predictions = fill_mask(masked_sentence)
+    words = [pred['score'] for pred in predictions]
+    logits = [pred['token_str'] for pred in predictions]
+    return masked_sentence, words, logits
 def high_entropy_words(sentence, non_melting_points):
     stop_words = set(stopwords.words('english'))
     candidate_words = [word for word in words if word.lower() not in stop_words and word.lower() not in non_melting_words]
     if not candidate_words:
+        return sentence, None, None
     max_entropy = -float('inf')
     max_entropy_word = None
+    max_logits = None
     for word in candidate_words:
         masked_sentence = sentence.replace(word, '[MASK]', 1)
         if entropy > max_entropy:
             max_entropy = entropy
             max_entropy_word = word
+            max_logits = [pred['score'] for pred in predictions]
+    masked_sentence = sentence.replace(max_entropy_word, '[MASK]', 1)
+    words = [pred['score'] for pred in predictions]
+    logits = [pred['token_str'] for pred in predictions]
+    return masked_sentence, words, logits
 # Load tokenizer and model for masked language model
 tokenizer = AutoTokenizer.from_pretrained("bert-large-cased-whole-word-masking")
 model = AutoModelForMaskedLM.from_pretrained("bert-large-cased-whole-word-masking")
 fill_mask = pipeline("fill-mask", model=model, tokenizer=tokenizer)
+non_melting_points = [(1, 'Jewish'), (2, 'messages'), (3, 'stab')]
+a, b, c = high_entropy_words("A former Cornell University student was sentenced to 21 months in prison on Monday after admitting that he had posted a series of online messages last fall in which he threatened to stab, rape and behead Jewish people", non_melting_points)
+print(f"logits type: {type(b)}")
+print(f"logits content: {b}")

paraphraser.py CHANGED Viewed

@@ -28,4 +28,4 @@ def generate_paraphrase(question):
     res = paraphrase(question, para_tokenizer, para_model)
     return res
-print(generate_paraphrase("Kim Beom-su, the billionaire behind the South Korean technology giant Kakao, was taken into custody on allegations of stock manipulation during a bidding war over one of the country’s largest K-pop agencies."))

     res = paraphrase(question, para_tokenizer, para_model)
     return res
+# print(generate_paraphrase("Kim Beom-su, the billionaire behind the South Korean technology giant Kakao, was taken into custody on allegations of stock manipulation during a bidding war over one of the country’s largest K-pop agencies."))

sampling_methods.py CHANGED Viewed

@@ -1,145 +1,33 @@
-import re
-from nltk.corpus import stopwords
-import random
-from termcolor import colored
-# Function to Watermark a Word Take Randomly Between Each lcs Point (Random Sampling)
-def random_sampling(original_sentence, paraphrased_sentences):
-    stop_words = set(stopwords.words('english'))
-    original_sentence_lower = original_sentence.lower()
-    paraphrased_sentences_lower = [s.lower() for s in paraphrased_sentences]
-    paraphrased_sentences_no_stopwords = []
-    for sentence in paraphrased_sentences_lower:
-        words = re.findall(r'\b\w+\b', sentence)
-        filtered_sentence = ' '.join([word for word in words if word not in stop_words])
-        paraphrased_sentences_no_stopwords.append(filtered_sentence)
-    results = []
-    for idx, sentence in enumerate(paraphrased_sentences_no_stopwords):
-        common_words = set(original_sentence_lower.split()) & set(sentence.split())
-        common_substrings = ', '.join(sorted(common_words))
-        words_to_replace = [word for word in sentence.split() if word not in common_words]
-        if words_to_replace:
-            word_to_mark = random.choice(words_to_replace)
-            sentence = sentence.replace(word_to_mark, colored(word_to_mark, 'red'))
-        for word in common_words:
-            sentence = sentence.replace(word, colored(word, 'green'))
-        results.append({
-            f"Paraphrased Sentence {idx+1}": sentence,
-            "Common Substrings": common_substrings
-        })
-    return results
-# Function for Inverse Transform Sampling
-def inverse_transform_sampling(original_sentence, paraphrased_sentences):
-    stop_words = set(stopwords.words('english'))
-    original_sentence_lower = original_sentence.lower()
-    paraphrased_sentences_lower = [s.lower() for s in paraphrased_sentences]
-    paraphrased_sentences_no_stopwords = []
-    for sentence in paraphrased_sentences_lower:
-        words = re.findall(r'\b\w+\b', sentence)
-        filtered_sentence = ' '.join([word for word in words if word not in stop_words])
-        paraphrased_sentences_no_stopwords.append(filtered_sentence)
-    results = []
-    for idx, sentence in enumerate(paraphrased_sentences_no_stopwords):
-        common_words = set(original_sentence_lower.split()) & set(sentence.split())
-        common_substrings = ', '.join(sorted(common_words))
-        words_to_replace = [word for word in sentence.split() if word not in common_words]
-        if words_to_replace:
-            probabilities = [1 / len(words_to_replace)] * len(words_to_replace)
-            chosen_word = random.choices(words_to_replace, weights=probabilities)[0]
-            sentence = sentence.replace(chosen_word, colored(chosen_word, 'magenta'))
-        for word in common_words:
-            sentence = sentence.replace(word, colored(word, 'green'))
-        results.append({
-            f"Paraphrased Sentence {idx+1}": sentence,
-            "Common Substrings": common_substrings
-        })
-    return results
-# Function for Contextual Sampling
-def contextual_sampling(original_sentence, paraphrased_sentences):
-    stop_words = set(stopwords.words('english'))
-    original_sentence_lower = original_sentence.lower()
-    paraphrased_sentences_lower = [s.lower() for s in paraphrased_sentences]
-    paraphrased_sentences_no_stopwords = []
-    for sentence in paraphrased_sentences_lower:
-        words = re.findall(r'\b\w+\b', sentence)
-        filtered_sentence = ' '.join([word for word in words if word not in stop_words])
-        paraphrased_sentences_no_stopwords.append(filtered_sentence)
-    results = []
-    for idx, sentence in enumerate(paraphrased_sentences_no_stopwords):
-        common_words = set(original_sentence_lower.split()) & set(sentence.split())
-        common_substrings = ', '.join(sorted(common_words))
-        words_to_replace = [word for word in sentence.split() if word not in common_words]
-        if words_to_replace:
-            context = " ".join([word for word in sentence.split() if word not in common_words])
-            chosen_word = random.choice(words_to_replace)
-            sentence = sentence.replace(chosen_word, colored(chosen_word, 'red'))
-        for word in common_words:
-            sentence = sentence.replace(word, colored(word, 'green'))
-        results.append({
-            f"Paraphrased Sentence {idx+1}": sentence,
-            "Common Substrings": common_substrings
-        })
-    return results
-# Function for Exponential Minimum Sampling
-def exponential_minimum_sampling(original_sentence, paraphrased_sentences):
-    stop_words = set(stopwords.words('english'))
-    original_sentence_lower = original_sentence.lower()
-    paraphrased_sentences_lower = [s.lower() for s in paraphrased_sentences]
-    paraphrased_sentences_no_stopwords = []
-    for sentence in paraphrased_sentences_lower:
-        words = re.findall(r'\b\w+\b', sentence)
-        filtered_sentence = ' '.join([word for word in words if word not in stop_words])
-        paraphrased_sentences_no_stopwords.append(filtered_sentence)
-    results = []
-    for idx, sentence in enumerate(paraphrased_sentences_no_stopwords):
-        common_words = set(original_sentence_lower.split()) & set(sentence.split())
-        common_substrings = ', '.join(sorted(common_words))
-        words_to_replace = [word for word in sentence.split() if word not in common_words]
-        if words_to_replace:
-            num_words = len(words_to_replace)
-            probabilities = [2 ** (-i) for i in range(num_words)]
-            chosen_word = random.choices(words_to_replace, weights=probabilities)[0]
-            sentence = sentence.replace(chosen_word, colored(chosen_word, 'red'))
-        for word in common_words:
-            sentence = sentence.replace(word, colored(word, 'green'))
-        results.append({
-            f"Paraphrased Sentence {idx+1}": sentence,
-            "Common Substrings": common_substrings
-        })
-    return results
-    #---------------------------------------------------------------------------
-    # aryans implementation please refactor it as you see fit
 import torch
 import random
-def sample_word(words, logits, sampling_technique='inverse_transform', temperature=1.0):
     if sampling_technique == 'inverse_transform':
         probs = torch.softmax(torch.tensor(logits), dim=-1)
         cumulative_probs = torch.cumsum(probs, dim=-1)
@@ -160,4 +48,8 @@ def sample_word(words, logits, sampling_technique='inverse_transform', temperatu
         raise ValueError("Invalid sampling technique. Choose 'inverse_transform', 'exponential_minimum', 'temperature', or 'greedy'.")
     sampled_word = words[sampled_index]
-    return sampled_word

+# import torch
+# import random
+# def sample_word(words, logits, sampling_technique='inverse_transform', temperature=1.0):
+#     if sampling_technique == 'inverse_transform':
+#         probs = torch.softmax(torch.tensor(logits), dim=-1)
+#         cumulative_probs = torch.cumsum(probs, dim=-1)
+#         random_prob = random.random()
+#         sampled_index = torch.where(cumulative_probs >= random_prob)[0][0]
+#     elif sampling_technique == 'exponential_minimum':
+#         probs = torch.softmax(torch.tensor(logits), dim=-1)
+#         exp_probs = torch.exp(-torch.log(probs))
+#         random_probs = torch.rand_like(exp_probs)
+#         sampled_index = torch.argmax(random_probs * exp_probs)
+#     elif sampling_technique == 'temperature':
+#         scaled_logits = torch.tensor(logits) / temperature
+#         probs = torch.softmax(scaled_logits, dim=-1)
+#         sampled_index = torch.multinomial(probs, 1).item()
+#     elif sampling_technique == 'greedy':
+#         sampled_index = torch.argmax(torch.tensor(logits)).item()
+#     else:
+#         raise ValueError("Invalid sampling technique. Choose 'inverse_transform', 'exponential_minimum', 'temperature', or 'greedy'.")
+#     sampled_word = words[sampled_index]
+#     return sampled_word
 import torch
 import random
+def sample_word(sentence, words, logits, sampling_technique='inverse_transform', temperature=1.0):
     if sampling_technique == 'inverse_transform':
         probs = torch.softmax(torch.tensor(logits), dim=-1)
         cumulative_probs = torch.cumsum(probs, dim=-1)
         raise ValueError("Invalid sampling technique. Choose 'inverse_transform', 'exponential_minimum', 'temperature', or 'greedy'.")
     sampled_word = words[sampled_index]
+    # Replace [MASK] with the sampled word
+    filled_sentence = sentence.replace('[MASK]', sampled_word)
+    return filled_sentence

tree.py CHANGED Viewed

@@ -1,29 +1,31 @@
-import plotly.graph_objs as go
 import textwrap
 import re
 from collections import defaultdict
-from paraphraser import generate_paraphrase
-from masking_methods import mask, mask_non_stopword
-def generate_plot(original_sentence, selected_sentences):
-    first_paraphrased_sentence = selected_sentences[0]
-    masked_sentence = mask_non_stopword(first_paraphrased_sentence)
-    masked_versions = mask(masked_sentence)
-    nodes = []
-    nodes.append(original_sentence)
-    nodes.extend(selected_sentences)
-    nodes.extend(masked_versions)
-    nodes[0] += ' L0'
-    para_len = len(selected_sentences)
-    for i in range(1, para_len+1):
-        nodes[i] += ' L1'
-    for i in range(para_len+1, len(nodes)):
-        nodes[i] += ' L2'
     cleaned_nodes = [re.sub(r'\sL[0-9]$', '', node) for node in nodes]
-    wrapped_nodes = ['<br>'.join(textwrap.wrap(node, width=30)) for node in cleaned_nodes]
     def get_levels_and_edges(nodes):
         levels = {}
         edges = []
@@ -37,58 +39,99 @@ def generate_plot(original_sentence, selected_sentences):
             if level == 1:
                 edges.append((root_node, i))
-        # Identify the first L1 node
-        first_l1_node = next(i for i, level in levels.items() if level == 1)
-        # Add edges from the first L1 node to all L2 nodes
-        for i, level in levels.items():
-            if level == 2:
-                edges.append((first_l1_node, i))
         return levels, edges
     # Get levels and dynamic edges
     levels, edges = get_levels_and_edges(nodes)
-    max_level = max(levels.values())
     # Calculate positions
     positions = {}
-    level_widths = defaultdict(int)
     for node, level in levels.items():
-        level_widths[level] += 1
-    x_offsets = {level: - (width - 1) / 2 for level, width in level_widths.items()}
-    y_gap = 4
     for node, level in levels.items():
-        positions[node] = (x_offsets[level], -level * y_gap)
-        x_offsets[level] += 1
     # Create figure
     fig = go.Figure()
     # Add nodes to the figure
     for i, node in enumerate(wrapped_nodes):
         x, y = positions[i]
         fig.add_trace(go.Scatter(
-            x=[x],
             y=[y],
             mode='markers',
             marker=dict(size=10, color='blue'),
             hoverinfo='none'
         ))
         fig.add_annotation(
-            x=x,
             y=y,
-            text=node,
             showarrow=False,
-            yshift=20,  # Adjust the y-shift value to avoid overlap
             align="center",
-            font=dict(size=10),
             bordercolor='black',
             borderwidth=1,
-            borderpad=4,
             bgcolor='white',
-            width=200
         )
     # Add edges to the figure
@@ -96,19 +139,19 @@ def generate_plot(original_sentence, selected_sentences):
         x0, y0 = positions[edge[0]]
         x1, y1 = positions[edge[1]]
         fig.add_trace(go.Scatter(
-            x=[x0, x1],
             y=[y0, y1],
             mode='lines',
-            line=dict(color='black', width=2)
         ))
     fig.update_layout(
         showlegend=False,
-        margin=dict(t=50, b=50, l=50, r=50),
         xaxis=dict(showgrid=False, zeroline=False, showticklabels=False),
         yaxis=dict(showgrid=False, zeroline=False, showticklabels=False),
-        width=1470,
-        height=800  # Increase height to provide more space
     )
     return fig

+import plotly.graph_objects as go
 import textwrap
 import re
 from collections import defaultdict
+def generate_subplot(paraphrased_sentence, scheme_sentences, sampled_sentence, highlight_info):
+    # Combine nodes into one list with appropriate labels
+    nodes = [paraphrased_sentence] + scheme_sentences + sampled_sentence
+    nodes[0] += ' L0'  # Paraphrased sentence is level 0
+    para_len = len(scheme_sentences)
+    for i in range(1, para_len + 1):
+        nodes[i] += ' L1'  # Scheme sentences are level 1
+    for i in range(para_len + 1, len(nodes)):
+        nodes[i] += ' L2'  # Sampled sentences are level 2
+    # Define the highlight_words function
+    def highlight_words(sentence, color_map):
+        for word, color in color_map.items():
+            sentence = re.sub(f"\\b{word}\\b", f"{{{{{word}}}}}", sentence, flags=re.IGNORECASE)
+        return sentence
+    # Clean and wrap nodes, and highlight specified words globally
     cleaned_nodes = [re.sub(r'\sL[0-9]$', '', node) for node in nodes]
+    global_color_map = dict(highlight_info)
+    highlighted_nodes = [highlight_words(node, global_color_map) for node in cleaned_nodes]
+    wrapped_nodes = ['<br>'.join(textwrap.wrap(node, width=30)) for node in highlighted_nodes]
+    # Function to determine tree levels and create edges dynamically
     def get_levels_and_edges(nodes):
         levels = {}
         edges = []
             if level == 1:
                 edges.append((root_node, i))
+        # Add edges from each L1 node to their corresponding L2 nodes
+        l1_indices = [i for i, level in levels.items() if level == 1]
+        l2_indices = [i for i, level in levels.items() if level == 2]
+        for i, l1_node in enumerate(l1_indices):
+            l2_start = i * 4
+            for j in range(4):
+                l2_index = l2_start + j
+                if l2_index < len(l2_indices):
+                    edges.append((l1_node, l2_indices[l2_index]))
+        # Add edges from each L2 node to their corresponding L3 nodes
+        l2_indices = [i for i, level in levels.items() if level == 2]
+        l3_indices = [i for i, level in levels.items() if level == 3]
+        l2_to_l3_map = {l2_node: [] for l2_node in l2_indices}
+        # Map L3 nodes to L2 nodes
+        for l3_node in l3_indices:
+            l2_node = l3_node % len(l2_indices)
+            l2_to_l3_map[l2_indices[l2_node]].append(l3_node)
+        for l2_node, l3_nodes in l2_to_l3_map.items():
+            for l3_node in l3_nodes:
+                edges.append((l2_node, l3_node))
         return levels, edges
     # Get levels and dynamic edges
     levels, edges = get_levels_and_edges(nodes)
+    max_level = max(levels.values(), default=0)
     # Calculate positions
     positions = {}
+    level_heights = defaultdict(int)
     for node, level in levels.items():
+        level_heights[level] += 1
+    y_offsets = {level: - (height - 1) / 2 for level, height in level_heights.items()}
+    x_gap = 2
+    l1_y_gap = 10
+    l2_y_gap = 6
     for node, level in levels.items():
+        if level == 1:
+            positions[node] = (-level * x_gap, y_offsets[level] * l1_y_gap)
+        elif level == 2:
+            positions[node] = (-level * x_gap, y_offsets[level] * l2_y_gap)
+        else:
+            positions[node] = (-level * x_gap, y_offsets[level] * l2_y_gap)
+        y_offsets[level] += 1
+    # Function to highlight words in a wrapped node string
+    def color_highlighted_words(node, color_map):
+        parts = re.split(r'(\{\{.*?\}\})', node)
+        colored_parts = []
+        for part in parts:
+            match = re.match(r'\{\{(.*?)\}\}', part)
+            if match:
+                word = match.group(1)
+                color = color_map.get(word, 'black')
+                colored_parts.append(f"<span style='color: {color};'>{word}</span>")
+            else:
+                colored_parts.append(part)
+        return ''.join(colored_parts)
     # Create figure
     fig = go.Figure()
     # Add nodes to the figure
     for i, node in enumerate(wrapped_nodes):
+        colored_node = color_highlighted_words(node, global_color_map)
         x, y = positions[i]
         fig.add_trace(go.Scatter(
+            x=[-x],  # Reflect the x coordinate
             y=[y],
             mode='markers',
             marker=dict(size=10, color='blue'),
             hoverinfo='none'
         ))
         fig.add_annotation(
+            x=-x,  # Reflect the x coordinate
             y=y,
+            text=colored_node,
             showarrow=False,
+            xshift=15,
             align="center",
+            font=dict(size=8),
             bordercolor='black',
             borderwidth=1,
+            borderpad=2,
             bgcolor='white',
+            width=150
         )
     # Add edges to the figure
         x0, y0 = positions[edge[0]]
         x1, y1 = positions[edge[1]]
         fig.add_trace(go.Scatter(
+            x=[-x0, -x1],  # Reflect the x coordinates
             y=[y0, y1],
             mode='lines',
+            line=dict(color='black', width=1)
         ))
     fig.update_layout(
         showlegend=False,
+        margin=dict(t=20, b=20, l=20, r=20),
         xaxis=dict(showgrid=False, zeroline=False, showticklabels=False),
         yaxis=dict(showgrid=False, zeroline=False, showticklabels=False),
+        width=1200,  # Adjusted width to accommodate more levels
+        height=1000   # Adjusted height to accommodate more levels
     )
     return fig