Spaces:
Running
Running
import re | |
from nltk.corpus import stopwords | |
def find_common_subsequences(sentence, str_list): | |
stop_words = set(stopwords.words('english')) | |
sentence = sentence.lower() | |
str_list = [s.lower() for s in str_list] | |
def is_present(subseq, str_list): | |
subseq_regex = re.compile(r'\b' + re.escape(subseq) + r'\b') | |
return all(subseq_regex.search(s) for s in str_list) | |
def remove_stop_words_and_special_chars(sentence): | |
sentence = re.sub(r'[^\w\s]', '', sentence) | |
words = sentence.split() | |
filtered_words = [word for word in words if word.lower() not in stop_words] | |
return " ".join(filtered_words) | |
sentence = remove_stop_words_and_special_chars(sentence) | |
str_list = [remove_stop_words_and_special_chars(s) for s in str_list] | |
words = sentence.split() | |
common_grams = [] | |
added_phrases = set() | |
for n in range(5, 0, -1): | |
for i in range(len(words) - n + 1): | |
subseq = " ".join(words[i:i+n]) | |
if is_present(subseq, str_list) and not any(subseq in phrase for phrase in added_phrases): | |
common_grams.append((i, subseq)) | |
added_phrases.add(subseq) | |
# Sort by the first appearance in the original sentence | |
common_grams.sort(key=lambda x: x[0]) | |
# Assign indices based on the sorted order | |
indexed_common_grams = [(index + 1, subseq) for index, (_, subseq) in enumerate(common_grams)] | |
return indexed_common_grams | |
# Example usage | |
# sentence = "Kim Beom-su, the billionaire behind the South Korean technology giant Kakao, was taken into custody on allegations of stock manipulation during a bidding war over one of the country’s largest K-pop agencies." | |
# str_list = ["The founder of South Korean technology company Kakao, billionaire Kim Beom-su, was arrested on charges of stock fraud during a bidding war for one of North Korea's biggest K-pop companies.", "In a bidding war for one of South Korea's largest K-pop agencies, Kim Beom-su, the billionaire who owns Kakao, was arrested on charges of manipulating stocks.", "During a bidding war for one of South Korea's biggest K-pop agencies, Kim Beom-su, the billionaire who owns Kakao, was arrested on charges of manipulating stocks.", "Kim Beom-su, the founder of South Korean technology giant Kakao's billionaire investor status, was arrested on charges of stock fraud during a bidding war for one of North Korea'S top K-pop agencies.", "A bidding war over one of South Korea's biggest K-pop agencies led to the arrest and apprehension charges of Kim Beom-Su, the billionaire who owns the technology giant Kakao.", "The billionaire who owns South Korean technology giant Kakao, Kim Beom-Su, was taken into custody for allegedly engaging in stock trading during a bidding war for one of North Korea's biggest K-pop media groups.", "Accused of stockpiling during a bidding war for one of South Korea's biggest K-pop agencies, Kim Beom-Su, the founder and owner of technology firm known as Kakao, was arrested on charges of manipulating stocks.", 'Kakao, the South Korean technology giant, was involved in a bidding war with Kim Beon-su, its founder, who was arrested on charges of manipulating stocks.', "South Korea's Kakao corporation'entrepreneur husband, Kim Beom-su (pictured), was arrested on suspicion of stock fraud during a bidding war for one of the country'S top K-pop companies.", 'Kim Beom-su, the billionaire who own a South Korean technology company called Kakaof, was arrested on charges of manipulating stocks in an ongoing bidding war over one million shares.'] | |
# print(find_common_subsequences(sentence, str_list)) | |