import re from nltk.corpus import stopwords def find_common_subsequences(sentence, str_list): stop_words = set(stopwords.words('english')) sentence = sentence.lower() str_list = [s.lower() for s in str_list] def is_present(subseq, str_list): subseq_regex = re.compile(r'\b' + re.escape(subseq) + r'\b') return all(subseq_regex.search(s) for s in str_list) def remove_stop_words_and_special_chars(sentence): sentence = re.sub(r'[^\w\s]', '', sentence) words = sentence.split() filtered_words = [word for word in words if word.lower() not in stop_words] return " ".join(filtered_words) sentence = remove_stop_words_and_special_chars(sentence) str_list = [remove_stop_words_and_special_chars(s) for s in str_list] words = sentence.split() common_grams = [] added_phrases = set() for n in range(5, 0, -1): for i in range(len(words) - n + 1): subseq = " ".join(words[i:i+n]) if is_present(subseq, str_list) and not any(subseq in phrase for phrase in added_phrases): common_grams.append((i, subseq)) added_phrases.add(subseq) # Sort by the first appearance in the original sentence common_grams.sort(key=lambda x: x[0]) # Assign indices based on the sorted order indexed_common_grams = [(index + 1, subseq) for index, (_, subseq) in enumerate(common_grams)] return indexed_common_grams # Example usage # sentence = "Donald Trump said at a campaign rally event in Wilkes-Barre, Pennsylvania, that there has “never been a more dangerous time 5since the Holocaust” to be Jewish in the United States." # str_list = [''] # print(find_common_subsequences(sentence, str_list))