Spaces:
Runtime error
Runtime error
from keybert import KeyBERT | |
import streamlit as st | |
import streamlit.components.v1 as components | |
from datasets import load_dataset | |
import pandas as pd | |
st.set_page_config(page_title="KeyBERT") | |
st.title("HF-KeyBERT A front end for KeyBERT") | |
st.caption("By Allen Roush") | |
st.caption("github: https://github.com/Hellisotherpeople") | |
st.caption("Linkedin: https://www.linkedin.com/in/allen-roush-27721011b/") | |
st.header("KeyBERT") | |
st.caption("By Maarten Grootendorst") | |
st.image("https://raw.githubusercontent.com/MaartenGr/KeyBERT/master/images/logo.png", width = 200) | |
st.caption("github: https://github.com/MaartenGr") | |
st.caption("Linkedin: https://www.linkedin.com/in/mgrootendorst/") | |
form = st.sidebar.form("choose_settings") | |
form.header("Main Settings") | |
custom_doc = form.checkbox("Use a document from an existing dataset?", value = True) | |
if custom_doc: | |
dataset_name = form.text_area("Enter the name of the huggingface Dataset to do analysis of:", value = "Hellisotherpeople/DebateSum") | |
dataset_name_2 = form.text_area("Enter the name of the config for the dataset if it has one", value = "") | |
split_name = form.text_area("Enter the name of the split of the dataset that you want to use", value = "train") | |
number_of_records = form.number_input("Enter the number of documents that you want to analyze from the dataset", value = 200) | |
column_name = form.text_area("Enter the name of the column that we are doing analysis on (the X value)", value = "Full-Document") | |
index_to_analyze_start = form.number_input("Enter the index start of the document that you want to analyze of the dataset", value = 0) | |
index_to_analyze_end = form.number_input("Enter the index end of the document that you want to analyze of the dataset", value = 2) | |
else: | |
doc = st.text_area("Enter a custom document") | |
model_name = form.text_area("Enter the name of the pre-trained model from sentence transformers that we are using for featurization", value = "all-MiniLM-L6-v2") | |
form.caption("This will download a new model, so it may take awhile or even break if the model is too large") | |
form.caption("See the list of pre-trained models that are available here! https://www.sbert.net/docs/pretrained_models.html") | |
form.form_submit_button("Submit") | |
def load_and_process_data(path, name, streaming, split_name, number_of_records): | |
dataset = load_dataset(path = path, name = name, streaming=streaming) | |
#return list(dataset) | |
dataset_head = dataset[split_name].take(number_of_records) | |
df = pd.DataFrame.from_dict(dataset_head) | |
return df[column_name] | |
def load_model(model_name): | |
kw_model = KeyBERT(model=model_name) | |
return kw_model | |
model = load_model(model_name=model_name) | |
if custom_doc: | |
st.header("Original Dataset") | |
df = load_and_process_data(dataset_name, dataset_name_2, True, split_name, number_of_records) | |
doc = list(df[index_to_analyze_start:index_to_analyze_end]) | |
st.write(df) | |
st.header("Indexed Documents") | |
st.write(doc) | |
form2 = st.sidebar.form("KeyBERT Settings") | |
form2.header("KeyBERT Settings") | |
keyphrase_min = form2.number_input("KeyPhrase ngram range minimum", value = 1, min_value = 1) | |
keyphrase_max = form2.number_input("KeyPhrase ngram range maximum", value = 2, min_value = 1) | |
form2.caption("Use the keyphrase min and max to set the length of the resulting keywords/keyphrases") | |
use_maxsum = form2.checkbox("Use Max Sum Similarity?", value = False) | |
form2.caption("Max sum modifies the keyphrase algorithm in the following way: we take the 2 x top_n most similar words/phrases to the document. Then, we take all top_n combinations from the 2 x top_n words and extract the combination that are the least similar to each other by cosine similarity.") | |
nr_candidates = form2.number_input("Enter the number of candidates to consider if maxsum is True", value = 10) | |
form2.caption("Only meaningful if Max Sum Similarity is selected") | |
use_mmr = form2.checkbox("Use Maximal Marginal Relevance?", value = False) | |
form2.caption("Maximal Marginal Relevance modifies the keyphrase algorithm in the following way: Instead of simply ranking the cosine similarity of the keyphrases to the document, keyphrases are also ranked against already selected keyphrases") | |
diversity = form2.number_input("Enter the diversity", value = 0.7) | |
form2.caption("Diversity only is meaningful if Maximal Marginal Relevance is turned on. This modifies how much the MMR algorithm weighs the results") | |
top_n = form2.number_input("Enter the number of returned keyphrases", value = 10) | |
min_df = form2.number_input("Enter the minimum document frequency of a word", value = 1, max_value = len(doc)) | |
form2.caption("Only meaningful if extracting the keyphrases of multiple documents") | |
seed_keywords = form2.text_area("Enter a list of keyword (separated with space) which will personalize/guide the extracted keywords", value = "") | |
form2.caption("Due to the implementation details of this in KeyBERT, this doesn't usually heavily impact results") | |
form2.form_submit_button("Submit") | |
keywords = model.extract_keywords(doc, keyphrase_ngram_range=(keyphrase_min, keyphrase_max), use_maxsum = use_maxsum, use_mmr = use_mmr, diversity = diversity, top_n = top_n, min_df = min_df, nr_candidates = nr_candidates, seed_keywords = seed_keywords.split()) | |
st.header("Extracted Keywords/Keyphrases") | |
st.caption("Output is sorted in reverse order (so the final element is the strongest keyphrase and the first element is the nth strongest)") | |
st.caption("That means you should read from the bottom up") | |
st.write(keywords) | |