HF-KeyBERT / app.py
Hellisotherpeople's picture
small fixes
c080e20
from keybert import KeyBERT
import streamlit as st
import streamlit.components.v1 as components
from datasets import load_dataset
import pandas as pd
st.set_page_config(page_title="KeyBERT")
st.title("HF-KeyBERT A front end for KeyBERT")
st.caption("By Allen Roush")
st.caption("github: https://github.com/Hellisotherpeople")
st.caption("Linkedin: https://www.linkedin.com/in/allen-roush-27721011b/")
st.header("KeyBERT")
st.caption("By Maarten Grootendorst")
st.image("https://raw.githubusercontent.com/MaartenGr/KeyBERT/master/images/logo.png", width = 200)
st.caption("github: https://github.com/MaartenGr")
st.caption("Linkedin: https://www.linkedin.com/in/mgrootendorst/")
form = st.sidebar.form("choose_settings")
form.header("Main Settings")
custom_doc = form.checkbox("Use a document from an existing dataset?", value = True)
if custom_doc:
dataset_name = form.text_area("Enter the name of the huggingface Dataset to do analysis of:", value = "Hellisotherpeople/DebateSum")
dataset_name_2 = form.text_area("Enter the name of the config for the dataset if it has one", value = "")
split_name = form.text_area("Enter the name of the split of the dataset that you want to use", value = "train")
number_of_records = form.number_input("Enter the number of documents that you want to analyze from the dataset", value = 200)
column_name = form.text_area("Enter the name of the column that we are doing analysis on (the X value)", value = "Full-Document")
index_to_analyze_start = form.number_input("Enter the index start of the document that you want to analyze of the dataset", value = 0)
index_to_analyze_end = form.number_input("Enter the index end of the document that you want to analyze of the dataset", value = 2)
else:
doc = st.text_area("Enter a custom document")
model_name = form.text_area("Enter the name of the pre-trained model from sentence transformers that we are using for featurization", value = "all-MiniLM-L6-v2")
form.caption("This will download a new model, so it may take awhile or even break if the model is too large")
form.caption("See the list of pre-trained models that are available here! https://www.sbert.net/docs/pretrained_models.html")
form.form_submit_button("Submit")
@st.cache
def load_and_process_data(path, name, streaming, split_name, number_of_records):
dataset = load_dataset(path = path, name = name, streaming=streaming)
#return list(dataset)
dataset_head = dataset[split_name].take(number_of_records)
df = pd.DataFrame.from_dict(dataset_head)
return df[column_name]
@st.cache(allow_output_mutation=True)
def load_model(model_name):
kw_model = KeyBERT(model=model_name)
return kw_model
model = load_model(model_name=model_name)
if custom_doc:
st.header("Original Dataset")
df = load_and_process_data(dataset_name, dataset_name_2, True, split_name, number_of_records)
doc = list(df[index_to_analyze_start:index_to_analyze_end])
st.write(df)
st.header("Indexed Documents")
st.write(doc)
form2 = st.sidebar.form("KeyBERT Settings")
form2.header("KeyBERT Settings")
keyphrase_min = form2.number_input("KeyPhrase ngram range minimum", value = 1, min_value = 1)
keyphrase_max = form2.number_input("KeyPhrase ngram range maximum", value = 2, min_value = 1)
form2.caption("Use the keyphrase min and max to set the length of the resulting keywords/keyphrases")
use_maxsum = form2.checkbox("Use Max Sum Similarity?", value = False)
form2.caption("Max sum modifies the keyphrase algorithm in the following way: we take the 2 x top_n most similar words/phrases to the document. Then, we take all top_n combinations from the 2 x top_n words and extract the combination that are the least similar to each other by cosine similarity.")
nr_candidates = form2.number_input("Enter the number of candidates to consider if maxsum is True", value = 10)
form2.caption("Only meaningful if Max Sum Similarity is selected")
use_mmr = form2.checkbox("Use Maximal Marginal Relevance?", value = False)
form2.caption("Maximal Marginal Relevance modifies the keyphrase algorithm in the following way: Instead of simply ranking the cosine similarity of the keyphrases to the document, keyphrases are also ranked against already selected keyphrases")
diversity = form2.number_input("Enter the diversity", value = 0.7)
form2.caption("Diversity only is meaningful if Maximal Marginal Relevance is turned on. This modifies how much the MMR algorithm weighs the results")
top_n = form2.number_input("Enter the number of returned keyphrases", value = 10)
min_df = form2.number_input("Enter the minimum document frequency of a word", value = 1, max_value = len(doc))
form2.caption("Only meaningful if extracting the keyphrases of multiple documents")
seed_keywords = form2.text_area("Enter a list of keyword (separated with space) which will personalize/guide the extracted keywords", value = "")
form2.caption("Due to the implementation details of this in KeyBERT, this doesn't usually heavily impact results")
form2.form_submit_button("Submit")
keywords = model.extract_keywords(doc, keyphrase_ngram_range=(keyphrase_min, keyphrase_max), use_maxsum = use_maxsum, use_mmr = use_mmr, diversity = diversity, top_n = top_n, min_df = min_df, nr_candidates = nr_candidates, seed_keywords = seed_keywords.split())
st.header("Extracted Keywords/Keyphrases")
st.caption("Output is sorted in reverse order (so the final element is the strongest keyphrase and the first element is the nth strongest)")
st.caption("That means you should read from the bottom up")
st.write(keywords)