loubnabnl's picture
loubnabnl HF staff
update filtering
353f3d1
raw
history blame
2.59 kB
import streamlit as st
import json
import pandas as pd
from datasets import load_dataset
st.set_page_config(page_title="The Stack data Inspection", layout="wide")
st.sidebar.title("The Stack data Inspection")
df = pd.read_csv("new_extension_distribution.csv")
all_extensions = df["extension"].tolist()
tags = {}
for index, row in df.iterrows():
if row["language"] not in tags:
tags[row["language"]] = []
tags[row["language"]].append(row["extension"])
all_languages = list(tags.keys())
@st.cache_data()
def load_data(language, ext):
ds = load_dataset(
"loubnabnl/the-stack-inspection-data",
data_dir=f"data/{language}/{ext}",
split="train",
)
return ds
col1, col2, _ = st.columns([1, 1, 4])
with col1:
chosen_language = st.sidebar.selectbox(
label="Select a programming language", options=all_languages, index=0
)
with col2:
chosen_ext = st.sidebar.selectbox(
label="Select an extension", options=tags[chosen_language], index=0
)
st.sidebar.header("Filters")
not_lexable = st.sidebar.checkbox("Not lexable")
min_alphanum = st.sidebar.slider("Minimum alphanumeric fraction", 0.0, 1.0, 1.0)
max_line_length = st.sidebar.slider("Maximum line length", 0, 1000, 0)
max_mean_line_length = st.sidebar.slider("Maximum average line length", 0, 500, 0)
st.sidebar.markdown("Printed files have `max_line_length` and `average_line_length` larger than the selected values.\
`alphanumeric_fraction` is smaller than the selected value.")
# load and filter dataset
samples = load_data(chosen_language, chosen_ext)
samples = samples.filter(lambda x: x["alphanum_fraction"] < min_alphanum)
samples = samples.filter(lambda x: x["max_line_length"] > max_line_length)
samples = samples.filter(lambda x: x["avg_line_length"] > max_mean_line_length)
if not_lexable:
samples = samples.filter(lambda x: not x["lexable"])
max_docs = len(samples)
if max_docs > 0:
col_1, _ = st.columns([3, 3])
with col_1:
index_example = st.number_input(
f"Extension {chosen_ext} has {max_docs} files, choose one to visualize:",
min_value=0,
max_value=max_docs - 1,
value=0,
step=1,
)
example = samples[index_example]
st.markdown("#### File content:")
if example["lexable"]:
st.code(example["content"], language=chosen_language)
else:
st.text(f"File can't be lexed so we remove syntax highlighting.\nContent:\n")
st.text(str(example["content"]))
else:
st.text("The dataset is empty after the filtering!")