the-stack-inspection

Sleeping

App Files Files Community

the-stack-inspection / app.py

harmdevries

Update app.py

a383930 almost 2 years ago

raw

history blame

3.27 kB

	import streamlit as st
	import json
	import pandas as pd
	from datasets import load_dataset

	st.set_page_config(page_title="The Stack data Inspection", layout="wide")
	st.sidebar.title("The Stack data Inspection")

	df = pd.read_csv("extension_distribution.csv")
	all_extensions = df["extension"].tolist()

	tags = {}
	for index, row in df.iterrows():
	if row["language"] not in tags:
	tags[row["language"]] = []
	tags[row["language"]].append(row["extension"])
	all_languages = list(tags.keys())



	@st.cache()
	def load_data(language, ext):
	ds = load_dataset(
	"loubnabnl/the-stack-inspection-data",
	data_dir=f"data/{language}/{ext}",
	split="train",
	)
	return ds


	col1, col2, _ = st.columns([1, 1, 4])
	with col1:
	chosen_language = st.sidebar.selectbox(
	label="Select a programming language", options=all_languages, index=0
	)
	with col2:
	chosen_ext = st.sidebar.selectbox(
	label="Select an extension", options=tags[chosen_language], index=0
	)

	st.sidebar.header("Filters")
	not_lexable = st.sidebar.checkbox("Not lexable?")
	low_alphanum = st.sidebar.checkbox("Low alphanum count?")
	long_lines = st.sidebar.checkbox("Long lines?")


	# load the dataset and get indexes of non lexable files
	samples = load_data(chosen_language, chosen_ext)

	if not_lexable:
	samples = samples.filter(lambda x: not x["lexable"])
	if low_alphanum:
	samples = samples.filter(lambda x: x["low_alphanum"])
	if long_lines:
	samples = samples.filter(lambda x: x["long_lines"])

	max_docs = len(samples)
	samples = samples.add_column("idx", range(len(samples)))

	# info about extension
	# st.sidebar.markdown("### Information about the extension:")
	# text = f"Extension {chosen_ext} has {max_docs} files, {df[df['extension'] == chosen_ext]['low_alphanum_count'].values[0]} with very low alphanumeric ratio, \
	# {df[df['extension'] == chosen_ext]['long_lines_count'].values[0]} with very long lines, and {df[df['extension'] == chosen_ext]['non_lexable_count'].values[0]} \
	# are not lexable.\n These files are at indexes:\n {indexes_not_lexed}."
	# st.sidebar.markdown(text)

	if max_docs > 0:
	col_1, _ = st.columns([3, 3])
	with col_1:
	index_example = st.number_input(
	f"Extension {chosen_ext} has {max_docs} files, choose one to visualize:",
	min_value=0,
	max_value=max_docs - 1,
	value=0,
	step=1,
	)


	# info about the chosen example
	example = samples[index_example]

	# st.markdown("#### Information about the chosen example:")
	# text_alpha = "has" if example["long_lines"] else "doesn't have"
	# text_lines = "has" if example["low_alphanum"] else "doesn't have"
	# text_lexer = "is" if example["lexable"] else "isn't"

	# st.markdown(
	# f"Example {index_example} {text_alpha} a very low alphanumeric ratio, \
	# {text_lines} very long lines, and {text_lexer} lexable."
	# )


	# display file content
	st.markdown("#### File content:")
	if not example["lexable"]:
	st.write(f"File can't be lexed so we remove syntax highlighting.\nContent:\n")
	st.text(example['content'])
	else:
	st.code(example["content"], language=chosen_language)