Spaces:

sasha
/

evaluation-buddy

Sleeping

Sasha

adding some hacky task type detection

4474a2c over 2 years ago

6.5 kB

	import streamlit as st
	from datasets import load_dataset_builder
	from datasets import get_dataset_config_names
	from os import listdir
	from datasets import load_dataset, Dataset
	from datasets_sql import query
	import plotly.express as px
	import numpy as np
	import statistics

	st.set_page_config(
	page_title="Evaluation Buddy",
	page_icon="./robot.png",
	layout="wide",
	)

	st.title("Hugging Face Evaluation Buddy")

	top_datasets= ['glue', 'super_glue', 'wikitext', 'imdb', 'squad', 'squad_es', \
	'paws', 'librispeech_asr', 'wmt16', 'xnli', 'snli', 'ag_news', \
	'anli', 'amazon_polarity', 'squad_v2', 'conll2003', 'red_caps', \
	'common_voice', 'stsb_multi_mt', 'trec', 'tweet_eval', 'cosmos_qa',\
	'sick', 'xsum', 'wikiann', 'yelp_polarity', 'hellaswag', 'piqa', \
	'race', 'winogrande']

	tasks= ['text classification', 'question answering', 'automatic speech recognition', 'natural language inference', \
	'machine translation', 'sentiment analysis', 'text simplification', 'named entity recognition', \
	'reading comprehension']
	metrics= ['matthews_correlation', 'perplexity', 'meteor', 'code_eval', 'super_glue', 'rouge', 'mauve', 'cer', 'accuracy', 'recall', 'bleurt', 'sari', 'precision', 'mean_iou', 'squad', 'mahalanobis', 'chrf', 'mae', 'squad_v2', 'seqeval', 'cuad', 'wiki_split', 'google_bleu', 'competition_math', 'pearsonr', 'xtreme_s', 'comet', 'gleu', 'spearmanr', 'f1', 'frugalscore', 'bertscore', 'indic_glue', 'mse', 'xnli', 'ter', 'coval', 'wer', 'bleu', 'glue', 'sacrebleu']

	with st.sidebar.expander("Datasets", expanded=True):
	dataset_name = st.selectbox(
	f"Choose a dataset to evaluate on:",
	sorted(top_datasets))
	configs = get_dataset_config_names(dataset_name)
	dataset_config = st.selectbox(
	f"Choose a configuration of your dataset:",
	configs)
	dataset_builder = load_dataset_builder(dataset_name, dataset_config)
	splits = [s for s in dataset_builder.info.splits]
	dataset_split = st.selectbox(
	f"Choose a dataset split:",
	splits)
	balanced_stdev = st.slider("Choose a standard deviation threshold for determining whether a dataset is balanced or not:", 0.00, 1.00, 0.20)



	st.markdown("## Here is some information about your dataset:")

	st.markdown("### Description")

	st.markdown(dataset_builder.info.description)
	st.markdown("For more information about this dataset, check out [its website](https://huggingface.co/datasets/"+dataset_name+")")


	st.markdown("### Dataset-Specific Metrics")
	if dataset_name in metrics:
	st.markdown("Great news! Your dataset has a dedicated metric for it! You can use it like this:")
	code = ''' from datasets import load_metric
	metric = load_metric('''+dataset_name+''', '''+dataset_config+''')'''
	st.code(code, language='python')
	dedicated_metric = True
	else:
	st.markdown("Your dataset doesn't have a dedicated metric, but that's ok!")
	dedicated_metric = False

	st.markdown("### Task-Specific Metrics")

	try:
	task = dataset_builder.info.task_templates[0].task
	except:
	for t in tasks:
	if t in str(dataset_builder.info.description).lower():
	task = t
	else:
	task = None

	if task is not None:
	st.markdown("The task associated to it your dataset is: " + task.replace('-',' '))
	if task == 'automatic-speech-recognition':
	st.markdown('Automatic Speech Recognition has some dedicated metrics such as:')
	st.markdown('[Word Error Rate](https://huggingface.co/metrics/wer)')
	st.markdown('[Character Error Rate](https://huggingface.co/metrics/cer)')
	else:
	st.markdown("The task for your dataset doesn't have any dedicated metrics, but you can still use general ones!")


	#print(dataset_builder.info.task_templates)
	#print(dataset_builder.info.features)


	#st.markdown("### General Metrics")



	#dataset = load_dataset(dataset_name, dataset_config, dataset_split)
	#print(dataset_name, dataset_config, dataset_split)

	#print(labels.head())



	try:
	num_classes = dataset_builder.info.features['label'].num_classes
	dataset = load_dataset(dataset_name, split=dataset_split)
	labels = query("SELECT COUNT(*) from dataset GROUP BY label").to_pandas()
	labels = labels.rename(columns={"count_star()": "count"})
	labels.index = dataset_builder.info.features['label'].names
	st.markdown("### Labelled Metrics")
	st.markdown("Your dataset has "+ str(dataset_builder.info.features['label'].num_classes) + " labels : " + ', '.join(dataset_builder.info.features['label'].names))
	#TODO : figure out how to make a label plot
	st.plotly_chart(px.pie(labels, values = "count", names = labels.index, width=800, height=400))
	total = sum(c for c in labels['count'])
	proportion = [c/total for c in labels['count']]
	#proportion = [0.85, 0.15]
	stdev_dataset= statistics.stdev(proportion)
	if stdev_dataset <= balanced_stdev:
	st.markdown("Since your dataset is well-balanced (with a standard deviation of " + str(round(stdev_dataset,2)) +"), you can look at using:")
	st.markdown('[Accuracy](https://huggingface.co/metrics/accuracy)')
	accuracy_code = '''from datasets import load_metric
	metric = load_metric("accuracy")'''
	st.code(accuracy_code, language='python')

	else:
	st.markdown("Since your dataset is not well-balanced (with a standard deviation of " + str(round(stdev_dataset,2)) +"), you can look at using:")
	st.markdown('[F1 Score](https://huggingface.co/metrics/f1)')
	accuracy_code = '''from datasets import load_metric
	metric = load_metric("accuracy")'''
	st.code(accuracy_code, language='python')
	st.markdown('Since it takes into account both precision and recall, which works well to evaluate model performance on minority classes.')
	except:
	st.markdown("### Unsupervised Metrics")
	st.markdown("Since dataset doesn't have any labels, so the metrics that you can use for evaluation are:")
	st.markdown('[Perplexity](https://huggingface.co/metrics/perplexity)')
	perplexity_code = '''from datasets import load_metric
	metric = load_metric("perplexity")'''
	st.code(perplexity_code, language='python')
	st.markdown('If you choose a model that was trained on ' + dataset_name + ' and use it to compute perplexity on text generated by your model, this can help determine how similar the two are.')