Spaces:
Sleeping
Sleeping
File size: 8,127 Bytes
1f09890 d8eab79 1f09890 af7fcaa 1f09890 20aa046 03a66e4 1f09890 4f31875 20aa046 1f09890 74e9f8c 1f09890 74e9f8c 1f09890 d8eab79 1f09890 af7fcaa 4474a2c 1f09890 03a66e4 74e9f8c 20aa046 4f31875 1f09890 74e9f8c 1f09890 4f31875 1f09890 4f31875 1f09890 4f31875 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 |
import streamlit as st
from datasets import load_dataset_builder
from datasets import get_dataset_config_names
from os import listdir
from datasets import load_dataset, Dataset
from datasets_sql import query
import plotly.express as px
import numpy as np
import statistics
st.set_page_config(
page_title="HuggingFace Evaluation Buddy",
page_icon="./robot.png",
layout="wide",
)
st.title("Hugging Face Evaluation Buddy")
st.markdown('### Here to help you do mindful model evaluation')
top_datasets= ['glue', 'super_glue', 'wikitext', 'imdb', 'squad', 'squad_es', \
'paws', 'librispeech_asr', 'wmt16', 'xnli', 'snli', 'ag_news', \
'anli', 'amazon_polarity', 'squad_v2', 'conll2003', 'red_caps', \
'common_voice', 'stsb_multi_mt', 'trec', 'tweet_eval', 'cosmos_qa',\
'sick', 'xsum', 'wikiann', 'yelp_polarity', 'hellaswag', 'piqa', \
'race', 'winogrande']
tasks= ['classification', 'question answering', 'automatic speech recognition', 'natural language inference', \
'translation', 'sentiment analysis', 'text simplification', 'named entity recognition', \
'reading comprehension', 'paraphrase identification', 'natural language understanding',\
'textual entailment', 'commonsense reasoning', 'summarization']
metrics= ['matthews_correlation', 'perplexity', 'meteor', 'code_eval', 'super_glue', 'rouge', 'mauve', 'cer', 'accuracy', 'recall', 'bleurt', 'sari', 'precision', 'mean_iou', 'squad', 'mahalanobis', 'chrf', 'mae', 'squad_v2', 'seqeval', 'cuad', 'wiki_split', 'google_bleu', 'competition_math', 'pearsonr', 'xtreme_s', 'comet', 'gleu', 'spearmanr', 'f1', 'frugalscore', 'bertscore', 'indic_glue', 'mse', 'xnli', 'ter', 'coval', 'wer', 'bleu', 'glue', 'sacrebleu']
dedicated_metric = False
def find_task(dname):
task = None
dataset_builder = load_dataset_builder(dataset_name, dataset_config)
try:
task = dataset_builder.info.task_templates[0].task
except:
for t in tasks:
if t in str(dataset_builder.info.description).lower():
task = t
else:
continue
return(task)
with st.sidebar.expander("Datasets", expanded=True):
dataset_name = st.selectbox(
f"Choose a dataset to evaluate on:",
sorted(top_datasets))
configs = get_dataset_config_names(dataset_name)
dataset_config = st.selectbox(
f"Choose a configuration of the dataset:",
configs)
dataset_builder = load_dataset_builder(dataset_name, dataset_config)
splits = [s for s in dataset_builder.info.splits]
dataset_split = st.selectbox(
f"Choose a dataset split:",
splits)
balanced_stdev = st.slider("Choose a standard deviation threshold for determining whether a dataset is balanced or not:", 0.00, 1.00, 0.20)
st.markdown("## Here is some information about this dataset:")
st.markdown(dataset_builder.info.description)
if len(dataset_builder.info.description) == 1:
st.markdown("This dataset does not have a description. :no_mouth:")
st.markdown("For more information about this dataset, check out [its website](https://huggingface.co/datasets/"+dataset_name+")")
st.markdown("## Now let's see what metrics we can use to evaluate models on this dataset!")
st.markdown("### Dataset-Specific Metrics")
if dataset_name in metrics:
st.markdown("Great news! This dataset has a dedicated metric for it!:partying_face: You can use it like this: :point_down:")
if "glue" in dataset_name:
code = ''' from datasets import load_metric
metric = load_metric(\"'''+dataset_name+'''\", \"'''+dataset_config+'''\")'''
st.code(code, language='python')
else:
code = ''' from datasets import load_metric
metric = load_metric(\"'''+dataset_name+'''\")'''
st.code(code, language='python')
dedicated_metric = True
else:
st.markdown("This dataset doesn't have a dedicated metric, but that's ok! :wink:")
dedicated_metric = False
if dedicated_metric == False:
st.markdown("### Task-Specific Metrics")
task = find_task(dataset_name)
if task is not None:
st.markdown("The task associated to it this dataset is: " + task.replace('-',' '))
if task == 'automatic-speech-recognition':
st.markdown('Automatic Speech Recognition has some dedicated metrics such as:')
st.markdown('[Word Error Rate](https://huggingface.co/metrics/wer)')
wer_code = '''from datasets import load_metric
metric = load_metric("wer")'''
st.code(wer_code, language='python')
st.markdown('[Character Error Rate](https://huggingface.co/metrics/cer)')
cer_code = '''from datasets import load_metric
metric = load_metric("cer")'''
st.code(cer_code, language='python')
else:
st.markdown("The task for this dataset doesn't have any dedicated metrics, but you can still use general ones! :cowboy_hat_face:")
#print(dataset_builder.info.task_templates)
#print(dataset_builder.info.features)
#st.markdown("### General Metrics")
#dataset = load_dataset(dataset_name, dataset_config, dataset_split)
#print(dataset_name, dataset_config, dataset_split)
#print(labels.head())
if dedicated_metric == False:
if dataset_name in ['glue','super_glue', 'paws', 'squad_es']:
dataset = load_dataset(dataset_name, dataset_config, split=dataset_split)
else:
dataset = load_dataset(dataset_name, split=dataset_split)
try:
num_classes = dataset_builder.info.features['label'].num_classes
labels = query("SELECT COUNT(*) from dataset GROUP BY label").to_pandas()
labels = labels.rename(columns={"count_star()": "count"})
labels.index = dataset_builder.info.features['label'].names
st.markdown("### Labelled Metrics")
st.markdown("This dataset has "+ str(dataset_builder.info.features['label'].num_classes) + " labels : " + ', '.join(dataset_builder.info.features['label'].names))
#TODO : figure out how to make a label plot
st.plotly_chart(px.pie(labels, values = "count", names = labels.index, width=800, height=400))
total = sum(c for c in labels['count'])
proportion = [c/total for c in labels['count']]
#proportion = [0.85, 0.15]
stdev_dataset= statistics.stdev(proportion)
if stdev_dataset <= balanced_stdev:
st.markdown("Since this dataset is well-balanced (with a standard deviation of " + str(round(stdev_dataset,2)) +"), you can look at using:")
st.markdown('[Accuracy](https://huggingface.co/metrics/accuracy)')
accuracy_code = '''from datasets import load_metric
metric = load_metric("accuracy")'''
st.code(accuracy_code, language='python')
else:
st.markdown("Since this dataset is not well-balanced (with a standard deviation of " + str(round(stdev_dataset,2)) +"), you can look at using:")
st.markdown('[F1 Score](https://huggingface.co/metrics/f1)')
accuracy_code = '''from datasets import load_metric
metric = load_metric("accuracy")'''
st.code(accuracy_code, language='python')
st.markdown('Since it takes into account both precision and recall, which works well to evaluate model performance on minority classes.')
except:
if task != 'automatic-speech-recognition':
st.markdown("### Unsupervised Metrics")
st.markdown("Since this dataset doesn't have any labels, the metrics that you can use for evaluation are:")
st.markdown('[Perplexity](https://huggingface.co/metrics/perplexity)')
perplexity_code = '''from datasets import load_metric
metric = load_metric("perplexity")'''
st.code(perplexity_code, language='python')
st.markdown('If you choose a model that was trained on **' + dataset_name + '** and use it to compute perplexity on text generated by your model, this can help determine how similar the two are.')
|