Sasha commited on
Commit
1f09890
1 Parent(s): d720be7

Initial version of the Evaluation Buddy -- currently most things are hardcoded (e.g. the dataset list), but the goal it to make it all compatible with the Hub!

Browse files
Files changed (2) hide show
  1. app.py +128 -0
  2. robot.png +0 -0
app.py ADDED
@@ -0,0 +1,128 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+ from datasets import load_dataset_builder
3
+ from datasets import get_dataset_config_names
4
+ from os import listdir
5
+ from datasets import load_dataset, Dataset
6
+ from datasets_sql import query
7
+ import plotly.express as px
8
+ import numpy as np
9
+ import statistics
10
+
11
+ st.set_page_config(
12
+ page_title="Evaluation Buddy",
13
+ page_icon="./robot.png",
14
+ layout="wide",
15
+ )
16
+
17
+ st.title("Hugging Face Evaluation Buddy")
18
+
19
+ top_datasets= ['glue', 'super_glue', 'wikitext', 'imdb', 'squad', 'squad_es', \
20
+ 'paws', 'librispeech_asr', 'wmt16', 'xnli', 'snli', 'ag_news', \
21
+ 'anli', 'amazon_polarity', 'squad_v2', 'conll2003', 'red_caps', \
22
+ 'common_voice', 'stsb_multi_mt', 'trec', 'tweet_eval', 'cosmos_qa',\
23
+ 'sick', 'xsum', 'wikiann', 'yelp_polarity', 'hellaswag', 'piqa', \
24
+ 'race', 'winogrande']
25
+
26
+ tasks= ['text-classification', 'question-answering-extractive', 'automatic-speech-recognition']
27
+
28
+ with st.sidebar.expander("Datasets", expanded=True):
29
+ dataset_name = st.selectbox(
30
+ f"Choose a dataset to evaluate on:",
31
+ sorted(top_datasets))
32
+ configs = get_dataset_config_names(dataset_name)
33
+ dataset_config = st.selectbox(
34
+ f"Choose a configuration of your dataset:",
35
+ configs)
36
+ dataset_builder = load_dataset_builder(dataset_name, dataset_config)
37
+ splits = [s for s in dataset_builder.info.splits]
38
+ dataset_split = st.selectbox(
39
+ f"Choose a dataset split:",
40
+ splits)
41
+ balanced_stdev = st.slider("Choose a standard deviation threshold for determining whether a dataset is balanced or not:", 0.00, 1.00, 0.20)
42
+
43
+
44
+
45
+ st.markdown("## Here is some information about your dataset:")
46
+
47
+ st.markdown("### Description")
48
+
49
+ st.markdown(dataset_builder.info.description)
50
+ st.markdown("For more information about this dataset, check out [its website](https://huggingface.co/datasets/"+dataset_name+")")
51
+
52
+ st.markdown("### Dataset-Specific Metrics")
53
+ if dataset_name in listdir('../datasets/metrics/'):
54
+ st.markdown("Great news! Your dataset has a dedicated metric for it! You can use it like this:")
55
+ code = ''' from datasets import load_metric
56
+ metric = load_metric('''+dataset+''', '''+config+''')'''
57
+ st.code(code, language='python')
58
+ dedicated_metric = True
59
+ else:
60
+ st.markdown("Your dataset doesn't have a dedicated metric, but that's ok!")
61
+ dedicated_metric = False
62
+
63
+ st.markdown("### Task-Specific Metrics")
64
+
65
+ try:
66
+ task = dataset_builder.info.task_templates[0].task
67
+ st.markdown("The task associated to it is: " + task)
68
+ if task == 'automatic-speech-recognition':
69
+ st.markdown('Automatic Speech Recognition has some dedicated metrics such as:')
70
+ st.markdown('[Word Error Rate](https://huggingface.co/metrics/wer)')
71
+ st.markdown('[Character Error Rate](https://huggingface.co/metrics/cer)')
72
+ else:
73
+ st.markdown("The task for your dataset doesn't have any dedicated metrics, but you can still use general ones!")
74
+ except:
75
+ st.markdown("The task for your dataset doesn't have any dedicated metrics, but you can still use general ones!")
76
+
77
+
78
+ #print(dataset_builder.info.task_templates)
79
+ #print(dataset_builder.info.features)
80
+
81
+
82
+ #st.markdown("### General Metrics")
83
+
84
+
85
+
86
+ #dataset = load_dataset(dataset_name, dataset_config, dataset_split)
87
+ #print(dataset_name, dataset_config, dataset_split)
88
+
89
+ #print(labels.head())
90
+
91
+
92
+
93
+ try:
94
+ num_classes = dataset_builder.info.features['label'].num_classes
95
+ dataset = load_dataset(dataset_name, split=dataset_split)
96
+ labels = query("SELECT COUNT(*) from dataset GROUP BY label").to_pandas()
97
+ labels = labels.rename(columns={"count_star()": "count"})
98
+ labels.index = dataset_builder.info.features['label'].names
99
+ st.markdown("### Labelled Metrics")
100
+ st.markdown("Your dataset has "+ str(dataset_builder.info.features['label'].num_classes) + " labels : " + ', '.join(dataset_builder.info.features['label'].names))
101
+ #TODO : figure out how to make a label plot
102
+ st.plotly_chart(px.pie(labels, values = "count", names = labels.index, width=800, height=400))
103
+ total = sum(c for c in labels['count'])
104
+ proportion = [c/total for c in labels['count']]
105
+ #proportion = [0.85, 0.15]
106
+ stdev_dataset= statistics.stdev(proportion)
107
+ if stdev_dataset <= balanced_stdev:
108
+ st.markdown("Since your dataset is well-balanced, you can look at using:")
109
+ st.markdown('[Accuracy](https://huggingface.co/metrics/accuracy)')
110
+ accuracy_code = '''from datasets import load_metric
111
+ metric = load_metric("accuracy")'''
112
+ st.code(accuracy_code, language='python')
113
+
114
+ else:
115
+ st.markdown("Since your dataset is not well-balanced, you can look at using:")
116
+ st.markdown('[F1 Score](https://huggingface.co/metrics/f1)')
117
+ accuracy_code = '''from datasets import load_metric
118
+ metric = load_metric("accuracy")'''
119
+ st.code(accuracy_code, language='python')
120
+ st.markdown('Since it takes into account both precision and recall, which works well to evaluate model performance on minority classes.')
121
+ except:
122
+ st.markdown("### Unsupervised Metrics")
123
+ st.markdown("Since dataset doesn't have any labels, so the metrics that you can use for evaluation are:")
124
+ st.markdown('[Perplexity](https://huggingface.co/metrics/perplexity)')
125
+ perplexity_code = '''from datasets import load_metric
126
+ metric = load_metric("perplexity")'''
127
+ st.code(perplexity_code, language='python')
128
+ st.markdown('If you choose a model that was trained on **' + dataset_name + '** and use it to compute perplexity on text generated by your model, this can help determine how similar the two are.')
robot.png ADDED