Spaces:
Build error
Build error
meg-huggingface
commited on
Commit
•
d3c28ec
1
Parent(s):
e3f7160
Loading per-widget. Various changes to streamlit interactions for efficiency.
Browse files- app.py +13 -14
- data_measurements/dataset_statistics.py +33 -23
- data_measurements/streamlit_utils.py +11 -6
app.py
CHANGED
@@ -101,20 +101,18 @@ def load_or_prepare(ds_args, show_embeddings, use_cache=False):
|
|
101 |
if use_cache:
|
102 |
logs.warning("Using cache")
|
103 |
dstats = dataset_statistics.DatasetStatisticsCacheClass(CACHE_DIR, **ds_args, use_cache=use_cache)
|
104 |
-
logs.warning("Loading
|
105 |
dstats.load_or_prepare_dataset()
|
106 |
-
logs.warning("
|
107 |
dstats.load_or_prepare_labels()
|
108 |
-
logs.warning("
|
109 |
dstats.load_or_prepare_text_lengths()
|
110 |
-
logs.warning("
|
111 |
dstats.load_or_prepare_text_duplicates()
|
112 |
-
logs.warning("
|
113 |
dstats.load_or_prepare_vocab()
|
114 |
-
logs.warning("
|
115 |
dstats.load_or_prepare_general_stats()
|
116 |
-
logs.warning("Completed Calculation.")
|
117 |
-
logs.warning("Calculating Fine-Grained Statistics...")
|
118 |
if show_embeddings:
|
119 |
logs.warning("Loading Embeddings")
|
120 |
dstats.load_or_prepare_embeddings()
|
@@ -135,6 +133,7 @@ def load_or_prepare_widgets(ds_args, show_embeddings, use_cache=False):
|
|
135 |
Returns:
|
136 |
|
137 |
"""
|
|
|
138 |
if not isdir(CACHE_DIR):
|
139 |
logs.warning("Creating cache")
|
140 |
# We need to preprocess everything.
|
@@ -143,6 +142,8 @@ def load_or_prepare_widgets(ds_args, show_embeddings, use_cache=False):
|
|
143 |
if use_cache:
|
144 |
logs.warning("Using cache")
|
145 |
dstats = dataset_statistics.DatasetStatisticsCacheClass(CACHE_DIR, **ds_args, use_cache=use_cache)
|
|
|
|
|
146 |
# Header widget
|
147 |
dstats.load_or_prepare_dset_peek()
|
148 |
# General stats widget
|
@@ -157,23 +158,21 @@ def load_or_prepare_widgets(ds_args, show_embeddings, use_cache=False):
|
|
157 |
dstats.load_or_prepare_text_duplicates()
|
158 |
dstats.load_or_prepare_npmi()
|
159 |
dstats.load_or_prepare_zipf()
|
160 |
-
|
161 |
-
dstats.set_deployment(True)
|
162 |
|
163 |
-
def show_column(dstats, ds_name_to_dict, show_embeddings, column_id
|
164 |
"""
|
165 |
Function for displaying the elements in the right column of the streamlit app.
|
166 |
Args:
|
167 |
ds_name_to_dict (dict): the dataset name and options in dictionary form
|
168 |
show_embeddings (Bool): whether embeddings should we loaded and displayed for this dataset
|
169 |
column_id (str): what column of the dataset the analysis is done on
|
170 |
-
use_cache (Bool): whether the cache is used by default or not
|
171 |
Returns:
|
172 |
The function displays the information using the functions defined in the st_utils class.
|
173 |
"""
|
174 |
# Note that at this point we assume we can use cache; default value is True.
|
175 |
# start showing stuff
|
176 |
-
title_str = f"### Showing{column_id}: {dstats.dset_name} - {dstats.dset_config} - {'-'.join(dstats.text_field)}"
|
177 |
st.markdown(title_str)
|
178 |
logs.info("showing header")
|
179 |
st_utils.expander_header(dstats, ds_name_to_dict, column_id)
|
@@ -230,7 +229,7 @@ def main():
|
|
230 |
else:
|
231 |
logs.warning("Using Single Dataset Mode")
|
232 |
dataset_args = st_utils.sidebar_selection(ds_name_to_dict, "")
|
233 |
-
dstats =
|
234 |
show_column(dstats, ds_name_to_dict, show_embeddings, "")
|
235 |
|
236 |
|
|
|
101 |
if use_cache:
|
102 |
logs.warning("Using cache")
|
103 |
dstats = dataset_statistics.DatasetStatisticsCacheClass(CACHE_DIR, **ds_args, use_cache=use_cache)
|
104 |
+
logs.warning("Loading dataset")
|
105 |
dstats.load_or_prepare_dataset()
|
106 |
+
logs.warning("Loading labels")
|
107 |
dstats.load_or_prepare_labels()
|
108 |
+
logs.warning("Loading text lengths")
|
109 |
dstats.load_or_prepare_text_lengths()
|
110 |
+
logs.warning("Loading duplicates")
|
111 |
dstats.load_or_prepare_text_duplicates()
|
112 |
+
logs.warning("Loading vocabulary")
|
113 |
dstats.load_or_prepare_vocab()
|
114 |
+
logs.warning("Loading general statistics...")
|
115 |
dstats.load_or_prepare_general_stats()
|
|
|
|
|
116 |
if show_embeddings:
|
117 |
logs.warning("Loading Embeddings")
|
118 |
dstats.load_or_prepare_embeddings()
|
|
|
133 |
Returns:
|
134 |
|
135 |
"""
|
136 |
+
|
137 |
if not isdir(CACHE_DIR):
|
138 |
logs.warning("Creating cache")
|
139 |
# We need to preprocess everything.
|
|
|
142 |
if use_cache:
|
143 |
logs.warning("Using cache")
|
144 |
dstats = dataset_statistics.DatasetStatisticsCacheClass(CACHE_DIR, **ds_args, use_cache=use_cache)
|
145 |
+
# Don't recalculate; we're live
|
146 |
+
dstats.set_deployment(True)
|
147 |
# Header widget
|
148 |
dstats.load_or_prepare_dset_peek()
|
149 |
# General stats widget
|
|
|
158 |
dstats.load_or_prepare_text_duplicates()
|
159 |
dstats.load_or_prepare_npmi()
|
160 |
dstats.load_or_prepare_zipf()
|
161 |
+
return dstats
|
|
|
162 |
|
163 |
+
def show_column(dstats, ds_name_to_dict, show_embeddings, column_id):
|
164 |
"""
|
165 |
Function for displaying the elements in the right column of the streamlit app.
|
166 |
Args:
|
167 |
ds_name_to_dict (dict): the dataset name and options in dictionary form
|
168 |
show_embeddings (Bool): whether embeddings should we loaded and displayed for this dataset
|
169 |
column_id (str): what column of the dataset the analysis is done on
|
|
|
170 |
Returns:
|
171 |
The function displays the information using the functions defined in the st_utils class.
|
172 |
"""
|
173 |
# Note that at this point we assume we can use cache; default value is True.
|
174 |
# start showing stuff
|
175 |
+
title_str = f"### Showing{column_id}: {dstats.dset_name} - {dstats.dset_config} - {dstats.split_name} - {'-'.join(dstats.text_field)}"
|
176 |
st.markdown(title_str)
|
177 |
logs.info("showing header")
|
178 |
st_utils.expander_header(dstats, ds_name_to_dict, column_id)
|
|
|
229 |
else:
|
230 |
logs.warning("Using Single Dataset Mode")
|
231 |
dataset_args = st_utils.sidebar_selection(ds_name_to_dict, "")
|
232 |
+
dstats = load_or_prepare_widgets(dataset_args, show_embeddings, use_cache=use_cache)
|
233 |
show_column(dstats, ds_name_to_dict, show_embeddings, "")
|
234 |
|
235 |
|
data_measurements/dataset_statistics.py
CHANGED
@@ -178,6 +178,7 @@ class DatasetStatisticsCacheClass:
|
|
178 |
self.dset_config = dset_config
|
179 |
# name of the split to analyze
|
180 |
self.split_name = split_name
|
|
|
181 |
# which text fields are we analysing?
|
182 |
self.text_field = text_field
|
183 |
# which label fields are we analysing?
|
@@ -207,6 +208,7 @@ class DatasetStatisticsCacheClass:
|
|
207 |
self.vocab_counts_df = None
|
208 |
# Vocabulary filtered to remove stopwords
|
209 |
self.vocab_counts_filtered_df = None
|
|
|
210 |
## General statistics and duplicates
|
211 |
self.total_words = 0
|
212 |
self.total_open_words = 0
|
@@ -340,12 +342,13 @@ class DatasetStatisticsCacheClass:
|
|
340 |
logs.info('Loading cached general stats')
|
341 |
self.load_general_stats()
|
342 |
else:
|
343 |
-
|
344 |
-
|
345 |
-
|
346 |
-
|
347 |
-
|
348 |
-
|
|
|
349 |
|
350 |
|
351 |
def load_or_prepare_text_lengths(self, save=True):
|
@@ -362,17 +365,19 @@ class DatasetStatisticsCacheClass:
|
|
362 |
if (self.use_cache and exists(self.fig_tok_length_fid)):
|
363 |
self.fig_tok_length = read_plotly(self.fig_tok_length_fid)
|
364 |
else:
|
365 |
-
self.
|
366 |
-
|
367 |
-
|
|
|
368 |
|
369 |
# Text length dataframe
|
370 |
if self.use_cache and exists(self.length_df_fid):
|
371 |
self.length_df = feather.read_feather(self.length_df_fid)
|
372 |
else:
|
373 |
-
self.
|
374 |
-
|
375 |
-
|
|
|
376 |
|
377 |
# Text length stats.
|
378 |
if self.use_cache and exists(self.length_stats_json_fid):
|
@@ -382,9 +387,10 @@ class DatasetStatisticsCacheClass:
|
|
382 |
self.std_length = self.length_stats_dict["std length"]
|
383 |
self.num_uniq_lengths = self.length_stats_dict["num lengths"]
|
384 |
else:
|
385 |
-
self.
|
386 |
-
|
387 |
-
|
|
|
388 |
|
389 |
def prepare_length_df(self):
|
390 |
if not self.live:
|
@@ -481,15 +487,17 @@ class DatasetStatisticsCacheClass:
|
|
481 |
with open(self.dup_counts_df_fid, "rb") as f:
|
482 |
self.dup_counts_df = feather.read_feather(f)
|
483 |
elif self.dup_counts_df is None:
|
484 |
-
self.
|
485 |
-
|
486 |
-
|
|
|
487 |
else:
|
488 |
-
|
489 |
-
|
490 |
-
|
491 |
-
|
492 |
-
|
|
|
493 |
|
494 |
def load_general_stats(self):
|
495 |
self.general_stats_dict = json.load(open(self.general_stats_json_fid, encoding="utf-8"))
|
@@ -815,6 +823,8 @@ class nPMIStatisticsCacheClass:
|
|
815 |
write_subgroup_npmi_data(subgroup, subgroup_dict, subgroup_files)
|
816 |
with open(joint_npmi_fid, "w+") as f:
|
817 |
joint_npmi_df.to_csv(f)
|
|
|
|
|
818 |
logs.info("The joint npmi df is")
|
819 |
logs.info(joint_npmi_df)
|
820 |
return joint_npmi_df
|
|
|
178 |
self.dset_config = dset_config
|
179 |
# name of the split to analyze
|
180 |
self.split_name = split_name
|
181 |
+
# TODO: Chould this be "feature" ?
|
182 |
# which text fields are we analysing?
|
183 |
self.text_field = text_field
|
184 |
# which label fields are we analysing?
|
|
|
208 |
self.vocab_counts_df = None
|
209 |
# Vocabulary filtered to remove stopwords
|
210 |
self.vocab_counts_filtered_df = None
|
211 |
+
self.sorted_top_vocab_df = None
|
212 |
## General statistics and duplicates
|
213 |
self.total_words = 0
|
214 |
self.total_open_words = 0
|
|
|
342 |
logs.info('Loading cached general stats')
|
343 |
self.load_general_stats()
|
344 |
else:
|
345 |
+
if not self.live:
|
346 |
+
logs.info('Preparing general stats')
|
347 |
+
self.prepare_general_stats()
|
348 |
+
if save:
|
349 |
+
write_df(self.sorted_top_vocab_df, self.sorted_top_vocab_df_fid)
|
350 |
+
write_df(self.dup_counts_df, self.dup_counts_df_fid)
|
351 |
+
write_json(self.general_stats_dict, self.general_stats_json_fid)
|
352 |
|
353 |
|
354 |
def load_or_prepare_text_lengths(self, save=True):
|
|
|
365 |
if (self.use_cache and exists(self.fig_tok_length_fid)):
|
366 |
self.fig_tok_length = read_plotly(self.fig_tok_length_fid)
|
367 |
else:
|
368 |
+
if not self.live:
|
369 |
+
self.prepare_fig_text_lengths()
|
370 |
+
if save:
|
371 |
+
write_plotly(self.fig_tok_length, self.fig_tok_length_fid)
|
372 |
|
373 |
# Text length dataframe
|
374 |
if self.use_cache and exists(self.length_df_fid):
|
375 |
self.length_df = feather.read_feather(self.length_df_fid)
|
376 |
else:
|
377 |
+
if not self.live:
|
378 |
+
self.prepare_length_df()
|
379 |
+
if save:
|
380 |
+
write_df(self.length_df, self.length_df_fid)
|
381 |
|
382 |
# Text length stats.
|
383 |
if self.use_cache and exists(self.length_stats_json_fid):
|
|
|
387 |
self.std_length = self.length_stats_dict["std length"]
|
388 |
self.num_uniq_lengths = self.length_stats_dict["num lengths"]
|
389 |
else:
|
390 |
+
if not self.live:
|
391 |
+
self.prepare_text_length_stats()
|
392 |
+
if save:
|
393 |
+
write_json(self.length_stats_dict, self.length_stats_json_fid)
|
394 |
|
395 |
def prepare_length_df(self):
|
396 |
if not self.live:
|
|
|
487 |
with open(self.dup_counts_df_fid, "rb") as f:
|
488 |
self.dup_counts_df = feather.read_feather(f)
|
489 |
elif self.dup_counts_df is None:
|
490 |
+
if not self.live:
|
491 |
+
self.prepare_text_duplicates()
|
492 |
+
if save:
|
493 |
+
write_df(self.dup_counts_df, self.dup_counts_df_fid)
|
494 |
else:
|
495 |
+
if not self.live:
|
496 |
+
# This happens when self.dup_counts_df is already defined;
|
497 |
+
# This happens when general_statistics were calculated first,
|
498 |
+
# since general statistics requires the number of duplicates
|
499 |
+
if save:
|
500 |
+
write_df(self.dup_counts_df, self.dup_counts_df_fid)
|
501 |
|
502 |
def load_general_stats(self):
|
503 |
self.general_stats_dict = json.load(open(self.general_stats_json_fid, encoding="utf-8"))
|
|
|
823 |
write_subgroup_npmi_data(subgroup, subgroup_dict, subgroup_files)
|
824 |
with open(joint_npmi_fid, "w+") as f:
|
825 |
joint_npmi_df.to_csv(f)
|
826 |
+
else:
|
827 |
+
joint_npmi_df = pd.DataFrame()
|
828 |
logs.info("The joint npmi df is")
|
829 |
logs.info(joint_npmi_df)
|
830 |
return joint_npmi_df
|
data_measurements/streamlit_utils.py
CHANGED
@@ -126,13 +126,18 @@ def expander_general_stats(dstats, column_id):
|
|
126 |
str(dstats.text_nan_count)
|
127 |
)
|
128 |
)
|
129 |
-
|
130 |
-
|
131 |
-
|
132 |
-
|
133 |
-
|
|
|
|
|
134 |
)
|
135 |
-
|
|
|
|
|
|
|
136 |
|
137 |
|
138 |
### Show the label distribution from the datasets
|
|
|
126 |
str(dstats.text_nan_count)
|
127 |
)
|
128 |
)
|
129 |
+
if dstats.dedup_total > 0:
|
130 |
+
st.markdown(
|
131 |
+
"There are {0} duplicate items in the dataset. "
|
132 |
+
"For more information about the duplicates, "
|
133 |
+
"click the 'Duplicates' tab below.".format(
|
134 |
+
str(dstats.dedup_total)
|
135 |
+
)
|
136 |
)
|
137 |
+
else:
|
138 |
+
st.markdown(
|
139 |
+
"There are 0 duplicate items in the dataset. ")
|
140 |
+
|
141 |
|
142 |
|
143 |
### Show the label distribution from the datasets
|