Spaces:
Build error
Build error
Yacine Jernite
commited on
Commit
•
0069e8c
1
Parent(s):
e2f899f
in deployment mode, check whethet the cache_dir exists
Browse files- app.py +69 -57
- data_measurements/dataset_statistics.py +15 -3
app.py
CHANGED
@@ -14,7 +14,7 @@
|
|
14 |
|
15 |
import logging
|
16 |
from os import mkdir
|
17 |
-
from os.path import isdir
|
18 |
from pathlib import Path
|
19 |
|
20 |
import streamlit as st
|
@@ -143,63 +143,63 @@ def load_or_prepare_widgets(ds_args, show_embeddings, use_cache=False):
|
|
143 |
|
144 |
"""
|
145 |
|
146 |
-
if not isdir(CACHE_DIR):
|
147 |
-
logs.warning("Creating cache")
|
148 |
-
# We need to preprocess everything.
|
149 |
-
# This should eventually all go into a prepare_dataset CLI
|
150 |
-
mkdir(CACHE_DIR)
|
151 |
if use_cache:
|
152 |
logs.warning("Using cache")
|
153 |
-
|
|
|
154 |
dstats = dataset_statistics.DatasetStatisticsCacheClass(CACHE_DIR, **ds_args, use_cache=use_cache)
|
155 |
# Don't recalculate; we're live
|
156 |
dstats.set_deployment(True)
|
157 |
-
|
158 |
-
|
159 |
-
|
160 |
-
|
161 |
-
|
162 |
-
|
163 |
-
logs.warning("Missing a cache for load or prepare dataset")
|
164 |
-
try:
|
165 |
-
# Header widget
|
166 |
-
dstats.load_or_prepare_dset_peek()
|
167 |
-
except:
|
168 |
-
logs.warning("Missing a cache for dset peek")
|
169 |
-
try:
|
170 |
-
# General stats widget
|
171 |
-
dstats.load_or_prepare_general_stats()
|
172 |
-
except:
|
173 |
-
logs.warning("Missing a cache for general stats")
|
174 |
-
try:
|
175 |
-
# Labels widget
|
176 |
-
dstats.load_or_prepare_labels()
|
177 |
-
except:
|
178 |
-
logs.warning("Missing a cache for prepare labels")
|
179 |
-
try:
|
180 |
-
# Text lengths widget
|
181 |
-
dstats.load_or_prepare_text_lengths()
|
182 |
-
except:
|
183 |
-
logs.warning("Missing a cache for text lengths")
|
184 |
-
if show_embeddings:
|
185 |
try:
|
186 |
-
#
|
187 |
-
dstats.
|
188 |
except:
|
189 |
-
logs.warning("Missing a cache for
|
190 |
-
|
191 |
-
|
192 |
-
|
193 |
-
|
194 |
-
|
195 |
-
|
196 |
-
|
197 |
-
|
198 |
-
|
199 |
-
|
200 |
-
|
201 |
-
|
202 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
203 |
|
204 |
def show_column(dstats, ds_name_to_dict, show_embeddings, column_id):
|
205 |
"""
|
@@ -257,21 +257,33 @@ def main():
|
|
257 |
dataset_args_left = st_utils.sidebar_selection(ds_name_to_dict, " A")
|
258 |
dataset_args_right = st_utils.sidebar_selection(ds_name_to_dict, " B")
|
259 |
left_col, _, right_col = st.columns([10, 1, 10])
|
260 |
-
dstats_left = load_or_prepare_widgets(
|
261 |
dataset_args_left, show_embeddings, use_cache=use_cache
|
262 |
)
|
263 |
with left_col:
|
264 |
-
|
265 |
-
|
|
|
|
|
|
|
|
|
266 |
dataset_args_right, show_embeddings, use_cache=use_cache
|
267 |
)
|
268 |
with right_col:
|
269 |
-
|
|
|
|
|
|
|
|
|
270 |
else:
|
271 |
logs.warning("Using Single Dataset Mode")
|
272 |
dataset_args = st_utils.sidebar_selection(ds_name_to_dict, "")
|
273 |
-
dstats = load_or_prepare_widgets(dataset_args, show_embeddings, use_cache=use_cache)
|
274 |
-
|
|
|
|
|
|
|
|
|
275 |
|
276 |
|
277 |
if __name__ == "__main__":
|
|
|
14 |
|
15 |
import logging
|
16 |
from os import mkdir
|
17 |
+
from os.path import exists, isdir
|
18 |
from pathlib import Path
|
19 |
|
20 |
import streamlit as st
|
|
|
143 |
|
144 |
"""
|
145 |
|
|
|
|
|
|
|
|
|
|
|
146 |
if use_cache:
|
147 |
logs.warning("Using cache")
|
148 |
+
if True:
|
149 |
+
#try:
|
150 |
dstats = dataset_statistics.DatasetStatisticsCacheClass(CACHE_DIR, **ds_args, use_cache=use_cache)
|
151 |
# Don't recalculate; we're live
|
152 |
dstats.set_deployment(True)
|
153 |
+
# checks whether the cache_dir exists in deployment mode
|
154 |
+
# creates cache_dir if not and if in development mode
|
155 |
+
cache_dir_exists = dstats.check_cache_dir()
|
156 |
+
#except:
|
157 |
+
# logs.warning("We're screwed")
|
158 |
+
if cache_dir_exists:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
159 |
try:
|
160 |
+
# We need to have the text_dset loaded for further load_or_prepare
|
161 |
+
dstats.load_or_prepare_dataset()
|
162 |
except:
|
163 |
+
logs.warning("Missing a cache for load or prepare dataset")
|
164 |
+
try:
|
165 |
+
# Header widget
|
166 |
+
dstats.load_or_prepare_dset_peek()
|
167 |
+
except:
|
168 |
+
logs.warning("Missing a cache for dset peek")
|
169 |
+
try:
|
170 |
+
# General stats widget
|
171 |
+
dstats.load_or_prepare_general_stats()
|
172 |
+
except:
|
173 |
+
logs.warning("Missing a cache for general stats")
|
174 |
+
try:
|
175 |
+
# Labels widget
|
176 |
+
dstats.load_or_prepare_labels()
|
177 |
+
except:
|
178 |
+
logs.warning("Missing a cache for prepare labels")
|
179 |
+
try:
|
180 |
+
# Text lengths widget
|
181 |
+
dstats.load_or_prepare_text_lengths()
|
182 |
+
except:
|
183 |
+
logs.warning("Missing a cache for text lengths")
|
184 |
+
if show_embeddings:
|
185 |
+
try:
|
186 |
+
# Embeddings widget
|
187 |
+
dstats.load_or_prepare_embeddings()
|
188 |
+
except:
|
189 |
+
logs.warning("Missing a cache for embeddings")
|
190 |
+
try:
|
191 |
+
dstats.load_or_prepare_text_duplicates()
|
192 |
+
except:
|
193 |
+
logs.warning("Missing a cache for text duplicates")
|
194 |
+
try:
|
195 |
+
dstats.load_or_prepare_npmi()
|
196 |
+
except:
|
197 |
+
logs.warning("Missing a cache for npmi")
|
198 |
+
try:
|
199 |
+
dstats.load_or_prepare_zipf()
|
200 |
+
except:
|
201 |
+
logs.warning("Missing a cache for zipf")
|
202 |
+
return dstats, cache_dir_exists
|
203 |
|
204 |
def show_column(dstats, ds_name_to_dict, show_embeddings, column_id):
|
205 |
"""
|
|
|
257 |
dataset_args_left = st_utils.sidebar_selection(ds_name_to_dict, " A")
|
258 |
dataset_args_right = st_utils.sidebar_selection(ds_name_to_dict, " B")
|
259 |
left_col, _, right_col = st.columns([10, 1, 10])
|
260 |
+
dstats_left, cache_exists_left = load_or_prepare_widgets(
|
261 |
dataset_args_left, show_embeddings, use_cache=use_cache
|
262 |
)
|
263 |
with left_col:
|
264 |
+
if cache_exists_left:
|
265 |
+
show_column(dstats_left, ds_name_to_dict, show_embeddings, " A")
|
266 |
+
else:
|
267 |
+
st.markdown("### Missing pre-computed data measures!")
|
268 |
+
st.write(dataset_args_left)
|
269 |
+
dstats_right, cache_exists_right = load_or_prepare_widgets(
|
270 |
dataset_args_right, show_embeddings, use_cache=use_cache
|
271 |
)
|
272 |
with right_col:
|
273 |
+
if cache_exists_right:
|
274 |
+
show_column(dstats_right, ds_name_to_dict, show_embeddings, " B")
|
275 |
+
else:
|
276 |
+
st.markdown("### Missing pre-computed data measures!")
|
277 |
+
st.write(dataset_args_right)
|
278 |
else:
|
279 |
logs.warning("Using Single Dataset Mode")
|
280 |
dataset_args = st_utils.sidebar_selection(ds_name_to_dict, "")
|
281 |
+
dstats, cache_exists = load_or_prepare_widgets(dataset_args, show_embeddings, use_cache=use_cache)
|
282 |
+
if cache_exists:
|
283 |
+
show_column(dstats, ds_name_to_dict, show_embeddings, "")
|
284 |
+
else:
|
285 |
+
st.markdown("### Missing pre-computed data measures!")
|
286 |
+
st.write(dataset_args)
|
287 |
|
288 |
|
289 |
if __name__ == "__main__":
|
data_measurements/dataset_statistics.py
CHANGED
@@ -245,9 +245,6 @@ class DatasetStatisticsCacheClass:
|
|
245 |
self.cache_dir,
|
246 |
f"{dset_name}_{dset_config}_{split_name}_{text_field}", # {label_field},
|
247 |
)
|
248 |
-
if not isdir(self.cache_path):
|
249 |
-
logs.warning("Creating cache directory %s." % self.cache_path)
|
250 |
-
mkdir(self.cache_path)
|
251 |
|
252 |
# Cache files not needed for UI
|
253 |
self.dset_fid = pjoin(self.cache_path, "base_dset")
|
@@ -302,6 +299,21 @@ class DatasetStatisticsCacheClass:
|
|
302 |
"""
|
303 |
self.live = live
|
304 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
305 |
def get_base_dataset(self):
|
306 |
"""Gets a pointer to the truncated base dataset object."""
|
307 |
if not self.dset:
|
|
|
245 |
self.cache_dir,
|
246 |
f"{dset_name}_{dset_config}_{split_name}_{text_field}", # {label_field},
|
247 |
)
|
|
|
|
|
|
|
248 |
|
249 |
# Cache files not needed for UI
|
250 |
self.dset_fid = pjoin(self.cache_path, "base_dset")
|
|
|
299 |
"""
|
300 |
self.live = live
|
301 |
|
302 |
+
def check_cache_dir(self):
|
303 |
+
"""
|
304 |
+
First function to call to create the cache directory.
|
305 |
+
If in deployment mode and cache directory does not already exist,
|
306 |
+
return False.
|
307 |
+
"""
|
308 |
+
if self.live:
|
309 |
+
return isdir(self.cache_path)
|
310 |
+
else:
|
311 |
+
if not isdir(self.cache_path):
|
312 |
+
logs.warning("Creating cache directory %s." % self.cache_path)
|
313 |
+
mkdir(self.cache_path)
|
314 |
+
return isdir(self.cache_path)
|
315 |
+
|
316 |
+
|
317 |
def get_base_dataset(self):
|
318 |
"""Gets a pointer to the truncated base dataset object."""
|
319 |
if not self.dset:
|