Yacine Jernite commited on
Commit
0069e8c
1 Parent(s): e2f899f

in deployment mode, check whethet the cache_dir exists

Browse files
Files changed (2) hide show
  1. app.py +69 -57
  2. data_measurements/dataset_statistics.py +15 -3
app.py CHANGED
@@ -14,7 +14,7 @@
14
 
15
  import logging
16
  from os import mkdir
17
- from os.path import isdir
18
  from pathlib import Path
19
 
20
  import streamlit as st
@@ -143,63 +143,63 @@ def load_or_prepare_widgets(ds_args, show_embeddings, use_cache=False):
143
 
144
  """
145
 
146
- if not isdir(CACHE_DIR):
147
- logs.warning("Creating cache")
148
- # We need to preprocess everything.
149
- # This should eventually all go into a prepare_dataset CLI
150
- mkdir(CACHE_DIR)
151
  if use_cache:
152
  logs.warning("Using cache")
153
- try:
 
154
  dstats = dataset_statistics.DatasetStatisticsCacheClass(CACHE_DIR, **ds_args, use_cache=use_cache)
155
  # Don't recalculate; we're live
156
  dstats.set_deployment(True)
157
- except:
158
- logs.warning("We're screwed")
159
- try:
160
- # We need to have the text_dset loaded for further load_or_prepare
161
- dstats.load_or_prepare_dataset()
162
- except:
163
- logs.warning("Missing a cache for load or prepare dataset")
164
- try:
165
- # Header widget
166
- dstats.load_or_prepare_dset_peek()
167
- except:
168
- logs.warning("Missing a cache for dset peek")
169
- try:
170
- # General stats widget
171
- dstats.load_or_prepare_general_stats()
172
- except:
173
- logs.warning("Missing a cache for general stats")
174
- try:
175
- # Labels widget
176
- dstats.load_or_prepare_labels()
177
- except:
178
- logs.warning("Missing a cache for prepare labels")
179
- try:
180
- # Text lengths widget
181
- dstats.load_or_prepare_text_lengths()
182
- except:
183
- logs.warning("Missing a cache for text lengths")
184
- if show_embeddings:
185
  try:
186
- # Embeddings widget
187
- dstats.load_or_prepare_embeddings()
188
  except:
189
- logs.warning("Missing a cache for embeddings")
190
- try:
191
- dstats.load_or_prepare_text_duplicates()
192
- except:
193
- logs.warning("Missing a cache for text duplicates")
194
- try:
195
- dstats.load_or_prepare_npmi()
196
- except:
197
- logs.warning("Missing a cache for npmi")
198
- try:
199
- dstats.load_or_prepare_zipf()
200
- except:
201
- logs.warning("Missing a cache for zipf")
202
- return dstats
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
203
 
204
  def show_column(dstats, ds_name_to_dict, show_embeddings, column_id):
205
  """
@@ -257,21 +257,33 @@ def main():
257
  dataset_args_left = st_utils.sidebar_selection(ds_name_to_dict, " A")
258
  dataset_args_right = st_utils.sidebar_selection(ds_name_to_dict, " B")
259
  left_col, _, right_col = st.columns([10, 1, 10])
260
- dstats_left = load_or_prepare_widgets(
261
  dataset_args_left, show_embeddings, use_cache=use_cache
262
  )
263
  with left_col:
264
- show_column(dstats_left, ds_name_to_dict, show_embeddings, " A")
265
- dstats_right = load_or_prepare_widgets(
 
 
 
 
266
  dataset_args_right, show_embeddings, use_cache=use_cache
267
  )
268
  with right_col:
269
- show_column(dstats_right, ds_name_to_dict, show_embeddings, " B")
 
 
 
 
270
  else:
271
  logs.warning("Using Single Dataset Mode")
272
  dataset_args = st_utils.sidebar_selection(ds_name_to_dict, "")
273
- dstats = load_or_prepare_widgets(dataset_args, show_embeddings, use_cache=use_cache)
274
- show_column(dstats, ds_name_to_dict, show_embeddings, "")
 
 
 
 
275
 
276
 
277
  if __name__ == "__main__":
 
14
 
15
  import logging
16
  from os import mkdir
17
+ from os.path import exists, isdir
18
  from pathlib import Path
19
 
20
  import streamlit as st
 
143
 
144
  """
145
 
 
 
 
 
 
146
  if use_cache:
147
  logs.warning("Using cache")
148
+ if True:
149
+ #try:
150
  dstats = dataset_statistics.DatasetStatisticsCacheClass(CACHE_DIR, **ds_args, use_cache=use_cache)
151
  # Don't recalculate; we're live
152
  dstats.set_deployment(True)
153
+ # checks whether the cache_dir exists in deployment mode
154
+ # creates cache_dir if not and if in development mode
155
+ cache_dir_exists = dstats.check_cache_dir()
156
+ #except:
157
+ # logs.warning("We're screwed")
158
+ if cache_dir_exists:
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
159
  try:
160
+ # We need to have the text_dset loaded for further load_or_prepare
161
+ dstats.load_or_prepare_dataset()
162
  except:
163
+ logs.warning("Missing a cache for load or prepare dataset")
164
+ try:
165
+ # Header widget
166
+ dstats.load_or_prepare_dset_peek()
167
+ except:
168
+ logs.warning("Missing a cache for dset peek")
169
+ try:
170
+ # General stats widget
171
+ dstats.load_or_prepare_general_stats()
172
+ except:
173
+ logs.warning("Missing a cache for general stats")
174
+ try:
175
+ # Labels widget
176
+ dstats.load_or_prepare_labels()
177
+ except:
178
+ logs.warning("Missing a cache for prepare labels")
179
+ try:
180
+ # Text lengths widget
181
+ dstats.load_or_prepare_text_lengths()
182
+ except:
183
+ logs.warning("Missing a cache for text lengths")
184
+ if show_embeddings:
185
+ try:
186
+ # Embeddings widget
187
+ dstats.load_or_prepare_embeddings()
188
+ except:
189
+ logs.warning("Missing a cache for embeddings")
190
+ try:
191
+ dstats.load_or_prepare_text_duplicates()
192
+ except:
193
+ logs.warning("Missing a cache for text duplicates")
194
+ try:
195
+ dstats.load_or_prepare_npmi()
196
+ except:
197
+ logs.warning("Missing a cache for npmi")
198
+ try:
199
+ dstats.load_or_prepare_zipf()
200
+ except:
201
+ logs.warning("Missing a cache for zipf")
202
+ return dstats, cache_dir_exists
203
 
204
  def show_column(dstats, ds_name_to_dict, show_embeddings, column_id):
205
  """
 
257
  dataset_args_left = st_utils.sidebar_selection(ds_name_to_dict, " A")
258
  dataset_args_right = st_utils.sidebar_selection(ds_name_to_dict, " B")
259
  left_col, _, right_col = st.columns([10, 1, 10])
260
+ dstats_left, cache_exists_left = load_or_prepare_widgets(
261
  dataset_args_left, show_embeddings, use_cache=use_cache
262
  )
263
  with left_col:
264
+ if cache_exists_left:
265
+ show_column(dstats_left, ds_name_to_dict, show_embeddings, " A")
266
+ else:
267
+ st.markdown("### Missing pre-computed data measures!")
268
+ st.write(dataset_args_left)
269
+ dstats_right, cache_exists_right = load_or_prepare_widgets(
270
  dataset_args_right, show_embeddings, use_cache=use_cache
271
  )
272
  with right_col:
273
+ if cache_exists_right:
274
+ show_column(dstats_right, ds_name_to_dict, show_embeddings, " B")
275
+ else:
276
+ st.markdown("### Missing pre-computed data measures!")
277
+ st.write(dataset_args_right)
278
  else:
279
  logs.warning("Using Single Dataset Mode")
280
  dataset_args = st_utils.sidebar_selection(ds_name_to_dict, "")
281
+ dstats, cache_exists = load_or_prepare_widgets(dataset_args, show_embeddings, use_cache=use_cache)
282
+ if cache_exists:
283
+ show_column(dstats, ds_name_to_dict, show_embeddings, "")
284
+ else:
285
+ st.markdown("### Missing pre-computed data measures!")
286
+ st.write(dataset_args)
287
 
288
 
289
  if __name__ == "__main__":
data_measurements/dataset_statistics.py CHANGED
@@ -245,9 +245,6 @@ class DatasetStatisticsCacheClass:
245
  self.cache_dir,
246
  f"{dset_name}_{dset_config}_{split_name}_{text_field}", # {label_field},
247
  )
248
- if not isdir(self.cache_path):
249
- logs.warning("Creating cache directory %s." % self.cache_path)
250
- mkdir(self.cache_path)
251
 
252
  # Cache files not needed for UI
253
  self.dset_fid = pjoin(self.cache_path, "base_dset")
@@ -302,6 +299,21 @@ class DatasetStatisticsCacheClass:
302
  """
303
  self.live = live
304
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
305
  def get_base_dataset(self):
306
  """Gets a pointer to the truncated base dataset object."""
307
  if not self.dset:
 
245
  self.cache_dir,
246
  f"{dset_name}_{dset_config}_{split_name}_{text_field}", # {label_field},
247
  )
 
 
 
248
 
249
  # Cache files not needed for UI
250
  self.dset_fid = pjoin(self.cache_path, "base_dset")
 
299
  """
300
  self.live = live
301
 
302
+ def check_cache_dir(self):
303
+ """
304
+ First function to call to create the cache directory.
305
+ If in deployment mode and cache directory does not already exist,
306
+ return False.
307
+ """
308
+ if self.live:
309
+ return isdir(self.cache_path)
310
+ else:
311
+ if not isdir(self.cache_path):
312
+ logs.warning("Creating cache directory %s." % self.cache_path)
313
+ mkdir(self.cache_path)
314
+ return isdir(self.cache_path)
315
+
316
+
317
  def get_base_dataset(self):
318
  """Gets a pointer to the truncated base dataset object."""
319
  if not self.dset: