meg HF staff commited on
Commit
cbbc827
2 Parent(s): 8c830d7 86d20f7

Merge branch 'main' of https://huggingface.co/spaces/huggingface/data-measurements-tool-2 into main

Browse files
Files changed (34) hide show
  1. app.py +52 -19
  2. cache_dir/c4_en.noblocklist_train_text/fig_tok_length.png +3 -0
  3. cache_dir/c4_en_train_text/fig_tok_length.png +3 -0
  4. cache_dir/c4_realnewslike_train_text/fig_tok_length.png +3 -0
  5. cache_dir/c4_realnewslike_train_text/text_dset/dataset.arrow +3 -0
  6. cache_dir/c4_realnewslike_train_text/text_dset/dataset_info.json +3 -0
  7. cache_dir/c4_realnewslike_train_text/text_dset/state.json +3 -0
  8. cache_dir/squad_plain_text_train_context/fig_tok_length.png +2 -2
  9. cache_dir/squad_plain_text_train_question/fig_tok_length.png +2 -2
  10. cache_dir/squad_plain_text_train_title/fig_tok_length.png +2 -2
  11. cache_dir/squad_plain_text_validation_context/fig_tok_length.png +3 -0
  12. cache_dir/squad_plain_text_validation_question/fig_tok_length.png +3 -0
  13. cache_dir/squad_plain_text_validation_title/fig_tok_length.png +3 -0
  14. cache_dir/squad_v2_squad_v2_train_context/fig_tok_length.png +3 -0
  15. cache_dir/squad_v2_squad_v2_train_question/fig_tok_length.png +3 -0
  16. cache_dir/squad_v2_squad_v2_train_title/fig_tok_length.png +3 -0
  17. cache_dir/squad_v2_squad_v2_validation_context/fig_tok_length.png +3 -0
  18. cache_dir/squad_v2_squad_v2_validation_question/fig_tok_length.png +3 -0
  19. cache_dir/squad_v2_squad_v2_validation_title/fig_tok_length.png +3 -0
  20. cache_dir/super_glue_boolq_test_passage/fig_tok_length.png +3 -0
  21. cache_dir/super_glue_boolq_test_question/fig_tok_length.png +3 -0
  22. cache_dir/super_glue_cb_test_hypothesis/fig_tok_length.png +3 -0
  23. cache_dir/super_glue_cb_test_premise/fig_tok_length.png +3 -0
  24. cache_dir/super_glue_copa_test_choice1/fig_tok_length.png +3 -0
  25. cache_dir/super_glue_copa_test_choice2/fig_tok_length.png +3 -0
  26. cache_dir/super_glue_copa_test_premise/fig_tok_length.png +3 -0
  27. cache_dir/super_glue_copa_test_question/fig_tok_length.png +3 -0
  28. cache_dir/wikitext_wikitext-103-raw-v1_test_text/fig_tok_length.png +3 -0
  29. cache_dir/wikitext_wikitext-103-v1_test_text/fig_tok_length.png +3 -0
  30. cache_dir/wikitext_wikitext-2-raw-v1_test_text/fig_tok_length.png +3 -0
  31. cache_dir/wikitext_wikitext-2-v1_test_text/fig_tok_length.png +3 -0
  32. data_measurements/dataset_statistics.py +9 -8
  33. data_measurements/streamlit_utils.py +79 -67
  34. requirements.txt +2 -2
app.py CHANGED
@@ -117,7 +117,10 @@ def load_or_prepare(ds_args, show_embeddings, use_cache=False):
117
  logs.warning("Loading Embeddings")
118
  dstats.load_or_prepare_embeddings()
119
  logs.warning("Loading nPMI")
120
- dstats.load_or_prepare_npmi()
 
 
 
121
  logs.warning("Loading Zipf")
122
  dstats.load_or_prepare_zipf()
123
  return dstats
@@ -147,25 +150,55 @@ def load_or_prepare_widgets(ds_args, show_embeddings, use_cache=False):
147
  mkdir(CACHE_DIR)
148
  if use_cache:
149
  logs.warning("Using cache")
150
- dstats = dataset_statistics.DatasetStatisticsCacheClass(CACHE_DIR, **ds_args, use_cache=use_cache)
151
- # Don't recalculate; we're live
152
- dstats.set_deployment(True)
153
- # We need to have the text_dset loaded for further load_or_prepare
154
- dstats.load_or_prepare_dataset()
155
- # Header widget
156
- dstats.load_or_prepare_dset_peek()
157
- # General stats widget
158
- dstats.load_or_prepare_general_stats()
159
- # Labels widget
160
- dstats.load_or_prepare_labels()
161
- # Text lengths widget
162
- dstats.load_or_prepare_text_lengths()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
163
  if show_embeddings:
164
- # Embeddings widget
165
- dstats.load_or_prepare_embeddings()
166
- dstats.load_or_prepare_text_duplicates()
167
- dstats.load_or_prepare_npmi()
168
- dstats.load_or_prepare_zipf()
 
 
 
 
 
 
 
 
 
 
 
 
169
  return dstats
170
 
171
  def show_column(dstats, ds_name_to_dict, show_embeddings, column_id):
 
117
  logs.warning("Loading Embeddings")
118
  dstats.load_or_prepare_embeddings()
119
  logs.warning("Loading nPMI")
120
+ try:
121
+ dstats.load_or_prepare_npmi()
122
+ except:
123
+ logs.warning("Missing a cache for npmi")
124
  logs.warning("Loading Zipf")
125
  dstats.load_or_prepare_zipf()
126
  return dstats
 
150
  mkdir(CACHE_DIR)
151
  if use_cache:
152
  logs.warning("Using cache")
153
+ try:
154
+ dstats = dataset_statistics.DatasetStatisticsCacheClass(CACHE_DIR, **ds_args, use_cache=use_cache)
155
+ # Don't recalculate; we're live
156
+ dstats.set_deployment(True)
157
+ except:
158
+ logs.warning("We're screwed")
159
+ try:
160
+ # We need to have the text_dset loaded for further load_or_prepare
161
+ dstats.load_or_prepare_dataset()
162
+ except:
163
+ logs.warning("Missing a cache for load or prepare dataset")
164
+ try:
165
+ # Header widget
166
+ dstats.load_or_prepare_dset_peek()
167
+ except:
168
+ logs.warning("Missing a cache for dset peek")
169
+ try:
170
+ # General stats widget
171
+ dstats.load_or_prepare_general_stats()
172
+ except:
173
+ logs.warning("Missing a cache for general stats")
174
+ try:
175
+ # Labels widget
176
+ dstats.load_or_prepare_labels()
177
+ except:
178
+ logs.warning("Missing a cache for prepare labels")
179
+ try:
180
+ # Text lengths widget
181
+ dstats.load_or_prepare_text_lengths()
182
+ except:
183
+ logs.warning("Missing a cache for text lengths")
184
  if show_embeddings:
185
+ try:
186
+ # Embeddings widget
187
+ dstats.load_or_prepare_embeddings()
188
+ except:
189
+ logs.warning("Missing a cache for embeddings")
190
+ try:
191
+ dstats.load_or_prepare_text_duplicates()
192
+ except:
193
+ logs.warning("Missing a cache for text duplicates")
194
+ try:
195
+ dstats.load_or_prepare_npmi()
196
+ except:
197
+ logs.warning("Missing a cache for npmi")
198
+ try:
199
+ dstats.load_or_prepare_zipf()
200
+ except:
201
+ logs.warning("Missing a cache for zipf")
202
  return dstats
203
 
204
  def show_column(dstats, ds_name_to_dict, show_embeddings, column_id):
cache_dir/c4_en.noblocklist_train_text/fig_tok_length.png ADDED

Git LFS Details

  • SHA256: 7cc045494f55ee52d94ebcae05ec2d936d4136a09fa7a01a4f854172352be843
  • Pointer size: 130 Bytes
  • Size of remote file: 38.3 kB
cache_dir/c4_en_train_text/fig_tok_length.png ADDED

Git LFS Details

  • SHA256: 9f53f89f4b7934746143833e287fbf2b616743bacba8d921db9d6a6bf8a6b62d
  • Pointer size: 130 Bytes
  • Size of remote file: 40.1 kB
cache_dir/c4_realnewslike_train_text/fig_tok_length.png ADDED

Git LFS Details

  • SHA256: 41d2915522c9b64ba7d8f975f52fda601030d86b9aa0767f0ea8a9439770468a
  • Pointer size: 130 Bytes
  • Size of remote file: 43.1 kB
cache_dir/c4_realnewslike_train_text/text_dset/dataset.arrow ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:9813f70c9be641905ca737aa8f16e29d6aa17155a76cd830e7a627aed91431f4
3
+ size 529606944
cache_dir/c4_realnewslike_train_text/text_dset/dataset_info.json ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:ff9f59542efc98b40f23b64408e3fbaed544ad8f0d1fb1e7126ead5af52844ac
3
+ size 945
cache_dir/c4_realnewslike_train_text/text_dset/state.json ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:c2f6884f5ee381e5df2d267dae699aaf4792ba06c8f16830c9c19c144b4b3003
3
+ size 256
cache_dir/squad_plain_text_train_context/fig_tok_length.png CHANGED

Git LFS Details

  • SHA256: d4ce94e5f9b40891bba84cae64842bfb3614d165c4c4c90931c20c558656a794
  • Pointer size: 130 Bytes
  • Size of remote file: 57.3 kB

Git LFS Details

  • SHA256: 79e07ea05db8bd40a31eb8d114e3a3f250d9282740c9aadfe22bdfd440cfca58
  • Pointer size: 130 Bytes
  • Size of remote file: 57.7 kB
cache_dir/squad_plain_text_train_question/fig_tok_length.png CHANGED

Git LFS Details

  • SHA256: 3a39496ae7d51b6472a3809431859d6822860dbe64bad60609b79f04ffc13f46
  • Pointer size: 130 Bytes
  • Size of remote file: 54 kB

Git LFS Details

  • SHA256: 760db98db07e10e3019114fc0b48582e2afecfc31378b6672e0d13450a19774f
  • Pointer size: 130 Bytes
  • Size of remote file: 53.9 kB
cache_dir/squad_plain_text_train_title/fig_tok_length.png CHANGED

Git LFS Details

  • SHA256: bf9fd7fd51590ec14b88cf6c9f3300f99c8e1fc4c98d45ee65c8d056db235e19
  • Pointer size: 130 Bytes
  • Size of remote file: 38.9 kB

Git LFS Details

  • SHA256: 5e98b54a1a91f4b83fd07600c3b1981c10c73312e8e1e1426bdb10ce79c5c5cf
  • Pointer size: 130 Bytes
  • Size of remote file: 38.8 kB
cache_dir/squad_plain_text_validation_context/fig_tok_length.png ADDED

Git LFS Details

  • SHA256: 889be8dd2dcc65d0d6df490b874bd1c16c33807fb042815b831a40a64ee98bfa
  • Pointer size: 130 Bytes
  • Size of remote file: 59.4 kB
cache_dir/squad_plain_text_validation_question/fig_tok_length.png ADDED

Git LFS Details

  • SHA256: d788d31d408a01443c3ae962c47e7da6a586e1d8257d39be7c044369190a1209
  • Pointer size: 130 Bytes
  • Size of remote file: 45.9 kB
cache_dir/squad_plain_text_validation_title/fig_tok_length.png ADDED

Git LFS Details

  • SHA256: f011b73e9ee97d39bcc43ff7bd3f6839e34c10694c54a56ee8f0e475eca308e2
  • Pointer size: 130 Bytes
  • Size of remote file: 30 kB
cache_dir/squad_v2_squad_v2_train_context/fig_tok_length.png ADDED

Git LFS Details

  • SHA256: d4fdf0874dc9ea371fb28570b84cb766f6cc8e4244b4e43695a7f6d098ac556f
  • Pointer size: 130 Bytes
  • Size of remote file: 54.9 kB
cache_dir/squad_v2_squad_v2_train_question/fig_tok_length.png ADDED

Git LFS Details

  • SHA256: 45f0292106c6f28c094cd8647a4a4aa0965d025570b282e5def599345f0c2367
  • Pointer size: 130 Bytes
  • Size of remote file: 59.6 kB
cache_dir/squad_v2_squad_v2_train_title/fig_tok_length.png ADDED

Git LFS Details

  • SHA256: d9f8c12053922ae904fe380032b6fbe93956bd208239e6795283b2cc8f7ff8cb
  • Pointer size: 130 Bytes
  • Size of remote file: 35 kB
cache_dir/squad_v2_squad_v2_validation_context/fig_tok_length.png ADDED

Git LFS Details

  • SHA256: 5b4d13d9f9bf0ce0e1066510352ee73f73ae3a919577a4c7542e2025461c4e5c
  • Pointer size: 130 Bytes
  • Size of remote file: 54.6 kB
cache_dir/squad_v2_squad_v2_validation_question/fig_tok_length.png ADDED

Git LFS Details

  • SHA256: 121712ba1a7020e57b9f8c1771af5104db2d6176d00641fbd557886fd35249ef
  • Pointer size: 130 Bytes
  • Size of remote file: 47.5 kB
cache_dir/squad_v2_squad_v2_validation_title/fig_tok_length.png ADDED

Git LFS Details

  • SHA256: 5669c412ee49eeb6d1d8c2db7908187708663f316f4e71d3aaf51b594527ac25
  • Pointer size: 130 Bytes
  • Size of remote file: 32.1 kB
cache_dir/super_glue_boolq_test_passage/fig_tok_length.png ADDED

Git LFS Details

  • SHA256: 4879f362469fa91a268ae956bf7b6eb3b7eaed3999fc5216107a959b37f9a61e
  • Pointer size: 130 Bytes
  • Size of remote file: 55.6 kB
cache_dir/super_glue_boolq_test_question/fig_tok_length.png ADDED

Git LFS Details

  • SHA256: 830d91aeabc6b623cacee45562d6f52d266c67dba01354b5d1f3c546e09986a8
  • Pointer size: 130 Bytes
  • Size of remote file: 40.7 kB
cache_dir/super_glue_cb_test_hypothesis/fig_tok_length.png ADDED

Git LFS Details

  • SHA256: 94e5b725f3a12b5e2a38f98d2e2aa4c310fd2fa7747525e53f9a465bf5488f5d
  • Pointer size: 130 Bytes
  • Size of remote file: 37.4 kB
cache_dir/super_glue_cb_test_premise/fig_tok_length.png ADDED

Git LFS Details

  • SHA256: a61f51f3fc8e2d0aee773e98c0fb31ca38d79f22efa16a2b74da829c6d20bba6
  • Pointer size: 130 Bytes
  • Size of remote file: 40.8 kB
cache_dir/super_glue_copa_test_choice1/fig_tok_length.png ADDED

Git LFS Details

  • SHA256: 498d1a103ef9cedf4438d9ebe6b0b5cf1f68d61ec1c2037898942143eb5b8b11
  • Pointer size: 130 Bytes
  • Size of remote file: 34 kB
cache_dir/super_glue_copa_test_choice2/fig_tok_length.png ADDED

Git LFS Details

  • SHA256: ff2e28dc98ae6b7c8c76d70274fb8aa6bd97efb0ee75ba4ccc36e07d5d751f59
  • Pointer size: 130 Bytes
  • Size of remote file: 34.7 kB
cache_dir/super_glue_copa_test_premise/fig_tok_length.png ADDED

Git LFS Details

  • SHA256: 80072ea80d10e77bfce285976ddb734316278e348738245c2ab1a530e7f1ff7a
  • Pointer size: 130 Bytes
  • Size of remote file: 34.5 kB
cache_dir/super_glue_copa_test_question/fig_tok_length.png ADDED

Git LFS Details

  • SHA256: 577503448bdd44845edf1ba231c346babcef16d63c69ac331a112ad500cd3567
  • Pointer size: 130 Bytes
  • Size of remote file: 23.8 kB
cache_dir/wikitext_wikitext-103-raw-v1_test_text/fig_tok_length.png ADDED

Git LFS Details

  • SHA256: 051870fd6d7ee89f5e6888c562303ffbc330b3aecf3957e4f8a7f53eee9cf9b0
  • Pointer size: 130 Bytes
  • Size of remote file: 38.6 kB
cache_dir/wikitext_wikitext-103-v1_test_text/fig_tok_length.png ADDED

Git LFS Details

  • SHA256: 3200e3dae5ac8e34146940c6063c605d1c7958d625e9647d9c73b4a5f922067b
  • Pointer size: 130 Bytes
  • Size of remote file: 38.5 kB
cache_dir/wikitext_wikitext-2-raw-v1_test_text/fig_tok_length.png ADDED

Git LFS Details

  • SHA256: 051870fd6d7ee89f5e6888c562303ffbc330b3aecf3957e4f8a7f53eee9cf9b0
  • Pointer size: 130 Bytes
  • Size of remote file: 38.6 kB
cache_dir/wikitext_wikitext-2-v1_test_text/fig_tok_length.png ADDED

Git LFS Details

  • SHA256: c0be97060b6e59db4de6c8c8435bb35dbd30a153128b66080fbfcbcce0098a59
  • Pointer size: 130 Bytes
  • Size of remote file: 38.5 kB
data_measurements/dataset_statistics.py CHANGED
@@ -498,7 +498,7 @@ class DatasetStatisticsCacheClass:
498
  if not self.live:
499
  if self.tokenized_df is None:
500
  logs.warning("Tokenized dataset not yet loaded; doing so.")
501
- self.load_or_prepare_dataset()
502
  if self.vocab_counts_df is None:
503
  logs.warning("Vocab not yet loaded; doing so.")
504
  self.load_or_prepare_vocab()
@@ -544,8 +544,8 @@ class DatasetStatisticsCacheClass:
544
  """
545
  logs.info("Doing text dset.")
546
  self.load_or_prepare_text_dset(save)
547
- logs.info("Doing tokenized dataframe")
548
- self.load_or_prepare_tokenized_df(save)
549
  logs.info("Doing dataset peek")
550
  self.load_or_prepare_dset_peek(save)
551
 
@@ -554,11 +554,12 @@ class DatasetStatisticsCacheClass:
554
  with open(self.dset_peek_json_fid, "r") as f:
555
  self.dset_peek = json.load(f)["dset peek"]
556
  else:
557
- if self.dset is None:
558
- self.get_base_dataset()
559
- self.dset_peek = self.dset[:100]
560
- if save:
561
- write_json({"dset peek": self.dset_peek}, self.dset_peek_json_fid)
 
562
 
563
  def load_or_prepare_tokenized_df(self, save=True):
564
  if self.use_cache and exists(self.tokenized_df_fid):
 
498
  if not self.live:
499
  if self.tokenized_df is None:
500
  logs.warning("Tokenized dataset not yet loaded; doing so.")
501
+ self.load_or_prepare_tokenized_df()
502
  if self.vocab_counts_df is None:
503
  logs.warning("Vocab not yet loaded; doing so.")
504
  self.load_or_prepare_vocab()
 
544
  """
545
  logs.info("Doing text dset.")
546
  self.load_or_prepare_text_dset(save)
547
+ #logs.info("Doing tokenized dataframe")
548
+ #self.load_or_prepare_tokenized_df(save)
549
  logs.info("Doing dataset peek")
550
  self.load_or_prepare_dset_peek(save)
551
 
 
554
  with open(self.dset_peek_json_fid, "r") as f:
555
  self.dset_peek = json.load(f)["dset peek"]
556
  else:
557
+ if not self.live:
558
+ if self.dset is None:
559
+ self.get_base_dataset()
560
+ self.dset_peek = self.dset[:100]
561
+ if save:
562
+ write_json({"dset peek": self.dset_peek}, self.dset_peek_json_fid)
563
 
564
  def load_or_prepare_tokenized_df(self, save=True):
565
  if self.use_cache and exists(self.tokenized_df_fid):
data_measurements/streamlit_utils.py CHANGED
@@ -20,7 +20,7 @@ import streamlit as st
20
  from st_aggrid import AgGrid, GridOptionsBuilder
21
 
22
  from .dataset_utils import HF_DESC_FIELD, HF_FEATURE_FIELD, HF_LABEL_FIELD
23
-
24
 
25
  def sidebar_header():
26
  st.sidebar.markdown(
@@ -48,7 +48,10 @@ def sidebar_selection(ds_name_to_dict, column_id):
48
  )
49
  # choose a config to analyze
50
  ds_configs = ds_name_to_dict[ds_name]
51
- config_names = list(ds_configs.keys())
 
 
 
52
  config_name = st.selectbox(
53
  f"Choose configuration{column_id}:",
54
  config_names,
@@ -319,72 +322,75 @@ def expander_npmi_description(min_vocab):
319
 
320
  ### Finally, show Zipf stuff
321
  def expander_zipf(z, zipf_fig, column_id):
322
- _ZIPF_CAPTION = """This shows how close the observed language is to an ideal
323
- natural language distribution following [Zipf's law](https://en.wikipedia.org/wiki/Zipf%27s_law),
324
- calculated by minimizing the [Kolmogorov-Smirnov (KS) statistic](https://en.wikipedia.org/wiki/Kolmogorov%E2%80%93Smirnov_test)."""
325
-
326
- powerlaw_eq = r"""p(x) \propto x^{- \alpha}"""
327
- zipf_summary = (
328
- "The optimal alpha based on this dataset is: **"
329
- + str(round(z.alpha, 2))
330
- + "**, with a KS distance of: **"
331
- + str(round(z.distance, 2))
332
- )
333
- zipf_summary += (
334
- "**. This was fit with a minimum rank value of: **"
335
- + str(int(z.xmin))
336
- + "**, which is the optimal rank *beyond which* the scaling regime of the power law fits best."
337
- )
338
-
339
- alpha_warning = "Your alpha value is a bit on the high side, which means that the distribution over words in this dataset is a bit unnatural. This could be due to non-language items throughout the dataset."
340
- xmin_warning = "The minimum rank for this fit is a bit on the high side, which means that the frequencies of your most common words aren't distributed as would be expected by Zipf's law."
341
- fit_results_table = pd.DataFrame.from_dict(
342
- {
343
- r"Alpha:": [str("%.2f" % z.alpha)],
344
- "KS distance:": [str("%.2f" % z.distance)],
345
- "Min rank:": [str("%s" % int(z.xmin))],
346
- },
347
- columns=["Results"],
348
- orient="index",
349
- )
350
- fit_results_table.index.name = column_id
351
  with st.expander(
352
  f"Vocabulary Distribution{column_id}: Zipf's Law Fit", expanded=False
353
  ):
354
- st.caption(
355
- "Use this widget for the counts of different words in your dataset, measuring the difference between the observed count and the expected count under Zipf's law."
356
- )
357
- st.markdown(_ZIPF_CAPTION)
358
- st.write(
359
- """
360
- A Zipfian distribution follows the power law: $p(x) \propto x^{-α}$
361
- with an ideal α value of 1."""
362
- )
363
- st.markdown(
364
- "In general, an alpha greater than 2 or a minimum rank greater than 10 (take with a grain of salt) means that your distribution is relativaly _unnatural_ for natural language. This can be a sign of mixed artefacts in the dataset, such as HTML markup."
365
- )
366
- st.markdown(
367
- "Below, you can see the counts of each word in your dataset vs. the expected number of counts following a Zipfian distribution."
368
- )
369
- st.markdown("-----")
370
- st.write("### Here is your dataset's Zipf results:")
371
- st.dataframe(fit_results_table)
372
- st.write(zipf_summary)
373
- # TODO: Nice UI version of the content in the comments.
374
- # st.markdown("\nThe KS test p-value is < %.2f" % z.ks_test.pvalue)
375
- # if z.ks_test.pvalue < 0.01:
376
- # st.markdown(
377
- # "\n Great news! Your data fits a powerlaw with a minimum KS " "distance of %.4f" % z.distance)
378
- # else:
379
- # st.markdown("\n Sadly, your data does not fit a powerlaw. =(")
380
- # st.markdown("Checking the goodness of fit of our observed distribution")
381
- # st.markdown("to the hypothesized power law distribution")
382
- # st.markdown("using a Kolmogorov–Smirnov (KS) test.")
383
- st.plotly_chart(zipf_fig, use_container_width=True)
384
- if z.alpha > 2:
385
- st.markdown(alpha_warning)
386
- if z.xmin > 5:
387
- st.markdown(xmin_warning)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
388
 
389
 
390
  ### Finally finally finally, show nPMI stuff.
@@ -427,17 +433,23 @@ def npmi_widget(npmi_stats, min_vocab, column_id):
427
 
428
  def npmi_show(paired_results):
429
  if paired_results.empty:
430
- st.markdown("No words that co-occur enough times for results! Or there's a 🐛.")
431
  else:
432
  s = pd.DataFrame(paired_results.sort_values(by="npmi-bias", ascending=True))
433
  # s.columns=pd.MultiIndex.from_arrays([['npmi','npmi','npmi','count', 'count'],['bias','man','straight','man','straight']])
434
  s.index.name = "word"
435
  npmi_cols = s.filter(like="npmi").columns
436
  count_cols = s.filter(like="count").columns
 
 
 
 
 
 
437
  # TODO: This is very different look than the duplicates table above. Should probably standardize.
438
  cm = sns.palplot(sns.diverging_palette(270, 36, s=99, l=48, n=16))
439
  out_df = (
440
- s.style.background_gradient(subset=npmi_cols, cmap=cm)
441
  .format(subset=npmi_cols, formatter="{:,.3f}")
442
  .format(subset=count_cols, formatter=int)
443
  .set_properties(
 
20
  from st_aggrid import AgGrid, GridOptionsBuilder
21
 
22
  from .dataset_utils import HF_DESC_FIELD, HF_FEATURE_FIELD, HF_LABEL_FIELD
23
+ st.set_option('deprecation.showPyplotGlobalUse', False)
24
 
25
  def sidebar_header():
26
  st.sidebar.markdown(
 
48
  )
49
  # choose a config to analyze
50
  ds_configs = ds_name_to_dict[ds_name]
51
+ if ds_name == "c4":
52
+ config_names = ['en','en.noblocklist','realnewslike']
53
+ else:
54
+ config_names = list(ds_configs.keys())
55
  config_name = st.selectbox(
56
  f"Choose configuration{column_id}:",
57
  config_names,
 
322
 
323
  ### Finally, show Zipf stuff
324
  def expander_zipf(z, zipf_fig, column_id):
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
325
  with st.expander(
326
  f"Vocabulary Distribution{column_id}: Zipf's Law Fit", expanded=False
327
  ):
328
+ try:
329
+ _ZIPF_CAPTION = """This shows how close the observed language is to an ideal
330
+ natural language distribution following [Zipf's law](https://en.wikipedia.org/wiki/Zipf%27s_law),
331
+ calculated by minimizing the [Kolmogorov-Smirnov (KS) statistic](https://en.wikipedia.org/wiki/Kolmogorov%E2%80%93Smirnov_test)."""
332
+
333
+ powerlaw_eq = r"""p(x) \propto x^{- \alpha}"""
334
+ zipf_summary = (
335
+ "The optimal alpha based on this dataset is: **"
336
+ + str(round(z.alpha, 2))
337
+ + "**, with a KS distance of: **"
338
+ + str(round(z.distance, 2))
339
+ )
340
+ zipf_summary += (
341
+ "**. This was fit with a minimum rank value of: **"
342
+ + str(int(z.xmin))
343
+ + "**, which is the optimal rank *beyond which* the scaling regime of the power law fits best."
344
+ )
345
+
346
+ alpha_warning = "Your alpha value is a bit on the high side, which means that the distribution over words in this dataset is a bit unnatural. This could be due to non-language items throughout the dataset."
347
+ xmin_warning = "The minimum rank for this fit is a bit on the high side, which means that the frequencies of your most common words aren't distributed as would be expected by Zipf's law."
348
+ fit_results_table = pd.DataFrame.from_dict(
349
+ {
350
+ r"Alpha:": [str("%.2f" % z.alpha)],
351
+ "KS distance:": [str("%.2f" % z.distance)],
352
+ "Min rank:": [str("%s" % int(z.xmin))],
353
+ },
354
+ columns=["Results"],
355
+ orient="index",
356
+ )
357
+ fit_results_table.index.name = column_id
358
+ st.caption(
359
+ "Use this widget for the counts of different words in your dataset, measuring the difference between the observed count and the expected count under Zipf's law."
360
+ )
361
+ st.markdown(_ZIPF_CAPTION)
362
+ st.write(
363
+ """
364
+ A Zipfian distribution follows the power law: $p(x) \propto x^{-α}$
365
+ with an ideal α value of 1."""
366
+ )
367
+ st.markdown(
368
+ "In general, an alpha greater than 2 or a minimum rank greater than 10 (take with a grain of salt) means that your distribution is relativaly _unnatural_ for natural language. This can be a sign of mixed artefacts in the dataset, such as HTML markup."
369
+ )
370
+ st.markdown(
371
+ "Below, you can see the counts of each word in your dataset vs. the expected number of counts following a Zipfian distribution."
372
+ )
373
+ st.markdown("-----")
374
+ st.write("### Here is your dataset's Zipf results:")
375
+ st.dataframe(fit_results_table)
376
+ st.write(zipf_summary)
377
+ # TODO: Nice UI version of the content in the comments.
378
+ # st.markdown("\nThe KS test p-value is < %.2f" % z.ks_test.pvalue)
379
+ # if z.ks_test.pvalue < 0.01:
380
+ # st.markdown(
381
+ # "\n Great news! Your data fits a powerlaw with a minimum KS " "distance of %.4f" % z.distance)
382
+ # else:
383
+ # st.markdown("\n Sadly, your data does not fit a powerlaw. =(")
384
+ # st.markdown("Checking the goodness of fit of our observed distribution")
385
+ # st.markdown("to the hypothesized power law distribution")
386
+ # st.markdown("using a Kolmogorov–Smirnov (KS) test.")
387
+ st.plotly_chart(zipf_fig, use_container_width=True)
388
+ if z.alpha > 2:
389
+ st.markdown(alpha_warning)
390
+ if z.xmin > 5:
391
+ st.markdown(xmin_warning)
392
+ except:
393
+ st.write("Under construction!")
394
 
395
 
396
  ### Finally finally finally, show nPMI stuff.
 
433
 
434
  def npmi_show(paired_results):
435
  if paired_results.empty:
436
+ st.markdown("No words that co-occur enough times for results! Or there's a 🐛. Or we're still computing this one. 🤷")
437
  else:
438
  s = pd.DataFrame(paired_results.sort_values(by="npmi-bias", ascending=True))
439
  # s.columns=pd.MultiIndex.from_arrays([['npmi','npmi','npmi','count', 'count'],['bias','man','straight','man','straight']])
440
  s.index.name = "word"
441
  npmi_cols = s.filter(like="npmi").columns
442
  count_cols = s.filter(like="count").columns
443
+ if s.shape[0] > 10000:
444
+ bias_thres = max(abs(s["npmi-bias"][5000]), abs(s["npmi-bias"][-5000]))
445
+ print(f"filtering with bias threshold: {bias_thres}")
446
+ s_filtered = s[s["npmi-bias"].abs() > bias_thres]
447
+ else:
448
+ s_filtered = s
449
  # TODO: This is very different look than the duplicates table above. Should probably standardize.
450
  cm = sns.palplot(sns.diverging_palette(270, 36, s=99, l=48, n=16))
451
  out_df = (
452
+ s_filtered.style.background_gradient(subset=npmi_cols, cmap=cm)
453
  .format(subset=npmi_cols, formatter="{:,.3f}")
454
  .format(subset=count_cols, formatter=int)
455
  .set_properties(
requirements.txt CHANGED
@@ -10,7 +10,7 @@ iso_639==0.4.5
10
  datasets==1.15.1
11
  powerlaw==1.5
12
  numpy==1.19.5
13
- pandas==1.3.0
14
  dataclasses==0.6
15
  iso639==0.1.4
16
  python_igraph==0.9.6
@@ -23,4 +23,4 @@ numexpr==2.7.3
23
  scikit-learn~=0.24.2
24
  scipy~=1.7.3
25
  tqdm~=4.62.3
26
- pyarrow~=6.0.1
 
10
  datasets==1.15.1
11
  powerlaw==1.5
12
  numpy==1.19.5
13
+ pandas==1.0.0
14
  dataclasses==0.6
15
  iso639==0.1.4
16
  python_igraph==0.9.6
 
23
  scikit-learn~=0.24.2
24
  scipy~=1.7.3
25
  tqdm~=4.62.3
26
+ pyarrow~=6.0.1