derek-thomas HF staff commited on
Commit
cdbb4c0
1 Parent(s): 84af1d7

Adding new rows to readme, and running at 5am GMT

Browse files
Files changed (2) hide show
  1. main.py +4 -2
  2. utilities/readme_update.py +2 -5
main.py CHANGED
@@ -48,7 +48,8 @@ def main():
48
  dataset['train'] = Dataset.from_pandas(new_df, preserve_index=False)
49
 
50
  # Update README
51
- update_readme(dataset_name=dataset_name, subreddit=subreddit, latest_date=date)
 
52
 
53
  # Push the augmented dataset to the Hugging Face hub
54
  logger.debug(f"Pushing data for {date} to the Hugging Face hub")
@@ -60,7 +61,8 @@ def schedule_daily_task():
60
  """
61
  Schedule the daily_task to run at the specific time every day.
62
  """
63
- start_time = (datetime.now() + timedelta(minutes=1)).time().strftime('%H:%M') # Now + 30 seconds
 
64
  logger.info(f'Scheduling tasks to run every day at: {start_time}')
65
  schedule.every().day.at(start_time).do(main)
66
 
 
48
  dataset['train'] = Dataset.from_pandas(new_df, preserve_index=False)
49
 
50
  # Update README
51
+ new_rows = len(new_df) - len(old_df)
52
+ update_readme(dataset_name=dataset_name, subreddit=subreddit, latest_date=date, new_rows=new_rows)
53
 
54
  # Push the augmented dataset to the Hugging Face hub
55
  logger.debug(f"Pushing data for {date} to the Hugging Face hub")
 
61
  """
62
  Schedule the daily_task to run at the specific time every day.
63
  """
64
+ # start_time = (datetime.now() + timedelta(minutes=1)).time().strftime('%H:%M') # Now + 30 seconds
65
+ start_time = '05:00'
66
  logger.info(f'Scheduling tasks to run every day at: {start_time}')
67
  schedule.every().day.at(start_time).do(main)
68
 
utilities/readme_update.py CHANGED
@@ -10,7 +10,7 @@ def get_readme_path(dataset_name):
10
  return cached_path(readme_path, download_config=DownloadConfig())
11
 
12
 
13
- def update_readme(dataset_name, subreddit, latest_date):
14
  path = get_readme_path(dataset_name=dataset_name)
15
  readme_text = f"""
16
  ## Dataset Overview
@@ -18,11 +18,8 @@ The goal is to have an open dataset of `{subreddit}` submissions. Im leveraging
18
 
19
  There is a limit of 1000 in an API call and limited search functionality, so this is run every day to get new submissions.
20
 
21
- # Dataset Name
22
- {dataset_name}
23
-
24
  ## Update Frequency
25
- The dataset is updated daily with the most recent day being: {latest_date}
26
  """
27
 
28
  append_readme(path=path, readme_text=readme_text)
 
10
  return cached_path(readme_path, download_config=DownloadConfig())
11
 
12
 
13
+ def update_readme(dataset_name, subreddit, latest_date, new_rows):
14
  path = get_readme_path(dataset_name=dataset_name)
15
  readme_text = f"""
16
  ## Dataset Overview
 
18
 
19
  There is a limit of 1000 in an API call and limited search functionality, so this is run every day to get new submissions.
20
 
 
 
 
21
  ## Update Frequency
22
+ The dataset is updated daily with the most recent day being `{latest_date}`where we added `{new_rows}` new rows.
23
  """
24
 
25
  append_readme(path=path, readme_text=readme_text)