File size: 3,484 Bytes
ed3130d 5d9e0b8 41daa3d ed3130d 5d9e0b8 5ec6657 ed3130d 5d9e0b8 5ec6657 3e1e25d 47ad458 5ec6657 9188762 41daa3d 9188762 5ec6657 9188762 5ec6657 9188762 41daa3d 5ec6657 5d9e0b8 ed3130d 285612d 5ec6657 285612d 5d9e0b8 285612d 613d6f5 3e1e25d 613d6f5 7641c8b 5d9e0b8 bc7f4d5 5ec6657 5d9e0b8 a9b0348 285612d ed3130d 5ec6657 ed3130d 5ec6657 ed3130d 41daa3d |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 |
import os
from datetime import datetime
from pathlib import Path
from shutil import rmtree
import pytz
from huggingface_hub import HfApi, Repository
frequency = os.environ.get("FREQUENCY", '').lower()
GENERATED_BELOW_MARKER = "--- Generated Part of README Below ---"
username = os.environ["USERNAME"]
hf_token = os.environ["HF_TOKEN"]
local_repo_path = "./readme_repo"
def update_dataset_readme(dataset_name: str, subreddit: str, new_rows: int) -> None:
"""
Update the README file of a specified dataset repository with new information.
Args:
dataset_name (str): Name of the dataset repository.
subreddit (str): Name of the subreddit being used for dataset creation.
new_rows (int): Number of new rows added in the latest update.
hf_token (str): Hugging Face authentication token.
local_repo_path (str): Local path to clone the repository.
"""
# Initialize HfApi
api = HfApi()
# Download README file
readme_path = api.hf_hub_download(repo_id=dataset_name, repo_type="dataset", filename="README.md")
# Read it
with open(readme_path, "r") as file:
old_readme = file.read()
# Modify it
new_readme = append_to_readme(subreddit=subreddit, new_rows=new_rows, old_readme=old_readme)
# Commit modifications
api.upload_file(
path_or_fileobj=new_readme.encode(),
path_in_repo="README.md",
repo_id=dataset_name,
repo_type="dataset",
commit_message=f'Pushing {new_rows} new rows'
)
def append_to_readme(subreddit: str, new_rows: int, old_readme: str) -> str:
"""
Append new information to the existing README content.
Args:
subreddit (str): Name of the subreddit.
new_rows (int): Number of new rows added.
old_readme (str): Existing README content.
Returns:
str: Updated README content.
"""
latest_hour = datetime.now(pytz.utc).replace(minute=0, second=0, microsecond=0)
latest_hour_str = latest_hour.strftime('%Y-%m-%d %H:00:00 %Z%z')
readme_text = f"""
## Dataset Overview
The goal is to have an open dataset of [r/{subreddit}](https://www.reddit.com/r/{subreddit}/) submissions. I'm leveraging PRAW and the Reddit API to get downloads.
There is a limit of 1000 in an API call and limited search functionality, so this is run {frequency} to get new submissions.
## Creation Details
This dataset was created by [{username}/dataset-creator-reddit-{subreddit}](https://huggingface.co/spaces/{username}/dataset-creator-reddit-{subreddit})
## Update Frequency
The dataset is updated {frequency} with the most recent update being `{latest_hour_str}` where we added **{new_rows} new rows**.
## Licensing
[Reddit Licensing terms](https://www.redditinc.com/policies/data-api-terms) as accessed on October 25:
[License information]
## Opt-out
To opt-out of this dataset please make a pull request with your justification and add your ids in filter_ids.json
1. Go to [filter_ids.json](https://huggingface.co/spaces/reddit-tools-HF/dataset-creator-reddit-bestofredditorupdates/blob/main/filter_ids.json)
2. Click Edit
3. Add your ids, 1 per row
4. Comment with your justification
"""
if GENERATED_BELOW_MARKER in old_readme:
index = old_readme.index(GENERATED_BELOW_MARKER) + len(GENERATED_BELOW_MARKER)
new_readme = old_readme[:index] + "\n\n" + readme_text
else:
new_readme = old_readme + "\n\n" + GENERATED_BELOW_MARKER + "\n\n" + readme_text + "\n"
return new_readme
|