Spaces:

CONDA-Workshop
/

Data-Contamination-Database

Running

App Files Files Community

Data-Contamination-Database / dataset.py

OSainz

Add PR links to previous commits

f35c65c 8 months ago

raw

history blame

9.64 kB

	import json
	import os

	import filelock
	import huggingface_hub
	import pandas as pd

	from utils import (
	build_datasets_urls,
	build_models_urls,
	build_text_icon,
	download_favicons,
	get_base_url,
	get_domain_name,
	)


	HF_ICON = "https://huggingface.co/front/assets/huggingface_logo.svg"
	CROSS_ICON = "https://upload.wikimedia.org/wikipedia/commons/4/4e/Cross.png"

	DISABLE_ONLINE_CACHE = False
	ONLINE_CACHE = "CONDA-Workshop/RequestCache"


	def save_cache(cache_data, cache_file, initial_timestamp):
	print(f"Saving cache to {cache_file}")
	# Acquire lock before reading and updating the file to prevent race conditions
	with filelock.FileLock(f"{cache_file}.lock"):
	# Check if the file has been modified since the initial read
	current_timestamp = (
	os.path.getmtime(cache_file) if os.path.exists(cache_file) else None
	)
	if current_timestamp is None or initial_timestamp != current_timestamp:
	# File has been modified or created since initial read, re-read the file
	try:
	with open(cache_file, "r", encoding="utf8") as f:
	# Update the dictionary with newly added entries
	cache_dict = json.load(f)
	# Test if cache_dict and cache_data are different
	if cache_dict != cache_data:
	cache_data.update(cache_dict)

	except FileNotFoundError:
	pass # If the file doesn't exist at this point, continue with the current dictionary

	# Write the updated dictionary back to the file
	with open(cache_file, "w", encoding="utf8") as f:
	json.dump(cache_data, f, ensure_ascii=False, indent=4)

	if not DISABLE_ONLINE_CACHE:
	try:
	huggingface_hub.upload_file(
	repo_id=ONLINE_CACHE,
	repo_type="dataset",
	token=os.environ.get("TOKEN") or True,
	path_in_repo=cache_file,
	path_or_fileobj=cache_file,
	)
	except Exception as e:
	print(f"Unable to upload {cache_file}: {e}")

	return cache_data


	def update_favicon_cache(sources):
	# Load the favicon dictionary if it exists
	favicon_dict = {}
	favicon_file_path = "favicons.json"
	initial_timestamp = None

	if not DISABLE_ONLINE_CACHE:
	try:
	huggingface_hub.hf_hub_download(
	repo_id=ONLINE_CACHE,
	repo_type="dataset",
	token=os.environ.get("TOKEN") or True,
	filename=favicon_file_path,
	local_dir=os.getcwd(),
	)
	except Exception as e:
	print(f"Unable to download favicons.json: {e}")

	# Attempt to load the favicon dictionary and record its last modification time
	if os.path.exists(favicon_file_path):
	initial_timestamp = os.path.getmtime(favicon_file_path)
	try:
	with open(favicon_file_path, "r", encoding="utf8") as f:
	favicon_dict = json.load(f)
	except FileNotFoundError:
	pass # File not found, proceed with an empty dictionary

	# Determine which favicons need to be downloaded
	missing_domains = [domain for domain in sources if domain not in favicon_dict]

	# Download missing favicons in batch
	if missing_domains:
	new_favicon_urls = download_favicons(missing_domains)
	favicon_dict.update(new_favicon_urls)
	favicon_dict = save_cache(
	cache_data=favicon_dict,
	cache_file=favicon_file_path,
	initial_timestamp=initial_timestamp,
	)

	return favicon_dict


	def update_model_url_cache(models):
	models = [x for x in models if x is not None]
	models = list(set(models))

	# Load the model url dictionary if it exists
	model_url_dict = {}
	model_url_file_path = "model_urls.json"
	initial_timestamp = None

	if not DISABLE_ONLINE_CACHE:
	try:
	huggingface_hub.hf_hub_download(
	repo_id=ONLINE_CACHE,
	repo_type="dataset",
	token=os.environ.get("TOKEN") or True,
	filename=model_url_file_path,
	local_dir=os.getcwd(),
	)
	except Exception as e:
	print(f"Unable to download model_urls.json: {e}")

	# Attempt to load the model url dictionary and record its last modification time
	if os.path.exists(model_url_file_path):
	initial_timestamp = os.path.getmtime(model_url_file_path)
	try:
	with open(model_url_file_path, "r", encoding="utf8") as f:
	model_url_dict = json.load(f)
	except FileNotFoundError:
	pass # File not found, proceed with an empty dictionary

	# Determine which model urls need to be downloaded
	missing_model_urls = [model for model in models if model not in model_url_dict]

	# Download missing model urls in batch
	if missing_model_urls:
	new_model_urls = build_models_urls(missing_model_urls)
	model_url_dict.update(new_model_urls)
	model_url_dict = save_cache(
	cache_data=model_url_dict,
	cache_file=model_url_file_path,
	initial_timestamp=initial_timestamp,
	)

	return model_url_dict


	def update_dataset_url_cache(datasets):
	datasets = [x for x in datasets if x is not None]
	datasets = list(set(datasets))

	# Load the dataset url dictionary if it exists
	dataset_url_dict = {}
	dataset_url_file_path = "dataset_urls.json"
	initial_timestamp = None

	if not DISABLE_ONLINE_CACHE:
	try:
	huggingface_hub.hf_hub_download(
	repo_id=ONLINE_CACHE,
	repo_type="dataset",
	token=os.environ.get("TOKEN") or True,
	filename=dataset_url_file_path,
	local_dir=os.getcwd(),
	)
	except Exception as e:
	print(f"Unable to download dataset_urls.json: {e}")

	# Attempt to load the dataset url dictionary and record its last modification time
	if os.path.exists(dataset_url_file_path):
	initial_timestamp = os.path.getmtime(dataset_url_file_path)
	try:
	with open(dataset_url_file_path, "r", encoding="utf8") as f:
	dataset_url_dict = json.load(f)
	except FileNotFoundError:
	pass # File not found, proceed with an empty dictionary

	# Determine which dataset urls need to be downloaded
	missing_dataset_urls = [
	dataset for dataset in datasets if dataset not in dataset_url_dict
	]

	# Download missing dataset urls in batch
	if missing_dataset_urls:
	new_dataset_urls = build_datasets_urls(missing_dataset_urls)
	dataset_url_dict.update(new_dataset_urls)
	dataset_url_dict = save_cache(
	cache_data=dataset_url_dict,
	cache_file=dataset_url_file_path,
	initial_timestamp=initial_timestamp,
	)

	return dataset_url_dict


	def get_dataframe():
	# Load the contamination_report.csv file
	data = pd.read_csv("contamination_report.csv", delimiter=";", header=0)

	# Load the favicon dictionary if it exists
	favicon_dict = {}

	# Update the favicon dictionary
	favicon_dict = update_favicon_cache([get_base_url(x) for x in data["Reference"]])

	# Update the model url dictionary
	model_url_dict = update_model_url_cache(
	data[data["Model or corpus"] == "model"]["Contaminated Source"]
	)

	# Update the dataset url dictionary
	dataset_url_dict = update_dataset_url_cache(
	list(data["Evaluation Dataset"])
	+ list(data[data["Model or corpus"] == "corpus"]["Contaminated Source"])
	)

	# Add favicons URLs to the dataframe in a vectorized manner
	data["Reference"] = data["Reference"].apply(
	lambda x: build_text_icon(
	text=get_domain_name(x),
	url=x,
	icon_url=favicon_dict.get(get_base_url(x), ""),
	)
	)

	PR_URL_FORMAT = "https://huggingface.co/spaces/CONDA-Workshop/Data-Contamination-Report/discussions/{}"
	data["PR"] = data["PR"].apply(
	lambda x: build_text_icon(
	text="",
	url=PR_URL_FORMAT.format(int(x)) if not pd.isna(x) else "no link",
	icon_url=HF_ICON if x == x else CROSS_ICON,
	)
	)

	data["Evaluation Dataset"] = data["Evaluation Dataset"].apply(
	lambda x: build_text_icon(
	text=x,
	url=dataset_url_dict.get(x, ""),
	icon_url=HF_ICON,
	)
	)

	data["Evaluation Dataset"] = data.apply(
	lambda x: x["Evaluation Dataset"] + f" ({x['Subset']})" if pd.notna(x["Subset"]) else x["Evaluation Dataset"],
	axis=1,
	)

	del data["Subset"]

	# For "Contaminated Source" use build_dataset_url if "Model or corpus" is "corpus" and build_model_url if "Model or corpus" is "model"
	data["Contaminated Source"] = data.apply(
	lambda x: build_text_icon(
	text=x["Contaminated Source"],
	url=dataset_url_dict.get(x["Contaminated Source"], "")
	if x["Model or corpus"] == "corpus"
	else model_url_dict.get(x["Contaminated Source"], ""),
	icon_url=HF_ICON,
	),
	axis=1,
	)

	data["Train Split"] = data["Train Split"].apply(lambda x: x/100 if x else x)
	data["Development Split"] = data["Development Split"].apply(lambda x: x/100 if x else x)
	data["Test Split"] = data["Test Split"].apply(lambda x: x/100 if x else x)

	return data