Spaces:
Sleeping
Sleeping
import json | |
import yaml | |
import pandas as pd | |
import numpy as np | |
from pathlib import Path | |
from jsonargparse import ArgumentParser | |
def parse_args(): | |
"""Parse command-line arguments.""" | |
parser = ArgumentParser() | |
parser.add_argument("--rating_path", type=str, required=True, default="./dataset/ratings.csv") | |
parser.add_argument("--book_path", type=str, required=True, default="./dataset/books.csv") | |
parser.add_argument("--out_dir", type=str, required=True, default="./processed") | |
parser.add_argument("--limit", required=True, type=int, default=1000) | |
return vars(parser.parse_args()) | |
def main( | |
rating_path, | |
book_path, | |
out_dir, | |
limit, | |
**kwargs | |
): | |
data = pd.read_csv(rating_path, delimiter=';', nrows=limit, encoding='ISO-8859-1') | |
# Make Y | |
Y = data.pivot(index='ISBN', columns='User-ID', values='Book-Rating') | |
Y = Y.fillna(0) | |
Y = Y.values | |
# Make R | |
R = np.where(Y != 0, 1, 0) | |
# Save Y and R as dense matrices | |
out_dir_path = Path(out_dir) | |
if out_dir_path.exists(): | |
assert out_dir_path.is_dir() | |
else: | |
out_dir_path.mkdir(parents=True) | |
np.save(f'{out_dir_path}/Y.npy', Y) | |
np.save(f'{out_dir_path}/R.npy', R) | |
# Create mappings for book and user IDs | |
book_lst = data['ISBN'].unique() | |
user_lst = data['User-ID'].unique() | |
book_id_map = {book_id: i for i, book_id in enumerate(book_lst)} | |
user_id_map = {user_id: i for i, user_id in enumerate(user_lst)} | |
# Convert keys to compatible types | |
book_id_map = {str(key): value for key, value in book_id_map.items()} | |
user_id_map = {str(key): value for key, value in user_id_map.items()} | |
# Save book_id_map to file | |
with open(f'{out_dir_path}/book_id_map.json', 'w') as f: | |
json.dump(book_id_map, f) | |
# Save user_id_map to file | |
with open(f'{out_dir_path}/user_id_map.json', 'w') as f: | |
json.dump(user_id_map, f) | |
# Get summary | |
function = { | |
"Book-Rating": "mean", | |
"User-ID": "count" | |
} | |
book_df = pd.read_csv(book_path, delimiter=';', encoding='ISO-8859-1', on_bad_lines='skip') | |
book_df = book_df[book_df["ISBN"].isin(book_id_map.keys())] | |
summary_rating = data.groupby("ISBN").agg(function, axis=0) | |
summary_rating = summary_rating.rename(columns={"Book-Rating": "Mean-Rating", "User-ID": "Num-Rating"}) | |
df = book_df.merge(summary_rating, how="left", left_on="ISBN", right_on="ISBN") | |
df = df.dropna() | |
df.drop(columns=["Image-URL-S", "Image-URL-M", "Image-URL-L"], inplace=True) | |
df.to_csv(f"{out_dir_path}/summary_book.csv", index=False) | |
if __name__ == "__main__": | |
main(**parse_args()) | |