derek-thomas's picture
derek-thomas HF staff
Major updates, moving away from pushshift.io into PRAW
285612d
raw
history blame
858 Bytes
from typing import Dict, List
import pandas as pd
from utilities.my_logger import setup_logger
# Setup logging
logger = setup_logger(__name__)
def preprocess_praw_data(submissions: List[Dict]) -> pd.DataFrame:
"""
Preprocesses praw data into a DataFrame.
Parameters:
- submissions: List of submission dictionaries.
Returns:
- pd.DataFrame: Preprocessed DataFrame.
"""
# Convert the submissions list to a DataFrame
praw_df = pd.DataFrame(submissions)
# Convert 'date' column to datetime format
praw_df.date_utc = pd.to_datetime(praw_df.date_utc)
# Remove 'poster_link' column if it exists
if 'poster_link' in praw_df.columns:
del praw_df['poster_link']
# Extract the 4th element from 'permalink' as 'id'
praw_df['id'] = praw_df.permalink.str.split('/').str[4]
return praw_df