from typing import Dict, List | |
import pandas as pd | |
from utilities.my_logger import setup_logger | |
# Setup logging | |
logger = setup_logger(__name__) | |
def preprocess_praw_data(submissions: List[Dict]) -> pd.DataFrame: | |
""" | |
Preprocesses praw data into a DataFrame. | |
Parameters: | |
- submissions: List of submission dictionaries. | |
Returns: | |
- pd.DataFrame: Preprocessed DataFrame. | |
""" | |
# Convert the submissions list to a DataFrame | |
praw_df = pd.DataFrame(submissions) | |
# Convert 'date' column to datetime format | |
praw_df.date_utc = pd.to_datetime(praw_df.date_utc) | |
# Remove 'poster_link' column if it exists | |
if 'poster_link' in praw_df.columns: | |
del praw_df['poster_link'] | |
# Extract the 4th element from 'permalink' as 'id' | |
praw_df['id'] = praw_df.permalink.str.split('/').str[4] | |
return praw_df | |