File size: 858 Bytes
285612d
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
from typing import Dict, List

import pandas as pd

from utilities.my_logger import setup_logger

# Setup logging
logger = setup_logger(__name__)


def preprocess_praw_data(submissions: List[Dict]) -> pd.DataFrame:
    """
    Preprocesses praw data into a DataFrame.

    Parameters:
    - submissions: List of submission dictionaries.

    Returns:
    - pd.DataFrame: Preprocessed DataFrame.
    """

    # Convert the submissions list to a DataFrame
    praw_df = pd.DataFrame(submissions)

    # Convert 'date' column to datetime format
    praw_df.date_utc = pd.to_datetime(praw_df.date_utc)

    # Remove 'poster_link' column if it exists
    if 'poster_link' in praw_df.columns:
        del praw_df['poster_link']

    # Extract the 4th element from 'permalink' as 'id'
    praw_df['id'] = praw_df.permalink.str.split('/').str[4]

    return praw_df