Spaces:

reddit-tools-HF
/

dataset-creator-reddit-bestofredditorupdates

Running

derek-thomas HF staff commited on Nov 17, 2023

Commit

04fde16

•

1 Parent(s): 1756d68

Updating "updated" to uninclude score updates

Files changed (1) hide show

utilities/data_processing.py CHANGED Viewed

@@ -23,27 +23,28 @@ def data_processing(df: pd.DataFrame) -> pd.DataFrame:
     df['content_length'] = df['content'].str.len()
     # Find row with the longest content for each 'id'
-    idx_longest_content = df.groupby('id')['content_length'].idxmax().values
     df_longest_content = df.loc[idx_longest_content][['id', 'content']]
     # Find row with the highest score for each 'id'
-    idx_highest_score = df.groupby('id')['score'].idxmax().values
     df_highest_score = df.loc[idx_highest_score][['id', 'score']]
     # Merge the two DataFrames on 'id'
     df_merged = pd.merge(df_longest_content, df_highest_score, on='id')
-    # Check if the content or score was updated for each id
     df_merged = df_merged.merge(original_df, on='id', suffixes=('', '_original'))
-    df_merged['updated'] = (df_merged['content'] != df_merged['content_original']) | (
-            df_merged['score'] != df_merged['score_original'])
-    # Drop duplicates to keep only the rows with longest content and highest score
-    df_merged.drop_duplicates(subset='id', inplace=True)
     # Drop original content and score columns
     df_merged.drop(columns=['content_original', 'score_original'], inplace=True)
     return df_merged

     df['content_length'] = df['content'].str.len()
     # Find row with the longest content for each 'id'
+    idx_longest_content = df.groupby('id')['content_length'].idxmax()
     df_longest_content = df.loc[idx_longest_content][['id', 'content']]
     # Find row with the highest score for each 'id'
+    idx_highest_score = df.groupby('id')['score'].idxmax()
     df_highest_score = df.loc[idx_highest_score][['id', 'score']]
     # Merge the two DataFrames on 'id'
     df_merged = pd.merge(df_longest_content, df_highest_score, on='id')
+    # Merge with original DataFrame to compare content and score
     df_merged = df_merged.merge(original_df, on='id', suffixes=('', '_original'))
+    # Check if the content or score was updated for each id
+    df_merged['updated'] = (df_merged['content'] != df_merged['content_original'])
     # Drop original content and score columns
     df_merged.drop(columns=['content_original', 'score_original'], inplace=True)
+    # Drop duplicates to keep only the rows with longest content and highest score
+    df_merged.drop_duplicates(subset='id', inplace=True)
     return df_merged