Commit
•
04fde16
1
Parent(s):
1756d68
Updating "updated" to uninclude score updates
Browse files
utilities/data_processing.py
CHANGED
@@ -23,27 +23,28 @@ def data_processing(df: pd.DataFrame) -> pd.DataFrame:
|
|
23 |
df['content_length'] = df['content'].str.len()
|
24 |
|
25 |
# Find row with the longest content for each 'id'
|
26 |
-
idx_longest_content = df.groupby('id')['content_length'].idxmax()
|
27 |
df_longest_content = df.loc[idx_longest_content][['id', 'content']]
|
28 |
|
29 |
# Find row with the highest score for each 'id'
|
30 |
-
idx_highest_score = df.groupby('id')['score'].idxmax()
|
31 |
df_highest_score = df.loc[idx_highest_score][['id', 'score']]
|
32 |
|
33 |
# Merge the two DataFrames on 'id'
|
34 |
df_merged = pd.merge(df_longest_content, df_highest_score, on='id')
|
35 |
|
36 |
-
#
|
37 |
df_merged = df_merged.merge(original_df, on='id', suffixes=('', '_original'))
|
38 |
-
df_merged['updated'] = (df_merged['content'] != df_merged['content_original']) | (
|
39 |
-
df_merged['score'] != df_merged['score_original'])
|
40 |
|
41 |
-
#
|
42 |
-
df_merged
|
43 |
|
44 |
# Drop original content and score columns
|
45 |
df_merged.drop(columns=['content_original', 'score_original'], inplace=True)
|
46 |
|
|
|
|
|
|
|
47 |
return df_merged
|
48 |
|
49 |
|
|
|
23 |
df['content_length'] = df['content'].str.len()
|
24 |
|
25 |
# Find row with the longest content for each 'id'
|
26 |
+
idx_longest_content = df.groupby('id')['content_length'].idxmax()
|
27 |
df_longest_content = df.loc[idx_longest_content][['id', 'content']]
|
28 |
|
29 |
# Find row with the highest score for each 'id'
|
30 |
+
idx_highest_score = df.groupby('id')['score'].idxmax()
|
31 |
df_highest_score = df.loc[idx_highest_score][['id', 'score']]
|
32 |
|
33 |
# Merge the two DataFrames on 'id'
|
34 |
df_merged = pd.merge(df_longest_content, df_highest_score, on='id')
|
35 |
|
36 |
+
# Merge with original DataFrame to compare content and score
|
37 |
df_merged = df_merged.merge(original_df, on='id', suffixes=('', '_original'))
|
|
|
|
|
38 |
|
39 |
+
# Check if the content or score was updated for each id
|
40 |
+
df_merged['updated'] = (df_merged['content'] != df_merged['content_original'])
|
41 |
|
42 |
# Drop original content and score columns
|
43 |
df_merged.drop(columns=['content_original', 'score_original'], inplace=True)
|
44 |
|
45 |
+
# Drop duplicates to keep only the rows with longest content and highest score
|
46 |
+
df_merged.drop_duplicates(subset='id', inplace=True)
|
47 |
+
|
48 |
return df_merged
|
49 |
|
50 |
|