|
import numpy as np |
|
import pandas as pd |
|
|
|
|
|
def get_statistics(start_msg, end_msg, annotated_msg): |
|
sum_deletions = 0 |
|
sum_insertions = 0 |
|
for text, change_type in annotated_msg: |
|
if change_type == '-': |
|
sum_deletions += len(text) |
|
elif change_type == '+': |
|
sum_insertions += len(text) |
|
|
|
sum_changes = sum_deletions + sum_insertions |
|
end_length = len(end_msg) |
|
start_length = len(start_msg) |
|
|
|
return { |
|
"deletions": sum_deletions / start_length, |
|
"insertions": sum_insertions / end_length, |
|
"changes": sum_changes / end_length |
|
} |
|
|
|
|
|
def get_statistics_for_df(df: pd.DataFrame, start_col, end_col, annotated_col): |
|
stats = [get_statistics(row[start_col], row[end_col], row[annotated_col]) for _, row in df.iterrows()] |
|
|
|
assert len(stats) > 0 |
|
|
|
return {stat_name: np.asarray([e[stat_name] for e in stats]) for stat_name in stats[0]} |
|
|
|
|
|
def get_statistics_for_manual_df(df): |
|
return get_statistics_for_df(df, start_col="commit_msg_start", end_col='commit_msg_end', |
|
annotated_col='annotated_diff') |
|
|
|
|
|
def get_statistics_for_synthetic_df(df): |
|
return get_statistics_for_df(df, start_col="initial_msg_pred", end_col='reference', annotated_col='annotated_diff') |
|
|