Petr Tsvetkov
Generate charts for the presentation & diploma;some refactoring; add (commented) Student's t-test
7ab7be2
import pickle | |
import Levenshtein | |
import numpy as np | |
import pandas as pd | |
import plotly.figure_factory as ff | |
from scipy.stats import stats | |
import config | |
def get_statistics(start_msg, end_msg, annotated_msg): | |
edit_ops = Levenshtein.editops(start_msg, end_msg) | |
n_deletes = sum([1 if op == 'delete' else 0 for op, _, _ in edit_ops]) | |
n_inserts = sum([1 if op == 'insert' else 0 for op, _, _ in edit_ops]) | |
n_replaces = sum([1 if op == 'replace' else 0 for op, _, _ in edit_ops]) | |
n_changes = n_deletes + n_inserts + n_replaces | |
n_deletes += n_replaces | |
n_inserts += n_replaces | |
return { | |
"deletions": n_deletes, | |
"insertions": n_inserts, | |
"changes": n_changes, | |
"deletions_norm": n_deletes / len(start_msg), | |
"insertions_norm": n_inserts / len(end_msg), | |
"changes_norm": n_changes / len(end_msg), | |
} | |
def get_statistics_for_df(df: pd.DataFrame): | |
stats = [get_statistics(row["commit_msg_start"], row["commit_msg_end"], row["annotated_diff"]) for _, row in | |
df.iterrows()] | |
assert len(stats) > 0 | |
return {stat_name: np.asarray([e[stat_name] for e in stats]) for stat_name in stats[0]} | |
def build_plotly_chart(stat_golden, stat_e2s, stat_s2e, stat_e2s_s2e, stat_name): | |
hist_data = [stat_golden, stat_e2s, stat_s2e, stat_e2s_s2e, | |
np.concatenate((stat_e2s, stat_s2e, stat_e2s_s2e), axis=0)] | |
group_labels = ['Golden', 'e2s', 's2e', 'e2s+s2e', 'Synthetic'] | |
fig = ff.create_distplot(hist_data, group_labels, | |
bin_size=.05, show_rug=False, show_hist=False) | |
fig.update_layout(title_text=stat_name) | |
with open(config.OUTPUT_CHARTS_DIR / f"{stat_name}_data.pkl", "wb") as f: | |
pickle.dump(hist_data, f) | |
return fig | |
def t_test(group_stats, main_group="manual"): | |
results = {} | |
for group in group_stats: | |
results[group] = {} | |
for stat in group_stats[group]: | |
a = group_stats[main_group][stat] | |
b = group_stats[group][stat] | |
p = stats.ttest_ind(a, b, equal_var=False, random_state=config.RANDOM_STATE).pvalue | |
results[group][stat] = p | |
return results | |