File size: 10,455 Bytes
5434c4b a01d3ba 5434c4b 7ab7be2 5434c4b 0c136d8 b7f7a57 fbb73cc 0c136d8 5434c4b 5f3a4af 303303b 7ab7be2 303303b f26a894 5434c4b b6ae739 5434c4b 02ebb6e ea84073 13e3243 ea84073 b6ae739 5ae823f b6ae739 5ae823f 5434c4b 642fae1 5434c4b 642fae1 0b259d2 ea84073 13e3243 0b259d2 d7e2287 13e3243 0b259d2 0c136d8 0b259d2 b6ae739 0c136d8 5f3a4af 0b259d2 5f3a4af b6ae739 5f3a4af a01d3ba f26a894 303303b f26a894 7ab7be2 f26a894 303303b f26a894 303303b f26a894 7ab7be2 303303b f26a894 303303b 7ab7be2 303303b 7ab7be2 303303b 5f3a4af a01d3ba 39950c9 ff76f88 a01d3ba b6ae739 0c136d8 5434c4b b6ae739 5f3a4af 5434c4b |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 |
import gradio as gr
import analysis_util
import generate_annotated_diffs
import dataset_statistics
df_manual = generate_annotated_diffs.manual_data_with_annotated_diffs()
df_manual["end_to_start"] = False
df_manual["start_to_end"] = False
n_diffs_manual = len(df_manual)
df_synthetic = generate_annotated_diffs.synthetic_data_with_annotated_diffs()
n_diffs_synthetic = len(df_synthetic)
def golden():
return df_manual
def e2s():
return df_synthetic[(df_synthetic['end_to_start'] == True) & (df_synthetic['start_to_end'] == False)]
def s2e():
return df_synthetic[(df_synthetic['end_to_start'] == False) & (df_synthetic['start_to_end'] == True)]
def e2s_s2e():
return df_synthetic[(df_synthetic['end_to_start'] == True) & (df_synthetic['start_to_end'] == True)]
def synthetic():
return df_synthetic[(df_synthetic['end_to_start'] == True) | (df_synthetic['start_to_end'] == True)]
STATISTICS = {"manual": dataset_statistics.get_statistics_for_df(df_manual),
"e2s": dataset_statistics.get_statistics_for_df(e2s()),
"s2e": dataset_statistics.get_statistics_for_df(s2e()),
"e2s_s2e": dataset_statistics.get_statistics_for_df(e2s_s2e()),
"synthetic": dataset_statistics.get_statistics_for_df(synthetic()),
"all": dataset_statistics.get_statistics_for_df(df_synthetic)}
STATISTICS_T_TEST = dataset_statistics.t_test(STATISTICS, main_group='manual')
STAT_NAMES = list(STATISTICS['manual'].keys())
def update_dataset_view(diff_idx, df):
diff_idx -= 1
return (df.iloc[diff_idx]['annotated_diff'],
df.iloc[diff_idx]['commit_msg_start'],
df.iloc[diff_idx]['commit_msg_end'],
df.iloc[diff_idx]['session'],
str(df.iloc[diff_idx]['end_to_start']),
str(df.iloc[diff_idx]['start_to_end']),
f"https://github.com/{df.iloc[diff_idx]['repo']}/commit/{df.iloc[diff_idx]['hash']}",)
def update_dataset_view_manual(diff_idx):
return update_dataset_view(diff_idx, df_manual)
def update_dataset_view_synthetic(diff_idx):
return update_dataset_view(diff_idx, df_synthetic)
force_light_theme_js_func = """
function refresh() {
const url = new URL(window.location);
if (url.searchParams.get('__theme') !== 'light') {
url.searchParams.set('__theme', 'light');
window.location.href = url.href;
}
}
"""
if __name__ == '__main__':
with gr.Blocks(theme=gr.themes.Soft(), js=force_light_theme_js_func) as application:
def dataset_view_tab(n_items):
slider = gr.Slider(minimum=1, maximum=n_items, step=1, value=1,
label=f"Sample number (total: {n_items})")
diff_view = gr.Highlightedtext(combine_adjacent=True, color_map={'+': "green", '-': "red"})
start_view = gr.Textbox(interactive=False, label="Start message", container=True)
end_view = gr.Textbox(interactive=False, label="End message", container=True)
session_view = gr.Textbox(interactive=False, label="Session", container=True)
is_end_to_start_view = gr.Textbox(interactive=False,
label="Is generated on the 'end-to-start' synthesis step?",
container=True)
is_start_to_end_view = gr.Textbox(interactive=False,
label="Is generated on the 'start-to-end' synthesis step?",
container=True)
link_view = gr.Markdown()
view = [
diff_view,
start_view,
end_view,
session_view,
is_end_to_start_view,
is_start_to_end_view,
link_view
]
return slider, view
with gr.Tab("Manual"):
slider_manual, view_manual = dataset_view_tab(n_diffs_manual)
slider_manual.change(update_dataset_view_manual, inputs=slider_manual,
outputs=view_manual)
with gr.Tab("Synthetic"):
slider_synthetic, view_synthetic = dataset_view_tab(n_diffs_synthetic)
slider_synthetic.change(update_dataset_view_synthetic, inputs=slider_synthetic,
outputs=view_synthetic)
with gr.Tab("Analysis"):
def layout_for_statistics(statistics_group_name):
gr.Markdown(f"### {statistics_group_name}")
stats = STATISTICS[statistics_group_name]
gr.Number(label="Count", interactive=False,
value=len(stats['deletions_norm']), min_width=00)
gr.Number(label="Avg deletions number (rel to the initial msg length)", interactive=False,
value=stats['deletions_norm'].mean().item(), precision=3, min_width=00)
gr.Number(label="Avg insertions number (rel to the result length)", interactive=False,
value=stats['insertions_norm'].mean().item(), precision=3, min_width=00)
gr.Number(label="Avg changes number (rel to the initial msg length)", interactive=False,
value=stats['changes_norm'].mean().item(), precision=3, min_width=00)
gr.Number(label="Avg deletions number", interactive=False,
value=stats['deletions'].mean().item(), precision=3, min_width=00)
gr.Number(label="Avg insertions number", interactive=False,
value=stats['insertions'].mean().item(), precision=3, min_width=00)
gr.Number(label="Avg changes number", interactive=False,
value=stats['changes'].mean().item(), precision=3, min_width=00)
def layout_for_statistics_t_test(statistics_group_name):
gr.Markdown(f"### {statistics_group_name}")
stats = STATISTICS_T_TEST[statistics_group_name]
gr.Number(label="Deletions number (rel to the initial msg length)", interactive=False,
value=stats['deletions_norm'], precision=3, min_width=00)
gr.Number(label="Insertions number (rel to the result length)", interactive=False,
value=stats['insertions_norm'], precision=3, min_width=00)
gr.Number(label="Changes number (rel to the initial msg length)", interactive=False,
value=stats['changes_norm'], precision=3, min_width=00)
gr.Number(label="Deletions number", interactive=False,
value=stats['deletions'], precision=3, min_width=00)
gr.Number(label="Insertions number", interactive=False,
value=stats['insertions'], precision=3, min_width=00)
gr.Number(label="Changes number", interactive=False,
value=stats['changes'], precision=3, min_width=00)
with gr.Row():
with gr.Column(scale=1, min_width=100):
layout_for_statistics("manual")
with gr.Column(scale=1, min_width=100):
layout_for_statistics("e2s")
with gr.Column(scale=1, min_width=100):
layout_for_statistics("s2e")
with gr.Column(scale=1, min_width=100):
layout_for_statistics("e2s_s2e")
with gr.Column(scale=1, min_width=100):
layout_for_statistics("synthetic")
with gr.Column(scale=1, min_width=100):
layout_for_statistics("all")
# gr.Markdown(f"### Student t-test (p-value)")
# with gr.Row():
# with gr.Column(scale=1, min_width=100):
# layout_for_statistics_t_test("manual")
# with gr.Column(scale=1, min_width=100):
# layout_for_statistics_t_test("e2s")
# with gr.Column(scale=1, min_width=100):
# layout_for_statistics_t_test("s2e")
# with gr.Column(scale=1, min_width=100):
# layout_for_statistics_t_test("e2s_s2e")
# with gr.Column(scale=1, min_width=100):
# layout_for_statistics_t_test("synthetic")
# with gr.Column(scale=1, min_width=100):
# layout_for_statistics_t_test("all")
with gr.Row():
with gr.Column(scale=1):
for stat_name in filter(lambda s: "_norm" not in s, STAT_NAMES):
chart = dataset_statistics.build_plotly_chart(
stat_golden=STATISTICS['manual'][stat_name],
stat_e2s=STATISTICS['e2s'][stat_name],
stat_s2e=STATISTICS['s2e'][stat_name],
stat_e2s_s2e=STATISTICS['e2s_s2e'][stat_name],
stat_name=stat_name
)
gr.Plot(value=chart)
with gr.Column(scale=1):
with gr.Column(scale=1):
for stat_name in filter(lambda s: "_norm" in s, STAT_NAMES):
chart = dataset_statistics.build_plotly_chart(
stat_golden=STATISTICS['manual'][stat_name],
stat_e2s=STATISTICS['e2s'][stat_name],
stat_s2e=STATISTICS['s2e'][stat_name],
stat_e2s_s2e=STATISTICS['e2s_s2e'][stat_name],
stat_name=stat_name
)
gr.Plot(value=chart)
gr.Markdown(f"### Reference-only correlations")
gr.Markdown(value=analysis_util.get_correlations_for_groups(df_synthetic, right_side="ind").to_markdown())
gr.Markdown(f"### Aggregated correlations")
gr.Markdown(value=analysis_util.get_correlations_for_groups(df_synthetic, right_side="aggr").to_markdown())
application.load(update_dataset_view_manual, inputs=slider_manual,
outputs=view_manual)
application.load(update_dataset_view_synthetic, inputs=slider_synthetic,
outputs=view_synthetic)
application.launch()
|