commit-message-editing-visualization / generate_annotated_diffs.py
Petr Tsvetkov
Create visualize app
5434c4b
raw
history blame
1.73 kB
from datetime import datetime
import diff_match_patch as dmp_module
import hf_data_loader
def group_changes(changes):
groups = {}
for change in changes:
group = datetime.fromisoformat(change['ts'])
if group not in groups:
groups[group] = []
groups[group].append(change)
grouped_changes = []
for group in sorted(groups.keys()):
groups[group].sort(key=lambda x: x['p'])
grouped_changes.append(groups[group])
return grouped_changes
def fill_in_annotation_gaps(annotated_text):
seg_start = None
seg_type = None
for i, e in enumerate(annotated_text):
if e[1] is None:
continue
if seg_type is None:
seg_start = i
elif seg_type != e[1]:
for j in range(seg_start, i):
annotated_text[j][1] = seg_type
seg_start = i
seg_type = e[1]
if seg_start is not None:
for j in range(seg_start, len(annotated_text)):
annotated_text[j][1] = seg_type
return annotated_text
def get_annotated_diff(start_text, end_text):
dmp = dmp_module.diff_match_patch()
dmp_mapping = {
-1: '-',
0: None,
1: '+'
}
diff = dmp.diff_main(start_text, end_text)
dmp.diff_cleanupSemantic(diff)
result = [[w, dmp_mapping[t]] for t, w in diff]
return result
def annotated_diff_for_row(row):
start = row['commit_msg_start']
end = row['commit_msg_end']
return get_annotated_diff(start, end)
def data_with_annotated_diffs():
df = hf_data_loader.load_raw_dataset_as_pandas()
annotated = df.apply(annotated_diff_for_row, axis=1)
df['annotated_diff'] = annotated
return df