|
from datetime import datetime |
|
|
|
import diff_match_patch as dmp_module |
|
|
|
import hf_data_loader |
|
|
|
|
|
def group_changes(changes): |
|
groups = {} |
|
for change in changes: |
|
group = datetime.fromisoformat(change['ts']) |
|
if group not in groups: |
|
groups[group] = [] |
|
groups[group].append(change) |
|
|
|
grouped_changes = [] |
|
for group in sorted(groups.keys()): |
|
groups[group].sort(key=lambda x: x['p']) |
|
grouped_changes.append(groups[group]) |
|
|
|
return grouped_changes |
|
|
|
|
|
def fill_in_annotation_gaps(annotated_text): |
|
seg_start = None |
|
seg_type = None |
|
|
|
for i, e in enumerate(annotated_text): |
|
if e[1] is None: |
|
continue |
|
|
|
if seg_type is None: |
|
seg_start = i |
|
elif seg_type != e[1]: |
|
for j in range(seg_start, i): |
|
annotated_text[j][1] = seg_type |
|
seg_start = i |
|
seg_type = e[1] |
|
|
|
if seg_start is not None: |
|
for j in range(seg_start, len(annotated_text)): |
|
annotated_text[j][1] = seg_type |
|
|
|
return annotated_text |
|
|
|
|
|
def get_annotated_diff(start_text, end_text): |
|
dmp = dmp_module.diff_match_patch() |
|
dmp_mapping = { |
|
-1: '-', |
|
0: None, |
|
1: '+' |
|
} |
|
|
|
diff = dmp.diff_main(start_text, end_text) |
|
dmp.diff_cleanupSemantic(diff) |
|
|
|
result = [[w, dmp_mapping[t]] for t, w in diff] |
|
|
|
return result |
|
|
|
|
|
def annotated_diff_for_row(row): |
|
start = row['commit_msg_start'] |
|
end = row['commit_msg_end'] |
|
return get_annotated_diff(start, end) |
|
|
|
|
|
def data_with_annotated_diffs(): |
|
df = hf_data_loader.load_raw_dataset_as_pandas() |
|
annotated = df.apply(annotated_diff_for_row, axis=1) |
|
df['annotated_diff'] = annotated |
|
return df |
|
|