|
import Levenshtein |
|
import evaluate |
|
import pandas as pd |
|
from tqdm import tqdm |
|
|
|
import config |
|
from analysis_util import correlations_for_group |
|
from api_wrappers import hf_data_loader |
|
from custom_metrics import gpt_eval |
|
|
|
BLEU = evaluate.load('bleu', cache_dir=config.CACHE_DIR) |
|
|
|
|
|
def bleu_fn(pred, ref, **kwargs): |
|
if "refs" in kwargs: |
|
return BLEU.compute(predictions=[pred] * len(kwargs["refs"]), references=kwargs["refs"])["bleu"] |
|
return BLEU.compute(predictions=[pred], references=[ref])["bleu"] |
|
|
|
|
|
METEOR = evaluate.load('meteor', cache_dir=config.CACHE_DIR) |
|
|
|
|
|
def meteor_fn(pred, ref, **kwargs): |
|
if "refs" in kwargs: |
|
return METEOR.compute(predictions=[pred] * len(kwargs["refs"]), references=kwargs["refs"])["meteor"] |
|
return METEOR.compute(predictions=[pred], references=[ref])["meteor"] |
|
|
|
|
|
ROUGE = evaluate.load('rouge', cache_dir=config.CACHE_DIR) |
|
|
|
|
|
def rouge1_fn(pred, ref, **kwargs): |
|
if "refs" in kwargs: |
|
return ROUGE.compute(predictions=[pred] * len(kwargs["refs"]), references=kwargs["refs"])["rouge1"] |
|
return ROUGE.compute(predictions=[pred], references=[ref])["rouge1"] |
|
|
|
|
|
def rouge2_fn(pred, ref, **kwargs): |
|
if "refs" in kwargs: |
|
return ROUGE.compute(predictions=[pred] * len(kwargs["refs"]), references=kwargs["refs"])["rouge2"] |
|
return ROUGE.compute(predictions=[pred], references=[ref])["rouge2"] |
|
|
|
|
|
def rougeL_fn(pred, ref, **kwargs): |
|
if "refs" in kwargs: |
|
return ROUGE.compute(predictions=[pred] * len(kwargs["refs"]), references=kwargs["refs"])["rougeL"] |
|
return ROUGE.compute(predictions=[pred], references=[ref])["rougeL"] |
|
|
|
|
|
BERTSCORE = evaluate.load('bertscore', cache_dir=config.CACHE_DIR) |
|
|
|
|
|
def bertscore_fn(pred, ref, **kwargs): |
|
if "refs" in kwargs: |
|
return \ |
|
BERTSCORE.compute(predictions=[pred], references=[kwargs["refs"]], model_type="distilbert-base-uncased")[ |
|
"f1"][0] |
|
return BERTSCORE.compute(predictions=[pred], references=[ref], model_type="distilbert-base-uncased")["f1"][0] |
|
|
|
|
|
CHRF = evaluate.load("chrf") |
|
|
|
|
|
def chrf_fn(pred, ref, **kwargs): |
|
if "refs" in kwargs: |
|
return CHRF.compute(predictions=[pred], references=[kwargs["refs"]])["score"] |
|
return CHRF.compute(predictions=[pred], references=[[ref]])["score"] |
|
|
|
|
|
TER = evaluate.load("ter") |
|
|
|
|
|
def ter_fn(pred, ref, **kwargs): |
|
if "refs" in kwargs: |
|
scores = [TER.compute(predictions=[pred], references=[[ref]])["score"] for ref in kwargs["refs"]] |
|
return sum(scores) / len(scores) |
|
return TER.compute(predictions=[pred], references=[[ref]])["score"] |
|
|
|
|
|
def edit_distance_fn(pred, ref, **kwargs): |
|
if "refs" in kwargs: |
|
scores = [Levenshtein.distance(pred, ref) for ref in kwargs["refs"]] |
|
return sum(scores) / len(scores) |
|
return Levenshtein.distance(pred, ref) |
|
|
|
|
|
def edit_distance_norm_fn(pred, ref, **kwargs): |
|
if "refs" in kwargs: |
|
scores = [Levenshtein.distance(pred, ref) / len(pred) for ref in kwargs["refs"]] |
|
return sum(scores) / len(scores) |
|
return Levenshtein.distance(pred, ref) / len(pred) |
|
|
|
|
|
def edit_time_fn(pred, ref, **kwargs): |
|
return kwargs["edittime"] |
|
|
|
|
|
def gptscore_ref_1_fn(pred, ref, **kwargs): |
|
if "refs" in kwargs: |
|
scores = [gpt_eval.compute_ref(prediction=pred, reference=ref, n_requests=1) for ref in kwargs["refs"]] |
|
return sum(scores) / len(scores) |
|
return gpt_eval.compute_ref(prediction=pred, reference=ref, n_requests=1) |
|
|
|
|
|
def gptscore_ref_3_fn(pred, ref, **kwargs): |
|
if "refs" in kwargs: |
|
scores = [gpt_eval.compute_ref(prediction=pred, reference=ref, n_requests=3) for ref in kwargs["refs"]] |
|
return sum(scores) / len(scores) |
|
return gpt_eval.compute_ref(prediction=pred, reference=ref, n_requests=3) |
|
|
|
|
|
def gptscore_ref_5_fn(pred, ref, **kwargs): |
|
if "refs" in kwargs: |
|
scores = [gpt_eval.compute_ref(prediction=pred, reference=ref, n_requests=5) for ref in kwargs["refs"]] |
|
return sum(scores) / len(scores) |
|
return gpt_eval.compute_ref(prediction=pred, reference=ref, n_requests=5) |
|
|
|
|
|
def gptscore_noref_1_fn(pred, ref, **kwargs): |
|
return gpt_eval.compute_noref(prediction=pred, diff=kwargs['diff'], n_requests=1) |
|
|
|
|
|
def gptscore_noref_3_fn(pred, ref, **kwargs): |
|
return gpt_eval.compute_noref(prediction=pred, diff=kwargs['diff'], n_requests=3) |
|
|
|
|
|
def gptscore_noref_5_fn(pred, ref, **kwargs): |
|
return gpt_eval.compute_noref(prediction=pred, diff=kwargs['diff'], n_requests=5) |
|
|
|
|
|
IND_METRICS = { |
|
"editdist": edit_distance_fn, |
|
"editdist-norm": edit_distance_norm_fn, |
|
|
|
|
|
|
|
|
|
|
|
|
|
"bleu": bleu_fn, |
|
"meteor": meteor_fn, |
|
"rouge1": rouge1_fn, |
|
"rouge2": rouge2_fn, |
|
"rougeL": rougeL_fn, |
|
"bertscore": bertscore_fn, |
|
"chrF": chrf_fn, |
|
"ter": ter_fn, |
|
} |
|
|
|
AGGR_METRICS = {} |
|
|
|
|
|
|
|
|
|
REL_METRICS = { |
|
"editdist": edit_distance_fn, |
|
"editdist-norm": edit_distance_norm_fn, |
|
"edittime": edit_time_fn, |
|
} |
|
|
|
|
|
def attach_references(df): |
|
reference_df = hf_data_loader.load_full_commit_as_pandas().set_index(["hash", "repo"])[["reference"]] |
|
df = df.set_index(["hash", "repo"]) |
|
return df.join(other=reference_df, how="left").reset_index() |
|
|
|
|
|
def compute_metrics(df): |
|
tqdm.pandas() |
|
|
|
def apply_metric_fn_to_row(row, fn, col_pred, col_ref): |
|
return fn(row[col_pred], row[col_ref], edittime=row['edit_time'], diff=str(row['mods'])) |
|
|
|
for metric in AGGR_METRICS: |
|
print(f"Computing {metric} for the aggregated independent pairs") |
|
values = [] |
|
for i, row in tqdm(df.iterrows(), total=len(df)): |
|
others = df[(df["hash"] == row["hash"]) & (df["repo"] == row["repo"]) & ( |
|
df["commit_msg_start"] != row["commit_msg_start"]) & ( |
|
df["commit_msg_end"] != row["commit_msg_end"])]['commit_msg_end'].to_list() |
|
others.append(row["reference"]) |
|
others = list(set(others)) |
|
metric_fn = AGGR_METRICS[metric] |
|
values.append( |
|
metric_fn( |
|
row['commit_msg_start'], None, refs=others, edittime=row['edit_time'], diff=str(row['mods']) |
|
) |
|
) |
|
df[f"{metric}_aggr"] = values |
|
|
|
for metric in REL_METRICS: |
|
print(f"Computing {metric} for the related pairs") |
|
metric_fn = REL_METRICS[metric] |
|
df[f"{metric}_related"] = df.progress_apply( |
|
lambda row: apply_metric_fn_to_row(row=row, |
|
fn=metric_fn, |
|
col_pred="commit_msg_start", |
|
col_ref="commit_msg_end"), |
|
axis=1 |
|
) |
|
|
|
for metric in IND_METRICS: |
|
print(f"Computing {metric} for the independent pairs") |
|
metric_fn = IND_METRICS[metric] |
|
df[f"{metric}_independent"] = df.progress_apply( |
|
lambda row: apply_metric_fn_to_row(row=row, |
|
fn=metric_fn, |
|
col_pred="commit_msg_start", |
|
col_ref="reference"), |
|
axis=1 |
|
) |
|
|
|
for rel_metric in REL_METRICS: |
|
for ind_metric in IND_METRICS: |
|
df[f"rel_{rel_metric}_ind_{ind_metric}_pearson"] = ( |
|
df[f"{rel_metric}_related"].corr(df[f"{ind_metric}_independent"], method="pearson")) |
|
|
|
df[f"rel_{rel_metric}_ind_{ind_metric}_spearman"] = ( |
|
df[f"{rel_metric}_related"].corr(df[f"{ind_metric}_independent"], method="spearman")) |
|
|
|
for aggr_metric in AGGR_METRICS: |
|
df[f"rel_{rel_metric}_aggr_{aggr_metric}_pearson"] = ( |
|
df[f"{rel_metric}_related"].corr(df[f"{aggr_metric}_aggr"], method="pearson")) |
|
|
|
df[f"rel_{rel_metric}_aggr_{aggr_metric}_spearman"] = ( |
|
df[f"{rel_metric}_related"].corr(df[f"{aggr_metric}_aggr"], method="spearman")) |
|
|
|
return df |
|
|
|
|
|
def compute_correlations(df: pd.DataFrame): |
|
grouped_df = df.groupby(by=["end_to_start", "start_to_end"]) |
|
correlations = grouped_df.apply(correlations_for_group, include_groups=False) |
|
return correlations |
|
|
|
|
|
def transform(df): |
|
print("Computing metrics") |
|
|
|
df = attach_references(df) |
|
df = compute_metrics(df) |
|
|
|
correlations_for_groups = compute_correlations(df) |
|
correlations_for_groups.to_csv(config.METRICS_CORRELATIONS_ARTIFACT) |
|
|
|
df.to_csv(config.SYNTHETIC_DATASET_ARTIFACT) |
|
|
|
print("Done") |
|
return df |
|
|
|
|
|
def main(): |
|
df = pd.read_csv(config.START_TO_END_ARTIFACT, index_col=[0]) |
|
transform(df) |
|
|
|
|
|
if __name__ == '__main__': |
|
main() |
|
|