Spaces:

JetBrains-Research
/

commit-message-editing-visualization

Sleeping

App Files Files Community

Petr Tsvetkov commited on Apr 14

Commit

e027012

•

1 Parent(s): f1b08a8

Add checkpoints

Browse files

Files changed (6) hide show

config.py +4 -0
custom_metrics/__init__.py +0 -0
custom_metrics/gpt_eval.py +46 -0
generation_steps/metrics_analysis.py +33 -4
generation_steps/synthetic_end_to_start.py +4 -4
generation_steps/synthetic_start_to_end.py +3 -3

config.py CHANGED Viewed

@@ -24,4 +24,8 @@ CACHE_DIR.mkdir(exist_ok=True)
 OUTPUT_DIR = Path("output")
 OUTPUT_DIR.mkdir(exist_ok=True)
 SYNTHETIC_DATASET_ARTIFACT = OUTPUT_DIR / "synthetic.csv"

 OUTPUT_DIR = Path("output")
 OUTPUT_DIR.mkdir(exist_ok=True)
+END_TO_START_ARTIFACT = OUTPUT_DIR / "end_to_start.csv"
+START_TO_END_ARTIFACT = OUTPUT_DIR / "start_to_end.csv"
 SYNTHETIC_DATASET_ARTIFACT = OUTPUT_DIR / "synthetic.csv"
+METRICS_CORRELATIONS_ARTIFACT = OUTPUT_DIR / "metrics_correlations.csv"

custom_metrics/__init__.py ADDED Viewed

File without changes

custom_metrics/gpt_eval.py ADDED Viewed

	@@ -0,0 +1,46 @@

+import time
+from api_wrappers import grazie_wrapper
+def build_prompt(prediction, reference):
+    return f"""Your task is to rate the quality of the generated commit message using the scale from 1 to 5.
+A good commit message has to be concise.
+Assign lower scores for the commit messages that are too verbose for a commit message.
+The generated commit message you have to evaluate:
+START OF THE GENERATED COMMIT MESSAGE
+{prediction}
+END OF THE GENERATED COMMIT MESSAGE
+Here is an example of an ideal reference commit message for the same commit:
+START OF THE REFERENCE COMMIT MESSAGE
+{reference}
+END OF THE REFERENCE COMMIT MESSAGE
+All the information in the reference commit message is true.
+Print only one integer number after the token "OUTPUT" - the rating of the generated commit message.
+Do not print anything that is not an integer.
+OUTPUT
+"""
+N_RETRIES = 3
+def compute(prediction, reference):
+    prompt = build_prompt(prediction, reference)
+    outputs = []
+    for i in range(N_RETRIES):
+        try:
+            output = grazie_wrapper.generate_for_prompt(prompt).strip()[-1]
+            outputs.append(output)
+            return int(output)
+        except ValueError:
+            continue
+    raise RuntimeError(f"GPT4 cannot generate a number. Its outputs were: {str(outputs)}")

generation_steps/metrics_analysis.py CHANGED Viewed

@@ -1,9 +1,13 @@
 import evaluate
 import pandas as pd
 from tqdm import tqdm
 import config
 from api_wrappers import hf_data_loader
 BLEU = evaluate.load('bleu', cache_dir=config.CACHE_DIR)
@@ -37,12 +41,17 @@ def bertscore_fn(pred, ref):
     return BERTSCORE.compute(predictions=[pred], references=[ref], model_type="distilbert-base-uncased")["f1"][0]
 METRICS = {
     "bleu": bleu_fn,
     "meteor": meteor_fn,
     "rouge1": rouge1_fn,
     "rouge2": rouge2_fn,
-    "bertscore": bertscore_fn
 }
@@ -82,20 +91,40 @@ def compute_metrics(df):
     return df
 def transform(df):
     print("Computing metrics")
     df = attach_references(df)
     df = compute_metrics(df)
     print("Done")
     return df
 def main():
-    df = pd.read_csv(config.SYNTHETIC_DATASET_ARTIFACT, index_col=[0])
-    df = transform(df)
-    df.to_csv(config.SYNTHETIC_DATASET_ARTIFACT)
 if __name__ == '__main__':

+import functools
+import operator
 import evaluate
 import pandas as pd
 from tqdm import tqdm
 import config
 from api_wrappers import hf_data_loader
+from custom_metrics import gpt_eval
 BLEU = evaluate.load('bleu', cache_dir=config.CACHE_DIR)
     return BERTSCORE.compute(predictions=[pred], references=[ref], model_type="distilbert-base-uncased")["f1"][0]
+def gptscore_fn(pred, ref):
+    return gpt_eval.compute(prediction=pred, reference=ref)
 METRICS = {
+    "gptscore": gptscore_fn,
     "bleu": bleu_fn,
     "meteor": meteor_fn,
     "rouge1": rouge1_fn,
     "rouge2": rouge2_fn,
+    "bertscore": bertscore_fn,
 }
     return df
+def correlations_for_group(group):
+    correlations = []
+    for metric in METRICS:
+        correlations.append({
+            f"{metric}_pearson": group[f"{metric}_related"].corr(group[f"{metric}_independent"], method="pearson"),
+            f"{metric}_spearman": group[f"{metric}_related"].corr(group[f"{metric}_independent"], method="spearman")
+        })
+    return pd.Series(functools.reduce(operator.ior, correlations, {}))
+def compute_correlations(df: pd.DataFrame):
+    grouped_df = df.groupby(by=["end_to_start", "start_to_end"])
+    correlations = grouped_df.apply(correlations_for_group, include_groups=False)
+    return correlations
 def transform(df):
     print("Computing metrics")
     df = attach_references(df)
     df = compute_metrics(df)
+    correlations_for_groups = compute_correlations(df)
+    correlations_for_groups.to_csv(config.METRICS_CORRELATIONS_ARTIFACT)
+    df.to_csv(config.SYNTHETIC_DATASET_ARTIFACT)
     print("Done")
     return df
 def main():
+    df = pd.read_csv(config.START_TO_END_ARTIFACT, index_col=[0])
+    transform(df)
 if __name__ == '__main__':

generation_steps/synthetic_end_to_start.py CHANGED Viewed

@@ -4,7 +4,7 @@ from tqdm import tqdm
 import config
 import generate_annotated_diffs
 import statistics
-from api_wrappers import grazie_wrapper
 from generation_steps import examples
 GENERATION_MULTIPLIER = 3
@@ -91,15 +91,15 @@ def transform(df):
     generated_df['end_to_start'] = True
     result = pd.concat([df, generated_df], ignore_index=True)
     print("Done")
     return result
 def main():
-    df = pd.read_csv(config.SYNTHETIC_DATASET_ARTIFACT, index_col=[0])
-    df = transform(df)
-    df.to_csv(config.SYNTHETIC_DATASET_ARTIFACT)
 if __name__ == '__main__':

 import config
 import generate_annotated_diffs
 import statistics
+from api_wrappers import grazie_wrapper, hf_data_loader
 from generation_steps import examples
 GENERATION_MULTIPLIER = 3
     generated_df['end_to_start'] = True
     result = pd.concat([df, generated_df], ignore_index=True)
+    result.to_csv(config.END_TO_START_ARTIFACT)
     print("Done")
     return result
 def main():
+    df = hf_data_loader.load_processed_rewriting_dataset_as_pandas()
+    transform(df)
 if __name__ == '__main__':

generation_steps/synthetic_start_to_end.py CHANGED Viewed

@@ -91,15 +91,15 @@ def transform(df):
     generated_df['start_to_end'] = True
     result = pd.concat([df, generated_df], ignore_index=True)
     print("Done")
     return result
 def main():
-    df = pd.read_csv(config.SYNTHETIC_DATASET_ARTIFACT, index_col=[0])
-    df = transform(df)
-    df.to_csv(config.SYNTHETIC_DATASET_ARTIFACT)
 if __name__ == '__main__':

     generated_df['start_to_end'] = True
     result = pd.concat([df, generated_df], ignore_index=True)
+    result.to_csv(config.START_TO_END_ARTIFACT)
     print("Done")
     return result
 def main():
+    df = pd.read_csv(config.END_TO_START_ARTIFACT, index_col=[0])
+    transform(df)
 if __name__ == '__main__':