Petr Tsvetkov
commited on
Commit
β’
e027012
1
Parent(s):
f1b08a8
Add checkpoints
Browse files
config.py
CHANGED
@@ -24,4 +24,8 @@ CACHE_DIR.mkdir(exist_ok=True)
|
|
24 |
OUTPUT_DIR = Path("output")
|
25 |
OUTPUT_DIR.mkdir(exist_ok=True)
|
26 |
|
|
|
|
|
|
|
27 |
SYNTHETIC_DATASET_ARTIFACT = OUTPUT_DIR / "synthetic.csv"
|
|
|
|
24 |
OUTPUT_DIR = Path("output")
|
25 |
OUTPUT_DIR.mkdir(exist_ok=True)
|
26 |
|
27 |
+
|
28 |
+
END_TO_START_ARTIFACT = OUTPUT_DIR / "end_to_start.csv"
|
29 |
+
START_TO_END_ARTIFACT = OUTPUT_DIR / "start_to_end.csv"
|
30 |
SYNTHETIC_DATASET_ARTIFACT = OUTPUT_DIR / "synthetic.csv"
|
31 |
+
METRICS_CORRELATIONS_ARTIFACT = OUTPUT_DIR / "metrics_correlations.csv"
|
custom_metrics/__init__.py
ADDED
File without changes
|
custom_metrics/gpt_eval.py
ADDED
@@ -0,0 +1,46 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import time
|
2 |
+
|
3 |
+
from api_wrappers import grazie_wrapper
|
4 |
+
|
5 |
+
|
6 |
+
def build_prompt(prediction, reference):
|
7 |
+
return f"""Your task is to rate the quality of the generated commit message using the scale from 1 to 5.
|
8 |
+
|
9 |
+
A good commit message has to be concise.
|
10 |
+
Assign lower scores for the commit messages that are too verbose for a commit message.
|
11 |
+
|
12 |
+
The generated commit message you have to evaluate:
|
13 |
+
START OF THE GENERATED COMMIT MESSAGE
|
14 |
+
{prediction}
|
15 |
+
END OF THE GENERATED COMMIT MESSAGE
|
16 |
+
|
17 |
+
Here is an example of an ideal reference commit message for the same commit:
|
18 |
+
START OF THE REFERENCE COMMIT MESSAGE
|
19 |
+
{reference}
|
20 |
+
END OF THE REFERENCE COMMIT MESSAGE
|
21 |
+
|
22 |
+
All the information in the reference commit message is true.
|
23 |
+
|
24 |
+
Print only one integer number after the token "OUTPUT" - the rating of the generated commit message.
|
25 |
+
Do not print anything that is not an integer.
|
26 |
+
|
27 |
+
OUTPUT
|
28 |
+
"""
|
29 |
+
|
30 |
+
|
31 |
+
N_RETRIES = 3
|
32 |
+
|
33 |
+
|
34 |
+
def compute(prediction, reference):
|
35 |
+
prompt = build_prompt(prediction, reference)
|
36 |
+
outputs = []
|
37 |
+
|
38 |
+
for i in range(N_RETRIES):
|
39 |
+
try:
|
40 |
+
output = grazie_wrapper.generate_for_prompt(prompt).strip()[-1]
|
41 |
+
outputs.append(output)
|
42 |
+
return int(output)
|
43 |
+
except ValueError:
|
44 |
+
continue
|
45 |
+
|
46 |
+
raise RuntimeError(f"GPT4 cannot generate a number. Its outputs were: {str(outputs)}")
|
generation_steps/metrics_analysis.py
CHANGED
@@ -1,9 +1,13 @@
|
|
|
|
|
|
|
|
1 |
import evaluate
|
2 |
import pandas as pd
|
3 |
from tqdm import tqdm
|
4 |
|
5 |
import config
|
6 |
from api_wrappers import hf_data_loader
|
|
|
7 |
|
8 |
BLEU = evaluate.load('bleu', cache_dir=config.CACHE_DIR)
|
9 |
|
@@ -37,12 +41,17 @@ def bertscore_fn(pred, ref):
|
|
37 |
return BERTSCORE.compute(predictions=[pred], references=[ref], model_type="distilbert-base-uncased")["f1"][0]
|
38 |
|
39 |
|
|
|
|
|
|
|
|
|
40 |
METRICS = {
|
|
|
41 |
"bleu": bleu_fn,
|
42 |
"meteor": meteor_fn,
|
43 |
"rouge1": rouge1_fn,
|
44 |
"rouge2": rouge2_fn,
|
45 |
-
"bertscore": bertscore_fn
|
46 |
}
|
47 |
|
48 |
|
@@ -82,20 +91,40 @@ def compute_metrics(df):
|
|
82 |
return df
|
83 |
|
84 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
85 |
def transform(df):
|
86 |
print("Computing metrics")
|
87 |
|
88 |
df = attach_references(df)
|
89 |
df = compute_metrics(df)
|
90 |
|
|
|
|
|
|
|
|
|
|
|
91 |
print("Done")
|
92 |
return df
|
93 |
|
94 |
|
95 |
def main():
|
96 |
-
df = pd.read_csv(config.
|
97 |
-
|
98 |
-
df.to_csv(config.SYNTHETIC_DATASET_ARTIFACT)
|
99 |
|
100 |
|
101 |
if __name__ == '__main__':
|
|
|
1 |
+
import functools
|
2 |
+
import operator
|
3 |
+
|
4 |
import evaluate
|
5 |
import pandas as pd
|
6 |
from tqdm import tqdm
|
7 |
|
8 |
import config
|
9 |
from api_wrappers import hf_data_loader
|
10 |
+
from custom_metrics import gpt_eval
|
11 |
|
12 |
BLEU = evaluate.load('bleu', cache_dir=config.CACHE_DIR)
|
13 |
|
|
|
41 |
return BERTSCORE.compute(predictions=[pred], references=[ref], model_type="distilbert-base-uncased")["f1"][0]
|
42 |
|
43 |
|
44 |
+
def gptscore_fn(pred, ref):
|
45 |
+
return gpt_eval.compute(prediction=pred, reference=ref)
|
46 |
+
|
47 |
+
|
48 |
METRICS = {
|
49 |
+
"gptscore": gptscore_fn,
|
50 |
"bleu": bleu_fn,
|
51 |
"meteor": meteor_fn,
|
52 |
"rouge1": rouge1_fn,
|
53 |
"rouge2": rouge2_fn,
|
54 |
+
"bertscore": bertscore_fn,
|
55 |
}
|
56 |
|
57 |
|
|
|
91 |
return df
|
92 |
|
93 |
|
94 |
+
def correlations_for_group(group):
|
95 |
+
correlations = []
|
96 |
+
for metric in METRICS:
|
97 |
+
correlations.append({
|
98 |
+
f"{metric}_pearson": group[f"{metric}_related"].corr(group[f"{metric}_independent"], method="pearson"),
|
99 |
+
f"{metric}_spearman": group[f"{metric}_related"].corr(group[f"{metric}_independent"], method="spearman")
|
100 |
+
})
|
101 |
+
return pd.Series(functools.reduce(operator.ior, correlations, {}))
|
102 |
+
|
103 |
+
|
104 |
+
def compute_correlations(df: pd.DataFrame):
|
105 |
+
grouped_df = df.groupby(by=["end_to_start", "start_to_end"])
|
106 |
+
correlations = grouped_df.apply(correlations_for_group, include_groups=False)
|
107 |
+
return correlations
|
108 |
+
|
109 |
+
|
110 |
def transform(df):
|
111 |
print("Computing metrics")
|
112 |
|
113 |
df = attach_references(df)
|
114 |
df = compute_metrics(df)
|
115 |
|
116 |
+
correlations_for_groups = compute_correlations(df)
|
117 |
+
correlations_for_groups.to_csv(config.METRICS_CORRELATIONS_ARTIFACT)
|
118 |
+
|
119 |
+
df.to_csv(config.SYNTHETIC_DATASET_ARTIFACT)
|
120 |
+
|
121 |
print("Done")
|
122 |
return df
|
123 |
|
124 |
|
125 |
def main():
|
126 |
+
df = pd.read_csv(config.START_TO_END_ARTIFACT, index_col=[0])
|
127 |
+
transform(df)
|
|
|
128 |
|
129 |
|
130 |
if __name__ == '__main__':
|
generation_steps/synthetic_end_to_start.py
CHANGED
@@ -4,7 +4,7 @@ from tqdm import tqdm
|
|
4 |
import config
|
5 |
import generate_annotated_diffs
|
6 |
import statistics
|
7 |
-
from api_wrappers import grazie_wrapper
|
8 |
from generation_steps import examples
|
9 |
|
10 |
GENERATION_MULTIPLIER = 3
|
@@ -91,15 +91,15 @@ def transform(df):
|
|
91 |
generated_df['end_to_start'] = True
|
92 |
|
93 |
result = pd.concat([df, generated_df], ignore_index=True)
|
|
|
94 |
|
95 |
print("Done")
|
96 |
return result
|
97 |
|
98 |
|
99 |
def main():
|
100 |
-
df =
|
101 |
-
|
102 |
-
df.to_csv(config.SYNTHETIC_DATASET_ARTIFACT)
|
103 |
|
104 |
|
105 |
if __name__ == '__main__':
|
|
|
4 |
import config
|
5 |
import generate_annotated_diffs
|
6 |
import statistics
|
7 |
+
from api_wrappers import grazie_wrapper, hf_data_loader
|
8 |
from generation_steps import examples
|
9 |
|
10 |
GENERATION_MULTIPLIER = 3
|
|
|
91 |
generated_df['end_to_start'] = True
|
92 |
|
93 |
result = pd.concat([df, generated_df], ignore_index=True)
|
94 |
+
result.to_csv(config.END_TO_START_ARTIFACT)
|
95 |
|
96 |
print("Done")
|
97 |
return result
|
98 |
|
99 |
|
100 |
def main():
|
101 |
+
df = hf_data_loader.load_processed_rewriting_dataset_as_pandas()
|
102 |
+
transform(df)
|
|
|
103 |
|
104 |
|
105 |
if __name__ == '__main__':
|
generation_steps/synthetic_start_to_end.py
CHANGED
@@ -91,15 +91,15 @@ def transform(df):
|
|
91 |
generated_df['start_to_end'] = True
|
92 |
|
93 |
result = pd.concat([df, generated_df], ignore_index=True)
|
|
|
94 |
|
95 |
print("Done")
|
96 |
return result
|
97 |
|
98 |
|
99 |
def main():
|
100 |
-
df = pd.read_csv(config.
|
101 |
-
|
102 |
-
df.to_csv(config.SYNTHETIC_DATASET_ARTIFACT)
|
103 |
|
104 |
|
105 |
if __name__ == '__main__':
|
|
|
91 |
generated_df['start_to_end'] = True
|
92 |
|
93 |
result = pd.concat([df, generated_df], ignore_index=True)
|
94 |
+
result.to_csv(config.START_TO_END_ARTIFACT)
|
95 |
|
96 |
print("Done")
|
97 |
return result
|
98 |
|
99 |
|
100 |
def main():
|
101 |
+
df = pd.read_csv(config.END_TO_START_ARTIFACT, index_col=[0])
|
102 |
+
transform(df)
|
|
|
103 |
|
104 |
|
105 |
if __name__ == '__main__':
|