|
import gradio as gr |
|
import pandas as pd |
|
|
|
from dataset import get_dataframe |
|
from markdown import GUIDELINES, PANEL_MARKDOWN |
|
|
|
|
|
df = get_dataframe() |
|
|
|
|
|
def filter_dataframe(dataframe, eval_dataset, cont_source, checkboxes): |
|
""" |
|
Filter the dataframe based on the provided evaluation dataset, contaminated source, and checkboxes. |
|
|
|
Args: |
|
dataframe (pandas.DataFrame): The input dataframe to filter. |
|
eval_dataset (str): The evaluation dataset to filter by. |
|
cont_source (str): The contaminated source to filter by. |
|
checkboxes (list): The checkboxes to filter by. |
|
|
|
Returns: |
|
pandas.DataFrame: The filtered dataframe. |
|
""" |
|
if isinstance(eval_dataset, str): |
|
dataframe = dataframe[ |
|
dataframe["Evaluation Dataset"].str.contains(eval_dataset) |
|
] |
|
if isinstance(cont_source, str): |
|
dataframe = dataframe[ |
|
dataframe["Contaminated Source"].str.contains(cont_source) |
|
] |
|
if isinstance(checkboxes, list) and "Exclude model-based evidences" in checkboxes: |
|
dataframe = dataframe[dataframe["Approach"] != "model-based"] |
|
if isinstance(checkboxes, list) and "Show only contaminated" in checkboxes: |
|
dataframe = dataframe[ |
|
(dataframe["Train Split"] > 0.0) |
|
| (dataframe["Development Split"] > 0.0) |
|
| (dataframe["Test Split"] > 0.0) |
|
] |
|
|
|
return dataframe.style.format( |
|
{"Train Split": "{:.1%}", "Development Split": "{:.1%}", "Test Split": "{:.1%}"} |
|
) |
|
|
|
|
|
def filter_dataframe_corpus(*args, **kwargs) -> pd.DataFrame: |
|
""" |
|
Filter the dataframe for corpus contamination. |
|
|
|
Returns: |
|
pandas.DataFrame: The filtered dataframe for corpus contamination. |
|
""" |
|
|
|
filtered_df = df[df["Model or corpus"] == "corpus"] |
|
filtered_df = filtered_df.drop(columns=["Model or corpus"]) |
|
return filter_dataframe(filtered_df, *args, **kwargs) |
|
|
|
|
|
def filter_dataframe_model(*args, **kwargs) -> pd.DataFrame: |
|
""" |
|
Filter the dataframe for model contamination. |
|
|
|
Returns: |
|
pandas.DataFrame: The filtered dataframe for model contamination. |
|
""" |
|
|
|
filtered_df = df[df["Model or corpus"] == "model"] |
|
filtered_df = filtered_df.drop(columns=["Model or corpus"]) |
|
return filter_dataframe(filtered_df, *args, **kwargs) |
|
|
|
|
|
theme = gr.themes.Soft( |
|
primary_hue="emerald", |
|
secondary_hue="red", |
|
text_size="sm", |
|
spacing_size="sm", |
|
font=[ |
|
gr.themes.GoogleFont("Poppins"), |
|
gr.themes.GoogleFont("Poppins"), |
|
gr.themes.GoogleFont("Poppins"), |
|
gr.themes.GoogleFont("Poppins"), |
|
], |
|
).set(block_background_fill="*neutral_50", block_background_fill_dark="*neutral_950") |
|
|
|
|
|
with gr.Blocks( |
|
theme=theme, |
|
title="π¨ Data Contamination Report", |
|
analytics_enabled=False, |
|
fill_height=True, |
|
) as demo: |
|
gr.Markdown(PANEL_MARKDOWN) |
|
with gr.Tab("Corpus contamination") as tab_corpus: |
|
with gr.Row(variant="compact"): |
|
with gr.Column(): |
|
eval_dataset_corpus = gr.Textbox( |
|
placeholder="Evaluation dataset", |
|
label="Evaluation dataset", |
|
value="", |
|
) |
|
cont_corpora = gr.Textbox( |
|
placeholder="Pre-training corpora", |
|
label="Pre-training corpora", |
|
value="", |
|
) |
|
with gr.Column(): |
|
checkboxes_corpus = gr.CheckboxGroup( |
|
["Exclude model-based evidences", "Show only contaminated"], |
|
label="Search options", |
|
value=[], |
|
) |
|
|
|
filter_corpus_btn = gr.Button("Filter") |
|
|
|
corpus_dataframe = gr.DataFrame( |
|
value=filter_dataframe_corpus( |
|
eval_dataset_corpus, cont_corpora, checkboxes_corpus |
|
), |
|
headers=df.columns.to_list(), |
|
datatype=[ |
|
"markdown", |
|
"markdown", |
|
"number", |
|
"number", |
|
"number", |
|
"str", |
|
"markdown", |
|
"markdown", |
|
], |
|
) |
|
|
|
with gr.Tab("Model contamination") as tab_model: |
|
with gr.Row(variant="compact"): |
|
with gr.Column(): |
|
eval_dataset_model = gr.Textbox( |
|
placeholder="Evaluation dataset", |
|
label="Evaluation dataset", |
|
value="", |
|
) |
|
cont_model = gr.Textbox( |
|
placeholder="Model", label="Pre-training corpora", value="" |
|
) |
|
with gr.Column(): |
|
checkboxes_model = gr.CheckboxGroup( |
|
["Exclude model-based evidences", "Show only contaminated"], |
|
label="Search options", |
|
value=[], |
|
) |
|
|
|
filter_model_btn = gr.Button("Filter") |
|
|
|
model_dataframe = gr.DataFrame( |
|
value=filter_dataframe_model( |
|
eval_dataset_model, cont_model, checkboxes_model |
|
), |
|
headers=df.columns.to_list(), |
|
datatype=[ |
|
"markdown", |
|
"markdown", |
|
"number", |
|
"number", |
|
"number", |
|
"str", |
|
"markdown", |
|
"markdown", |
|
], |
|
) |
|
|
|
filter_corpus_btn.click( |
|
filter_dataframe_corpus, |
|
inputs=[eval_dataset_corpus, cont_corpora, checkboxes_corpus], |
|
outputs=corpus_dataframe, |
|
) |
|
filter_model_btn.click( |
|
filter_dataframe_model, |
|
inputs=[eval_dataset_model, cont_model, checkboxes_model], |
|
outputs=model_dataframe, |
|
) |
|
|
|
with gr.Tab("Contribution Guidelines") as tab_guidelines: |
|
gr.Markdown(GUIDELINES) |
|
|
|
|
|
demo.launch() |
|
|