File size: 506 Bytes
928b43c
 
68731ca
 
 
928b43c
e765db8
68731ca
 
 
928b43c
 
 
 
68731ca
e765db8
 
 
928b43c
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
import os

from datasets import load_dataset

CACHE_DIR = 'cache'
N_SAMPLES = 15
REMOVED_COMMITS = ['9cc896202dc38d962c01aa2637dbc5bbc3e3dd9b']


def load_data():
    df = load_dataset("petrtsv-jb/commit-rewriting-samples",
                      split="train",
                      token=os.environ.get('HF_REWRITING_TOKEN'),
                      cache_dir=CACHE_DIR).to_pandas()

    removed_idx = df['hash'].isin(REMOVED_COMMITS)
    df = df[~removed_idx]

    return df.to_dict('records')[:N_SAMPLES]