Jen Ben Arye
commited on
Commit
·
c8a2d4e
1
Parent(s):
0e83f57
script to load datasets in kto schema
Browse files- kto_dataset_processor.py +58 -0
kto_dataset_processor.py
ADDED
@@ -0,0 +1,58 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from datasets import load_dataset, Dataset
|
2 |
+
import pandas as pd
|
3 |
+
from pdb import set_trace as st
|
4 |
+
|
5 |
+
|
6 |
+
def process_dataset_ultrafeedback():
|
7 |
+
"""
|
8 |
+
Processes the 'HuggingFaceH4/ultrafeedback_binarized' dataset into a unified train and test split.
|
9 |
+
|
10 |
+
Returns:
|
11 |
+
dict: A dictionary containing the unified 'train' and 'test' splits of the dataset in the KTO format.
|
12 |
+
Each split is a Hugging Face Dataset object.
|
13 |
+
"""
|
14 |
+
# Load the dataset
|
15 |
+
dataset_name = "HuggingFaceH4/ultrafeedback_binarized"
|
16 |
+
dataset = load_dataset(dataset_name)
|
17 |
+
|
18 |
+
# Function to transform a single example into the desired schema
|
19 |
+
def transform_data(example):
|
20 |
+
data_points = []
|
21 |
+
# Chosen completion
|
22 |
+
chosen_completion = example["chosen"][1]["content"]
|
23 |
+
data_points.append({
|
24 |
+
"prompt": example["prompt"],
|
25 |
+
"completion": chosen_completion.strip(),
|
26 |
+
"label": True
|
27 |
+
})
|
28 |
+
# Rejected completion
|
29 |
+
rejected_completion = example["rejected"][1]["content"]
|
30 |
+
data_points.append({
|
31 |
+
"prompt": example["prompt"],
|
32 |
+
"completion": rejected_completion.strip(),
|
33 |
+
"label": False
|
34 |
+
})
|
35 |
+
return data_points
|
36 |
+
|
37 |
+
# Combine splits into unified train and test sets
|
38 |
+
train_data = []
|
39 |
+
test_data = []
|
40 |
+
|
41 |
+
for split_name, split_data in dataset.items():
|
42 |
+
if "train" in split_name:
|
43 |
+
for example in split_data:
|
44 |
+
train_data.extend(transform_data(example))
|
45 |
+
elif "test" in split_name:
|
46 |
+
for example in split_data:
|
47 |
+
test_data.extend(transform_data(example))
|
48 |
+
|
49 |
+
# Convert unified data to Hugging Face Dataset
|
50 |
+
unified_train = Dataset.from_pandas(pd.DataFrame(train_data))
|
51 |
+
unified_test = Dataset.from_pandas(pd.DataFrame(test_data))
|
52 |
+
|
53 |
+
return {"train": unified_train, "test": unified_test}
|
54 |
+
|
55 |
+
|
56 |
+
# if __name__ == "__main__":
|
57 |
+
# kto_dataset = process_dataset_ultrafeedback()
|
58 |
+
# st()
|