Jen Ben Arye commited on
Commit
c8a2d4e
·
1 Parent(s): 0e83f57

script to load datasets in kto schema

Browse files
Files changed (1) hide show
  1. kto_dataset_processor.py +58 -0
kto_dataset_processor.py ADDED
@@ -0,0 +1,58 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from datasets import load_dataset, Dataset
2
+ import pandas as pd
3
+ from pdb import set_trace as st
4
+
5
+
6
+ def process_dataset_ultrafeedback():
7
+ """
8
+ Processes the 'HuggingFaceH4/ultrafeedback_binarized' dataset into a unified train and test split.
9
+
10
+ Returns:
11
+ dict: A dictionary containing the unified 'train' and 'test' splits of the dataset in the KTO format.
12
+ Each split is a Hugging Face Dataset object.
13
+ """
14
+ # Load the dataset
15
+ dataset_name = "HuggingFaceH4/ultrafeedback_binarized"
16
+ dataset = load_dataset(dataset_name)
17
+
18
+ # Function to transform a single example into the desired schema
19
+ def transform_data(example):
20
+ data_points = []
21
+ # Chosen completion
22
+ chosen_completion = example["chosen"][1]["content"]
23
+ data_points.append({
24
+ "prompt": example["prompt"],
25
+ "completion": chosen_completion.strip(),
26
+ "label": True
27
+ })
28
+ # Rejected completion
29
+ rejected_completion = example["rejected"][1]["content"]
30
+ data_points.append({
31
+ "prompt": example["prompt"],
32
+ "completion": rejected_completion.strip(),
33
+ "label": False
34
+ })
35
+ return data_points
36
+
37
+ # Combine splits into unified train and test sets
38
+ train_data = []
39
+ test_data = []
40
+
41
+ for split_name, split_data in dataset.items():
42
+ if "train" in split_name:
43
+ for example in split_data:
44
+ train_data.extend(transform_data(example))
45
+ elif "test" in split_name:
46
+ for example in split_data:
47
+ test_data.extend(transform_data(example))
48
+
49
+ # Convert unified data to Hugging Face Dataset
50
+ unified_train = Dataset.from_pandas(pd.DataFrame(train_data))
51
+ unified_test = Dataset.from_pandas(pd.DataFrame(test_data))
52
+
53
+ return {"train": unified_train, "test": unified_test}
54
+
55
+
56
+ # if __name__ == "__main__":
57
+ # kto_dataset = process_dataset_ultrafeedback()
58
+ # st()