File size: 2,225 Bytes
c8a2d4e
 
 
 
 
 
 
239efc0
 
c8a2d4e
 
 
 
 
239efc0
c8a2d4e
239efc0
 
c8a2d4e
 
 
 
 
 
239efc0
 
 
 
 
 
c8a2d4e
 
239efc0
 
 
 
 
 
c8a2d4e
 
239efc0
c8a2d4e
 
 
239efc0
 
c8a2d4e
239efc0
 
 
 
 
 
 
 
 
 
 
c8a2d4e
 
 
 
239efc0
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
from datasets import load_dataset, Dataset
import pandas as pd
from pdb import set_trace as st


def process_dataset_ultrafeedback():
    """
    Processes the 'train_prefs' and 'test_prefs' splits of the 'HuggingFaceH4/ultrafeedback_binarized' dataset
    into a unified format for preference modeling.

    Returns:
        dict: A dictionary containing the unified 'train' and 'test' splits of the dataset in the KTO format.
              Each split is a Hugging Face Dataset object.
    """
    # Load the relevant splits of the dataset
    dataset_name = "HuggingFaceH4/ultrafeedback_binarized"
    train_prefs = load_dataset(dataset_name, split="train_prefs")
    test_prefs = load_dataset(dataset_name, split="test_prefs")

    # Function to transform a single example into the desired schema
    def transform_data(example):
        data_points = []
        # Chosen completion
        chosen_completion = example["chosen"][1]["content"]
        if chosen_completion.strip():  # Check for non-empty completions
            data_points.append({
                "prompt": example["prompt"],
                "completion": chosen_completion.strip(),
                "label": True
            })
        # Rejected completion
        rejected_completion = example["rejected"][1]["content"]
        if rejected_completion.strip():  # Check for non-empty completions
            data_points.append({
                "prompt": example["prompt"],
                "completion": rejected_completion.strip(),
                "label": False
            })
        return data_points

    # Process train and test splits
    train_data = []
    test_data = []

    for example in train_prefs:
        train_data.extend(transform_data(example))

    for example in test_prefs:
        test_data.extend(transform_data(example))

    # Convert unified data to DataFrames
    train_df = pd.DataFrame(train_data)
    test_df = pd.DataFrame(test_data)


    # Convert to Hugging Face Dataset
    unified_train = Dataset.from_pandas(train_df)
    unified_test = Dataset.from_pandas(test_df)

    return {"train": unified_train, "test": unified_test}


if __name__ == "__main__":
    kto_dataset = process_dataset_ultrafeedback()
    st()