|
import pandas as pd |
|
import numpy as np |
|
|
|
|
|
|
|
CHOSEN_COLUMN = 'chosen' |
|
REJECTED_COLUMN = 'rejected' |
|
COLUMNS_TO_DROP = ['metadata', 'timestamp', 'id'] |
|
|
|
def transform_rlhf_dataset(df, chosen_col=CHOSEN_COLUMN, rejected_col=REJECTED_COLUMN, drop_cols=COLUMNS_TO_DROP): |
|
""" |
|
Parameters: |
|
df (pandas.DataFrame): Input dataframe with chosen and rejected columns |
|
chosen_col (str): Name of column containing chosen responses |
|
rejected_col (str): Name of column containing rejected responses |
|
drop_cols (list): List of column names to drop from the dataset |
|
|
|
Returns: |
|
pandas.DataFrame: Transformed dataset with 'text' and 'label' columns |
|
""" |
|
|
|
df = df.copy() |
|
|
|
existing_cols_to_drop = [col for col in drop_cols if col in df.columns] |
|
if existing_cols_to_drop: |
|
df = df.drop(columns=existing_cols_to_drop) |
|
|
|
preserved_cols = [col for col in df.columns if col not in [chosen_col, rejected_col]] |
|
|
|
|
|
liked_df = df[[chosen_col]].copy() |
|
liked_df.columns = ['text'] |
|
liked_df['label'] = 'liked' |
|
|
|
disliked_df = df[[rejected_col]].copy() |
|
disliked_df.columns = ['text'] |
|
disliked_df['label'] = 'disliked' |
|
|
|
for col in preserved_cols: |
|
liked_df[col] = df[col] |
|
for col in preserved_cols: |
|
disliked_df[col] = df[col] |
|
|
|
|
|
transformed_df = pd.concat([liked_df, disliked_df], ignore_index=True) |
|
transformed_df = transformed_df.dropna(subset=['text']) |
|
transformed_df = transformed_df.sample(frac=1).reset_index(drop=True) |
|
|
|
|
|
column_order = ['text', 'label'] + preserved_cols |
|
transformed_df = transformed_df[column_order] |
|
|
|
return transformed_df |
|
|
|
def test_example(): |
|
example_data = { |
|
'chosen': ['This is a good response', 'Another good one'], |
|
'rejected': ['This is a bad response', 'Another bad one'], |
|
'metadata': ['meta1', 'meta2'], |
|
'timestamp': ['2024-01-01', '2024-01-02'], |
|
'id': [1, 2] |
|
} |
|
|
|
df = pd.DataFrame(example_data) |
|
transformed_df = transform_rlhf_dataset( |
|
df, |
|
chosen_col='chosen', |
|
rejected_col='rejected', |
|
drop_cols=['metadata', 'id'] |
|
) |
|
|
|
print("Original shape:", df.shape) |
|
print("\nTransformed shape:", transformed_df.shape) |
|
print("\nTransformation sample:") |
|
print(transformed_df.head()) |
|
print("\nLabel distribution:") |
|
print(transformed_df['label'].value_counts()) |
|
|
|
if __name__ == "__main__": |
|
test_example() |