Spaces:

feel-fl
/

open-human-feedback-chat

Running

App Files Files Community

Riddhi Bhagwat commited on Nov 9, 2024

Commit

261522b

1 Parent(s): 04f923c

Add files via upload

Browse files

Files changed (1) hide show

data_transform_pipeline.py +80 -0

data_transform_pipeline.py ADDED Viewed

	@@ -0,0 +1,80 @@

+import pandas as pd
+import numpy as np
+# NOTE: names of preset cols may be different based on dataset, this is just a generalized pipeline
+CHOSEN_COLUMN = 'chosen'  # name of col with chosen responses
+REJECTED_COLUMN = 'rejected'  # name of col with rejected responses
+COLUMNS_TO_DROP = ['metadata', 'timestamp', 'id']  # cols to remove
+def transform_rlhf_dataset(df, chosen_col=CHOSEN_COLUMN, rejected_col=REJECTED_COLUMN, drop_cols=COLUMNS_TO_DROP):
+    """
+    Parameters:
+    df (pandas.DataFrame): Input dataframe with chosen and rejected columns
+    chosen_col (str): Name of column containing chosen responses
+    rejected_col (str): Name of column containing rejected responses
+    drop_cols (list): List of column names to drop from the dataset
+    Returns:
+    pandas.DataFrame: Transformed dataset with 'text' and 'label' columns
+    """
+    df = df.copy()
+    existing_cols_to_drop = [col for col in drop_cols if col in df.columns]
+    if existing_cols_to_drop:
+        df = df.drop(columns=existing_cols_to_drop)
+    preserved_cols = [col for col in df.columns if col not in [chosen_col, rejected_col]]
+    # two separate dataframes for liked and disliked
+    liked_df = df[[chosen_col]].copy()
+    liked_df.columns = ['text']
+    liked_df['label'] = 'liked'
+    disliked_df = df[[rejected_col]].copy()
+    disliked_df.columns = ['text']
+    disliked_df['label'] = 'disliked'
+    for col in preserved_cols:
+        liked_df[col] = df[col]
+    for col in preserved_cols:
+        disliked_df[col] = df[col]
+    # combine + shuffle
+    transformed_df = pd.concat([liked_df, disliked_df], ignore_index=True)
+    transformed_df = transformed_df.dropna(subset=['text'])
+    transformed_df = transformed_df.sample(frac=1).reset_index(drop=True)
+    # reordering
+    column_order = ['text', 'label'] + preserved_cols
+    transformed_df = transformed_df[column_order]
+    return transformed_df
+def test_example():
+    example_data = {
+        'chosen': ['This is a good response', 'Another good one'],
+        'rejected': ['This is a bad response', 'Another bad one'],
+        'metadata': ['meta1', 'meta2'],
+        'timestamp': ['2024-01-01', '2024-01-02'],
+        'id': [1, 2]
+    }
+    df = pd.DataFrame(example_data)
+    transformed_df = transform_rlhf_dataset(
+        df,
+        chosen_col='chosen',
+        rejected_col='rejected',
+        drop_cols=['metadata', 'id']
+    )
+    print("Original shape:", df.shape)
+    print("\nTransformed shape:", transformed_df.shape)
+    print("\nTransformation sample:")
+    print(transformed_df.head())
+    print("\nLabel distribution:")
+    print(transformed_df['label'].value_counts())
+if __name__ == "__main__":
+    test_example()