Riddhi Bhagwat commited on
Commit
261522b
·
1 Parent(s): 04f923c

Add files via upload

Browse files
Files changed (1) hide show
  1. data_transform_pipeline.py +80 -0
data_transform_pipeline.py ADDED
@@ -0,0 +1,80 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import pandas as pd
2
+ import numpy as np
3
+
4
+ # NOTE: names of preset cols may be different based on dataset, this is just a generalized pipeline
5
+
6
+ CHOSEN_COLUMN = 'chosen' # name of col with chosen responses
7
+ REJECTED_COLUMN = 'rejected' # name of col with rejected responses
8
+ COLUMNS_TO_DROP = ['metadata', 'timestamp', 'id'] # cols to remove
9
+
10
+ def transform_rlhf_dataset(df, chosen_col=CHOSEN_COLUMN, rejected_col=REJECTED_COLUMN, drop_cols=COLUMNS_TO_DROP):
11
+ """
12
+ Parameters:
13
+ df (pandas.DataFrame): Input dataframe with chosen and rejected columns
14
+ chosen_col (str): Name of column containing chosen responses
15
+ rejected_col (str): Name of column containing rejected responses
16
+ drop_cols (list): List of column names to drop from the dataset
17
+
18
+ Returns:
19
+ pandas.DataFrame: Transformed dataset with 'text' and 'label' columns
20
+ """
21
+
22
+ df = df.copy()
23
+
24
+ existing_cols_to_drop = [col for col in drop_cols if col in df.columns]
25
+ if existing_cols_to_drop:
26
+ df = df.drop(columns=existing_cols_to_drop)
27
+
28
+ preserved_cols = [col for col in df.columns if col not in [chosen_col, rejected_col]]
29
+
30
+ # two separate dataframes for liked and disliked
31
+ liked_df = df[[chosen_col]].copy()
32
+ liked_df.columns = ['text']
33
+ liked_df['label'] = 'liked'
34
+
35
+ disliked_df = df[[rejected_col]].copy()
36
+ disliked_df.columns = ['text']
37
+ disliked_df['label'] = 'disliked'
38
+
39
+ for col in preserved_cols:
40
+ liked_df[col] = df[col]
41
+ for col in preserved_cols:
42
+ disliked_df[col] = df[col]
43
+
44
+ # combine + shuffle
45
+ transformed_df = pd.concat([liked_df, disliked_df], ignore_index=True)
46
+ transformed_df = transformed_df.dropna(subset=['text'])
47
+ transformed_df = transformed_df.sample(frac=1).reset_index(drop=True)
48
+
49
+ # reordering
50
+ column_order = ['text', 'label'] + preserved_cols
51
+ transformed_df = transformed_df[column_order]
52
+
53
+ return transformed_df
54
+
55
+ def test_example():
56
+ example_data = {
57
+ 'chosen': ['This is a good response', 'Another good one'],
58
+ 'rejected': ['This is a bad response', 'Another bad one'],
59
+ 'metadata': ['meta1', 'meta2'],
60
+ 'timestamp': ['2024-01-01', '2024-01-02'],
61
+ 'id': [1, 2]
62
+ }
63
+
64
+ df = pd.DataFrame(example_data)
65
+ transformed_df = transform_rlhf_dataset(
66
+ df,
67
+ chosen_col='chosen',
68
+ rejected_col='rejected',
69
+ drop_cols=['metadata', 'id']
70
+ )
71
+
72
+ print("Original shape:", df.shape)
73
+ print("\nTransformed shape:", transformed_df.shape)
74
+ print("\nTransformation sample:")
75
+ print(transformed_df.head())
76
+ print("\nLabel distribution:")
77
+ print(transformed_df['label'].value_counts())
78
+
79
+ if __name__ == "__main__":
80
+ test_example()