Spaces:

feel-fl
/

open-human-feedback-chat

Running

open-human-feedback-chat / ml /eval /data_transform_pipeline.py

burtenshaw

migrate all ml files into subdir

e9484c6 about 1 month ago

2.73 kB

	import pandas as pd
	import numpy as np

	# NOTE: names of preset cols may be different based on dataset, this is just a generalized pipeline

	CHOSEN_COLUMN = 'chosen' # name of col with chosen responses
	REJECTED_COLUMN = 'rejected' # name of col with rejected responses
	COLUMNS_TO_DROP = ['metadata', 'timestamp', 'id'] # cols to remove

	def transform_rlhf_dataset(df, chosen_col=CHOSEN_COLUMN, rejected_col=REJECTED_COLUMN, drop_cols=COLUMNS_TO_DROP):
	"""
	Parameters:
	df (pandas.DataFrame): Input dataframe with chosen and rejected columns
	chosen_col (str): Name of column containing chosen responses
	rejected_col (str): Name of column containing rejected responses
	drop_cols (list): List of column names to drop from the dataset

	Returns:
	pandas.DataFrame: Transformed dataset with 'text' and 'label' columns
	"""

	df = df.copy()

	existing_cols_to_drop = [col for col in drop_cols if col in df.columns]
	if existing_cols_to_drop:
	df = df.drop(columns=existing_cols_to_drop)

	preserved_cols = [col for col in df.columns if col not in [chosen_col, rejected_col]]

	# two separate dataframes for liked and disliked
	liked_df = df[[chosen_col]].copy()
	liked_df.columns = ['text']
	liked_df['label'] = 'liked'

	disliked_df = df[[rejected_col]].copy()
	disliked_df.columns = ['text']
	disliked_df['label'] = 'disliked'

	for col in preserved_cols:
	liked_df[col] = df[col]
	for col in preserved_cols:
	disliked_df[col] = df[col]

	# combine + shuffle
	transformed_df = pd.concat([liked_df, disliked_df], ignore_index=True)
	transformed_df = transformed_df.dropna(subset=['text'])
	transformed_df = transformed_df.sample(frac=1).reset_index(drop=True)

	# reordering
	column_order = ['text', 'label'] + preserved_cols
	transformed_df = transformed_df[column_order]

	return transformed_df

	def test_example():
	example_data = {
	'chosen': ['This is a good response', 'Another good one'],
	'rejected': ['This is a bad response', 'Another bad one'],
	'metadata': ['meta1', 'meta2'],
	'timestamp': ['2024-01-01', '2024-01-02'],
	'id': [1, 2]
	}

	df = pd.DataFrame(example_data)
	transformed_df = transform_rlhf_dataset(
	df,
	chosen_col='chosen',
	rejected_col='rejected',
	drop_cols=['metadata', 'id']
	)

	print("Original shape:", df.shape)
	print("\nTransformed shape:", transformed_df.shape)
	print("\nTransformation sample:")
	print(transformed_df.head())
	print("\nLabel distribution:")
	print(transformed_df['label'].value_counts())

	if __name__ == "__main__":
	test_example()