Spaces:
Runtime error
Runtime error
| import pandas as pd | |
| import numpy as np | |
| # NOTE: names of preset cols may be different based on dataset, this is just a generalized pipeline | |
| CHOSEN_COLUMN = 'chosen' # name of col with chosen responses | |
| REJECTED_COLUMN = 'rejected' # name of col with rejected responses | |
| COLUMNS_TO_DROP = ['metadata', 'timestamp', 'id'] # cols to remove | |
| def transform_rlhf_dataset(df, chosen_col=CHOSEN_COLUMN, rejected_col=REJECTED_COLUMN, drop_cols=COLUMNS_TO_DROP): | |
| """ | |
| Parameters: | |
| df (pandas.DataFrame): Input dataframe with chosen and rejected columns | |
| chosen_col (str): Name of column containing chosen responses | |
| rejected_col (str): Name of column containing rejected responses | |
| drop_cols (list): List of column names to drop from the dataset | |
| Returns: | |
| pandas.DataFrame: Transformed dataset with 'text' and 'label' columns | |
| """ | |
| df = df.copy() | |
| existing_cols_to_drop = [col for col in drop_cols if col in df.columns] | |
| if existing_cols_to_drop: | |
| df = df.drop(columns=existing_cols_to_drop) | |
| preserved_cols = [col for col in df.columns if col not in [chosen_col, rejected_col]] | |
| # two separate dataframes for liked and disliked | |
| liked_df = df[[chosen_col]].copy() | |
| liked_df.columns = ['text'] | |
| liked_df['label'] = 'liked' | |
| disliked_df = df[[rejected_col]].copy() | |
| disliked_df.columns = ['text'] | |
| disliked_df['label'] = 'disliked' | |
| for col in preserved_cols: | |
| liked_df[col] = df[col] | |
| for col in preserved_cols: | |
| disliked_df[col] = df[col] | |
| # combine + shuffle | |
| transformed_df = pd.concat([liked_df, disliked_df], ignore_index=True) | |
| transformed_df = transformed_df.dropna(subset=['text']) | |
| transformed_df = transformed_df.sample(frac=1).reset_index(drop=True) | |
| # reordering | |
| column_order = ['text', 'label'] + preserved_cols | |
| transformed_df = transformed_df[column_order] | |
| return transformed_df | |
| def test_example(): | |
| example_data = { | |
| 'chosen': ['This is a good response', 'Another good one'], | |
| 'rejected': ['This is a bad response', 'Another bad one'], | |
| 'metadata': ['meta1', 'meta2'], | |
| 'timestamp': ['2024-01-01', '2024-01-02'], | |
| 'id': [1, 2] | |
| } | |
| df = pd.DataFrame(example_data) | |
| transformed_df = transform_rlhf_dataset( | |
| df, | |
| chosen_col='chosen', | |
| rejected_col='rejected', | |
| drop_cols=['metadata', 'id'] | |
| ) | |
| print("Original shape:", df.shape) | |
| print("\nTransformed shape:", transformed_df.shape) | |
| print("\nTransformation sample:") | |
| print(transformed_df.head()) | |
| print("\nLabel distribution:") | |
| print(transformed_df['label'].value_counts()) | |
| if __name__ == "__main__": | |
| test_example() |