Jen Ben Arye commited on
Commit
239efc0
·
1 Parent(s): 71053f2

updated to only load preference data

Browse files
Files changed (2) hide show
  1. kto_dataset_processor.py +34 -27
  2. kto_pipeline.py +43 -1
kto_dataset_processor.py CHANGED
@@ -5,54 +5,61 @@ from pdb import set_trace as st
5
 
6
  def process_dataset_ultrafeedback():
7
  """
8
- Processes the 'HuggingFaceH4/ultrafeedback_binarized' dataset into a unified train and test split.
 
9
 
10
  Returns:
11
  dict: A dictionary containing the unified 'train' and 'test' splits of the dataset in the KTO format.
12
  Each split is a Hugging Face Dataset object.
13
  """
14
- # Load the dataset
15
  dataset_name = "HuggingFaceH4/ultrafeedback_binarized"
16
- dataset = load_dataset(dataset_name)
 
17
 
18
  # Function to transform a single example into the desired schema
19
  def transform_data(example):
20
  data_points = []
21
  # Chosen completion
22
  chosen_completion = example["chosen"][1]["content"]
23
- data_points.append({
24
- "prompt": example["prompt"],
25
- "completion": chosen_completion.strip(),
26
- "label": True
27
- })
 
28
  # Rejected completion
29
  rejected_completion = example["rejected"][1]["content"]
30
- data_points.append({
31
- "prompt": example["prompt"],
32
- "completion": rejected_completion.strip(),
33
- "label": False
34
- })
 
35
  return data_points
36
 
37
- # Combine splits into unified train and test sets
38
  train_data = []
39
  test_data = []
40
 
41
- for split_name, split_data in dataset.items():
42
- if "train" in split_name:
43
- for example in split_data:
44
- train_data.extend(transform_data(example))
45
- elif "test" in split_name:
46
- for example in split_data:
47
- test_data.extend(transform_data(example))
48
 
49
- # Convert unified data to Hugging Face Dataset
50
- unified_train = Dataset.from_pandas(pd.DataFrame(train_data))
51
- unified_test = Dataset.from_pandas(pd.DataFrame(test_data))
 
 
 
 
 
 
 
 
52
 
53
  return {"train": unified_train, "test": unified_test}
54
 
55
 
56
- # if __name__ == "__main__":
57
- # kto_dataset = process_dataset_ultrafeedback()
58
- # st()
 
5
 
6
  def process_dataset_ultrafeedback():
7
  """
8
+ Processes the 'train_prefs' and 'test_prefs' splits of the 'HuggingFaceH4/ultrafeedback_binarized' dataset
9
+ into a unified format for preference modeling.
10
 
11
  Returns:
12
  dict: A dictionary containing the unified 'train' and 'test' splits of the dataset in the KTO format.
13
  Each split is a Hugging Face Dataset object.
14
  """
15
+ # Load the relevant splits of the dataset
16
  dataset_name = "HuggingFaceH4/ultrafeedback_binarized"
17
+ train_prefs = load_dataset(dataset_name, split="train_prefs")
18
+ test_prefs = load_dataset(dataset_name, split="test_prefs")
19
 
20
  # Function to transform a single example into the desired schema
21
  def transform_data(example):
22
  data_points = []
23
  # Chosen completion
24
  chosen_completion = example["chosen"][1]["content"]
25
+ if chosen_completion.strip(): # Check for non-empty completions
26
+ data_points.append({
27
+ "prompt": example["prompt"],
28
+ "completion": chosen_completion.strip(),
29
+ "label": True
30
+ })
31
  # Rejected completion
32
  rejected_completion = example["rejected"][1]["content"]
33
+ if rejected_completion.strip(): # Check for non-empty completions
34
+ data_points.append({
35
+ "prompt": example["prompt"],
36
+ "completion": rejected_completion.strip(),
37
+ "label": False
38
+ })
39
  return data_points
40
 
41
+ # Process train and test splits
42
  train_data = []
43
  test_data = []
44
 
45
+ for example in train_prefs:
46
+ train_data.extend(transform_data(example))
 
 
 
 
 
47
 
48
+ for example in test_prefs:
49
+ test_data.extend(transform_data(example))
50
+
51
+ # Convert unified data to DataFrames
52
+ train_df = pd.DataFrame(train_data)
53
+ test_df = pd.DataFrame(test_data)
54
+
55
+
56
+ # Convert to Hugging Face Dataset
57
+ unified_train = Dataset.from_pandas(train_df)
58
+ unified_test = Dataset.from_pandas(test_df)
59
 
60
  return {"train": unified_train, "test": unified_test}
61
 
62
 
63
+ if __name__ == "__main__":
64
+ kto_dataset = process_dataset_ultrafeedback()
65
+ st()
kto_pipeline.py CHANGED
@@ -2,7 +2,7 @@ import torch
2
  from dataclasses import dataclass
3
  from accelerate import PartialState
4
  from transformers import AutoModelForCausalLM, AutoTokenizer, HfArgumentParser
5
- from trl import KTOConfig, KTOTrainer, ModelConfig, get_peft_config
6
  from kto_dataset_processor import process_dataset_ultrafeedback
7
  from datetime import datetime
8
  import wandb
@@ -78,8 +78,45 @@ def load_model_and_tokenizer(model_args):
78
  if tokenizer.pad_token is None:
79
  tokenizer.pad_token = tokenizer.eos_token
80
 
 
 
 
 
 
 
81
  return model, tokenizer
82
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
83
  ####################################
84
  # MAIN LOGIC
85
  ####################################
@@ -99,6 +136,11 @@ def main():
99
  dataset = process_dataset_ultrafeedback()
100
  print("Dataset processed.")
101
 
 
 
 
 
 
102
  # Initialize trainer
103
  print("Initializing trainer...")
104
  trainer = KTOTrainer(
 
2
  from dataclasses import dataclass
3
  from accelerate import PartialState
4
  from transformers import AutoModelForCausalLM, AutoTokenizer, HfArgumentParser
5
+ from trl import KTOConfig, KTOTrainer, ModelConfig, get_peft_config, maybe_unpair_preference_dataset, setup_chat_format
6
  from kto_dataset_processor import process_dataset_ultrafeedback
7
  from datetime import datetime
8
  import wandb
 
78
  if tokenizer.pad_token is None:
79
  tokenizer.pad_token = tokenizer.eos_token
80
 
81
+ # Setup chat format if not present
82
+ if tokenizer.chat_template is None:
83
+ model, tokenizer = setup_chat_format(model, tokenizer)
84
+
85
+
86
+
87
  return model, tokenizer
88
 
89
+
90
+ # def find_unknown_tokens(tokenizer, texts):
91
+ # """
92
+ # Identify tokens in the dataset that are not in the tokenizer's vocabulary.
93
+ # """
94
+ # all_tokens = set()
95
+ # for text in texts:
96
+ # tokens = tokenizer.tokenize(text)
97
+ # all_tokens.update(tokens)
98
+ # vocab = set(tokenizer.get_vocab().keys())
99
+ # unknown_tokens = all_tokens - vocab
100
+ # return unknown_tokens
101
+
102
+
103
+ # def add_tokens_to_tokenizer(tokenizer, model, dataset):
104
+ # """
105
+ # Extend the tokenizer's vocabulary with missing tokens and resize the model embeddings.
106
+ # """
107
+ # # Extract all texts from the dataset
108
+ # texts = [example["completion"] for example in dataset["train"]]
109
+
110
+ # # Identify unknown tokens
111
+ # unknown_tokens = find_unknown_tokens(tokenizer, texts)
112
+ # print(f"Found {len(unknown_tokens)} unknown tokens: {list(unknown_tokens)[:10]}...")
113
+
114
+ # # Add unknown tokens to tokenizer
115
+ # tokenizer.add_tokens(list(unknown_tokens))
116
+ # model.resize_token_embeddings(len(tokenizer))
117
+ # print(f"Tokenizer vocabulary size after extension: {len(tokenizer)}")
118
+
119
+
120
  ####################################
121
  # MAIN LOGIC
122
  ####################################
 
136
  dataset = process_dataset_ultrafeedback()
137
  print("Dataset processed.")
138
 
139
+ # # Extend tokenizer with missing tokens
140
+ # print("Adding unknown tokens to tokenizer...")
141
+ # add_tokens_to_tokenizer(tokenizer, model, dataset)
142
+ # print("Tokenizer updated.")
143
+
144
  # Initialize trainer
145
  print("Initializing trainer...")
146
  trainer = KTOTrainer(