Granther commited on
Commit
c941f24
1 Parent(s): 1d4d79d

Upload modelling/ft.py with huggingface_hub

Browse files
Files changed (1) hide show
  1. modelling/ft.py +165 -0
modelling/ft.py ADDED
@@ -0,0 +1,165 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import sys
2
+ import logging
3
+
4
+ import datasets
5
+ from datasets import load_dataset
6
+ from peft import LoraConfig
7
+ import torch
8
+ import transformers
9
+ from trl import SFTTrainer
10
+ from transformers import AutoModelForCausalLM, AutoTokenizer, TrainingArguments, BitsAndBytesConfig
11
+
12
+ training_config = {
13
+ "bf16": True,
14
+ "do_eval": False,
15
+ "learning_rate": 5.0e-06,
16
+ "log_level": "info",
17
+ "logging_steps": 20,
18
+ "logging_strategy": "steps",
19
+ "lr_scheduler_type": "cosine",
20
+ "num_train_epochs": 1,
21
+ "max_steps": -1,
22
+ "output_dir": "./instruct_chk_dir",
23
+ "overwrite_output_dir": True,
24
+ "per_device_eval_batch_size": 4,
25
+ "per_device_train_batch_size": 4,
26
+ "remove_unused_columns": True,
27
+ "save_steps": 100,
28
+ "save_total_limit": 1,
29
+ "seed": 0,
30
+ "gradient_checkpointing": True,
31
+ "gradient_checkpointing_kwargs":{"use_reentrant": False},
32
+ "gradient_accumulation_steps": 1,
33
+ "warmup_ratio": 0.2,
34
+ }
35
+
36
+ peft_config = {
37
+ "r": 16,
38
+ "lora_alpha": 32,
39
+ "lora_dropout": 0.05,
40
+ "bias": "none",
41
+ "task_type": "CAUSAL_LM",
42
+ "target_modules": "all-linear",
43
+ "modules_to_save": None,
44
+ }
45
+
46
+ config = {
47
+ "max_len": 4096,
48
+ }
49
+
50
+ train_conf = TrainingArguments(**training_config)
51
+ peft_conf = LoraConfig(**peft_config)
52
+
53
+
54
+ # Model Init
55
+ checkpoint_path = "microsoft/Phi-3-mini-128k-instruct"
56
+ model_kwargs = dict(
57
+ use_cache=False,
58
+ trust_remote_code=True,
59
+ attn_implementation="flash_attention_2", # loading the model with flash-attenstion support
60
+ torch_dtype=torch.bfloat16,
61
+ #device_map=None
62
+ device_map="sequential"
63
+ )
64
+ model = AutoModelForCausalLM.from_pretrained(checkpoint_path, **model_kwargs)
65
+ tokenizer = AutoTokenizer.from_pretrained(checkpoint_path)
66
+ tokenizer.model_max_length = config['max_len']
67
+ tokenizer.pad_token = tokenizer.unk_token # use unk rather than eos token to prevent endless generation
68
+ tokenizer.pad_token_id = tokenizer.convert_tokens_to_ids(tokenizer.pad_token)
69
+ tokenizer.padding_side = 'right'
70
+
71
+ dataset_id = "BAAI/Infinity-Instruct"
72
+ raw_dataset = load_dataset(dataset_id, "0625", split="train")
73
+ dataset = raw_dataset.select(range(10000))
74
+
75
+
76
+ # Preproc dataset
77
+ def preproc(example, tokenizer):
78
+ convo = example['conversations']
79
+ for i, dic in enumerate(convo):
80
+ dic['role'] = dic.pop('from')
81
+ dic['content'] = dic.pop('value')
82
+ if dic['role'] == 'gpt':
83
+ dic['role'] = 'assistant'
84
+ elif dic['role'] == 'human':
85
+ dic['role'] = 'user'
86
+
87
+ example['text'] = tokenizer.apply_chat_template(convo, tokenize=False, add_generation_prompt=False)
88
+ return example
89
+
90
+ column_names = list(dataset.features)
91
+ train_dataset = dataset.map(
92
+ preproc,
93
+ fn_kwargs={"tokenizer": tokenizer},
94
+ num_proc=10,
95
+ remove_columns=column_names
96
+ )
97
+
98
+ # eval_dataset = dataset[9000:]
99
+ # eval_dataset = eval_dataset.map(
100
+ # preproc,
101
+ # fn_kwargs={"tokenizer": tokenizer},
102
+ # num_proc=10,
103
+ # remove_columns=column_names
104
+ # )
105
+
106
+
107
+ # Train Model
108
+ trainer = SFTTrainer(
109
+ model=model,
110
+ args=train_conf,
111
+ peft_config=peft_conf,
112
+ train_dataset=train_dataset,
113
+ #eval_dataset=eval_dataset,
114
+ max_seq_length=config['max_len'],
115
+ dataset_text_field="text",
116
+ tokenizer=tokenizer,
117
+ packing=True
118
+ )
119
+ train_result = trainer.train()
120
+ metrics = train_result.metrics
121
+ trainer.log_metrics("train", metrics)
122
+ trainer.save_metrics("train", metrics)
123
+ trainer.save_state()
124
+
125
+
126
+ # Eval Model
127
+ tokenizer.padding_side = 'left'
128
+ metrics = trainer.evaluate()
129
+ metrics["eval_samples"] = len(processed_test_dataset)
130
+ trainer.log_metrics("eval", metrics)
131
+ trainer.save_metrics("eval", metrics)
132
+
133
+
134
+ # Save model
135
+ trainer.save_model(train_conf.output_dir)
136
+
137
+ # def apply_chat_template(
138
+ # example,
139
+ # tokenizer,
140
+ # ):
141
+ # messages = example["messages"]
142
+ # example["text"] = tokenizer.apply_chat_template(
143
+ # messages, tokenize=False, add_generation_prompt=False)
144
+ # return example
145
+
146
+ # raw_dataset = load_dataset("HuggingFaceH4/ultrachat_200k")
147
+ # train_dataset = raw_dataset["train_sft"].select(range(10000))
148
+ # test_dataset = raw_dataset["test_sft"].select(range(1000))
149
+ # column_names = list(train_dataset.features)
150
+
151
+ # processed_train_dataset = train_dataset.map(
152
+ # apply_chat_template,
153
+ # fn_kwargs={"tokenizer": tokenizer},
154
+ # num_proc=10,
155
+ # remove_columns=column_names,
156
+ # desc="Applying chat template to train_sft",
157
+ # )
158
+
159
+ # processed_test_dataset = test_dataset.map(
160
+ # apply_chat_template,
161
+ # fn_kwargs={"tokenizer": tokenizer},
162
+ # num_proc=10,
163
+ # remove_columns=column_names,
164
+ # desc="Applying chat template to test_sft",
165
+ # )