liang.zhao commited on
Commit
67b506b
1 Parent(s): 8c1c087

update model and config

Browse files
.gitattributes CHANGED
@@ -35,3 +35,7 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
36
  pytorch_model-00001-of-00002.bin filter=lfs diff=lfs merge=lfs -text
37
  pytorch_model-00002-of-00002.bin filter=lfs diff=lfs merge=lfs -text
 
 
 
 
 
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
36
  pytorch_model-00001-of-00002.bin filter=lfs diff=lfs merge=lfs -text
37
  pytorch_model-00002-of-00002.bin filter=lfs diff=lfs merge=lfs -text
38
+ pytorch_model-00001-of-00004.bin filter=lfs diff=lfs merge=lfs -text
39
+ pytorch_model-00002-of-00004.bin filter=lfs diff=lfs merge=lfs -text
40
+ pytorch_model-00003-of-00004.bin filter=lfs diff=lfs merge=lfs -text
41
+ pytorch_model-00004-of-00004.bin filter=lfs diff=lfs merge=lfs -text
config.json CHANGED
@@ -33,7 +33,7 @@
33
  "rms_norm_eps": 1e-06,
34
  "tie_word_embeddings": false,
35
  "torch_dtype": "bfloat16",
36
- "transformers_version": "4.33.1",
37
  "use_cache": true,
38
- "vocab_size": 65519
39
  }
 
33
  "rms_norm_eps": 1e-06,
34
  "tie_word_embeddings": false,
35
  "torch_dtype": "bfloat16",
36
+ "transformers_version": "4.34.0",
37
  "use_cache": true,
38
+ "vocab_size": 65536
39
  }
modeling_skywork.py CHANGED
@@ -179,6 +179,27 @@ class SkyworkDynamicNTKScalingRotaryEmbedding(SkyworkRotaryEmbedding):
179
  self.register_buffer("sin_cached", emb.sin().to(dtype), persistent=False)
180
 
181
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
182
  def rotate_half(x):
183
  """Rotates half the hidden dims of the input."""
184
  x1 = x[..., : x.shape[-1] // 2]
@@ -189,7 +210,7 @@ def rotate_half(x):
189
  # Copied from transformers.models.gpt_neox.modeling_gpt_neox.apply_rotary_pos_emb
190
  def apply_rotary_pos_emb(q, k, cos, sin, position_ids):
191
  cos = cos[position_ids].unsqueeze(1) # [seq_len, dim] -> [batch_size, 1, seq_len, head_dim]
192
- sin = sin[position_ids].unsqueeze(1)
193
  q_embed = (q * cos) + (rotate_half(q) * sin)
194
  k_embed = (k * cos) + (rotate_half(k) * sin)
195
  return q_embed, k_embed
@@ -290,6 +311,13 @@ class SkyworkAttention(nn.Module):
290
  scaling_factor=scaling_factor,
291
  base=self.rope_theta,
292
  )
 
 
 
 
 
 
 
293
  else:
294
  raise ValueError(f"Unknown RoPE scaling type {scaling_type}")
295
 
 
179
  self.register_buffer("sin_cached", emb.sin().to(dtype), persistent=False)
180
 
181
 
182
+ class SkyworkNTKScalingRotaryEmbedding(SkyworkRotaryEmbedding):
183
+ def __init__(self, dim, max_position_embeddings=2048, base=10000, device=None, scaling_factor=1.0):
184
+ self.scaling_factor = scaling_factor
185
+ super().__init__(dim, max_position_embeddings, base, device)
186
+
187
+ def _set_cos_sin_cache(self, seq_len, device, dtype):
188
+ self.max_seq_len_cached = seq_len
189
+
190
+ base = (self.base * self.scaling_factor) ** (self.dim / (self.dim - 2))
191
+ inv_freq = 1.0 / (base ** (torch.arange(0, self.dim, 2).float().to(device) / self.dim))
192
+ self.register_buffer("inv_freq", inv_freq, persistent=False)
193
+
194
+ t = torch.arange(self.max_seq_len_cached, device=device, dtype=self.inv_freq.dtype)
195
+
196
+ freqs = torch.einsum("i,j->ij", t, self.inv_freq)
197
+ # Different from paper, but it uses a different permutation in order to obtain the same calculation
198
+ emb = torch.cat((freqs, freqs), dim=-1)
199
+ self.register_buffer("cos_cached", emb.cos().to(dtype), persistent=False)
200
+ self.register_buffer("sin_cached", emb.sin().to(dtype), persistent=False)
201
+
202
+
203
  def rotate_half(x):
204
  """Rotates half the hidden dims of the input."""
205
  x1 = x[..., : x.shape[-1] // 2]
 
210
  # Copied from transformers.models.gpt_neox.modeling_gpt_neox.apply_rotary_pos_emb
211
  def apply_rotary_pos_emb(q, k, cos, sin, position_ids):
212
  cos = cos[position_ids].unsqueeze(1) # [seq_len, dim] -> [batch_size, 1, seq_len, head_dim]
213
+ sin = sin[position_ids].unsqueeze(1) #
214
  q_embed = (q * cos) + (rotate_half(q) * sin)
215
  k_embed = (k * cos) + (rotate_half(k) * sin)
216
  return q_embed, k_embed
 
311
  scaling_factor=scaling_factor,
312
  base=self.rope_theta,
313
  )
314
+ elif scaling_type == "ntk":
315
+ self.rotary_emb = SkyworkNTKScalingRotaryEmbedding(
316
+ self.head_dim,
317
+ max_position_embeddings=self.max_position_embeddings,
318
+ scaling_factor=scaling_factor,
319
+ base=self.rope_theta,
320
+ )
321
  else:
322
  raise ValueError(f"Unknown RoPE scaling type {scaling_type}")
323
 
pytorch_model-00001-of-00002.bin → pytorch_model-00001-of-00004.bin RENAMED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:ef96286501811b2ee17470bbf8c071cafbd66f36e7b9a0d3e0f5fa43a5c6ae28
3
- size 9983005310
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:f344f1c62a065f471de22d3a9ac6a4a4d2c1b8f98a2251080e59be55f7d77632
3
+ size 3982977952
pytorch_model-00002-of-00002.bin → pytorch_model-00002-of-00004.bin RENAMED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:b8e1887f1eef677bac502bfdb6e534601b291a7586c36abb686cfcd398a7e2a7
3
- size 4485927597
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:2fcff58e56b6d24abd9588ba1b17c58fbfe76cbfe1c35014c5bd59aa69f8fb7a
3
+ size 3959875181
pytorch_model-00003-of-00004.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:0b903e4ba9505d009982736e9eb21f77c7151f91d0bb8d846756929f147e3eb9
3
+ size 3966949023
pytorch_model-00004-of-00004.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:573ab5962dc857d263679e9e6e7493a0d150ec905ba5172af38cbb2eedac5f29
3
+ size 2559125753
pytorch_model.bin.index.json CHANGED
The diff for this file is too large to render. See raw diff
 
skywork_13b_sft.sh ADDED
@@ -0,0 +1,128 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ set -x
2
+ export WANDB_API_KEY=${WANDB_API_KEY:-YOUR_WANDB_API_KEY}
3
+ export WANDB_ENTITY=${WANDB_ENTITY:-YOUR_WANDB_ENTITY}
4
+ export WANDB_PROJECT=${WANDB_PROJECT:-YOUR_WANDB_PROJECT}
5
+
6
+ GPUS_PER_NODE=8
7
+ NODE_RANK=$([ -z "$RANK" ] && echo -n 0 || echo -n $RANK)
8
+ NNODES=$([ -z "$WORLD_SIZE" ] && echo -n 1 || echo -n $WORLD_SIZE)
9
+
10
+ DEBUG="false"
11
+ USE_LORA="false"
12
+ TASK_TYPE="sft"
13
+
14
+ MAX_STEP=1000
15
+ LR=1e-4
16
+ MAX_LENGTH=4096
17
+
18
+ GLOBAL_BATCH_SIZE=32 # 8 * 4
19
+ MICRO_BATCH_SIZE=1
20
+ SAVE_STEP=500
21
+ EVAL_STEP=500
22
+ GRAD_ACC=$((${GLOBAL_BATCH_SIZE} / (${GPUS_PER_NODE} * $NNODES * ${MICRO_BATCH_SIZE}) ))
23
+
24
+ FLAG=Skywork-13B-Base-sft-peaklr${LR}-steps${MAX_STEP}-gbs${GLOBAL_BATCH_SIZE}
25
+
26
+ ROOT_PATH=${ROOT_PATH:-/data/user/your_name}
27
+ MODEL_PATH=${MODEL_PATH:-SKYWORK_13B_BASE_MODEL_PATH}
28
+
29
+ SFT_DATA_DIR=${SFT_DATA_DIR:-"YOUR_DATA_DIR"}
30
+ DATA_CACHE_DIR=${DATA_CACHE_DIR:-"YOUR_DATA_CACHE_DIR"}
31
+
32
+ OUTPUT_DIR=$ROOT_PATH/run_output/skywork-13b-sft-trainer/$FLAG
33
+ LOAD_MODEL_PATH=$([ -z "$MODEL_PATH" ] && echo -n "$OUTPUT_DIR" || echo -n "$MODEL_PATH")
34
+
35
+ DISTRIBUTED_ARGS="--nproc_per_node $GPUS_PER_NODE --nnodes $NNODES --master_port 29501"
36
+ if [[ $NNODES -gt 1 ]]; then
37
+
38
+ export NCCL_IB_HCA=mlx5
39
+ export NCCL_IB_TC=136
40
+ export NCCL_IB_SL=5
41
+ export NCCL_IB_GID_INDEX=3
42
+ export NCCL_IB_TIMEOUT=22
43
+ export NCCL_SOCKET_IFNAME=bond0
44
+ export NCCL_DEBUG=INFO
45
+ NODE_RANK=$RANK
46
+ if [ "$MASTER_ADDR" == "localhost" ] ; then $MASTER_ADDR=`hostname`; fi
47
+
48
+ echo $MASTER_ADDR
49
+ echo $MASTER_PORT
50
+ DISTRIBUTED_ARGS="--nproc_per_node $GPUS_PER_NODE --nnodes $NNODES --node_rank $NODE_RANK --master_addr $MASTER_ADDR --master_port $MASTER_PORT"
51
+ fi
52
+
53
+ if [ "$DEBUG" = "true" ]; then
54
+ EVAL_STEP=5
55
+ GLOBAL_BATCH_SIZE=8
56
+ GRAD_ACC=1
57
+
58
+ fi
59
+
60
+ DS_CONFIG=${DS_CONFIG:-train/ds_config/zero3_offload.json}
61
+
62
+ LOG_ARGS="
63
+ --logging_steps 1 \
64
+ --logging_dir tensorboard/$FLAG \
65
+ --logging_strategy steps \
66
+ --logging_first_step True \
67
+ --report_to wandb \
68
+ --run_name $FLAG
69
+ "
70
+
71
+ OUTPUT_ARGS="
72
+ --save_strategy steps \
73
+ --save_total_limit 500 \
74
+ --save_steps $SAVE_STEP \
75
+ --output_dir $OUTPUT_DIR \
76
+ --overwrite_output_dir
77
+ "
78
+
79
+ TRAIN_ARGS="
80
+ --task_type $TASK_TYPE \
81
+ --do_train \
82
+ --max_seq_length $MAX_LENGTH \
83
+ --max_steps $MAX_STEP \
84
+ --lr_scheduler_type constant_with_warmup \
85
+ --learning_rate $LR \
86
+ --weight_decay 0.1 \
87
+ --warmup_steps 20 \
88
+ --adam_beta1 0.9 \
89
+ --adam_beta2 0.95 \
90
+ --gradient_accumulation_steps $GRAD_ACC \
91
+ --per_device_train_batch_size $MICRO_BATCH_SIZE
92
+ "
93
+
94
+ EVAL_ARGS="
95
+ --do_eval \
96
+ --evaluation_strategy steps \
97
+ --eval_steps $EVAL_STEP \
98
+ --per_device_eval_batch_size 1
99
+ "
100
+
101
+ INPUT_ARGS="
102
+ --model_name_or_path $LOAD_MODEL_PATH \
103
+ --tokenizer_name_or_path $LOAD_MODEL_PATH \
104
+ --sft_dataset_dir $SFT_DATA_DIR \
105
+ --data_cache_dir $DATA_CACHE_DIR
106
+ "
107
+
108
+ EXTRA_ARGS="
109
+ --seed 1234 \
110
+ --deepspeed $DS_CONFIG \
111
+ --gradient_checkpointing \
112
+ --ddp_find_unused_parameters False \
113
+ --preprocessing_num_workers 12 \
114
+ --ddp_timeout 30000 \
115
+ --torch_dtype bfloat16 \
116
+ --bf16 \
117
+ --load_in_kbits 16
118
+ "
119
+
120
+ mkdir -p logs/$FLAG || True
121
+ torchrun $DISTRIBUTED_ARGS train/train.py \
122
+ $LOG_ARGS \
123
+ $OUTPUT_ARGS \
124
+ $TRAIN_ARGS \
125
+ $EVAL_ARGS \
126
+ $INPUT_ARGS \
127
+ $EXTRA_ARGS 2>&1 | tee -a logs/$FLAG/$RANK.log
128
+