matlok commited on
Commit
2349d28
1 Parent(s): 9c8c9b0

fix for including the tokenizer and adding documentation/comments

Browse files
Files changed (2) hide show
  1. README.md +399 -291
  2. run-tiny-merge.py +134 -59
README.md CHANGED
@@ -24,27 +24,51 @@ TinyLlama/TinyLlama-1.1B-intermediate-step-1431k-3T
24
 
25
  Please refer to the Unsloth fine-tuning guide for:
26
 
 
 
 
 
 
 
 
 
27
  - [Alpaca + TinyLlama + RoPE Scaling full example.ipynb](https://colab.research.google.com/drive/1AZghoNBQaMDgWJpi4RbffGM1h6raLUj9?usp=sharing)
28
 
29
  ## How do I generate my own model merges?
30
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
31
  ```python3
32
  #!/usr/bin/env python3
33
 
 
34
  import transformers
35
  import torch
36
  import logging
37
  from ddare.merge import merge_tensors
38
- from ddare.tensor import dare_ties_sparsification, relative_norm, divide_tensor_into_sets
 
 
 
 
39
  from ddare.util import get_device
40
  import re
41
  from typing import Dict, Tuple, List
42
 
43
- # If you want to fine-tune, here's an example Unsloth fine tuning guide for:
44
- # Alpaca + TinyLlama + RoPE Scaling full example.ipynb
45
- # https://colab.research.google.com/drive/1AZghoNBQaMDgWJpi4RbffGM1h6raLUj9?usp=sharing
46
-
47
- # code here was refactored from gist: https://gist.github.com/maldevide/08829eada04ad9bd78e46c1a3787d42b
48
 
49
  logging.basicConfig(level=logging.INFO)
50
  log = logging.getLogger(__name__)
@@ -54,10 +78,16 @@ def get_models(
54
  models: List[str],
55
  trust_remote_code: bool,
56
  ):
 
 
 
 
 
 
57
  config = {
58
- 'torch_dtype': torch.float16,
59
- 'low_cpu_mem_usage': False,
60
- 'trust_remote_code': trust_remote_code,
61
  }
62
  loaded_models = []
63
  num_models = len(models)
@@ -68,8 +98,7 @@ def get_models(
68
  )
69
  loaded_models.append(
70
  transformers.AutoModelForCausalLM.from_pretrained(
71
- model_path,
72
- **config
73
  )
74
  )
75
  return loaded_models
@@ -78,6 +107,11 @@ def get_models(
78
  def pm(
79
  model,
80
  ):
 
 
 
 
 
81
  keys = model.state_dict().keys()
82
  log.info(f"model keys={len(keys)}")
83
  for i, k in enumerate(keys):
@@ -85,52 +119,66 @@ def pm(
85
  log.info(
86
  f"{i:3d} {k} shape={tensor.shape} "
87
  f"type={tensor.dtype} dev={tensor.device} "
88
- f"contig={tensor.is_contiguous()}")
 
89
 
90
 
91
  def run_text_test(
92
  model,
93
- tokenizer_path,
94
  question: str,
95
  device: str = "cuda",
96
  ):
 
 
 
 
 
 
 
 
97
  base_model = model.to(device)
98
- log.info(
99
- f"loading tokenizer={tokenizer_path}"
100
- )
101
  tokenizer = transformers.AutoTokenizer.from_pretrained(
102
  tokenizer_path,
103
  torch_dtype=torch.float16,
104
  )
105
 
106
- inputs = tokenizer(
107
- question,
108
- return_tensors="pt"
109
- ).to(device)
110
  with torch.backends.cuda.sdp_kernel(
111
  enable_flash=True,
112
  enable_math=False,
113
- enable_mem_efficient=False
114
  ):
115
  outputs = base_model.generate(
116
  **inputs,
117
- max_new_tokens=1000,
118
  )
119
- answer = tokenizer.decode(outputs[0], skip_special_tokens=True)
 
 
120
  log.info(
121
  "\n"
122
  "----------"
 
123
  f"tokenizer={tokenizer}\n "
124
  f"question:\n{question}\n"
125
  f"answer:\n{answer}\n"
126
  "----------"
127
  )
128
  base_model = base_model.to(device)
 
 
129
 
 
 
 
130
 
131
- def get_layer_type(
132
- key: str
133
- ) -> Tuple[int, str]:
134
  matcher = re.compile(r"model.layers.(\d+).(.+)")
135
  m = matcher.match(key)
136
  if m is None:
@@ -148,8 +196,16 @@ def get_layer_type(
148
  def merge_model_with_ties(
149
  models: List[str],
150
  model_dst: str,
151
- trust_remote_code: bool = True
152
  ):
 
 
 
 
 
 
 
 
153
  models = get_models(
154
  models=models,
155
  trust_remote_code=trust_remote_code,
@@ -175,25 +231,30 @@ def merge_model_with_ties(
175
 
176
  # build a ratio
177
  ratio = {
178
- 'to_q': 0.0,
179
- 'to_k': 0.0,
180
- 'to_v': 0.0,
181
- }.get(layer_type, .5)
182
 
183
  norm_ratio = 0.68
184
  log.info(
185
  f"model={k} {num_keys} shape={m0.shape} "
186
  f"dtype={m0.dtype} {m0.device} "
187
- f"raio={ratio} "
188
  f"contig={m0.is_contiguous()} "
189
- f"norm={norm_ratio}")
 
190
 
191
  # for all tensors
192
  for i, tensor in enumerate(m):
193
  if layer_type == "to_k":
194
  # Get to_q key
195
- q_base = models[0].state_dict()[k.replace("to_k", "to_q")]
196
- q_merge = models[i].state_dict()[k.replace("to_k", "to_q")]
 
 
 
 
197
  scale = relative_norm(q_merge, q_base)
198
  tensor = tensor.to(device) / scale
199
  del scale
@@ -201,9 +262,7 @@ def merge_model_with_ties(
201
  scale = relative_norm(tensor, m0)
202
  tensor = tensor.to(device) * scale
203
  del scale
204
- slice_mask = (
205
- sets == i
206
- ).bool()
207
  new_tensor = dare_ties_sparsification(
208
  model_a_param=m0,
209
  model_b_param=tensor,
@@ -211,21 +270,23 @@ def merge_model_with_ties(
211
  ties="sum",
212
  rescale="off",
213
  device=device,
214
- **config)
215
- new_tensor = merge_tensors("slerp", m0, tensor, ratio)
216
- result = torch.where(slice_mask, new_tensor, result)
 
 
 
 
 
217
  del new_tensor, slice_mask
218
 
219
  result_dict[k] = result
220
  # end of merge
221
 
222
- log.info(
223
- f"done merge saving to file: {model_dst}"
224
- )
225
  out_model = (
226
  transformers.AutoModelForCausalLM.from_pretrained(
227
- model_dst,
228
- **config
229
  )
230
  )
231
  out_model.state_dict = lambda: result_dict
@@ -233,17 +294,24 @@ def merge_model_with_ties(
233
 
234
 
235
  def run():
236
- question = (
237
- "why is the sky blue?"
 
 
 
 
 
 
 
 
238
  )
239
- log.info(f"merging models and asking the question: {question}")
240
  model_src = "TinyLlama/TinyLlama-1.1B-intermediate-step-1431k-3T"
241
  model_dst = "matlok/tinyllama-cinder-openhermes-32k"
242
  device = "cuda"
243
  config = {
244
- 'torch_dtype': torch.float16,
245
- 'low_cpu_mem_usage': False,
246
- 'trust_remote_code': True,
247
  }
248
  models = [
249
  model_src,
@@ -253,13 +321,13 @@ def run():
253
  "Josephgflowers/TinyLlama-3T-Cinder-v1.3",
254
  ]
255
  merge_model_with_ties(
256
- models=models,
257
- model_dst=model_dst
258
  )
259
  log.info(f"loading newly-created file: {model_dst}")
260
- model = transformers.AutoModelForCausalLM.from_pretrained(
261
- model_dst,
262
- **config
 
263
  )
264
  log.info(
265
  f"loaded new model file: {model_dst} "
@@ -271,7 +339,29 @@ def run():
271
  question=question,
272
  device=device,
273
  )
274
- log.info(f"done loading new model: {model} file: {model_dst}")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
275
 
276
 
277
  if __name__ == "__main__":
@@ -283,223 +373,244 @@ if __name__ == "__main__":
283
  Here's the logs from the code above:
284
 
285
  ```
 
286
  Total VRAM 12282 MB, total RAM 85434 MB
287
  Set vram state to: NORMAL_VRAM
288
  Device: cuda:0 NVIDIA GeForce RTX 4070 Ti : native
289
  VAE dtype: torch.bfloat16
290
  INFO:__main__:merging models and asking the question: why is the sky blue?
291
- INFO:__main__:loading model=1/5 model=TinyLlama/TinyLlama-1.1B-intermediate-step-1431k-3T
292
- INFO:__main__:loading model=2/5 model=Doctor-Shotgun/TinyLlama-1.1B-32k-Instruct
293
- /d/venvs/dev/lib/python3.11/site-packages/torch/_utils.py:831: UserWarning: TypedStorage is deprecated. It will be removed in the future and UntypedStorage will be the only storage class. This should only matter to you if you are using storages directly. To access UntypedStorage directly, use tensor.untyped_storage() instead of tensor.storage()
294
- return self.fget.__get__(instance, owner)()
295
- INFO:__main__:loading model=3/5 model=Doctor-Shotgun/TinyLlama-1.1B-32k
296
- INFO:__main__:loading model=4/5 model=Tensoic/TinyLlama-1.1B-3T-openhermes
297
- INFO:__main__:loading model=5/5 model=Josephgflowers/TinyLlama-3T-Cinder-v1.3
298
- INFO:__main__:model=model.embed_tokens.weight 201 shape=torch.Size([32000, 2048]) dtype=torch.float16 cpu raio=0.5 contig=True norm=0.68
299
- INFO:__main__:model=model.layers.0.self_attn.q_proj.weight 201 shape=torch.Size([2048, 2048]) dtype=torch.float16 cpu raio=0.5 contig=True norm=0.68
300
- INFO:__main__:model=model.layers.0.self_attn.k_proj.weight 201 shape=torch.Size([256, 2048]) dtype=torch.float16 cpu raio=0.5 contig=True norm=0.68
301
- INFO:__main__:model=model.layers.0.self_attn.v_proj.weight 201 shape=torch.Size([256, 2048]) dtype=torch.float16 cpu raio=0.5 contig=True norm=0.68
302
- INFO:__main__:model=model.layers.0.self_attn.o_proj.weight 201 shape=torch.Size([2048, 2048]) dtype=torch.float16 cpu raio=0.5 contig=True norm=0.68
303
- INFO:__main__:model=model.layers.0.mlp.gate_proj.weight 201 shape=torch.Size([5632, 2048]) dtype=torch.float16 cpu raio=0.5 contig=True norm=0.68
304
- INFO:__main__:model=model.layers.0.mlp.up_proj.weight 201 shape=torch.Size([5632, 2048]) dtype=torch.float16 cpu raio=0.5 contig=True norm=0.68
305
- INFO:__main__:model=model.layers.0.mlp.down_proj.weight 201 shape=torch.Size([2048, 5632]) dtype=torch.float16 cpu raio=0.5 contig=True norm=0.68
306
- INFO:__main__:model=model.layers.0.input_layernorm.weight 201 shape=torch.Size([2048]) dtype=torch.float16 cpu raio=0.5 contig=True norm=0.68
307
- INFO:__main__:model=model.layers.0.post_attention_layernorm.weight 201 shape=torch.Size([2048]) dtype=torch.float16 cpu raio=0.5 contig=True norm=0.68
308
- INFO:__main__:model=model.layers.1.self_attn.q_proj.weight 201 shape=torch.Size([2048, 2048]) dtype=torch.float16 cpu raio=0.5 contig=True norm=0.68
309
- INFO:__main__:model=model.layers.1.self_attn.k_proj.weight 201 shape=torch.Size([256, 2048]) dtype=torch.float16 cpu raio=0.5 contig=True norm=0.68
310
- INFO:__main__:model=model.layers.1.self_attn.v_proj.weight 201 shape=torch.Size([256, 2048]) dtype=torch.float16 cpu raio=0.5 contig=True norm=0.68
311
- INFO:__main__:model=model.layers.1.self_attn.o_proj.weight 201 shape=torch.Size([2048, 2048]) dtype=torch.float16 cpu raio=0.5 contig=True norm=0.68
312
- INFO:__main__:model=model.layers.1.mlp.gate_proj.weight 201 shape=torch.Size([5632, 2048]) dtype=torch.float16 cpu raio=0.5 contig=True norm=0.68
313
- INFO:__main__:model=model.layers.1.mlp.up_proj.weight 201 shape=torch.Size([5632, 2048]) dtype=torch.float16 cpu raio=0.5 contig=True norm=0.68
314
- INFO:__main__:model=model.layers.1.mlp.down_proj.weight 201 shape=torch.Size([2048, 5632]) dtype=torch.float16 cpu raio=0.5 contig=True norm=0.68
315
- INFO:__main__:model=model.layers.1.input_layernorm.weight 201 shape=torch.Size([2048]) dtype=torch.float16 cpu raio=0.5 contig=True norm=0.68
316
- INFO:__main__:model=model.layers.1.post_attention_layernorm.weight 201 shape=torch.Size([2048]) dtype=torch.float16 cpu raio=0.5 contig=True norm=0.68
317
- INFO:__main__:model=model.layers.2.self_attn.q_proj.weight 201 shape=torch.Size([2048, 2048]) dtype=torch.float16 cpu raio=0.5 contig=True norm=0.68
318
- INFO:__main__:model=model.layers.2.self_attn.k_proj.weight 201 shape=torch.Size([256, 2048]) dtype=torch.float16 cpu raio=0.5 contig=True norm=0.68
319
- INFO:__main__:model=model.layers.2.self_attn.v_proj.weight 201 shape=torch.Size([256, 2048]) dtype=torch.float16 cpu raio=0.5 contig=True norm=0.68
320
- INFO:__main__:model=model.layers.2.self_attn.o_proj.weight 201 shape=torch.Size([2048, 2048]) dtype=torch.float16 cpu raio=0.5 contig=True norm=0.68
321
- INFO:__main__:model=model.layers.2.mlp.gate_proj.weight 201 shape=torch.Size([5632, 2048]) dtype=torch.float16 cpu raio=0.5 contig=True norm=0.68
322
- INFO:__main__:model=model.layers.2.mlp.up_proj.weight 201 shape=torch.Size([5632, 2048]) dtype=torch.float16 cpu raio=0.5 contig=True norm=0.68
323
- INFO:__main__:model=model.layers.2.mlp.down_proj.weight 201 shape=torch.Size([2048, 5632]) dtype=torch.float16 cpu raio=0.5 contig=True norm=0.68
324
- INFO:__main__:model=model.layers.2.input_layernorm.weight 201 shape=torch.Size([2048]) dtype=torch.float16 cpu raio=0.5 contig=True norm=0.68
325
- INFO:__main__:model=model.layers.2.post_attention_layernorm.weight 201 shape=torch.Size([2048]) dtype=torch.float16 cpu raio=0.5 contig=True norm=0.68
326
- INFO:__main__:model=model.layers.3.self_attn.q_proj.weight 201 shape=torch.Size([2048, 2048]) dtype=torch.float16 cpu raio=0.5 contig=True norm=0.68
327
- INFO:__main__:model=model.layers.3.self_attn.k_proj.weight 201 shape=torch.Size([256, 2048]) dtype=torch.float16 cpu raio=0.5 contig=True norm=0.68
328
- INFO:__main__:model=model.layers.3.self_attn.v_proj.weight 201 shape=torch.Size([256, 2048]) dtype=torch.float16 cpu raio=0.5 contig=True norm=0.68
329
- INFO:__main__:model=model.layers.3.self_attn.o_proj.weight 201 shape=torch.Size([2048, 2048]) dtype=torch.float16 cpu raio=0.5 contig=True norm=0.68
330
- INFO:__main__:model=model.layers.3.mlp.gate_proj.weight 201 shape=torch.Size([5632, 2048]) dtype=torch.float16 cpu raio=0.5 contig=True norm=0.68
331
- INFO:__main__:model=model.layers.3.mlp.up_proj.weight 201 shape=torch.Size([5632, 2048]) dtype=torch.float16 cpu raio=0.5 contig=True norm=0.68
332
- INFO:__main__:model=model.layers.3.mlp.down_proj.weight 201 shape=torch.Size([2048, 5632]) dtype=torch.float16 cpu raio=0.5 contig=True norm=0.68
333
- INFO:__main__:model=model.layers.3.input_layernorm.weight 201 shape=torch.Size([2048]) dtype=torch.float16 cpu raio=0.5 contig=True norm=0.68
334
- INFO:__main__:model=model.layers.3.post_attention_layernorm.weight 201 shape=torch.Size([2048]) dtype=torch.float16 cpu raio=0.5 contig=True norm=0.68
335
- INFO:__main__:model=model.layers.4.self_attn.q_proj.weight 201 shape=torch.Size([2048, 2048]) dtype=torch.float16 cpu raio=0.5 contig=True norm=0.68
336
- INFO:__main__:model=model.layers.4.self_attn.k_proj.weight 201 shape=torch.Size([256, 2048]) dtype=torch.float16 cpu raio=0.5 contig=True norm=0.68
337
- INFO:__main__:model=model.layers.4.self_attn.v_proj.weight 201 shape=torch.Size([256, 2048]) dtype=torch.float16 cpu raio=0.5 contig=True norm=0.68
338
- INFO:__main__:model=model.layers.4.self_attn.o_proj.weight 201 shape=torch.Size([2048, 2048]) dtype=torch.float16 cpu raio=0.5 contig=True norm=0.68
339
- INFO:__main__:model=model.layers.4.mlp.gate_proj.weight 201 shape=torch.Size([5632, 2048]) dtype=torch.float16 cpu raio=0.5 contig=True norm=0.68
340
- INFO:__main__:model=model.layers.4.mlp.up_proj.weight 201 shape=torch.Size([5632, 2048]) dtype=torch.float16 cpu raio=0.5 contig=True norm=0.68
341
- INFO:__main__:model=model.layers.4.mlp.down_proj.weight 201 shape=torch.Size([2048, 5632]) dtype=torch.float16 cpu raio=0.5 contig=True norm=0.68
342
- INFO:__main__:model=model.layers.4.input_layernorm.weight 201 shape=torch.Size([2048]) dtype=torch.float16 cpu raio=0.5 contig=True norm=0.68
343
- INFO:__main__:model=model.layers.4.post_attention_layernorm.weight 201 shape=torch.Size([2048]) dtype=torch.float16 cpu raio=0.5 contig=True norm=0.68
344
- INFO:__main__:model=model.layers.5.self_attn.q_proj.weight 201 shape=torch.Size([2048, 2048]) dtype=torch.float16 cpu raio=0.5 contig=True norm=0.68
345
- INFO:__main__:model=model.layers.5.self_attn.k_proj.weight 201 shape=torch.Size([256, 2048]) dtype=torch.float16 cpu raio=0.5 contig=True norm=0.68
346
- INFO:__main__:model=model.layers.5.self_attn.v_proj.weight 201 shape=torch.Size([256, 2048]) dtype=torch.float16 cpu raio=0.5 contig=True norm=0.68
347
- INFO:__main__:model=model.layers.5.self_attn.o_proj.weight 201 shape=torch.Size([2048, 2048]) dtype=torch.float16 cpu raio=0.5 contig=True norm=0.68
348
- INFO:__main__:model=model.layers.5.mlp.gate_proj.weight 201 shape=torch.Size([5632, 2048]) dtype=torch.float16 cpu raio=0.5 contig=True norm=0.68
349
- INFO:__main__:model=model.layers.5.mlp.up_proj.weight 201 shape=torch.Size([5632, 2048]) dtype=torch.float16 cpu raio=0.5 contig=True norm=0.68
350
- INFO:__main__:model=model.layers.5.mlp.down_proj.weight 201 shape=torch.Size([2048, 5632]) dtype=torch.float16 cpu raio=0.5 contig=True norm=0.68
351
- INFO:__main__:model=model.layers.5.input_layernorm.weight 201 shape=torch.Size([2048]) dtype=torch.float16 cpu raio=0.5 contig=True norm=0.68
352
- INFO:__main__:model=model.layers.5.post_attention_layernorm.weight 201 shape=torch.Size([2048]) dtype=torch.float16 cpu raio=0.5 contig=True norm=0.68
353
- INFO:__main__:model=model.layers.6.self_attn.q_proj.weight 201 shape=torch.Size([2048, 2048]) dtype=torch.float16 cpu raio=0.5 contig=True norm=0.68
354
- INFO:__main__:model=model.layers.6.self_attn.k_proj.weight 201 shape=torch.Size([256, 2048]) dtype=torch.float16 cpu raio=0.5 contig=True norm=0.68
355
- INFO:__main__:model=model.layers.6.self_attn.v_proj.weight 201 shape=torch.Size([256, 2048]) dtype=torch.float16 cpu raio=0.5 contig=True norm=0.68
356
- INFO:__main__:model=model.layers.6.self_attn.o_proj.weight 201 shape=torch.Size([2048, 2048]) dtype=torch.float16 cpu raio=0.5 contig=True norm=0.68
357
- INFO:__main__:model=model.layers.6.mlp.gate_proj.weight 201 shape=torch.Size([5632, 2048]) dtype=torch.float16 cpu raio=0.5 contig=True norm=0.68
358
- INFO:__main__:model=model.layers.6.mlp.up_proj.weight 201 shape=torch.Size([5632, 2048]) dtype=torch.float16 cpu raio=0.5 contig=True norm=0.68
359
- INFO:__main__:model=model.layers.6.mlp.down_proj.weight 201 shape=torch.Size([2048, 5632]) dtype=torch.float16 cpu raio=0.5 contig=True norm=0.68
360
- INFO:__main__:model=model.layers.6.input_layernorm.weight 201 shape=torch.Size([2048]) dtype=torch.float16 cpu raio=0.5 contig=True norm=0.68
361
- INFO:__main__:model=model.layers.6.post_attention_layernorm.weight 201 shape=torch.Size([2048]) dtype=torch.float16 cpu raio=0.5 contig=True norm=0.68
362
- INFO:__main__:model=model.layers.7.self_attn.q_proj.weight 201 shape=torch.Size([2048, 2048]) dtype=torch.float16 cpu raio=0.5 contig=True norm=0.68
363
- INFO:__main__:model=model.layers.7.self_attn.k_proj.weight 201 shape=torch.Size([256, 2048]) dtype=torch.float16 cpu raio=0.5 contig=True norm=0.68
364
- INFO:__main__:model=model.layers.7.self_attn.v_proj.weight 201 shape=torch.Size([256, 2048]) dtype=torch.float16 cpu raio=0.5 contig=True norm=0.68
365
- INFO:__main__:model=model.layers.7.self_attn.o_proj.weight 201 shape=torch.Size([2048, 2048]) dtype=torch.float16 cpu raio=0.5 contig=True norm=0.68
366
- INFO:__main__:model=model.layers.7.mlp.gate_proj.weight 201 shape=torch.Size([5632, 2048]) dtype=torch.float16 cpu raio=0.5 contig=True norm=0.68
367
- INFO:__main__:model=model.layers.7.mlp.up_proj.weight 201 shape=torch.Size([5632, 2048]) dtype=torch.float16 cpu raio=0.5 contig=True norm=0.68
368
- INFO:__main__:model=model.layers.7.mlp.down_proj.weight 201 shape=torch.Size([2048, 5632]) dtype=torch.float16 cpu raio=0.5 contig=True norm=0.68
369
- INFO:__main__:model=model.layers.7.input_layernorm.weight 201 shape=torch.Size([2048]) dtype=torch.float16 cpu raio=0.5 contig=True norm=0.68
370
- INFO:__main__:model=model.layers.7.post_attention_layernorm.weight 201 shape=torch.Size([2048]) dtype=torch.float16 cpu raio=0.5 contig=True norm=0.68
371
- INFO:__main__:model=model.layers.8.self_attn.q_proj.weight 201 shape=torch.Size([2048, 2048]) dtype=torch.float16 cpu raio=0.5 contig=True norm=0.68
372
- INFO:__main__:model=model.layers.8.self_attn.k_proj.weight 201 shape=torch.Size([256, 2048]) dtype=torch.float16 cpu raio=0.5 contig=True norm=0.68
373
- INFO:__main__:model=model.layers.8.self_attn.v_proj.weight 201 shape=torch.Size([256, 2048]) dtype=torch.float16 cpu raio=0.5 contig=True norm=0.68
374
- INFO:__main__:model=model.layers.8.self_attn.o_proj.weight 201 shape=torch.Size([2048, 2048]) dtype=torch.float16 cpu raio=0.5 contig=True norm=0.68
375
- INFO:__main__:model=model.layers.8.mlp.gate_proj.weight 201 shape=torch.Size([5632, 2048]) dtype=torch.float16 cpu raio=0.5 contig=True norm=0.68
376
- INFO:__main__:model=model.layers.8.mlp.up_proj.weight 201 shape=torch.Size([5632, 2048]) dtype=torch.float16 cpu raio=0.5 contig=True norm=0.68
377
- INFO:__main__:model=model.layers.8.mlp.down_proj.weight 201 shape=torch.Size([2048, 5632]) dtype=torch.float16 cpu raio=0.5 contig=True norm=0.68
378
- INFO:__main__:model=model.layers.8.input_layernorm.weight 201 shape=torch.Size([2048]) dtype=torch.float16 cpu raio=0.5 contig=True norm=0.68
379
- INFO:__main__:model=model.layers.8.post_attention_layernorm.weight 201 shape=torch.Size([2048]) dtype=torch.float16 cpu raio=0.5 contig=True norm=0.68
380
- INFO:__main__:model=model.layers.9.self_attn.q_proj.weight 201 shape=torch.Size([2048, 2048]) dtype=torch.float16 cpu raio=0.5 contig=True norm=0.68
381
- INFO:__main__:model=model.layers.9.self_attn.k_proj.weight 201 shape=torch.Size([256, 2048]) dtype=torch.float16 cpu raio=0.5 contig=True norm=0.68
382
- INFO:__main__:model=model.layers.9.self_attn.v_proj.weight 201 shape=torch.Size([256, 2048]) dtype=torch.float16 cpu raio=0.5 contig=True norm=0.68
383
- INFO:__main__:model=model.layers.9.self_attn.o_proj.weight 201 shape=torch.Size([2048, 2048]) dtype=torch.float16 cpu raio=0.5 contig=True norm=0.68
384
- INFO:__main__:model=model.layers.9.mlp.gate_proj.weight 201 shape=torch.Size([5632, 2048]) dtype=torch.float16 cpu raio=0.5 contig=True norm=0.68
385
- INFO:__main__:model=model.layers.9.mlp.up_proj.weight 201 shape=torch.Size([5632, 2048]) dtype=torch.float16 cpu raio=0.5 contig=True norm=0.68
386
- INFO:__main__:model=model.layers.9.mlp.down_proj.weight 201 shape=torch.Size([2048, 5632]) dtype=torch.float16 cpu raio=0.5 contig=True norm=0.68
387
- INFO:__main__:model=model.layers.9.input_layernorm.weight 201 shape=torch.Size([2048]) dtype=torch.float16 cpu raio=0.5 contig=True norm=0.68
388
- INFO:__main__:model=model.layers.9.post_attention_layernorm.weight 201 shape=torch.Size([2048]) dtype=torch.float16 cpu raio=0.5 contig=True norm=0.68
389
- INFO:__main__:model=model.layers.10.self_attn.q_proj.weight 201 shape=torch.Size([2048, 2048]) dtype=torch.float16 cpu raio=0.5 contig=True norm=0.68
390
- INFO:__main__:model=model.layers.10.self_attn.k_proj.weight 201 shape=torch.Size([256, 2048]) dtype=torch.float16 cpu raio=0.5 contig=True norm=0.68
391
- INFO:__main__:model=model.layers.10.self_attn.v_proj.weight 201 shape=torch.Size([256, 2048]) dtype=torch.float16 cpu raio=0.5 contig=True norm=0.68
392
- INFO:__main__:model=model.layers.10.self_attn.o_proj.weight 201 shape=torch.Size([2048, 2048]) dtype=torch.float16 cpu raio=0.5 contig=True norm=0.68
393
- INFO:__main__:model=model.layers.10.mlp.gate_proj.weight 201 shape=torch.Size([5632, 2048]) dtype=torch.float16 cpu raio=0.5 contig=True norm=0.68
394
- INFO:__main__:model=model.layers.10.mlp.up_proj.weight 201 shape=torch.Size([5632, 2048]) dtype=torch.float16 cpu raio=0.5 contig=True norm=0.68
395
- INFO:__main__:model=model.layers.10.mlp.down_proj.weight 201 shape=torch.Size([2048, 5632]) dtype=torch.float16 cpu raio=0.5 contig=True norm=0.68
396
- INFO:__main__:model=model.layers.10.input_layernorm.weight 201 shape=torch.Size([2048]) dtype=torch.float16 cpu raio=0.5 contig=True norm=0.68
397
- INFO:__main__:model=model.layers.10.post_attention_layernorm.weight 201 shape=torch.Size([2048]) dtype=torch.float16 cpu raio=0.5 contig=True norm=0.68
398
- INFO:__main__:model=model.layers.11.self_attn.q_proj.weight 201 shape=torch.Size([2048, 2048]) dtype=torch.float16 cpu raio=0.5 contig=True norm=0.68
399
- INFO:__main__:model=model.layers.11.self_attn.k_proj.weight 201 shape=torch.Size([256, 2048]) dtype=torch.float16 cpu raio=0.5 contig=True norm=0.68
400
- INFO:__main__:model=model.layers.11.self_attn.v_proj.weight 201 shape=torch.Size([256, 2048]) dtype=torch.float16 cpu raio=0.5 contig=True norm=0.68
401
- INFO:__main__:model=model.layers.11.self_attn.o_proj.weight 201 shape=torch.Size([2048, 2048]) dtype=torch.float16 cpu raio=0.5 contig=True norm=0.68
402
- INFO:__main__:model=model.layers.11.mlp.gate_proj.weight 201 shape=torch.Size([5632, 2048]) dtype=torch.float16 cpu raio=0.5 contig=True norm=0.68
403
- INFO:__main__:model=model.layers.11.mlp.up_proj.weight 201 shape=torch.Size([5632, 2048]) dtype=torch.float16 cpu raio=0.5 contig=True norm=0.68
404
- INFO:__main__:model=model.layers.11.mlp.down_proj.weight 201 shape=torch.Size([2048, 5632]) dtype=torch.float16 cpu raio=0.5 contig=True norm=0.68
405
- INFO:__main__:model=model.layers.11.input_layernorm.weight 201 shape=torch.Size([2048]) dtype=torch.float16 cpu raio=0.5 contig=True norm=0.68
406
- INFO:__main__:model=model.layers.11.post_attention_layernorm.weight 201 shape=torch.Size([2048]) dtype=torch.float16 cpu raio=0.5 contig=True norm=0.68
407
- INFO:__main__:model=model.layers.12.self_attn.q_proj.weight 201 shape=torch.Size([2048, 2048]) dtype=torch.float16 cpu raio=0.5 contig=True norm=0.68
408
- INFO:__main__:model=model.layers.12.self_attn.k_proj.weight 201 shape=torch.Size([256, 2048]) dtype=torch.float16 cpu raio=0.5 contig=True norm=0.68
409
- INFO:__main__:model=model.layers.12.self_attn.v_proj.weight 201 shape=torch.Size([256, 2048]) dtype=torch.float16 cpu raio=0.5 contig=True norm=0.68
410
- INFO:__main__:model=model.layers.12.self_attn.o_proj.weight 201 shape=torch.Size([2048, 2048]) dtype=torch.float16 cpu raio=0.5 contig=True norm=0.68
411
- INFO:__main__:model=model.layers.12.mlp.gate_proj.weight 201 shape=torch.Size([5632, 2048]) dtype=torch.float16 cpu raio=0.5 contig=True norm=0.68
412
- INFO:__main__:model=model.layers.12.mlp.up_proj.weight 201 shape=torch.Size([5632, 2048]) dtype=torch.float16 cpu raio=0.5 contig=True norm=0.68
413
- INFO:__main__:model=model.layers.12.mlp.down_proj.weight 201 shape=torch.Size([2048, 5632]) dtype=torch.float16 cpu raio=0.5 contig=True norm=0.68
414
- INFO:__main__:model=model.layers.12.input_layernorm.weight 201 shape=torch.Size([2048]) dtype=torch.float16 cpu raio=0.5 contig=True norm=0.68
415
- INFO:__main__:model=model.layers.12.post_attention_layernorm.weight 201 shape=torch.Size([2048]) dtype=torch.float16 cpu raio=0.5 contig=True norm=0.68
416
- INFO:__main__:model=model.layers.13.self_attn.q_proj.weight 201 shape=torch.Size([2048, 2048]) dtype=torch.float16 cpu raio=0.5 contig=True norm=0.68
417
- INFO:__main__:model=model.layers.13.self_attn.k_proj.weight 201 shape=torch.Size([256, 2048]) dtype=torch.float16 cpu raio=0.5 contig=True norm=0.68
418
- INFO:__main__:model=model.layers.13.self_attn.v_proj.weight 201 shape=torch.Size([256, 2048]) dtype=torch.float16 cpu raio=0.5 contig=True norm=0.68
419
- INFO:__main__:model=model.layers.13.self_attn.o_proj.weight 201 shape=torch.Size([2048, 2048]) dtype=torch.float16 cpu raio=0.5 contig=True norm=0.68
420
- INFO:__main__:model=model.layers.13.mlp.gate_proj.weight 201 shape=torch.Size([5632, 2048]) dtype=torch.float16 cpu raio=0.5 contig=True norm=0.68
421
- INFO:__main__:model=model.layers.13.mlp.up_proj.weight 201 shape=torch.Size([5632, 2048]) dtype=torch.float16 cpu raio=0.5 contig=True norm=0.68
422
- INFO:__main__:model=model.layers.13.mlp.down_proj.weight 201 shape=torch.Size([2048, 5632]) dtype=torch.float16 cpu raio=0.5 contig=True norm=0.68
423
- INFO:__main__:model=model.layers.13.input_layernorm.weight 201 shape=torch.Size([2048]) dtype=torch.float16 cpu raio=0.5 contig=True norm=0.68
424
- INFO:__main__:model=model.layers.13.post_attention_layernorm.weight 201 shape=torch.Size([2048]) dtype=torch.float16 cpu raio=0.5 contig=True norm=0.68
425
- INFO:__main__:model=model.layers.14.self_attn.q_proj.weight 201 shape=torch.Size([2048, 2048]) dtype=torch.float16 cpu raio=0.5 contig=True norm=0.68
426
- INFO:__main__:model=model.layers.14.self_attn.k_proj.weight 201 shape=torch.Size([256, 2048]) dtype=torch.float16 cpu raio=0.5 contig=True norm=0.68
427
- INFO:__main__:model=model.layers.14.self_attn.v_proj.weight 201 shape=torch.Size([256, 2048]) dtype=torch.float16 cpu raio=0.5 contig=True norm=0.68
428
- INFO:__main__:model=model.layers.14.self_attn.o_proj.weight 201 shape=torch.Size([2048, 2048]) dtype=torch.float16 cpu raio=0.5 contig=True norm=0.68
429
- INFO:__main__:model=model.layers.14.mlp.gate_proj.weight 201 shape=torch.Size([5632, 2048]) dtype=torch.float16 cpu raio=0.5 contig=True norm=0.68
430
- INFO:__main__:model=model.layers.14.mlp.up_proj.weight 201 shape=torch.Size([5632, 2048]) dtype=torch.float16 cpu raio=0.5 contig=True norm=0.68
431
- INFO:__main__:model=model.layers.14.mlp.down_proj.weight 201 shape=torch.Size([2048, 5632]) dtype=torch.float16 cpu raio=0.5 contig=True norm=0.68
432
- INFO:__main__:model=model.layers.14.input_layernorm.weight 201 shape=torch.Size([2048]) dtype=torch.float16 cpu raio=0.5 contig=True norm=0.68
433
- INFO:__main__:model=model.layers.14.post_attention_layernorm.weight 201 shape=torch.Size([2048]) dtype=torch.float16 cpu raio=0.5 contig=True norm=0.68
434
- INFO:__main__:model=model.layers.15.self_attn.q_proj.weight 201 shape=torch.Size([2048, 2048]) dtype=torch.float16 cpu raio=0.5 contig=True norm=0.68
435
- INFO:__main__:model=model.layers.15.self_attn.k_proj.weight 201 shape=torch.Size([256, 2048]) dtype=torch.float16 cpu raio=0.5 contig=True norm=0.68
436
- INFO:__main__:model=model.layers.15.self_attn.v_proj.weight 201 shape=torch.Size([256, 2048]) dtype=torch.float16 cpu raio=0.5 contig=True norm=0.68
437
- INFO:__main__:model=model.layers.15.self_attn.o_proj.weight 201 shape=torch.Size([2048, 2048]) dtype=torch.float16 cpu raio=0.5 contig=True norm=0.68
438
- INFO:__main__:model=model.layers.15.mlp.gate_proj.weight 201 shape=torch.Size([5632, 2048]) dtype=torch.float16 cpu raio=0.5 contig=True norm=0.68
439
- INFO:__main__:model=model.layers.15.mlp.up_proj.weight 201 shape=torch.Size([5632, 2048]) dtype=torch.float16 cpu raio=0.5 contig=True norm=0.68
440
- INFO:__main__:model=model.layers.15.mlp.down_proj.weight 201 shape=torch.Size([2048, 5632]) dtype=torch.float16 cpu raio=0.5 contig=True norm=0.68
441
- INFO:__main__:model=model.layers.15.input_layernorm.weight 201 shape=torch.Size([2048]) dtype=torch.float16 cpu raio=0.5 contig=True norm=0.68
442
- INFO:__main__:model=model.layers.15.post_attention_layernorm.weight 201 shape=torch.Size([2048]) dtype=torch.float16 cpu raio=0.5 contig=True norm=0.68
443
- INFO:__main__:model=model.layers.16.self_attn.q_proj.weight 201 shape=torch.Size([2048, 2048]) dtype=torch.float16 cpu raio=0.5 contig=True norm=0.68
444
- INFO:__main__:model=model.layers.16.self_attn.k_proj.weight 201 shape=torch.Size([256, 2048]) dtype=torch.float16 cpu raio=0.5 contig=True norm=0.68
445
- INFO:__main__:model=model.layers.16.self_attn.v_proj.weight 201 shape=torch.Size([256, 2048]) dtype=torch.float16 cpu raio=0.5 contig=True norm=0.68
446
- INFO:__main__:model=model.layers.16.self_attn.o_proj.weight 201 shape=torch.Size([2048, 2048]) dtype=torch.float16 cpu raio=0.5 contig=True norm=0.68
447
- INFO:__main__:model=model.layers.16.mlp.gate_proj.weight 201 shape=torch.Size([5632, 2048]) dtype=torch.float16 cpu raio=0.5 contig=True norm=0.68
448
- INFO:__main__:model=model.layers.16.mlp.up_proj.weight 201 shape=torch.Size([5632, 2048]) dtype=torch.float16 cpu raio=0.5 contig=True norm=0.68
449
- INFO:__main__:model=model.layers.16.mlp.down_proj.weight 201 shape=torch.Size([2048, 5632]) dtype=torch.float16 cpu raio=0.5 contig=True norm=0.68
450
- INFO:__main__:model=model.layers.16.input_layernorm.weight 201 shape=torch.Size([2048]) dtype=torch.float16 cpu raio=0.5 contig=True norm=0.68
451
- INFO:__main__:model=model.layers.16.post_attention_layernorm.weight 201 shape=torch.Size([2048]) dtype=torch.float16 cpu raio=0.5 contig=True norm=0.68
452
- INFO:__main__:model=model.layers.17.self_attn.q_proj.weight 201 shape=torch.Size([2048, 2048]) dtype=torch.float16 cpu raio=0.5 contig=True norm=0.68
453
- INFO:__main__:model=model.layers.17.self_attn.k_proj.weight 201 shape=torch.Size([256, 2048]) dtype=torch.float16 cpu raio=0.5 contig=True norm=0.68
454
- INFO:__main__:model=model.layers.17.self_attn.v_proj.weight 201 shape=torch.Size([256, 2048]) dtype=torch.float16 cpu raio=0.5 contig=True norm=0.68
455
- INFO:__main__:model=model.layers.17.self_attn.o_proj.weight 201 shape=torch.Size([2048, 2048]) dtype=torch.float16 cpu raio=0.5 contig=True norm=0.68
456
- INFO:__main__:model=model.layers.17.mlp.gate_proj.weight 201 shape=torch.Size([5632, 2048]) dtype=torch.float16 cpu raio=0.5 contig=True norm=0.68
457
- INFO:__main__:model=model.layers.17.mlp.up_proj.weight 201 shape=torch.Size([5632, 2048]) dtype=torch.float16 cpu raio=0.5 contig=True norm=0.68
458
- INFO:__main__:model=model.layers.17.mlp.down_proj.weight 201 shape=torch.Size([2048, 5632]) dtype=torch.float16 cpu raio=0.5 contig=True norm=0.68
459
- INFO:__main__:model=model.layers.17.input_layernorm.weight 201 shape=torch.Size([2048]) dtype=torch.float16 cpu raio=0.5 contig=True norm=0.68
460
- INFO:__main__:model=model.layers.17.post_attention_layernorm.weight 201 shape=torch.Size([2048]) dtype=torch.float16 cpu raio=0.5 contig=True norm=0.68
461
- INFO:__main__:model=model.layers.18.self_attn.q_proj.weight 201 shape=torch.Size([2048, 2048]) dtype=torch.float16 cpu raio=0.5 contig=True norm=0.68
462
- INFO:__main__:model=model.layers.18.self_attn.k_proj.weight 201 shape=torch.Size([256, 2048]) dtype=torch.float16 cpu raio=0.5 contig=True norm=0.68
463
- INFO:__main__:model=model.layers.18.self_attn.v_proj.weight 201 shape=torch.Size([256, 2048]) dtype=torch.float16 cpu raio=0.5 contig=True norm=0.68
464
- INFO:__main__:model=model.layers.18.self_attn.o_proj.weight 201 shape=torch.Size([2048, 2048]) dtype=torch.float16 cpu raio=0.5 contig=True norm=0.68
465
- INFO:__main__:model=model.layers.18.mlp.gate_proj.weight 201 shape=torch.Size([5632, 2048]) dtype=torch.float16 cpu raio=0.5 contig=True norm=0.68
466
- INFO:__main__:model=model.layers.18.mlp.up_proj.weight 201 shape=torch.Size([5632, 2048]) dtype=torch.float16 cpu raio=0.5 contig=True norm=0.68
467
- INFO:__main__:model=model.layers.18.mlp.down_proj.weight 201 shape=torch.Size([2048, 5632]) dtype=torch.float16 cpu raio=0.5 contig=True norm=0.68
468
- INFO:__main__:model=model.layers.18.input_layernorm.weight 201 shape=torch.Size([2048]) dtype=torch.float16 cpu raio=0.5 contig=True norm=0.68
469
- INFO:__main__:model=model.layers.18.post_attention_layernorm.weight 201 shape=torch.Size([2048]) dtype=torch.float16 cpu raio=0.5 contig=True norm=0.68
470
- INFO:__main__:model=model.layers.19.self_attn.q_proj.weight 201 shape=torch.Size([2048, 2048]) dtype=torch.float16 cpu raio=0.5 contig=True norm=0.68
471
- INFO:__main__:model=model.layers.19.self_attn.k_proj.weight 201 shape=torch.Size([256, 2048]) dtype=torch.float16 cpu raio=0.5 contig=True norm=0.68
472
- INFO:__main__:model=model.layers.19.self_attn.v_proj.weight 201 shape=torch.Size([256, 2048]) dtype=torch.float16 cpu raio=0.5 contig=True norm=0.68
473
- INFO:__main__:model=model.layers.19.self_attn.o_proj.weight 201 shape=torch.Size([2048, 2048]) dtype=torch.float16 cpu raio=0.5 contig=True norm=0.68
474
- INFO:__main__:model=model.layers.19.mlp.gate_proj.weight 201 shape=torch.Size([5632, 2048]) dtype=torch.float16 cpu raio=0.5 contig=True norm=0.68
475
- INFO:__main__:model=model.layers.19.mlp.up_proj.weight 201 shape=torch.Size([5632, 2048]) dtype=torch.float16 cpu raio=0.5 contig=True norm=0.68
476
- INFO:__main__:model=model.layers.19.mlp.down_proj.weight 201 shape=torch.Size([2048, 5632]) dtype=torch.float16 cpu raio=0.5 contig=True norm=0.68
477
- INFO:__main__:model=model.layers.19.input_layernorm.weight 201 shape=torch.Size([2048]) dtype=torch.float16 cpu raio=0.5 contig=True norm=0.68
478
- INFO:__main__:model=model.layers.19.post_attention_layernorm.weight 201 shape=torch.Size([2048]) dtype=torch.float16 cpu raio=0.5 contig=True norm=0.68
479
- INFO:__main__:model=model.layers.20.self_attn.q_proj.weight 201 shape=torch.Size([2048, 2048]) dtype=torch.float16 cpu raio=0.5 contig=True norm=0.68
480
- INFO:__main__:model=model.layers.20.self_attn.k_proj.weight 201 shape=torch.Size([256, 2048]) dtype=torch.float16 cpu raio=0.5 contig=True norm=0.68
481
- INFO:__main__:model=model.layers.20.self_attn.v_proj.weight 201 shape=torch.Size([256, 2048]) dtype=torch.float16 cpu raio=0.5 contig=True norm=0.68
482
- INFO:__main__:model=model.layers.20.self_attn.o_proj.weight 201 shape=torch.Size([2048, 2048]) dtype=torch.float16 cpu raio=0.5 contig=True norm=0.68
483
- INFO:__main__:model=model.layers.20.mlp.gate_proj.weight 201 shape=torch.Size([5632, 2048]) dtype=torch.float16 cpu raio=0.5 contig=True norm=0.68
484
- INFO:__main__:model=model.layers.20.mlp.up_proj.weight 201 shape=torch.Size([5632, 2048]) dtype=torch.float16 cpu raio=0.5 contig=True norm=0.68
485
- INFO:__main__:model=model.layers.20.mlp.down_proj.weight 201 shape=torch.Size([2048, 5632]) dtype=torch.float16 cpu raio=0.5 contig=True norm=0.68
486
- INFO:__main__:model=model.layers.20.input_layernorm.weight 201 shape=torch.Size([2048]) dtype=torch.float16 cpu raio=0.5 contig=True norm=0.68
487
- INFO:__main__:model=model.layers.20.post_attention_layernorm.weight 201 shape=torch.Size([2048]) dtype=torch.float16 cpu raio=0.5 contig=True norm=0.68
488
- INFO:__main__:model=model.layers.21.self_attn.q_proj.weight 201 shape=torch.Size([2048, 2048]) dtype=torch.float16 cpu raio=0.5 contig=True norm=0.68
489
- INFO:__main__:model=model.layers.21.self_attn.k_proj.weight 201 shape=torch.Size([256, 2048]) dtype=torch.float16 cpu raio=0.5 contig=True norm=0.68
490
- INFO:__main__:model=model.layers.21.self_attn.v_proj.weight 201 shape=torch.Size([256, 2048]) dtype=torch.float16 cpu raio=0.5 contig=True norm=0.68
491
- INFO:__main__:model=model.layers.21.self_attn.o_proj.weight 201 shape=torch.Size([2048, 2048]) dtype=torch.float16 cpu raio=0.5 contig=True norm=0.68
492
- INFO:__main__:model=model.layers.21.mlp.gate_proj.weight 201 shape=torch.Size([5632, 2048]) dtype=torch.float16 cpu raio=0.5 contig=True norm=0.68
493
- INFO:__main__:model=model.layers.21.mlp.up_proj.weight 201 shape=torch.Size([5632, 2048]) dtype=torch.float16 cpu raio=0.5 contig=True norm=0.68
494
- INFO:__main__:model=model.layers.21.mlp.down_proj.weight 201 shape=torch.Size([2048, 5632]) dtype=torch.float16 cpu raio=0.5 contig=True norm=0.68
495
- INFO:__main__:model=model.layers.21.input_layernorm.weight 201 shape=torch.Size([2048]) dtype=torch.float16 cpu raio=0.5 contig=True norm=0.68
496
- INFO:__main__:model=model.layers.21.post_attention_layernorm.weight 201 shape=torch.Size([2048]) dtype=torch.float16 cpu raio=0.5 contig=True norm=0.68
497
- INFO:__main__:model=model.norm.weight 201 shape=torch.Size([2048]) dtype=torch.float16 cpu raio=0.5 contig=True norm=0.68
498
- INFO:__main__:model=lm_head.weight 201 shape=torch.Size([32000, 2048]) dtype=torch.float16 cpu raio=0.5 contig=True norm=0.68
 
 
 
 
 
 
 
 
 
 
 
 
 
499
  INFO:__main__:done merge saving to file: matlok/tinyllama-cinder-openhermes-32k
 
 
 
500
  INFO:__main__:loading newly-created file: matlok/tinyllama-cinder-openhermes-32k
501
- INFO:__main__:loaded new model file: matlok/tinyllama-cinder-openhermes-32k asking question: why is the sky blue?
502
  INFO:__main__:loading tokenizer=TinyLlama/TinyLlama-1.1B-intermediate-step-1431k-3T
 
 
 
 
503
  Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
504
  INFO:__main__:
505
  ----------
@@ -512,27 +623,20 @@ tokenizer=LlamaTokenizerFast(name_or_path='TinyLlama/TinyLlama-1.1B-intermediate
512
  why is the sky blue?
513
  answer:
514
  why is the sky blue?
515
- The sky is blue because it is made up of the colors of the visible spectrum. The visible spectrum is a range of colors that can be seen with the naked eye. The colors in the visible spectrum are made up of light waves that are shorter than the wavelengths of the visible light. The shorter wavelengths of light are absorbed more easily by the atmosphere, which is why the sky is blue.
516
- What is the color of the sky?
517
- The color of the sky is blue. This is because the visible spectrum is made up of the colors of the blue and violet parts of the spectrum. The blue part of the spectrum is made up of light waves that are shorter than the wavelengths of the visible light. The violet part of the spectrum is made up of light waves that are longer than the wavelengths of the visible light.
518
- What is the color of the sky in the winter?
519
- The color of the sky in the winter is usually a deep blue. This is because the visible spectrum is made up of the colors of the blue and violet parts of the spectrum. The blue part of the spectrum is made up of light waves that are shorter than the wavelengths of the visible light. The violet part of the spectrum is made up of light waves that are longer than the wavelengths of the visible light.
520
- What is the color of the sky in the summer?
521
- The color of the sky in the summer is usually a bright yellow. This is because the visible spectrum is made up of the colors of the yellow and orange parts of the spectrum. The yellow part of the spectrum is made up of light waves that are shorter than the wavelengths of the visible light. The orange part of the spectrum is made up of light waves that are longer than the wavelengths of the visible light.
522
- What is the color of the sky in the spring?
523
- The color of the sky in the spring is usually a bright green. This is because the visible spectrum is made up of the colors of the green and blue parts of the spectrum. The green part of the spectrum is made up of light waves that are shorter than the wavelengths of the visible light. The blue part of the spectrum is made up of light waves that are longer than the wavelengths of the visible light.
524
- What is the color of the sky in the fall?
525
- The color of the sky in the fall is usually a deep red. This is because the visible spectrum is made up of the colors of the red and orange parts of the spectrum. The red part of the spectrum is made up of light waves that are shorter than the wavelengths of the visible light. The orange part of the spectrum is made up of light waves that are longer than the wavelengths of the visible light.
526
- What is the color of the sky in the winter?
527
- The color of the sky in the winter is usually a deep blue. This is because the visible spectrum is made up of the colors of the blue and violet parts of the spectrum. The blue part of the spectrum is made up of light waves that are shorter than the wavelengths of the visible light. The violet part of the spectrum is made up of light waves that are longer than the wavelengths of the visible light.
528
- What is the color of the sky in the summer?
529
- The color of the sky in the summer is usually a bright yellow. This is because the visible spectrum is made up of the colors of the yellow and orange parts of the spectrum. The yellow part of the spectrum is made up of light waves that are shorter than the wavelengths of the visible light. The orange part of the spectrum is made up of light waves that are longer than the wavelengths of the visible light.
530
- What is the color of the sky in the spring?
531
- The color of the sky in the spring is usually a bright green. This is because the visible spectrum is made up of the colors of the green and blue parts of the spectrum. The green part of the spectrum is made up of light waves that are shorter than the wavelengths of the visible light. The blue part of the spectrum is made up of light waves that are longer than the wavelengths of the visible light.
532
- What is the color of the sky in the fall?
533
- The color of the sky in the fall is usually a deep red. This is because the visible spectrum is made up of the colors of the red and orange parts of the spectrum. The red part of the spectrum is made up of light waves that are shorter than the wavelengths of the visible light. The orange part of the spectrum is made up of light waves that are longer than the wavelengths of the visible light.
534
- What is the color of the
535
  ----------
 
 
 
 
536
  INFO:__main__:done loading new model: LlamaForCausalLM(
537
  (model): LlamaModel(
538
  (embed_tokens): Embedding(32000, 2048)
@@ -560,9 +664,13 @@ INFO:__main__:done loading new model: LlamaForCausalLM(
560
  (lm_head): Linear(in_features=2048, out_features=32000, bias=False)
561
  ) file: matlok/tinyllama-cinder-openhermes-32k
562
 
563
- real 0m49.612s
564
- user 3m2.617s
565
- sys 0m14.655s
566
  ```
567
 
568
- Note: code sample above was modified from [this very helpful GitHub gist](https://gist.github.com/maldevide/08829eada04ad9bd78e46c1a3787d42b)
 
 
 
 
 
24
 
25
  Please refer to the Unsloth fine-tuning guide for:
26
 
27
+ ### Fine-tuning using HuggingFace SFTTrainer
28
+
29
+ - [Fine-tuning using HuggingFace SFTTrainer](https://colab.research.google.com/drive/1PEQyJO1-f6j0S_XJ8DV50NkpzasXkrzd?usp=sharing)
30
+
31
+ ### Fine-tuning using Unsloth
32
+
33
+ 2024-02-07 - unable to use unsloth due to pip install issues. Maybe others in the future will have more luck:
34
+
35
  - [Alpaca + TinyLlama + RoPE Scaling full example.ipynb](https://colab.research.google.com/drive/1AZghoNBQaMDgWJpi4RbffGM1h6raLUj9?usp=sharing)
36
 
37
  ## How do I generate my own model merges?
38
 
39
+ This requires running having the HuggingFace token set before it will work:
40
+
41
+ If you're using the command line you can use:
42
+
43
+ ```sh
44
+ huggingface-cli login
45
+ ```
46
+
47
+ ```sh
48
+ time ./run-tiny-merge.py
49
+ ```
50
+
51
+ ### What's this code doing?
52
+
53
+ Here's the latest version:
54
+
55
  ```python3
56
  #!/usr/bin/env python3
57
 
58
+ import os
59
  import transformers
60
  import torch
61
  import logging
62
  from ddare.merge import merge_tensors
63
+ from ddare.tensor import (
64
+ dare_ties_sparsification,
65
+ relative_norm,
66
+ divide_tensor_into_sets,
67
+ )
68
  from ddare.util import get_device
69
  import re
70
  from typing import Dict, Tuple, List
71
 
 
 
 
 
 
72
 
73
  logging.basicConfig(level=logging.INFO)
74
  log = logging.getLogger(__name__)
 
78
  models: List[str],
79
  trust_remote_code: bool,
80
  ):
81
+ """
82
+ get the models
83
+
84
+ :param models: model names to download
85
+ :param trust_remote_code: are you sure??? True/False
86
+ """
87
  config = {
88
+ "torch_dtype": torch.float16,
89
+ "low_cpu_mem_usage": False,
90
+ "trust_remote_code": trust_remote_code,
91
  }
92
  loaded_models = []
93
  num_models = len(models)
 
98
  )
99
  loaded_models.append(
100
  transformers.AutoModelForCausalLM.from_pretrained(
101
+ model_path, **config
 
102
  )
103
  )
104
  return loaded_models
 
107
  def pm(
108
  model,
109
  ):
110
+ """
111
+ pretty print model
112
+
113
+ :param model: show me the model
114
+ """
115
  keys = model.state_dict().keys()
116
  log.info(f"model keys={len(keys)}")
117
  for i, k in enumerate(keys):
 
119
  log.info(
120
  f"{i:3d} {k} shape={tensor.shape} "
121
  f"type={tensor.dtype} dev={tensor.device} "
122
+ f"contig={tensor.is_contiguous()}"
123
+ )
124
 
125
 
126
  def run_text_test(
127
  model,
128
+ tokenizer_path: str,
129
  question: str,
130
  device: str = "cuda",
131
  ):
132
+ """
133
+ run a question on the model and return the answer
134
+
135
+ :param model: initialized model
136
+ :param tokenizer_path: tokenizer path/name
137
+ :param question: what are you asking?
138
+ :param device: where do you want to run "cpu"/"gpu"?
139
+ """
140
  base_model = model.to(device)
141
+ log.info(f"loading tokenizer={tokenizer_path}")
 
 
142
  tokenizer = transformers.AutoTokenizer.from_pretrained(
143
  tokenizer_path,
144
  torch_dtype=torch.float16,
145
  )
146
 
147
+ inputs = tokenizer(question, return_tensors="pt").to(
148
+ device
149
+ )
 
150
  with torch.backends.cuda.sdp_kernel(
151
  enable_flash=True,
152
  enable_math=False,
153
+ enable_mem_efficient=True,
154
  ):
155
  outputs = base_model.generate(
156
  **inputs,
157
+ max_new_tokens=256,
158
  )
159
+ answer = tokenizer.decode(
160
+ outputs[0], skip_special_tokens=True
161
+ )
162
  log.info(
163
  "\n"
164
  "----------"
165
+ "\n"
166
  f"tokenizer={tokenizer}\n "
167
  f"question:\n{question}\n"
168
  f"answer:\n{answer}\n"
169
  "----------"
170
  )
171
  base_model = base_model.to(device)
172
+ return tokenizer
173
+
174
 
175
+ def get_layer_type(key: str) -> Tuple[int, str]:
176
+ """
177
+ get the layer type
178
 
179
+ :param key: name of the layer
180
+ :return: layer id and name
181
+ """
182
  matcher = re.compile(r"model.layers.(\d+).(.+)")
183
  m = matcher.match(key)
184
  if m is None:
 
196
  def merge_model_with_ties(
197
  models: List[str],
198
  model_dst: str,
199
+ trust_remote_code: bool = True,
200
  ):
201
+ """
202
+ merge the list of models into one model
203
+ called model_dst
204
+
205
+ :param models: list of models to merge
206
+ :param model_dst: name of the new model
207
+ :param trust_remote_code: are you sure? True/False
208
+ """
209
  models = get_models(
210
  models=models,
211
  trust_remote_code=trust_remote_code,
 
231
 
232
  # build a ratio
233
  ratio = {
234
+ "to_q": 0.0,
235
+ "to_k": 0.0,
236
+ "to_v": 0.0,
237
+ }.get(layer_type, 0.5)
238
 
239
  norm_ratio = 0.68
240
  log.info(
241
  f"model={k} {num_keys} shape={m0.shape} "
242
  f"dtype={m0.dtype} {m0.device} "
243
+ f"ratio={ratio} "
244
  f"contig={m0.is_contiguous()} "
245
+ f"norm={norm_ratio}"
246
+ )
247
 
248
  # for all tensors
249
  for i, tensor in enumerate(m):
250
  if layer_type == "to_k":
251
  # Get to_q key
252
+ q_base = models[0].state_dict()[
253
+ k.replace("to_k", "to_q")
254
+ ]
255
+ q_merge = models[i].state_dict()[
256
+ k.replace("to_k", "to_q")
257
+ ]
258
  scale = relative_norm(q_merge, q_base)
259
  tensor = tensor.to(device) / scale
260
  del scale
 
262
  scale = relative_norm(tensor, m0)
263
  tensor = tensor.to(device) * scale
264
  del scale
265
+ slice_mask = (sets == i).bool()
 
 
266
  new_tensor = dare_ties_sparsification(
267
  model_a_param=m0,
268
  model_b_param=tensor,
 
270
  ties="sum",
271
  rescale="off",
272
  device=device,
273
+ **config,
274
+ )
275
+ new_tensor = merge_tensors(
276
+ "slerp", m0, tensor, ratio
277
+ )
278
+ result = torch.where(
279
+ slice_mask, new_tensor, result
280
+ )
281
  del new_tensor, slice_mask
282
 
283
  result_dict[k] = result
284
  # end of merge
285
 
286
+ log.info(f"done merge saving to file: {model_dst}")
 
 
287
  out_model = (
288
  transformers.AutoModelForCausalLM.from_pretrained(
289
+ model_dst, **config
 
290
  )
291
  )
292
  out_model.state_dict = lambda: result_dict
 
294
 
295
 
296
  def run():
297
+ """
298
+ run the merge and upload the model and tokenizer
299
+
300
+ This requires running having the HuggingFace token
301
+ set before it will work:
302
+ ```huggingface-cli login```
303
+ """
304
+ question = "why is the sky blue?"
305
+ log.info(
306
+ f"merging models and asking the question: {question}"
307
  )
 
308
  model_src = "TinyLlama/TinyLlama-1.1B-intermediate-step-1431k-3T"
309
  model_dst = "matlok/tinyllama-cinder-openhermes-32k"
310
  device = "cuda"
311
  config = {
312
+ "torch_dtype": torch.float16,
313
+ "low_cpu_mem_usage": False,
314
+ "trust_remote_code": True,
315
  }
316
  models = [
317
  model_src,
 
321
  "Josephgflowers/TinyLlama-3T-Cinder-v1.3",
322
  ]
323
  merge_model_with_ties(
324
+ models=models, model_dst=model_dst
 
325
  )
326
  log.info(f"loading newly-created file: {model_dst}")
327
+ model = (
328
+ transformers.AutoModelForCausalLM.from_pretrained(
329
+ model_dst, **config
330
+ )
331
  )
332
  log.info(
333
  f"loaded new model file: {model_dst} "
 
339
  question=question,
340
  device=device,
341
  )
342
+
343
+ # clean the temp merge dir
344
+ # remove model dir to prevent issues with the tokenizer upload
345
+ model_org = model_dst.split("/")[0]
346
+ if os.path.exists(model_org):
347
+ os.system(f"rm -rf ./{model_org}")
348
+
349
+ log.info(f"uploading model: {model_dst}")
350
+ model.push_to_hub(model_dst)
351
+
352
+ log.info(f"uploading src tokenizer: {model_src}")
353
+ # reload tokenizer to save it and found on:
354
+ # https://colab.research.google.com/drive/1PEQyJO1-f6j0S_XJ8DV50NkpzasXkrzd?usp=sharing#scrollTo=QQn30cRtAZ-P
355
+ tokenizer = transformers.AutoTokenizer.from_pretrained(
356
+ model_src, trust_remote_code=True
357
+ )
358
+ # https://huggingface.co/docs/transformers/model_sharing#use-the-pushtohub-function
359
+ # tokenizer.push_to_hub("my-awesome-model")
360
+ tokenizer.push_to_hub(model_dst)
361
+ log.info(
362
+ f"done loading new model: {model} "
363
+ f"file: {model_dst}"
364
+ )
365
 
366
 
367
  if __name__ == "__main__":
 
373
  Here's the logs from the code above:
374
 
375
  ```
376
+ time ./run-tiny-merge.py
377
  Total VRAM 12282 MB, total RAM 85434 MB
378
  Set vram state to: NORMAL_VRAM
379
  Device: cuda:0 NVIDIA GeForce RTX 4070 Ti : native
380
  VAE dtype: torch.bfloat16
381
  INFO:__main__:merging models and asking the question: why is the sky blue?
382
+ INFO:__main__:loading model=1/5 model=TinyLlama/TinyLlama-1.1B-intermediate-step-1431k-3T
383
+ config.json: 100%|█████████████████████████████████████| 560/560 [00:00<00:00, 5.23MB/s]
384
+ model.safetensors: 100%|███████████████████████████| 4.40G/4.40G [00:48<00:00, 90.2MB/s]
385
+ generation_config.json: 100%|███████████████████████████| 129/129 [00:00<00:00, 721kB/s]
386
+ INFO:__main__:loading model=2/5 model=Doctor-Shotgun/TinyLlama-1.1B-32k-Instruct
387
+ config.json: 100%|█████████████████████████████████████| 695/695 [00:00<00:00, 3.04MB/s]
388
+ pytorch_model.bin: 100%|███████████████████████████| 2.20G/2.20G [00:23<00:00, 92.6MB/s]
389
+ generation_config.json: 100%|███████████████████████████| 129/129 [00:00<00:00, 566kB/s]
390
+ INFO:__main__:loading model=3/5 model=Doctor-Shotgun/TinyLlama-1.1B-32k
391
+ config.json: 100%|█████████████████████████████████████| 686/686 [00:00<00:00, 3.57MB/s]
392
+ model.safetensors: 100%|███████████████████████████| 2.20G/2.20G [00:24<00:00, 90.5MB/s]
393
+ generation_config.json: 100%|██████████████████████████| 124/124 [00:00<00:00, 1.80MB/s]
394
+ INFO:__main__:loading model=4/5 model=Tensoic/TinyLlama-1.1B-3T-openhermes
395
+ config.json: 100%|█████████████████████████████████████| 702/702 [00:00<00:00, 2.97MB/s]
396
+ pytorch_model.bin: 100%|███████████████████████████| 2.20G/2.20G [00:23<00:00, 92.7MB/s]
397
+ generation_config.json: 100%|███████████████████████████| 124/124 [00:00<00:00, 671kB/s]
398
+ INFO:__main__:loading model=5/5 model=Josephgflowers/TinyLlama-3T-Cinder-v1.3
399
+ config.json: 100%|█████████████████████████████████████| 713/713 [00:00<00:00, 9.35MB/s]
400
+ model.safetensors: 100%|███████████████████████████| 2.20G/2.20G [00:24<00:00, 91.5MB/s]
401
+ generation_config.json: 100%|██████████████████████████| 138/138 [00:00<00:00, 1.86MB/s]
402
+ INFO:__main__:model=model.embed_tokens.weight 201 shape=torch.Size([32000, 2048]) dtype=torch.float16 cpu ratio=0.5 contig=True norm=0.68
403
+ INFO:__main__:model=model.layers.0.self_attn.q_proj.weight 201 shape=torch.Size([2048, 2048]) dtype=torch.float16 cpu ratio=0.5 contig=True norm=0.68
404
+ INFO:__main__:model=model.layers.0.self_attn.k_proj.weight 201 shape=torch.Size([256, 2048]) dtype=torch.float16 cpu ratio=0.5 contig=True norm=0.68
405
+ INFO:__main__:model=model.layers.0.self_attn.v_proj.weight 201 shape=torch.Size([256, 2048]) dtype=torch.float16 cpu ratio=0.5 contig=True norm=0.68
406
+ INFO:__main__:model=model.layers.0.self_attn.o_proj.weight 201 shape=torch.Size([2048, 2048]) dtype=torch.float16 cpu ratio=0.5 contig=True norm=0.68
407
+ INFO:__main__:model=model.layers.0.mlp.gate_proj.weight 201 shape=torch.Size([5632, 2048]) dtype=torch.float16 cpu ratio=0.5 contig=True norm=0.68
408
+ INFO:__main__:model=model.layers.0.mlp.up_proj.weight 201 shape=torch.Size([5632, 2048]) dtype=torch.float16 cpu ratio=0.5 contig=True norm=0.68
409
+ INFO:__main__:model=model.layers.0.mlp.down_proj.weight 201 shape=torch.Size([2048, 5632]) dtype=torch.float16 cpu ratio=0.5 contig=True norm=0.68
410
+ INFO:__main__:model=model.layers.0.input_layernorm.weight 201 shape=torch.Size([2048]) dtype=torch.float16 cpu ratio=0.5 contig=True norm=0.68
411
+ INFO:__main__:model=model.layers.0.post_attention_layernorm.weight 201 shape=torch.Size([2048]) dtype=torch.float16 cpu ratio=0.5 contig=True norm=0.68
412
+ INFO:__main__:model=model.layers.1.self_attn.q_proj.weight 201 shape=torch.Size([2048, 2048]) dtype=torch.float16 cpu ratio=0.5 contig=True norm=0.68
413
+ INFO:__main__:model=model.layers.1.self_attn.k_proj.weight 201 shape=torch.Size([256, 2048]) dtype=torch.float16 cpu ratio=0.5 contig=True norm=0.68
414
+ INFO:__main__:model=model.layers.1.self_attn.v_proj.weight 201 shape=torch.Size([256, 2048]) dtype=torch.float16 cpu ratio=0.5 contig=True norm=0.68
415
+ INFO:__main__:model=model.layers.1.self_attn.o_proj.weight 201 shape=torch.Size([2048, 2048]) dtype=torch.float16 cpu ratio=0.5 contig=True norm=0.68
416
+ INFO:__main__:model=model.layers.1.mlp.gate_proj.weight 201 shape=torch.Size([5632, 2048]) dtype=torch.float16 cpu ratio=0.5 contig=True norm=0.68
417
+ INFO:__main__:model=model.layers.1.mlp.up_proj.weight 201 shape=torch.Size([5632, 2048]) dtype=torch.float16 cpu ratio=0.5 contig=True norm=0.68
418
+ INFO:__main__:model=model.layers.1.mlp.down_proj.weight 201 shape=torch.Size([2048, 5632]) dtype=torch.float16 cpu ratio=0.5 contig=True norm=0.68
419
+ INFO:__main__:model=model.layers.1.input_layernorm.weight 201 shape=torch.Size([2048]) dtype=torch.float16 cpu ratio=0.5 contig=True norm=0.68
420
+ INFO:__main__:model=model.layers.1.post_attention_layernorm.weight 201 shape=torch.Size([2048]) dtype=torch.float16 cpu ratio=0.5 contig=True norm=0.68
421
+ INFO:__main__:model=model.layers.2.self_attn.q_proj.weight 201 shape=torch.Size([2048, 2048]) dtype=torch.float16 cpu ratio=0.5 contig=True norm=0.68
422
+ INFO:__main__:model=model.layers.2.self_attn.k_proj.weight 201 shape=torch.Size([256, 2048]) dtype=torch.float16 cpu ratio=0.5 contig=True norm=0.68
423
+ INFO:__main__:model=model.layers.2.self_attn.v_proj.weight 201 shape=torch.Size([256, 2048]) dtype=torch.float16 cpu ratio=0.5 contig=True norm=0.68
424
+ INFO:__main__:model=model.layers.2.self_attn.o_proj.weight 201 shape=torch.Size([2048, 2048]) dtype=torch.float16 cpu ratio=0.5 contig=True norm=0.68
425
+ INFO:__main__:model=model.layers.2.mlp.gate_proj.weight 201 shape=torch.Size([5632, 2048]) dtype=torch.float16 cpu ratio=0.5 contig=True norm=0.68
426
+ INFO:__main__:model=model.layers.2.mlp.up_proj.weight 201 shape=torch.Size([5632, 2048]) dtype=torch.float16 cpu ratio=0.5 contig=True norm=0.68
427
+ INFO:__main__:model=model.layers.2.mlp.down_proj.weight 201 shape=torch.Size([2048, 5632]) dtype=torch.float16 cpu ratio=0.5 contig=True norm=0.68
428
+ INFO:__main__:model=model.layers.2.input_layernorm.weight 201 shape=torch.Size([2048]) dtype=torch.float16 cpu ratio=0.5 contig=True norm=0.68
429
+ INFO:__main__:model=model.layers.2.post_attention_layernorm.weight 201 shape=torch.Size([2048]) dtype=torch.float16 cpu ratio=0.5 contig=True norm=0.68
430
+ INFO:__main__:model=model.layers.3.self_attn.q_proj.weight 201 shape=torch.Size([2048, 2048]) dtype=torch.float16 cpu ratio=0.5 contig=True norm=0.68
431
+ INFO:__main__:model=model.layers.3.self_attn.k_proj.weight 201 shape=torch.Size([256, 2048]) dtype=torch.float16 cpu ratio=0.5 contig=True norm=0.68
432
+ INFO:__main__:model=model.layers.3.self_attn.v_proj.weight 201 shape=torch.Size([256, 2048]) dtype=torch.float16 cpu ratio=0.5 contig=True norm=0.68
433
+ INFO:__main__:model=model.layers.3.self_attn.o_proj.weight 201 shape=torch.Size([2048, 2048]) dtype=torch.float16 cpu ratio=0.5 contig=True norm=0.68
434
+ INFO:__main__:model=model.layers.3.mlp.gate_proj.weight 201 shape=torch.Size([5632, 2048]) dtype=torch.float16 cpu ratio=0.5 contig=True norm=0.68
435
+ INFO:__main__:model=model.layers.3.mlp.up_proj.weight 201 shape=torch.Size([5632, 2048]) dtype=torch.float16 cpu ratio=0.5 contig=True norm=0.68
436
+ INFO:__main__:model=model.layers.3.mlp.down_proj.weight 201 shape=torch.Size([2048, 5632]) dtype=torch.float16 cpu ratio=0.5 contig=True norm=0.68
437
+ INFO:__main__:model=model.layers.3.input_layernorm.weight 201 shape=torch.Size([2048]) dtype=torch.float16 cpu ratio=0.5 contig=True norm=0.68
438
+ INFO:__main__:model=model.layers.3.post_attention_layernorm.weight 201 shape=torch.Size([2048]) dtype=torch.float16 cpu ratio=0.5 contig=True norm=0.68
439
+ INFO:__main__:model=model.layers.4.self_attn.q_proj.weight 201 shape=torch.Size([2048, 2048]) dtype=torch.float16 cpu ratio=0.5 contig=True norm=0.68
440
+ INFO:__main__:model=model.layers.4.self_attn.k_proj.weight 201 shape=torch.Size([256, 2048]) dtype=torch.float16 cpu ratio=0.5 contig=True norm=0.68
441
+ INFO:__main__:model=model.layers.4.self_attn.v_proj.weight 201 shape=torch.Size([256, 2048]) dtype=torch.float16 cpu ratio=0.5 contig=True norm=0.68
442
+ INFO:__main__:model=model.layers.4.self_attn.o_proj.weight 201 shape=torch.Size([2048, 2048]) dtype=torch.float16 cpu ratio=0.5 contig=True norm=0.68
443
+ INFO:__main__:model=model.layers.4.mlp.gate_proj.weight 201 shape=torch.Size([5632, 2048]) dtype=torch.float16 cpu ratio=0.5 contig=True norm=0.68
444
+ INFO:__main__:model=model.layers.4.mlp.up_proj.weight 201 shape=torch.Size([5632, 2048]) dtype=torch.float16 cpu ratio=0.5 contig=True norm=0.68
445
+ INFO:__main__:model=model.layers.4.mlp.down_proj.weight 201 shape=torch.Size([2048, 5632]) dtype=torch.float16 cpu ratio=0.5 contig=True norm=0.68
446
+ INFO:__main__:model=model.layers.4.input_layernorm.weight 201 shape=torch.Size([2048]) dtype=torch.float16 cpu ratio=0.5 contig=True norm=0.68
447
+ INFO:__main__:model=model.layers.4.post_attention_layernorm.weight 201 shape=torch.Size([2048]) dtype=torch.float16 cpu ratio=0.5 contig=True norm=0.68
448
+ INFO:__main__:model=model.layers.5.self_attn.q_proj.weight 201 shape=torch.Size([2048, 2048]) dtype=torch.float16 cpu ratio=0.5 contig=True norm=0.68
449
+ INFO:__main__:model=model.layers.5.self_attn.k_proj.weight 201 shape=torch.Size([256, 2048]) dtype=torch.float16 cpu ratio=0.5 contig=True norm=0.68
450
+ INFO:__main__:model=model.layers.5.self_attn.v_proj.weight 201 shape=torch.Size([256, 2048]) dtype=torch.float16 cpu ratio=0.5 contig=True norm=0.68
451
+ INFO:__main__:model=model.layers.5.self_attn.o_proj.weight 201 shape=torch.Size([2048, 2048]) dtype=torch.float16 cpu ratio=0.5 contig=True norm=0.68
452
+ INFO:__main__:model=model.layers.5.mlp.gate_proj.weight 201 shape=torch.Size([5632, 2048]) dtype=torch.float16 cpu ratio=0.5 contig=True norm=0.68
453
+ INFO:__main__:model=model.layers.5.mlp.up_proj.weight 201 shape=torch.Size([5632, 2048]) dtype=torch.float16 cpu ratio=0.5 contig=True norm=0.68
454
+ INFO:__main__:model=model.layers.5.mlp.down_proj.weight 201 shape=torch.Size([2048, 5632]) dtype=torch.float16 cpu ratio=0.5 contig=True norm=0.68
455
+ INFO:__main__:model=model.layers.5.input_layernorm.weight 201 shape=torch.Size([2048]) dtype=torch.float16 cpu ratio=0.5 contig=True norm=0.68
456
+ INFO:__main__:model=model.layers.5.post_attention_layernorm.weight 201 shape=torch.Size([2048]) dtype=torch.float16 cpu ratio=0.5 contig=True norm=0.68
457
+ INFO:__main__:model=model.layers.6.self_attn.q_proj.weight 201 shape=torch.Size([2048, 2048]) dtype=torch.float16 cpu ratio=0.5 contig=True norm=0.68
458
+ INFO:__main__:model=model.layers.6.self_attn.k_proj.weight 201 shape=torch.Size([256, 2048]) dtype=torch.float16 cpu ratio=0.5 contig=True norm=0.68
459
+ INFO:__main__:model=model.layers.6.self_attn.v_proj.weight 201 shape=torch.Size([256, 2048]) dtype=torch.float16 cpu ratio=0.5 contig=True norm=0.68
460
+ INFO:__main__:model=model.layers.6.self_attn.o_proj.weight 201 shape=torch.Size([2048, 2048]) dtype=torch.float16 cpu ratio=0.5 contig=True norm=0.68
461
+ INFO:__main__:model=model.layers.6.mlp.gate_proj.weight 201 shape=torch.Size([5632, 2048]) dtype=torch.float16 cpu ratio=0.5 contig=True norm=0.68
462
+ INFO:__main__:model=model.layers.6.mlp.up_proj.weight 201 shape=torch.Size([5632, 2048]) dtype=torch.float16 cpu ratio=0.5 contig=True norm=0.68
463
+ INFO:__main__:model=model.layers.6.mlp.down_proj.weight 201 shape=torch.Size([2048, 5632]) dtype=torch.float16 cpu ratio=0.5 contig=True norm=0.68
464
+ INFO:__main__:model=model.layers.6.input_layernorm.weight 201 shape=torch.Size([2048]) dtype=torch.float16 cpu ratio=0.5 contig=True norm=0.68
465
+ INFO:__main__:model=model.layers.6.post_attention_layernorm.weight 201 shape=torch.Size([2048]) dtype=torch.float16 cpu ratio=0.5 contig=True norm=0.68
466
+ INFO:__main__:model=model.layers.7.self_attn.q_proj.weight 201 shape=torch.Size([2048, 2048]) dtype=torch.float16 cpu ratio=0.5 contig=True norm=0.68
467
+ INFO:__main__:model=model.layers.7.self_attn.k_proj.weight 201 shape=torch.Size([256, 2048]) dtype=torch.float16 cpu ratio=0.5 contig=True norm=0.68
468
+ INFO:__main__:model=model.layers.7.self_attn.v_proj.weight 201 shape=torch.Size([256, 2048]) dtype=torch.float16 cpu ratio=0.5 contig=True norm=0.68
469
+ INFO:__main__:model=model.layers.7.self_attn.o_proj.weight 201 shape=torch.Size([2048, 2048]) dtype=torch.float16 cpu ratio=0.5 contig=True norm=0.68
470
+ INFO:__main__:model=model.layers.7.mlp.gate_proj.weight 201 shape=torch.Size([5632, 2048]) dtype=torch.float16 cpu ratio=0.5 contig=True norm=0.68
471
+ INFO:__main__:model=model.layers.7.mlp.up_proj.weight 201 shape=torch.Size([5632, 2048]) dtype=torch.float16 cpu ratio=0.5 contig=True norm=0.68
472
+ INFO:__main__:model=model.layers.7.mlp.down_proj.weight 201 shape=torch.Size([2048, 5632]) dtype=torch.float16 cpu ratio=0.5 contig=True norm=0.68
473
+ INFO:__main__:model=model.layers.7.input_layernorm.weight 201 shape=torch.Size([2048]) dtype=torch.float16 cpu ratio=0.5 contig=True norm=0.68
474
+ INFO:__main__:model=model.layers.7.post_attention_layernorm.weight 201 shape=torch.Size([2048]) dtype=torch.float16 cpu ratio=0.5 contig=True norm=0.68
475
+ INFO:__main__:model=model.layers.8.self_attn.q_proj.weight 201 shape=torch.Size([2048, 2048]) dtype=torch.float16 cpu ratio=0.5 contig=True norm=0.68
476
+ INFO:__main__:model=model.layers.8.self_attn.k_proj.weight 201 shape=torch.Size([256, 2048]) dtype=torch.float16 cpu ratio=0.5 contig=True norm=0.68
477
+ INFO:__main__:model=model.layers.8.self_attn.v_proj.weight 201 shape=torch.Size([256, 2048]) dtype=torch.float16 cpu ratio=0.5 contig=True norm=0.68
478
+ INFO:__main__:model=model.layers.8.self_attn.o_proj.weight 201 shape=torch.Size([2048, 2048]) dtype=torch.float16 cpu ratio=0.5 contig=True norm=0.68
479
+ INFO:__main__:model=model.layers.8.mlp.gate_proj.weight 201 shape=torch.Size([5632, 2048]) dtype=torch.float16 cpu ratio=0.5 contig=True norm=0.68
480
+ INFO:__main__:model=model.layers.8.mlp.up_proj.weight 201 shape=torch.Size([5632, 2048]) dtype=torch.float16 cpu ratio=0.5 contig=True norm=0.68
481
+ INFO:__main__:model=model.layers.8.mlp.down_proj.weight 201 shape=torch.Size([2048, 5632]) dtype=torch.float16 cpu ratio=0.5 contig=True norm=0.68
482
+ INFO:__main__:model=model.layers.8.input_layernorm.weight 201 shape=torch.Size([2048]) dtype=torch.float16 cpu ratio=0.5 contig=True norm=0.68
483
+ INFO:__main__:model=model.layers.8.post_attention_layernorm.weight 201 shape=torch.Size([2048]) dtype=torch.float16 cpu ratio=0.5 contig=True norm=0.68
484
+ INFO:__main__:model=model.layers.9.self_attn.q_proj.weight 201 shape=torch.Size([2048, 2048]) dtype=torch.float16 cpu ratio=0.5 contig=True norm=0.68
485
+ INFO:__main__:model=model.layers.9.self_attn.k_proj.weight 201 shape=torch.Size([256, 2048]) dtype=torch.float16 cpu ratio=0.5 contig=True norm=0.68
486
+ INFO:__main__:model=model.layers.9.self_attn.v_proj.weight 201 shape=torch.Size([256, 2048]) dtype=torch.float16 cpu ratio=0.5 contig=True norm=0.68
487
+ INFO:__main__:model=model.layers.9.self_attn.o_proj.weight 201 shape=torch.Size([2048, 2048]) dtype=torch.float16 cpu ratio=0.5 contig=True norm=0.68
488
+ INFO:__main__:model=model.layers.9.mlp.gate_proj.weight 201 shape=torch.Size([5632, 2048]) dtype=torch.float16 cpu ratio=0.5 contig=True norm=0.68
489
+ INFO:__main__:model=model.layers.9.mlp.up_proj.weight 201 shape=torch.Size([5632, 2048]) dtype=torch.float16 cpu ratio=0.5 contig=True norm=0.68
490
+ INFO:__main__:model=model.layers.9.mlp.down_proj.weight 201 shape=torch.Size([2048, 5632]) dtype=torch.float16 cpu ratio=0.5 contig=True norm=0.68
491
+ INFO:__main__:model=model.layers.9.input_layernorm.weight 201 shape=torch.Size([2048]) dtype=torch.float16 cpu ratio=0.5 contig=True norm=0.68
492
+ INFO:__main__:model=model.layers.9.post_attention_layernorm.weight 201 shape=torch.Size([2048]) dtype=torch.float16 cpu ratio=0.5 contig=True norm=0.68
493
+ INFO:__main__:model=model.layers.10.self_attn.q_proj.weight 201 shape=torch.Size([2048, 2048]) dtype=torch.float16 cpu ratio=0.5 contig=True norm=0.68
494
+ INFO:__main__:model=model.layers.10.self_attn.k_proj.weight 201 shape=torch.Size([256, 2048]) dtype=torch.float16 cpu ratio=0.5 contig=True norm=0.68
495
+ INFO:__main__:model=model.layers.10.self_attn.v_proj.weight 201 shape=torch.Size([256, 2048]) dtype=torch.float16 cpu ratio=0.5 contig=True norm=0.68
496
+ INFO:__main__:model=model.layers.10.self_attn.o_proj.weight 201 shape=torch.Size([2048, 2048]) dtype=torch.float16 cpu ratio=0.5 contig=True norm=0.68
497
+ INFO:__main__:model=model.layers.10.mlp.gate_proj.weight 201 shape=torch.Size([5632, 2048]) dtype=torch.float16 cpu ratio=0.5 contig=True norm=0.68
498
+ INFO:__main__:model=model.layers.10.mlp.up_proj.weight 201 shape=torch.Size([5632, 2048]) dtype=torch.float16 cpu ratio=0.5 contig=True norm=0.68
499
+ INFO:__main__:model=model.layers.10.mlp.down_proj.weight 201 shape=torch.Size([2048, 5632]) dtype=torch.float16 cpu ratio=0.5 contig=True norm=0.68
500
+ INFO:__main__:model=model.layers.10.input_layernorm.weight 201 shape=torch.Size([2048]) dtype=torch.float16 cpu ratio=0.5 contig=True norm=0.68
501
+ INFO:__main__:model=model.layers.10.post_attention_layernorm.weight 201 shape=torch.Size([2048]) dtype=torch.float16 cpu ratio=0.5 contig=True norm=0.68
502
+ INFO:__main__:model=model.layers.11.self_attn.q_proj.weight 201 shape=torch.Size([2048, 2048]) dtype=torch.float16 cpu ratio=0.5 contig=True norm=0.68
503
+ INFO:__main__:model=model.layers.11.self_attn.k_proj.weight 201 shape=torch.Size([256, 2048]) dtype=torch.float16 cpu ratio=0.5 contig=True norm=0.68
504
+ INFO:__main__:model=model.layers.11.self_attn.v_proj.weight 201 shape=torch.Size([256, 2048]) dtype=torch.float16 cpu ratio=0.5 contig=True norm=0.68
505
+ INFO:__main__:model=model.layers.11.self_attn.o_proj.weight 201 shape=torch.Size([2048, 2048]) dtype=torch.float16 cpu ratio=0.5 contig=True norm=0.68
506
+ INFO:__main__:model=model.layers.11.mlp.gate_proj.weight 201 shape=torch.Size([5632, 2048]) dtype=torch.float16 cpu ratio=0.5 contig=True norm=0.68
507
+ INFO:__main__:model=model.layers.11.mlp.up_proj.weight 201 shape=torch.Size([5632, 2048]) dtype=torch.float16 cpu ratio=0.5 contig=True norm=0.68
508
+ INFO:__main__:model=model.layers.11.mlp.down_proj.weight 201 shape=torch.Size([2048, 5632]) dtype=torch.float16 cpu ratio=0.5 contig=True norm=0.68
509
+ INFO:__main__:model=model.layers.11.input_layernorm.weight 201 shape=torch.Size([2048]) dtype=torch.float16 cpu ratio=0.5 contig=True norm=0.68
510
+ INFO:__main__:model=model.layers.11.post_attention_layernorm.weight 201 shape=torch.Size([2048]) dtype=torch.float16 cpu ratio=0.5 contig=True norm=0.68
511
+ INFO:__main__:model=model.layers.12.self_attn.q_proj.weight 201 shape=torch.Size([2048, 2048]) dtype=torch.float16 cpu ratio=0.5 contig=True norm=0.68
512
+ INFO:__main__:model=model.layers.12.self_attn.k_proj.weight 201 shape=torch.Size([256, 2048]) dtype=torch.float16 cpu ratio=0.5 contig=True norm=0.68
513
+ INFO:__main__:model=model.layers.12.self_attn.v_proj.weight 201 shape=torch.Size([256, 2048]) dtype=torch.float16 cpu ratio=0.5 contig=True norm=0.68
514
+ INFO:__main__:model=model.layers.12.self_attn.o_proj.weight 201 shape=torch.Size([2048, 2048]) dtype=torch.float16 cpu ratio=0.5 contig=True norm=0.68
515
+ INFO:__main__:model=model.layers.12.mlp.gate_proj.weight 201 shape=torch.Size([5632, 2048]) dtype=torch.float16 cpu ratio=0.5 contig=True norm=0.68
516
+ INFO:__main__:model=model.layers.12.mlp.up_proj.weight 201 shape=torch.Size([5632, 2048]) dtype=torch.float16 cpu ratio=0.5 contig=True norm=0.68
517
+ INFO:__main__:model=model.layers.12.mlp.down_proj.weight 201 shape=torch.Size([2048, 5632]) dtype=torch.float16 cpu ratio=0.5 contig=True norm=0.68
518
+ INFO:__main__:model=model.layers.12.input_layernorm.weight 201 shape=torch.Size([2048]) dtype=torch.float16 cpu ratio=0.5 contig=True norm=0.68
519
+ INFO:__main__:model=model.layers.12.post_attention_layernorm.weight 201 shape=torch.Size([2048]) dtype=torch.float16 cpu ratio=0.5 contig=True norm=0.68
520
+ INFO:__main__:model=model.layers.13.self_attn.q_proj.weight 201 shape=torch.Size([2048, 2048]) dtype=torch.float16 cpu ratio=0.5 contig=True norm=0.68
521
+ INFO:__main__:model=model.layers.13.self_attn.k_proj.weight 201 shape=torch.Size([256, 2048]) dtype=torch.float16 cpu ratio=0.5 contig=True norm=0.68
522
+ INFO:__main__:model=model.layers.13.self_attn.v_proj.weight 201 shape=torch.Size([256, 2048]) dtype=torch.float16 cpu ratio=0.5 contig=True norm=0.68
523
+ INFO:__main__:model=model.layers.13.self_attn.o_proj.weight 201 shape=torch.Size([2048, 2048]) dtype=torch.float16 cpu ratio=0.5 contig=True norm=0.68
524
+ INFO:__main__:model=model.layers.13.mlp.gate_proj.weight 201 shape=torch.Size([5632, 2048]) dtype=torch.float16 cpu ratio=0.5 contig=True norm=0.68
525
+ INFO:__main__:model=model.layers.13.mlp.up_proj.weight 201 shape=torch.Size([5632, 2048]) dtype=torch.float16 cpu ratio=0.5 contig=True norm=0.68
526
+ INFO:__main__:model=model.layers.13.mlp.down_proj.weight 201 shape=torch.Size([2048, 5632]) dtype=torch.float16 cpu ratio=0.5 contig=True norm=0.68
527
+ INFO:__main__:model=model.layers.13.input_layernorm.weight 201 shape=torch.Size([2048]) dtype=torch.float16 cpu ratio=0.5 contig=True norm=0.68
528
+ INFO:__main__:model=model.layers.13.post_attention_layernorm.weight 201 shape=torch.Size([2048]) dtype=torch.float16 cpu ratio=0.5 contig=True norm=0.68
529
+ INFO:__main__:model=model.layers.14.self_attn.q_proj.weight 201 shape=torch.Size([2048, 2048]) dtype=torch.float16 cpu ratio=0.5 contig=True norm=0.68
530
+ INFO:__main__:model=model.layers.14.self_attn.k_proj.weight 201 shape=torch.Size([256, 2048]) dtype=torch.float16 cpu ratio=0.5 contig=True norm=0.68
531
+ INFO:__main__:model=model.layers.14.self_attn.v_proj.weight 201 shape=torch.Size([256, 2048]) dtype=torch.float16 cpu ratio=0.5 contig=True norm=0.68
532
+ INFO:__main__:model=model.layers.14.self_attn.o_proj.weight 201 shape=torch.Size([2048, 2048]) dtype=torch.float16 cpu ratio=0.5 contig=True norm=0.68
533
+ INFO:__main__:model=model.layers.14.mlp.gate_proj.weight 201 shape=torch.Size([5632, 2048]) dtype=torch.float16 cpu ratio=0.5 contig=True norm=0.68
534
+ INFO:__main__:model=model.layers.14.mlp.up_proj.weight 201 shape=torch.Size([5632, 2048]) dtype=torch.float16 cpu ratio=0.5 contig=True norm=0.68
535
+ INFO:__main__:model=model.layers.14.mlp.down_proj.weight 201 shape=torch.Size([2048, 5632]) dtype=torch.float16 cpu ratio=0.5 contig=True norm=0.68
536
+ INFO:__main__:model=model.layers.14.input_layernorm.weight 201 shape=torch.Size([2048]) dtype=torch.float16 cpu ratio=0.5 contig=True norm=0.68
537
+ INFO:__main__:model=model.layers.14.post_attention_layernorm.weight 201 shape=torch.Size([2048]) dtype=torch.float16 cpu ratio=0.5 contig=True norm=0.68
538
+ INFO:__main__:model=model.layers.15.self_attn.q_proj.weight 201 shape=torch.Size([2048, 2048]) dtype=torch.float16 cpu ratio=0.5 contig=True norm=0.68
539
+ INFO:__main__:model=model.layers.15.self_attn.k_proj.weight 201 shape=torch.Size([256, 2048]) dtype=torch.float16 cpu ratio=0.5 contig=True norm=0.68
540
+ INFO:__main__:model=model.layers.15.self_attn.v_proj.weight 201 shape=torch.Size([256, 2048]) dtype=torch.float16 cpu ratio=0.5 contig=True norm=0.68
541
+ INFO:__main__:model=model.layers.15.self_attn.o_proj.weight 201 shape=torch.Size([2048, 2048]) dtype=torch.float16 cpu ratio=0.5 contig=True norm=0.68
542
+ INFO:__main__:model=model.layers.15.mlp.gate_proj.weight 201 shape=torch.Size([5632, 2048]) dtype=torch.float16 cpu ratio=0.5 contig=True norm=0.68
543
+ INFO:__main__:model=model.layers.15.mlp.up_proj.weight 201 shape=torch.Size([5632, 2048]) dtype=torch.float16 cpu ratio=0.5 contig=True norm=0.68
544
+ INFO:__main__:model=model.layers.15.mlp.down_proj.weight 201 shape=torch.Size([2048, 5632]) dtype=torch.float16 cpu ratio=0.5 contig=True norm=0.68
545
+ INFO:__main__:model=model.layers.15.input_layernorm.weight 201 shape=torch.Size([2048]) dtype=torch.float16 cpu ratio=0.5 contig=True norm=0.68
546
+ INFO:__main__:model=model.layers.15.post_attention_layernorm.weight 201 shape=torch.Size([2048]) dtype=torch.float16 cpu ratio=0.5 contig=True norm=0.68
547
+ INFO:__main__:model=model.layers.16.self_attn.q_proj.weight 201 shape=torch.Size([2048, 2048]) dtype=torch.float16 cpu ratio=0.5 contig=True norm=0.68
548
+ INFO:__main__:model=model.layers.16.self_attn.k_proj.weight 201 shape=torch.Size([256, 2048]) dtype=torch.float16 cpu ratio=0.5 contig=True norm=0.68
549
+ INFO:__main__:model=model.layers.16.self_attn.v_proj.weight 201 shape=torch.Size([256, 2048]) dtype=torch.float16 cpu ratio=0.5 contig=True norm=0.68
550
+ INFO:__main__:model=model.layers.16.self_attn.o_proj.weight 201 shape=torch.Size([2048, 2048]) dtype=torch.float16 cpu ratio=0.5 contig=True norm=0.68
551
+ INFO:__main__:model=model.layers.16.mlp.gate_proj.weight 201 shape=torch.Size([5632, 2048]) dtype=torch.float16 cpu ratio=0.5 contig=True norm=0.68
552
+ INFO:__main__:model=model.layers.16.mlp.up_proj.weight 201 shape=torch.Size([5632, 2048]) dtype=torch.float16 cpu ratio=0.5 contig=True norm=0.68
553
+ INFO:__main__:model=model.layers.16.mlp.down_proj.weight 201 shape=torch.Size([2048, 5632]) dtype=torch.float16 cpu ratio=0.5 contig=True norm=0.68
554
+ INFO:__main__:model=model.layers.16.input_layernorm.weight 201 shape=torch.Size([2048]) dtype=torch.float16 cpu ratio=0.5 contig=True norm=0.68
555
+ INFO:__main__:model=model.layers.16.post_attention_layernorm.weight 201 shape=torch.Size([2048]) dtype=torch.float16 cpu ratio=0.5 contig=True norm=0.68
556
+ INFO:__main__:model=model.layers.17.self_attn.q_proj.weight 201 shape=torch.Size([2048, 2048]) dtype=torch.float16 cpu ratio=0.5 contig=True norm=0.68
557
+ INFO:__main__:model=model.layers.17.self_attn.k_proj.weight 201 shape=torch.Size([256, 2048]) dtype=torch.float16 cpu ratio=0.5 contig=True norm=0.68
558
+ INFO:__main__:model=model.layers.17.self_attn.v_proj.weight 201 shape=torch.Size([256, 2048]) dtype=torch.float16 cpu ratio=0.5 contig=True norm=0.68
559
+ INFO:__main__:model=model.layers.17.self_attn.o_proj.weight 201 shape=torch.Size([2048, 2048]) dtype=torch.float16 cpu ratio=0.5 contig=True norm=0.68
560
+ INFO:__main__:model=model.layers.17.mlp.gate_proj.weight 201 shape=torch.Size([5632, 2048]) dtype=torch.float16 cpu ratio=0.5 contig=True norm=0.68
561
+ INFO:__main__:model=model.layers.17.mlp.up_proj.weight 201 shape=torch.Size([5632, 2048]) dtype=torch.float16 cpu ratio=0.5 contig=True norm=0.68
562
+ INFO:__main__:model=model.layers.17.mlp.down_proj.weight 201 shape=torch.Size([2048, 5632]) dtype=torch.float16 cpu ratio=0.5 contig=True norm=0.68
563
+ INFO:__main__:model=model.layers.17.input_layernorm.weight 201 shape=torch.Size([2048]) dtype=torch.float16 cpu ratio=0.5 contig=True norm=0.68
564
+ INFO:__main__:model=model.layers.17.post_attention_layernorm.weight 201 shape=torch.Size([2048]) dtype=torch.float16 cpu ratio=0.5 contig=True norm=0.68
565
+ INFO:__main__:model=model.layers.18.self_attn.q_proj.weight 201 shape=torch.Size([2048, 2048]) dtype=torch.float16 cpu ratio=0.5 contig=True norm=0.68
566
+ INFO:__main__:model=model.layers.18.self_attn.k_proj.weight 201 shape=torch.Size([256, 2048]) dtype=torch.float16 cpu ratio=0.5 contig=True norm=0.68
567
+ INFO:__main__:model=model.layers.18.self_attn.v_proj.weight 201 shape=torch.Size([256, 2048]) dtype=torch.float16 cpu ratio=0.5 contig=True norm=0.68
568
+ INFO:__main__:model=model.layers.18.self_attn.o_proj.weight 201 shape=torch.Size([2048, 2048]) dtype=torch.float16 cpu ratio=0.5 contig=True norm=0.68
569
+ INFO:__main__:model=model.layers.18.mlp.gate_proj.weight 201 shape=torch.Size([5632, 2048]) dtype=torch.float16 cpu ratio=0.5 contig=True norm=0.68
570
+ INFO:__main__:model=model.layers.18.mlp.up_proj.weight 201 shape=torch.Size([5632, 2048]) dtype=torch.float16 cpu ratio=0.5 contig=True norm=0.68
571
+ INFO:__main__:model=model.layers.18.mlp.down_proj.weight 201 shape=torch.Size([2048, 5632]) dtype=torch.float16 cpu ratio=0.5 contig=True norm=0.68
572
+ INFO:__main__:model=model.layers.18.input_layernorm.weight 201 shape=torch.Size([2048]) dtype=torch.float16 cpu ratio=0.5 contig=True norm=0.68
573
+ INFO:__main__:model=model.layers.18.post_attention_layernorm.weight 201 shape=torch.Size([2048]) dtype=torch.float16 cpu ratio=0.5 contig=True norm=0.68
574
+ INFO:__main__:model=model.layers.19.self_attn.q_proj.weight 201 shape=torch.Size([2048, 2048]) dtype=torch.float16 cpu ratio=0.5 contig=True norm=0.68
575
+ INFO:__main__:model=model.layers.19.self_attn.k_proj.weight 201 shape=torch.Size([256, 2048]) dtype=torch.float16 cpu ratio=0.5 contig=True norm=0.68
576
+ INFO:__main__:model=model.layers.19.self_attn.v_proj.weight 201 shape=torch.Size([256, 2048]) dtype=torch.float16 cpu ratio=0.5 contig=True norm=0.68
577
+ INFO:__main__:model=model.layers.19.self_attn.o_proj.weight 201 shape=torch.Size([2048, 2048]) dtype=torch.float16 cpu ratio=0.5 contig=True norm=0.68
578
+ INFO:__main__:model=model.layers.19.mlp.gate_proj.weight 201 shape=torch.Size([5632, 2048]) dtype=torch.float16 cpu ratio=0.5 contig=True norm=0.68
579
+ INFO:__main__:model=model.layers.19.mlp.up_proj.weight 201 shape=torch.Size([5632, 2048]) dtype=torch.float16 cpu ratio=0.5 contig=True norm=0.68
580
+ INFO:__main__:model=model.layers.19.mlp.down_proj.weight 201 shape=torch.Size([2048, 5632]) dtype=torch.float16 cpu ratio=0.5 contig=True norm=0.68
581
+ INFO:__main__:model=model.layers.19.input_layernorm.weight 201 shape=torch.Size([2048]) dtype=torch.float16 cpu ratio=0.5 contig=True norm=0.68
582
+ INFO:__main__:model=model.layers.19.post_attention_layernorm.weight 201 shape=torch.Size([2048]) dtype=torch.float16 cpu ratio=0.5 contig=True norm=0.68
583
+ INFO:__main__:model=model.layers.20.self_attn.q_proj.weight 201 shape=torch.Size([2048, 2048]) dtype=torch.float16 cpu ratio=0.5 contig=True norm=0.68
584
+ INFO:__main__:model=model.layers.20.self_attn.k_proj.weight 201 shape=torch.Size([256, 2048]) dtype=torch.float16 cpu ratio=0.5 contig=True norm=0.68
585
+ INFO:__main__:model=model.layers.20.self_attn.v_proj.weight 201 shape=torch.Size([256, 2048]) dtype=torch.float16 cpu ratio=0.5 contig=True norm=0.68
586
+ INFO:__main__:model=model.layers.20.self_attn.o_proj.weight 201 shape=torch.Size([2048, 2048]) dtype=torch.float16 cpu ratio=0.5 contig=True norm=0.68
587
+ INFO:__main__:model=model.layers.20.mlp.gate_proj.weight 201 shape=torch.Size([5632, 2048]) dtype=torch.float16 cpu ratio=0.5 contig=True norm=0.68
588
+ INFO:__main__:model=model.layers.20.mlp.up_proj.weight 201 shape=torch.Size([5632, 2048]) dtype=torch.float16 cpu ratio=0.5 contig=True norm=0.68
589
+ INFO:__main__:model=model.layers.20.mlp.down_proj.weight 201 shape=torch.Size([2048, 5632]) dtype=torch.float16 cpu ratio=0.5 contig=True norm=0.68
590
+ INFO:__main__:model=model.layers.20.input_layernorm.weight 201 shape=torch.Size([2048]) dtype=torch.float16 cpu ratio=0.5 contig=True norm=0.68
591
+ INFO:__main__:model=model.layers.20.post_attention_layernorm.weight 201 shape=torch.Size([2048]) dtype=torch.float16 cpu ratio=0.5 contig=True norm=0.68
592
+ INFO:__main__:model=model.layers.21.self_attn.q_proj.weight 201 shape=torch.Size([2048, 2048]) dtype=torch.float16 cpu ratio=0.5 contig=True norm=0.68
593
+ INFO:__main__:model=model.layers.21.self_attn.k_proj.weight 201 shape=torch.Size([256, 2048]) dtype=torch.float16 cpu ratio=0.5 contig=True norm=0.68
594
+ INFO:__main__:model=model.layers.21.self_attn.v_proj.weight 201 shape=torch.Size([256, 2048]) dtype=torch.float16 cpu ratio=0.5 contig=True norm=0.68
595
+ INFO:__main__:model=model.layers.21.self_attn.o_proj.weight 201 shape=torch.Size([2048, 2048]) dtype=torch.float16 cpu ratio=0.5 contig=True norm=0.68
596
+ INFO:__main__:model=model.layers.21.mlp.gate_proj.weight 201 shape=torch.Size([5632, 2048]) dtype=torch.float16 cpu ratio=0.5 contig=True norm=0.68
597
+ INFO:__main__:model=model.layers.21.mlp.up_proj.weight 201 shape=torch.Size([5632, 2048]) dtype=torch.float16 cpu ratio=0.5 contig=True norm=0.68
598
+ INFO:__main__:model=model.layers.21.mlp.down_proj.weight 201 shape=torch.Size([2048, 5632]) dtype=torch.float16 cpu ratio=0.5 contig=True norm=0.68
599
+ INFO:__main__:model=model.layers.21.input_layernorm.weight 201 shape=torch.Size([2048]) dtype=torch.float16 cpu ratio=0.5 contig=True norm=0.68
600
+ INFO:__main__:model=model.layers.21.post_attention_layernorm.weight 201 shape=torch.Size([2048]) dtype=torch.float16 cpu ratio=0.5 contig=True norm=0.68
601
+ INFO:__main__:model=model.norm.weight 201 shape=torch.Size([2048]) dtype=torch.float16 cpu ratio=0.5 contig=True norm=0.68
602
+ INFO:__main__:model=lm_head.weight 201 shape=torch.Size([32000, 2048]) dtype=torch.float16 cpu ratio=0.5 contig=True norm=0.68
603
  INFO:__main__:done merge saving to file: matlok/tinyllama-cinder-openhermes-32k
604
+ config.json: 100%|█████████████████████████████████████| 724/724 [00:00<00:00, 7.75MB/s]
605
+ model.safetensors: 100%|███████████████████████████| 2.20G/2.20G [00:23<00:00, 91.8MB/s]
606
+ generation_config.json: 100%|██████████████████████████| 133/133 [00:00<00:00, 1.58MB/s]
607
  INFO:__main__:loading newly-created file: matlok/tinyllama-cinder-openhermes-32k
608
+ INFO:__main__:loaded new model file: matlok/tinyllama-cinder-openhermes-32k asking question: why is the sky blue?
609
  INFO:__main__:loading tokenizer=TinyLlama/TinyLlama-1.1B-intermediate-step-1431k-3T
610
+ tokenizer_config.json: 100%|███████████████████████████| 776/776 [00:00<00:00, 8.26MB/s]
611
+ tokenizer.model: 100%|███████████████████████████████| 500k/500k [00:00<00:00, 64.6MB/s]
612
+ tokenizer.json: 100%|██████████████████████████████| 1.84M/1.84M [00:01<00:00, 1.57MB/s]
613
+ special_tokens_map.json: 100%|█████████████████████████| 414/414 [00:00<00:00, 2.47MB/s]
614
  Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
615
  INFO:__main__:
616
  ----------
 
623
  why is the sky blue?
624
  answer:
625
  why is the sky blue?
626
+ Answer: The sky is blue because of the presence of the trace amounts of the elements oxygen and nitrogen. These elements are present in the atmosphere in very small amounts. The trace amounts of these elements are responsible for the blue color of the sky.
627
+ Why is the sky blue?
628
+ Answer: The sky is blue because of the presence of the trace amounts of the elements oxygen and nitrogen. These elements are present in the atmosphere in very small amounts. The trace amounts of these elements are responsible for the blue color of the sky.
629
+ Why is the sky blue?
630
+ Answer: The sky is blue because of the presence of the trace amounts of the elements oxygen and nitrogen. These elements are present in the atmosphere in very small amounts. The trace amounts of these elements are responsible for the blue color of the sky.
631
+ Why is the sky blue?
632
+ Answer: The sky is blue because of the presence of the trace amounts of the elements oxygen and nitrogen. These elements are present in the atmosphere in very small amounts. The trace amounts of these elements are responsible for the blue color of the sky.
633
+ Why is the sky blue?
634
+ Answer: The sky is blue because of the presence of the trace amounts of
 
 
 
 
 
 
 
 
 
 
 
635
  ----------
636
+ INFO:__main__:uploading model: matlok/tinyllama-cinder-openhermes-32k
637
+ README.md: 100%|████████████████████████████████████| 45.6k/45.6k [00:00<00:00, 297MB/s]
638
+ model.safetensors: 100%|███████████████████████████| 2.20G/2.20G [01:18<00:00, 28.0MB/s]
639
+ INFO:__main__:uploading src tokenizer: TinyLlama/TinyLlama-1.1B-intermediate-step-1431k-3T
640
  INFO:__main__:done loading new model: LlamaForCausalLM(
641
  (model): LlamaModel(
642
  (embed_tokens): Embedding(32000, 2048)
 
664
  (lm_head): Linear(in_features=2048, out_features=32000, bias=False)
665
  ) file: matlok/tinyllama-cinder-openhermes-32k
666
 
667
+ real 4m44.626s
668
+ user 2m54.434s
669
+ sys 0m25.981s
670
  ```
671
 
672
+ ### Acknowlegdements
673
+
674
+ - Code sample above was modified from [this very helpful GitHub gist](https://gist.github.com/maldevide/08829eada04ad9bd78e46c1a3787d42b)
675
+ - [Fine tuning example](https://colab.research.google.com/drive/1PEQyJO1-f6j0S_XJ8DV50NkpzasXkrzd?usp=sharing)
676
+ - [CodeLlama example](https://huggingface.co/collections/mlabonne/codellama-6509bc68c2d4c8fc379ee87f)
run-tiny-merge.py CHANGED
@@ -1,5 +1,21 @@
1
  #!/usr/bin/env python3
2
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3
  import transformers
4
  import torch
5
  import logging
@@ -7,18 +23,12 @@ from ddare.merge import merge_tensors
7
  from ddare.tensor import (
8
  dare_ties_sparsification,
9
  relative_norm,
10
- divide_tensor_into_sets
11
  )
12
  from ddare.util import get_device
13
  import re
14
  from typing import Dict, Tuple, List
15
 
16
- # If you want to fine-tune, here's an example Unsloth fine tuning guide for:
17
- # Alpaca + TinyLlama + RoPE Scaling full example.ipynb
18
- # https://colab.research.google.com/drive/1AZghoNBQaMDgWJpi4RbffGM1h6raLUj9?usp=sharing
19
-
20
- # code here was refactored from gist:
21
- # https://gist.github.com/maldevide/08829eada04ad9bd78e46c1a3787d42b
22
 
23
  logging.basicConfig(level=logging.INFO)
24
  log = logging.getLogger(__name__)
@@ -28,10 +38,16 @@ def get_models(
28
  models: List[str],
29
  trust_remote_code: bool,
30
  ):
 
 
 
 
 
 
31
  config = {
32
- 'torch_dtype': torch.float16,
33
- 'low_cpu_mem_usage': False,
34
- 'trust_remote_code': trust_remote_code,
35
  }
36
  loaded_models = []
37
  num_models = len(models)
@@ -42,8 +58,7 @@ def get_models(
42
  )
43
  loaded_models.append(
44
  transformers.AutoModelForCausalLM.from_pretrained(
45
- model_path,
46
- **config
47
  )
48
  )
49
  return loaded_models
@@ -52,6 +67,11 @@ def get_models(
52
  def pm(
53
  model,
54
  ):
 
 
 
 
 
55
  keys = model.state_dict().keys()
56
  log.info(f"model keys={len(keys)}")
57
  for i, k in enumerate(keys):
@@ -59,38 +79,46 @@ def pm(
59
  log.info(
60
  f"{i:3d} {k} shape={tensor.shape} "
61
  f"type={tensor.dtype} dev={tensor.device} "
62
- f"contig={tensor.is_contiguous()}")
 
63
 
64
 
65
  def run_text_test(
66
  model,
67
- tokenizer_path,
68
  question: str,
69
  device: str = "cuda",
70
  ):
 
 
 
 
 
 
 
 
71
  base_model = model.to(device)
72
- log.info(
73
- f"loading tokenizer={tokenizer_path}"
74
- )
75
  tokenizer = transformers.AutoTokenizer.from_pretrained(
76
  tokenizer_path,
77
  torch_dtype=torch.float16,
78
  )
79
 
80
- inputs = tokenizer(
81
- question,
82
- return_tensors="pt"
83
- ).to(device)
84
  with torch.backends.cuda.sdp_kernel(
85
  enable_flash=True,
86
  enable_math=False,
87
- enable_mem_efficient=False
88
  ):
89
  outputs = base_model.generate(
90
  **inputs,
91
- max_new_tokens=1000,
92
  )
93
- answer = tokenizer.decode(outputs[0], skip_special_tokens=True)
 
 
94
  log.info(
95
  "\n"
96
  "----------"
@@ -101,11 +129,16 @@ def run_text_test(
101
  "----------"
102
  )
103
  base_model = base_model.to(device)
 
104
 
105
 
106
- def get_layer_type(
107
- key: str
108
- ) -> Tuple[int, str]:
 
 
 
 
109
  matcher = re.compile(r"model.layers.(\d+).(.+)")
110
  m = matcher.match(key)
111
  if m is None:
@@ -123,8 +156,16 @@ def get_layer_type(
123
  def merge_model_with_ties(
124
  models: List[str],
125
  model_dst: str,
126
- trust_remote_code: bool = True
127
  ):
 
 
 
 
 
 
 
 
128
  models = get_models(
129
  models=models,
130
  trust_remote_code=trust_remote_code,
@@ -150,25 +191,30 @@ def merge_model_with_ties(
150
 
151
  # build a ratio
152
  ratio = {
153
- 'to_q': 0.0,
154
- 'to_k': 0.0,
155
- 'to_v': 0.0,
156
- }.get(layer_type, .5)
157
 
158
  norm_ratio = 0.68
159
  log.info(
160
  f"model={k} {num_keys} shape={m0.shape} "
161
  f"dtype={m0.dtype} {m0.device} "
162
- f"raio={ratio} "
163
  f"contig={m0.is_contiguous()} "
164
- f"norm={norm_ratio}")
 
165
 
166
  # for all tensors
167
  for i, tensor in enumerate(m):
168
  if layer_type == "to_k":
169
  # Get to_q key
170
- q_base = models[0].state_dict()[k.replace("to_k", "to_q")]
171
- q_merge = models[i].state_dict()[k.replace("to_k", "to_q")]
 
 
 
 
172
  scale = relative_norm(q_merge, q_base)
173
  tensor = tensor.to(device) / scale
174
  del scale
@@ -176,9 +222,7 @@ def merge_model_with_ties(
176
  scale = relative_norm(tensor, m0)
177
  tensor = tensor.to(device) * scale
178
  del scale
179
- slice_mask = (
180
- sets == i
181
- ).bool()
182
  new_tensor = dare_ties_sparsification(
183
  model_a_param=m0,
184
  model_b_param=tensor,
@@ -186,21 +230,23 @@ def merge_model_with_ties(
186
  ties="sum",
187
  rescale="off",
188
  device=device,
189
- **config)
190
- new_tensor = merge_tensors("slerp", m0, tensor, ratio)
191
- result = torch.where(slice_mask, new_tensor, result)
 
 
 
 
 
192
  del new_tensor, slice_mask
193
 
194
  result_dict[k] = result
195
  # end of merge
196
 
197
- log.info(
198
- f"done merge saving to file: {model_dst}"
199
- )
200
  out_model = (
201
  transformers.AutoModelForCausalLM.from_pretrained(
202
- model_dst,
203
- **config
204
  )
205
  )
206
  out_model.state_dict = lambda: result_dict
@@ -208,17 +254,24 @@ def merge_model_with_ties(
208
 
209
 
210
  def run():
211
- question = (
212
- "why is the sky blue?"
 
 
 
 
 
 
 
 
213
  )
214
- log.info(f"merging models and asking the question: {question}")
215
  model_src = "TinyLlama/TinyLlama-1.1B-intermediate-step-1431k-3T"
216
  model_dst = "matlok/tinyllama-cinder-openhermes-32k"
217
  device = "cuda"
218
  config = {
219
- 'torch_dtype': torch.float16,
220
- 'low_cpu_mem_usage': False,
221
- 'trust_remote_code': True,
222
  }
223
  models = [
224
  model_src,
@@ -228,13 +281,13 @@ def run():
228
  "Josephgflowers/TinyLlama-3T-Cinder-v1.3",
229
  ]
230
  merge_model_with_ties(
231
- models=models,
232
- model_dst=model_dst
233
  )
234
  log.info(f"loading newly-created file: {model_dst}")
235
- model = transformers.AutoModelForCausalLM.from_pretrained(
236
- model_dst,
237
- **config
 
238
  )
239
  log.info(
240
  f"loaded new model file: {model_dst} "
@@ -246,7 +299,29 @@ def run():
246
  question=question,
247
  device=device,
248
  )
249
- log.info(f"done loading new model: {model} file: {model_dst}")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
250
 
251
 
252
  if __name__ == "__main__":
 
1
  #!/usr/bin/env python3
2
 
3
+ """
4
+ If you want to fine-tune, here's an example Unsloth fine tuning guide for:
5
+ Alpaca + TinyLlama + RoPE Scaling full example.ipynb
6
+ https://colab.research.google.com/drive/1AZghoNBQaMDgWJpi4RbffGM1h6raLUj9?usp=sharing
7
+
8
+ Code here was refactored from gist:
9
+ https://gist.github.com/maldevide/08829eada04ad9bd78e46c1a3787d42b
10
+
11
+ Fine tuning example:
12
+ https://colab.research.google.com/drive/1PEQyJO1-f6j0S_XJ8DV50NkpzasXkrzd?usp=sharing
13
+
14
+ CodeLlama example:
15
+ https://huggingface.co/collections/mlabonne/codellama-6509bc68c2d4c8fc379ee87f
16
+ """
17
+
18
+ import os
19
  import transformers
20
  import torch
21
  import logging
 
23
  from ddare.tensor import (
24
  dare_ties_sparsification,
25
  relative_norm,
26
+ divide_tensor_into_sets,
27
  )
28
  from ddare.util import get_device
29
  import re
30
  from typing import Dict, Tuple, List
31
 
 
 
 
 
 
 
32
 
33
  logging.basicConfig(level=logging.INFO)
34
  log = logging.getLogger(__name__)
 
38
  models: List[str],
39
  trust_remote_code: bool,
40
  ):
41
+ """
42
+ get the models
43
+
44
+ :param models: model names to download
45
+ :param trust_remote_code: are you sure??? True/False
46
+ """
47
  config = {
48
+ "torch_dtype": torch.float16,
49
+ "low_cpu_mem_usage": False,
50
+ "trust_remote_code": trust_remote_code,
51
  }
52
  loaded_models = []
53
  num_models = len(models)
 
58
  )
59
  loaded_models.append(
60
  transformers.AutoModelForCausalLM.from_pretrained(
61
+ model_path, **config
 
62
  )
63
  )
64
  return loaded_models
 
67
  def pm(
68
  model,
69
  ):
70
+ """
71
+ pretty print model
72
+
73
+ :param model: show me the model
74
+ """
75
  keys = model.state_dict().keys()
76
  log.info(f"model keys={len(keys)}")
77
  for i, k in enumerate(keys):
 
79
  log.info(
80
  f"{i:3d} {k} shape={tensor.shape} "
81
  f"type={tensor.dtype} dev={tensor.device} "
82
+ f"contig={tensor.is_contiguous()}"
83
+ )
84
 
85
 
86
  def run_text_test(
87
  model,
88
+ tokenizer_path: str,
89
  question: str,
90
  device: str = "cuda",
91
  ):
92
+ """
93
+ run a question on the model and return the answer
94
+
95
+ :param model: initialized model
96
+ :param tokenizer_path: tokenizer path/name
97
+ :param question: what are you asking?
98
+ :param device: where do you want to run "cpu"/"gpu"?
99
+ """
100
  base_model = model.to(device)
101
+ log.info(f"loading tokenizer={tokenizer_path}")
 
 
102
  tokenizer = transformers.AutoTokenizer.from_pretrained(
103
  tokenizer_path,
104
  torch_dtype=torch.float16,
105
  )
106
 
107
+ inputs = tokenizer(question, return_tensors="pt").to(
108
+ device
109
+ )
 
110
  with torch.backends.cuda.sdp_kernel(
111
  enable_flash=True,
112
  enable_math=False,
113
+ enable_mem_efficient=True,
114
  ):
115
  outputs = base_model.generate(
116
  **inputs,
117
+ max_new_tokens=256,
118
  )
119
+ answer = tokenizer.decode(
120
+ outputs[0], skip_special_tokens=True
121
+ )
122
  log.info(
123
  "\n"
124
  "----------"
 
129
  "----------"
130
  )
131
  base_model = base_model.to(device)
132
+ return tokenizer
133
 
134
 
135
+ def get_layer_type(key: str) -> Tuple[int, str]:
136
+ """
137
+ get the layer type
138
+
139
+ :param key: name of the layer
140
+ :return: layer id and name
141
+ """
142
  matcher = re.compile(r"model.layers.(\d+).(.+)")
143
  m = matcher.match(key)
144
  if m is None:
 
156
  def merge_model_with_ties(
157
  models: List[str],
158
  model_dst: str,
159
+ trust_remote_code: bool = True,
160
  ):
161
+ """
162
+ merge the list of models into one model
163
+ called model_dst
164
+
165
+ :param models: list of models to merge
166
+ :param model_dst: name of the new model
167
+ :param trust_remote_code: are you sure? True/False
168
+ """
169
  models = get_models(
170
  models=models,
171
  trust_remote_code=trust_remote_code,
 
191
 
192
  # build a ratio
193
  ratio = {
194
+ "to_q": 0.0,
195
+ "to_k": 0.0,
196
+ "to_v": 0.0,
197
+ }.get(layer_type, 0.5)
198
 
199
  norm_ratio = 0.68
200
  log.info(
201
  f"model={k} {num_keys} shape={m0.shape} "
202
  f"dtype={m0.dtype} {m0.device} "
203
+ f"ratio={ratio} "
204
  f"contig={m0.is_contiguous()} "
205
+ f"norm={norm_ratio}"
206
+ )
207
 
208
  # for all tensors
209
  for i, tensor in enumerate(m):
210
  if layer_type == "to_k":
211
  # Get to_q key
212
+ q_base = models[0].state_dict()[
213
+ k.replace("to_k", "to_q")
214
+ ]
215
+ q_merge = models[i].state_dict()[
216
+ k.replace("to_k", "to_q")
217
+ ]
218
  scale = relative_norm(q_merge, q_base)
219
  tensor = tensor.to(device) / scale
220
  del scale
 
222
  scale = relative_norm(tensor, m0)
223
  tensor = tensor.to(device) * scale
224
  del scale
225
+ slice_mask = (sets == i).bool()
 
 
226
  new_tensor = dare_ties_sparsification(
227
  model_a_param=m0,
228
  model_b_param=tensor,
 
230
  ties="sum",
231
  rescale="off",
232
  device=device,
233
+ **config,
234
+ )
235
+ new_tensor = merge_tensors(
236
+ "slerp", m0, tensor, ratio
237
+ )
238
+ result = torch.where(
239
+ slice_mask, new_tensor, result
240
+ )
241
  del new_tensor, slice_mask
242
 
243
  result_dict[k] = result
244
  # end of merge
245
 
246
+ log.info(f"done merge saving to file: {model_dst}")
 
 
247
  out_model = (
248
  transformers.AutoModelForCausalLM.from_pretrained(
249
+ model_dst, **config
 
250
  )
251
  )
252
  out_model.state_dict = lambda: result_dict
 
254
 
255
 
256
  def run():
257
+ """
258
+ run the merge and upload the model and tokenizer
259
+
260
+ This requires running having the HuggingFace token
261
+ set before it will work:
262
+ ```huggingface-cli login```
263
+ """
264
+ question = "why is the sky blue?"
265
+ log.info(
266
+ f"merging models and asking the question: {question}"
267
  )
 
268
  model_src = "TinyLlama/TinyLlama-1.1B-intermediate-step-1431k-3T"
269
  model_dst = "matlok/tinyllama-cinder-openhermes-32k"
270
  device = "cuda"
271
  config = {
272
+ "torch_dtype": torch.float16,
273
+ "low_cpu_mem_usage": False,
274
+ "trust_remote_code": True,
275
  }
276
  models = [
277
  model_src,
 
281
  "Josephgflowers/TinyLlama-3T-Cinder-v1.3",
282
  ]
283
  merge_model_with_ties(
284
+ models=models, model_dst=model_dst
 
285
  )
286
  log.info(f"loading newly-created file: {model_dst}")
287
+ model = (
288
+ transformers.AutoModelForCausalLM.from_pretrained(
289
+ model_dst, **config
290
+ )
291
  )
292
  log.info(
293
  f"loaded new model file: {model_dst} "
 
299
  question=question,
300
  device=device,
301
  )
302
+
303
+ # clean the temp merge dir
304
+ # remove model dir to prevent issues with the tokenizer upload
305
+ model_org = model_dst.split("/")[0]
306
+ if os.path.exists(model_org):
307
+ os.system(f"rm -rf ./{model_org}")
308
+
309
+ log.info(f"uploading model: {model_dst}")
310
+ model.push_to_hub(model_dst)
311
+
312
+ log.info(f"uploading src tokenizer: {model_src}")
313
+ # reload tokenizer to save it and found on:
314
+ # https://colab.research.google.com/drive/1PEQyJO1-f6j0S_XJ8DV50NkpzasXkrzd?usp=sharing#scrollTo=QQn30cRtAZ-P
315
+ tokenizer = transformers.AutoTokenizer.from_pretrained(
316
+ model_src, trust_remote_code=True
317
+ )
318
+ # https://huggingface.co/docs/transformers/model_sharing#use-the-pushtohub-function
319
+ # tokenizer.push_to_hub("my-awesome-model")
320
+ tokenizer.push_to_hub(model_dst)
321
+ log.info(
322
+ f"done loading new model: {model} "
323
+ f"file: {model_dst}"
324
+ )
325
 
326
 
327
  if __name__ == "__main__":