|
--- |
|
license: unknown |
|
--- |
|
|
|
## Merging models like lego blocks using ddare and ties |
|
|
|
If you want to fine-tune, here's an example Unsloth fine tuning guide for: |
|
[Alpaca + TinyLlama + RoPE Scaling full example.ipynb](https://colab.research.google.com/drive/1AZghoNBQaMDgWJpi4RbffGM1h6raLUj9?usp=sharing#scrollTo=LjY75GoYUCB8) |
|
|
|
## How do I generate my own model merges? |
|
|
|
The code below merges the following HuggingFace TinyLlama models: |
|
|
|
- TinyLlama/TinyLlama-1.1B-intermediate-step-1431k-3T |
|
- Doctor-Shotgun/TinyLlama-1.1B-32k-Instruct |
|
- Doctor-Shotgun/TinyLlama-1.1B-32k |
|
- Tensoic/TinyLlama-1.1B-3T-openhermes |
|
- Josephgflowers/TinyLlama-3T-Cinder-v1.3 |
|
|
|
```python3 |
|
import transformers |
|
import torch |
|
import logging |
|
from ddare.merge import merge_tensors |
|
from ddare.tensor import dare_ties_sparsification, relative_norm, divide_tensor_into_sets |
|
from ddare.util import get_device |
|
import re |
|
from typing import Dict, Tuple, List |
|
|
|
logging.basicConfig(level=logging.INFO) |
|
log = logging.getLogger(__name__) |
|
|
|
|
|
def get_models( |
|
models: List[str], |
|
trust_remote_code: bool, |
|
): |
|
config = { |
|
'torch_dtype': torch.float16, |
|
'low_cpu_mem_usage': False, |
|
'trust_remote_code': trust_remote_code, |
|
} |
|
loaded_models = [] |
|
num_models = len(models) |
|
for midx, model_path in enumerate(models): |
|
log.info( |
|
f"loading model={midx}/{num_models} " |
|
f"model={model_path} " |
|
) |
|
loaded_models.append( |
|
transformers.AutoModelForCausalLM.from_pretrained( |
|
model_path, |
|
**config |
|
) |
|
) |
|
return loaded_models |
|
|
|
|
|
def pm( |
|
model, |
|
): |
|
keys = model.state_dict().keys() |
|
log.info(f"model keys={len(keys)}") |
|
for i, k in enumerate(keys): |
|
tensor = model.state_dict()[k] |
|
log.info( |
|
f"{i:3d} {k} shape={tensor.shape} " |
|
f"type={tensor.dtype} dev={tensor.device} " |
|
f"contig={tensor.is_contiguous()}") |
|
|
|
|
|
def run_text_test( |
|
model, |
|
model_path, |
|
device: str, |
|
question: str, |
|
): |
|
base_model = model.to(device) |
|
log.info( |
|
f"loading model={model_path}" |
|
) |
|
tokenizer = transformers.AutoTokenizer.from_pretrained( |
|
model_path, |
|
torch_dtype=torch.float16) |
|
|
|
inputs = tokenizer( |
|
question, |
|
return_tensors="pt" |
|
).to("cuda") |
|
with torch.backends.cuda.sdp_kernel( |
|
enable_flash=True, |
|
enable_math=False, |
|
enable_mem_efficient=False |
|
): |
|
outputs = base_model.generate(**inputs) |
|
log.info(tokenizer.decode(outputs[0], skip_special_tokens=True)) |
|
base_model = base_model.to("cpu") |
|
|
|
|
|
def get_layer_type( |
|
key: str |
|
) -> Tuple[int, str]: |
|
matcher = re.compile(r"model.layers.(\d+).(.+)") |
|
m = matcher.match(key) |
|
if m is None: |
|
if "model.norm.weight" == key: |
|
return -1, "norm" |
|
if "model.embed_tokens.weight" == key: |
|
return -1, "embed" |
|
if "lm_head.weight" == key: |
|
return -1, "head" |
|
log.info(f"Unknown key {key}") |
|
return -1, "unknown" |
|
return int(m.group(1)), m.group(2) |
|
|
|
|
|
def merge_model_with_ties( |
|
models: List[str], |
|
model_dst: str, |
|
trust_remote_code: bool = True |
|
): |
|
models = get_models( |
|
models=models, |
|
trust_remote_code=trust_remote_code, |
|
) |
|
config = {} |
|
result_dict: Dict[str, torch.Tensor] = {} |
|
device = get_device() |
|
keys = models[0].state_dict().keys() |
|
num_keys = len(keys) |
|
for k in keys: |
|
block, layer_type = get_layer_type(k) |
|
m0: torch.Tensor = models[0].state_dict()[k] |
|
result = m0.clone() |
|
sets = divide_tensor_into_sets(tensor=m0, n_sets=4) |
|
|
|
# get the src layers to merge |
|
m = [ |
|
models[1].state_dict()[k], |
|
models[2].state_dict()[k], |
|
models[3].state_dict()[k], |
|
] |
|
|
|
# build a ratio |
|
ratio = { |
|
'to_q': 0.0, |
|
'to_k': 0.0, |
|
'to_v': 0.0, |
|
}.get(layer_type, .5) |
|
|
|
norm_ratio = 0.68 |
|
log.info( |
|
f"model={k} {num_keys} shape={m0.shape} " |
|
f"dtype={m0.dtype} {m0.device} " |
|
f"raio={ratio} " |
|
f"contig={m0.is_contiguous()} " |
|
f"norm={norm_ratio}") |
|
|
|
# for all tensors |
|
for i, tensor in enumerate(m): |
|
if layer_type == "to_k": |
|
# Get to_q key |
|
q_base = models[0].state_dict()[k.replace("to_k", "to_q")] |
|
q_merge = models[i].state_dict()[k.replace("to_k", "to_q")] |
|
scale = relative_norm(q_merge, q_base) |
|
tensor = tensor.to(device) / scale |
|
del scale |
|
elif layer_type == "to_q": |
|
scale = relative_norm(tensor, m0) |
|
tensor = tensor.to(device) * scale |
|
del scale |
|
slice_mask = ( |
|
sets == i |
|
).bool() |
|
new_tensor = dare_ties_sparsification( |
|
model_a_param=m0, |
|
model_b_param=tensor, |
|
drop_rate=norm_ratio, |
|
ties="sum", |
|
rescale="off", |
|
device=device, |
|
**config) |
|
new_tensor = merge_tensors("slerp", m0, tensor, ratio) |
|
result = torch.where(slice_mask, new_tensor, result) |
|
del new_tensor, slice_mask |
|
|
|
result_dict[k] = result |
|
# end of merge |
|
|
|
log.info( |
|
f"{config} - done merge saving to file: {model_dst}" |
|
) |
|
out_model = ( |
|
transformers.AutoModelForCausalLM.from_pretrained( |
|
model_dst, |
|
**config |
|
) |
|
) |
|
out_model.state_dict = lambda: result_dict |
|
out_model.save_pretrained(model_dst) |
|
|
|
|
|
def run(): |
|
log.info("start") |
|
model_src = "TinyLlama/TinyLlama-1.1B-intermediate-step-1431k-3T" |
|
model_dst = "matlok/tinyllama-cinder-openhermes-32k" |
|
config = { |
|
'torch_dtype': torch.float16, |
|
'low_cpu_mem_usage': False, |
|
'trust_remote_code': True, |
|
} |
|
models = [ |
|
model_src, |
|
"Doctor-Shotgun/TinyLlama-1.1B-32k-Instruct", |
|
"Doctor-Shotgun/TinyLlama-1.1B-32k", |
|
"Tensoic/TinyLlama-1.1B-3T-openhermes", |
|
"Josephgflowers/TinyLlama-3T-Cinder-v1.3", |
|
] |
|
merge_model_with_ties( |
|
models=models, |
|
model_dst=model_dst |
|
) |
|
log.info(f"loading newly-created file: {model_dst}") |
|
model = transformers.AutoModelForCausalLM.from_pretrained( |
|
model_dst, |
|
**config |
|
) |
|
pm(model=model) |
|
log.info(f"done loading new model: {model} file: {model_dst}") |
|
|
|
|
|
if __name__ == "__main__": |
|
run() |
|
``` |
|
|
|
|
|
### Logs |
|
|
|
Here's hte logs |
|
|
|
``` |
|
Total VRAM 12282 MB, total RAM 85434 MB |
|
Set vram state to: NORMAL_VRAM |
|
Device: cuda:0 NVIDIA GeForce RTX 4070 Ti : native |
|
VAE dtype: torch.bfloat16 |
|
INFO:__main__:start |
|
INFO:__main__:loading model=0/5 model=TinyLlama/TinyLlama-1.1B-intermediate-step-1431k-3T |
|
INFO:__main__:loading model=1/5 model=Doctor-Shotgun/TinyLlama-1.1B-32k-Instruct |
|
INFO:__main__:loading model=2/5 model=Doctor-Shotgun/TinyLlama-1.1B-32k |
|
INFO:__main__:loading model=3/5 model=Tensoic/TinyLlama-1.1B-3T-openhermes |
|
INFO:__main__:loading model=4/5 model=Josephgflowers/TinyLlama-3T-Cinder-v1.3 |
|
INFO:__main__:model=model.embed_tokens.weight 201 shape=torch.Size([32000, 2048]) dtype=torch.float16 cpu raio=0.5 contig=True norm=0.68 |
|
INFO:__main__:model=model.layers.0.self_attn.q_proj.weight 201 shape=torch.Size([2048, 2048]) dtype=torch.float16 cpu raio=0.5 contig=True norm=0.68 |
|
INFO:__main__:model=model.layers.0.self_attn.k_proj.weight 201 shape=torch.Size([256, 2048]) dtype=torch.float16 cpu raio=0.5 contig=True norm=0.68 |
|
INFO:__main__:model=model.layers.0.self_attn.v_proj.weight 201 shape=torch.Size([256, 2048]) dtype=torch.float16 cpu raio=0.5 contig=True norm=0.68 |
|
INFO:__main__:model=model.layers.0.self_attn.o_proj.weight 201 shape=torch.Size([2048, 2048]) dtype=torch.float16 cpu raio=0.5 contig=True norm=0.68 |
|
INFO:__main__:model=model.layers.0.mlp.gate_proj.weight 201 shape=torch.Size([5632, 2048]) dtype=torch.float16 cpu raio=0.5 contig=True norm=0.68 |
|
INFO:__main__:model=model.layers.0.mlp.up_proj.weight 201 shape=torch.Size([5632, 2048]) dtype=torch.float16 cpu raio=0.5 contig=True norm=0.68 |
|
INFO:__main__:model=model.layers.0.mlp.down_proj.weight 201 shape=torch.Size([2048, 5632]) dtype=torch.float16 cpu raio=0.5 contig=True norm=0.68 |
|
INFO:__main__:model=model.layers.0.input_layernorm.weight 201 shape=torch.Size([2048]) dtype=torch.float16 cpu raio=0.5 contig=True norm=0.68 |
|
INFO:__main__:model=model.layers.0.post_attention_layernorm.weight 201 shape=torch.Size([2048]) dtype=torch.float16 cpu raio=0.5 contig=True norm=0.68 |
|
INFO:__main__:model=model.layers.1.self_attn.q_proj.weight 201 shape=torch.Size([2048, 2048]) dtype=torch.float16 cpu raio=0.5 contig=True norm=0.68 |
|
INFO:__main__:model=model.layers.1.self_attn.k_proj.weight 201 shape=torch.Size([256, 2048]) dtype=torch.float16 cpu raio=0.5 contig=True norm=0.68 |
|
INFO:__main__:model=model.layers.1.self_attn.v_proj.weight 201 shape=torch.Size([256, 2048]) dtype=torch.float16 cpu raio=0.5 contig=True norm=0.68 |
|
INFO:__main__:model=model.layers.1.self_attn.o_proj.weight 201 shape=torch.Size([2048, 2048]) dtype=torch.float16 cpu raio=0.5 contig=True norm=0.68 |
|
INFO:__main__:model=model.layers.1.mlp.gate_proj.weight 201 shape=torch.Size([5632, 2048]) dtype=torch.float16 cpu raio=0.5 contig=True norm=0.68 |
|
INFO:__main__:model=model.layers.1.mlp.up_proj.weight 201 shape=torch.Size([5632, 2048]) dtype=torch.float16 cpu raio=0.5 contig=True norm=0.68 |
|
INFO:__main__:model=model.layers.1.mlp.down_proj.weight 201 shape=torch.Size([2048, 5632]) dtype=torch.float16 cpu raio=0.5 contig=True norm=0.68 |
|
INFO:__main__:model=model.layers.1.input_layernorm.weight 201 shape=torch.Size([2048]) dtype=torch.float16 cpu raio=0.5 contig=True norm=0.68 |
|
INFO:__main__:model=model.layers.1.post_attention_layernorm.weight 201 shape=torch.Size([2048]) dtype=torch.float16 cpu raio=0.5 contig=True norm=0.68 |
|
INFO:__main__:model=model.layers.2.self_attn.q_proj.weight 201 shape=torch.Size([2048, 2048]) dtype=torch.float16 cpu raio=0.5 contig=True norm=0.68 |
|
INFO:__main__:model=model.layers.2.self_attn.k_proj.weight 201 shape=torch.Size([256, 2048]) dtype=torch.float16 cpu raio=0.5 contig=True norm=0.68 |
|
INFO:__main__:model=model.layers.2.self_attn.v_proj.weight 201 shape=torch.Size([256, 2048]) dtype=torch.float16 cpu raio=0.5 contig=True norm=0.68 |
|
INFO:__main__:model=model.layers.2.self_attn.o_proj.weight 201 shape=torch.Size([2048, 2048]) dtype=torch.float16 cpu raio=0.5 contig=True norm=0.68 |
|
INFO:__main__:model=model.layers.2.mlp.gate_proj.weight 201 shape=torch.Size([5632, 2048]) dtype=torch.float16 cpu raio=0.5 contig=True norm=0.68 |
|
INFO:__main__:model=model.layers.2.mlp.up_proj.weight 201 shape=torch.Size([5632, 2048]) dtype=torch.float16 cpu raio=0.5 contig=True norm=0.68 |
|
INFO:__main__:model=model.layers.2.mlp.down_proj.weight 201 shape=torch.Size([2048, 5632]) dtype=torch.float16 cpu raio=0.5 contig=True norm=0.68 |
|
INFO:__main__:model=model.layers.2.input_layernorm.weight 201 shape=torch.Size([2048]) dtype=torch.float16 cpu raio=0.5 contig=True norm=0.68 |
|
INFO:__main__:model=model.layers.2.post_attention_layernorm.weight 201 shape=torch.Size([2048]) dtype=torch.float16 cpu raio=0.5 contig=True norm=0.68 |
|
INFO:__main__:model=model.layers.3.self_attn.q_proj.weight 201 shape=torch.Size([2048, 2048]) dtype=torch.float16 cpu raio=0.5 contig=True norm=0.68 |
|
INFO:__main__:model=model.layers.3.self_attn.k_proj.weight 201 shape=torch.Size([256, 2048]) dtype=torch.float16 cpu raio=0.5 contig=True norm=0.68 |
|
INFO:__main__:model=model.layers.3.self_attn.v_proj.weight 201 shape=torch.Size([256, 2048]) dtype=torch.float16 cpu raio=0.5 contig=True norm=0.68 |
|
INFO:__main__:model=model.layers.3.self_attn.o_proj.weight 201 shape=torch.Size([2048, 2048]) dtype=torch.float16 cpu raio=0.5 contig=True norm=0.68 |
|
INFO:__main__:model=model.layers.3.mlp.gate_proj.weight 201 shape=torch.Size([5632, 2048]) dtype=torch.float16 cpu raio=0.5 contig=True norm=0.68 |
|
INFO:__main__:model=model.layers.3.mlp.up_proj.weight 201 shape=torch.Size([5632, 2048]) dtype=torch.float16 cpu raio=0.5 contig=True norm=0.68 |
|
INFO:__main__:model=model.layers.3.mlp.down_proj.weight 201 shape=torch.Size([2048, 5632]) dtype=torch.float16 cpu raio=0.5 contig=True norm=0.68 |
|
INFO:__main__:model=model.layers.3.input_layernorm.weight 201 shape=torch.Size([2048]) dtype=torch.float16 cpu raio=0.5 contig=True norm=0.68 |
|
INFO:__main__:model=model.layers.3.post_attention_layernorm.weight 201 shape=torch.Size([2048]) dtype=torch.float16 cpu raio=0.5 contig=True norm=0.68 |
|
INFO:__main__:model=model.layers.4.self_attn.q_proj.weight 201 shape=torch.Size([2048, 2048]) dtype=torch.float16 cpu raio=0.5 contig=True norm=0.68 |
|
INFO:__main__:model=model.layers.4.self_attn.k_proj.weight 201 shape=torch.Size([256, 2048]) dtype=torch.float16 cpu raio=0.5 contig=True norm=0.68 |
|
INFO:__main__:model=model.layers.4.self_attn.v_proj.weight 201 shape=torch.Size([256, 2048]) dtype=torch.float16 cpu raio=0.5 contig=True norm=0.68 |
|
INFO:__main__:model=model.layers.4.self_attn.o_proj.weight 201 shape=torch.Size([2048, 2048]) dtype=torch.float16 cpu raio=0.5 contig=True norm=0.68 |
|
INFO:__main__:model=model.layers.4.mlp.gate_proj.weight 201 shape=torch.Size([5632, 2048]) dtype=torch.float16 cpu raio=0.5 contig=True norm=0.68 |
|
INFO:__main__:model=model.layers.4.mlp.up_proj.weight 201 shape=torch.Size([5632, 2048]) dtype=torch.float16 cpu raio=0.5 contig=True norm=0.68 |
|
INFO:__main__:model=model.layers.4.mlp.down_proj.weight 201 shape=torch.Size([2048, 5632]) dtype=torch.float16 cpu raio=0.5 contig=True norm=0.68 |
|
INFO:__main__:model=model.layers.4.input_layernorm.weight 201 shape=torch.Size([2048]) dtype=torch.float16 cpu raio=0.5 contig=True norm=0.68 |
|
INFO:__main__:model=model.layers.4.post_attention_layernorm.weight 201 shape=torch.Size([2048]) dtype=torch.float16 cpu raio=0.5 contig=True norm=0.68 |
|
INFO:__main__:model=model.layers.5.self_attn.q_proj.weight 201 shape=torch.Size([2048, 2048]) dtype=torch.float16 cpu raio=0.5 contig=True norm=0.68 |
|
INFO:__main__:model=model.layers.5.self_attn.k_proj.weight 201 shape=torch.Size([256, 2048]) dtype=torch.float16 cpu raio=0.5 contig=True norm=0.68 |
|
INFO:__main__:model=model.layers.5.self_attn.v_proj.weight 201 shape=torch.Size([256, 2048]) dtype=torch.float16 cpu raio=0.5 contig=True norm=0.68 |
|
INFO:__main__:model=model.layers.5.self_attn.o_proj.weight 201 shape=torch.Size([2048, 2048]) dtype=torch.float16 cpu raio=0.5 contig=True norm=0.68 |
|
INFO:__main__:model=model.layers.5.mlp.gate_proj.weight 201 shape=torch.Size([5632, 2048]) dtype=torch.float16 cpu raio=0.5 contig=True norm=0.68 |
|
INFO:__main__:model=model.layers.5.mlp.up_proj.weight 201 shape=torch.Size([5632, 2048]) dtype=torch.float16 cpu raio=0.5 contig=True norm=0.68 |
|
INFO:__main__:model=model.layers.5.mlp.down_proj.weight 201 shape=torch.Size([2048, 5632]) dtype=torch.float16 cpu raio=0.5 contig=True norm=0.68 |
|
INFO:__main__:model=model.layers.5.input_layernorm.weight 201 shape=torch.Size([2048]) dtype=torch.float16 cpu raio=0.5 contig=True norm=0.68 |
|
INFO:__main__:model=model.layers.5.post_attention_layernorm.weight 201 shape=torch.Size([2048]) dtype=torch.float16 cpu raio=0.5 contig=True norm=0.68 |
|
INFO:__main__:model=model.layers.6.self_attn.q_proj.weight 201 shape=torch.Size([2048, 2048]) dtype=torch.float16 cpu raio=0.5 contig=True norm=0.68 |
|
INFO:__main__:model=model.layers.6.self_attn.k_proj.weight 201 shape=torch.Size([256, 2048]) dtype=torch.float16 cpu raio=0.5 contig=True norm=0.68 |
|
INFO:__main__:model=model.layers.6.self_attn.v_proj.weight 201 shape=torch.Size([256, 2048]) dtype=torch.float16 cpu raio=0.5 contig=True norm=0.68 |
|
INFO:__main__:model=model.layers.6.self_attn.o_proj.weight 201 shape=torch.Size([2048, 2048]) dtype=torch.float16 cpu raio=0.5 contig=True norm=0.68 |
|
INFO:__main__:model=model.layers.6.mlp.gate_proj.weight 201 shape=torch.Size([5632, 2048]) dtype=torch.float16 cpu raio=0.5 contig=True norm=0.68 |
|
INFO:__main__:model=model.layers.6.mlp.up_proj.weight 201 shape=torch.Size([5632, 2048]) dtype=torch.float16 cpu raio=0.5 contig=True norm=0.68 |
|
INFO:__main__:model=model.layers.6.mlp.down_proj.weight 201 shape=torch.Size([2048, 5632]) dtype=torch.float16 cpu raio=0.5 contig=True norm=0.68 |
|
INFO:__main__:model=model.layers.6.input_layernorm.weight 201 shape=torch.Size([2048]) dtype=torch.float16 cpu raio=0.5 contig=True norm=0.68 |
|
INFO:__main__:model=model.layers.6.post_attention_layernorm.weight 201 shape=torch.Size([2048]) dtype=torch.float16 cpu raio=0.5 contig=True norm=0.68 |
|
INFO:__main__:model=model.layers.7.self_attn.q_proj.weight 201 shape=torch.Size([2048, 2048]) dtype=torch.float16 cpu raio=0.5 contig=True norm=0.68 |
|
INFO:__main__:model=model.layers.7.self_attn.k_proj.weight 201 shape=torch.Size([256, 2048]) dtype=torch.float16 cpu raio=0.5 contig=True norm=0.68 |
|
INFO:__main__:model=model.layers.7.self_attn.v_proj.weight 201 shape=torch.Size([256, 2048]) dtype=torch.float16 cpu raio=0.5 contig=True norm=0.68 |
|
INFO:__main__:model=model.layers.7.self_attn.o_proj.weight 201 shape=torch.Size([2048, 2048]) dtype=torch.float16 cpu raio=0.5 contig=True norm=0.68 |
|
INFO:__main__:model=model.layers.7.mlp.gate_proj.weight 201 shape=torch.Size([5632, 2048]) dtype=torch.float16 cpu raio=0.5 contig=True norm=0.68 |
|
INFO:__main__:model=model.layers.7.mlp.up_proj.weight 201 shape=torch.Size([5632, 2048]) dtype=torch.float16 cpu raio=0.5 contig=True norm=0.68 |
|
INFO:__main__:model=model.layers.7.mlp.down_proj.weight 201 shape=torch.Size([2048, 5632]) dtype=torch.float16 cpu raio=0.5 contig=True norm=0.68 |
|
INFO:__main__:model=model.layers.7.input_layernorm.weight 201 shape=torch.Size([2048]) dtype=torch.float16 cpu raio=0.5 contig=True norm=0.68 |
|
INFO:__main__:model=model.layers.7.post_attention_layernorm.weight 201 shape=torch.Size([2048]) dtype=torch.float16 cpu raio=0.5 contig=True norm=0.68 |
|
INFO:__main__:model=model.layers.8.self_attn.q_proj.weight 201 shape=torch.Size([2048, 2048]) dtype=torch.float16 cpu raio=0.5 contig=True norm=0.68 |
|
INFO:__main__:model=model.layers.8.self_attn.k_proj.weight 201 shape=torch.Size([256, 2048]) dtype=torch.float16 cpu raio=0.5 contig=True norm=0.68 |
|
INFO:__main__:model=model.layers.8.self_attn.v_proj.weight 201 shape=torch.Size([256, 2048]) dtype=torch.float16 cpu raio=0.5 contig=True norm=0.68 |
|
INFO:__main__:model=model.layers.8.self_attn.o_proj.weight 201 shape=torch.Size([2048, 2048]) dtype=torch.float16 cpu raio=0.5 contig=True norm=0.68 |
|
INFO:__main__:model=model.layers.8.mlp.gate_proj.weight 201 shape=torch.Size([5632, 2048]) dtype=torch.float16 cpu raio=0.5 contig=True norm=0.68 |
|
INFO:__main__:model=model.layers.8.mlp.up_proj.weight 201 shape=torch.Size([5632, 2048]) dtype=torch.float16 cpu raio=0.5 contig=True norm=0.68 |
|
INFO:__main__:model=model.layers.8.mlp.down_proj.weight 201 shape=torch.Size([2048, 5632]) dtype=torch.float16 cpu raio=0.5 contig=True norm=0.68 |
|
INFO:__main__:model=model.layers.8.input_layernorm.weight 201 shape=torch.Size([2048]) dtype=torch.float16 cpu raio=0.5 contig=True norm=0.68 |
|
INFO:__main__:model=model.layers.8.post_attention_layernorm.weight 201 shape=torch.Size([2048]) dtype=torch.float16 cpu raio=0.5 contig=True norm=0.68 |
|
INFO:__main__:model=model.layers.9.self_attn.q_proj.weight 201 shape=torch.Size([2048, 2048]) dtype=torch.float16 cpu raio=0.5 contig=True norm=0.68 |
|
INFO:__main__:model=model.layers.9.self_attn.k_proj.weight 201 shape=torch.Size([256, 2048]) dtype=torch.float16 cpu raio=0.5 contig=True norm=0.68 |
|
INFO:__main__:model=model.layers.9.self_attn.v_proj.weight 201 shape=torch.Size([256, 2048]) dtype=torch.float16 cpu raio=0.5 contig=True norm=0.68 |
|
INFO:__main__:model=model.layers.9.self_attn.o_proj.weight 201 shape=torch.Size([2048, 2048]) dtype=torch.float16 cpu raio=0.5 contig=True norm=0.68 |
|
INFO:__main__:model=model.layers.9.mlp.gate_proj.weight 201 shape=torch.Size([5632, 2048]) dtype=torch.float16 cpu raio=0.5 contig=True norm=0.68 |
|
INFO:__main__:model=model.layers.9.mlp.up_proj.weight 201 shape=torch.Size([5632, 2048]) dtype=torch.float16 cpu raio=0.5 contig=True norm=0.68 |
|
INFO:__main__:model=model.layers.9.mlp.down_proj.weight 201 shape=torch.Size([2048, 5632]) dtype=torch.float16 cpu raio=0.5 contig=True norm=0.68 |
|
INFO:__main__:model=model.layers.9.input_layernorm.weight 201 shape=torch.Size([2048]) dtype=torch.float16 cpu raio=0.5 contig=True norm=0.68 |
|
INFO:__main__:model=model.layers.9.post_attention_layernorm.weight 201 shape=torch.Size([2048]) dtype=torch.float16 cpu raio=0.5 contig=True norm=0.68 |
|
INFO:__main__:model=model.layers.10.self_attn.q_proj.weight 201 shape=torch.Size([2048, 2048]) dtype=torch.float16 cpu raio=0.5 contig=True norm=0.68 |
|
INFO:__main__:model=model.layers.10.self_attn.k_proj.weight 201 shape=torch.Size([256, 2048]) dtype=torch.float16 cpu raio=0.5 contig=True norm=0.68 |
|
INFO:__main__:model=model.layers.10.self_attn.v_proj.weight 201 shape=torch.Size([256, 2048]) dtype=torch.float16 cpu raio=0.5 contig=True norm=0.68 |
|
INFO:__main__:model=model.layers.10.self_attn.o_proj.weight 201 shape=torch.Size([2048, 2048]) dtype=torch.float16 cpu raio=0.5 contig=True norm=0.68 |
|
INFO:__main__:model=model.layers.10.mlp.gate_proj.weight 201 shape=torch.Size([5632, 2048]) dtype=torch.float16 cpu raio=0.5 contig=True norm=0.68 |
|
INFO:__main__:model=model.layers.10.mlp.up_proj.weight 201 shape=torch.Size([5632, 2048]) dtype=torch.float16 cpu raio=0.5 contig=True norm=0.68 |
|
INFO:__main__:model=model.layers.10.mlp.down_proj.weight 201 shape=torch.Size([2048, 5632]) dtype=torch.float16 cpu raio=0.5 contig=True norm=0.68 |
|
INFO:__main__:model=model.layers.10.input_layernorm.weight 201 shape=torch.Size([2048]) dtype=torch.float16 cpu raio=0.5 contig=True norm=0.68 |
|
INFO:__main__:model=model.layers.10.post_attention_layernorm.weight 201 shape=torch.Size([2048]) dtype=torch.float16 cpu raio=0.5 contig=True norm=0.68 |
|
INFO:__main__:model=model.layers.11.self_attn.q_proj.weight 201 shape=torch.Size([2048, 2048]) dtype=torch.float16 cpu raio=0.5 contig=True norm=0.68 |
|
INFO:__main__:model=model.layers.11.self_attn.k_proj.weight 201 shape=torch.Size([256, 2048]) dtype=torch.float16 cpu raio=0.5 contig=True norm=0.68 |
|
INFO:__main__:model=model.layers.11.self_attn.v_proj.weight 201 shape=torch.Size([256, 2048]) dtype=torch.float16 cpu raio=0.5 contig=True norm=0.68 |
|
INFO:__main__:model=model.layers.11.self_attn.o_proj.weight 201 shape=torch.Size([2048, 2048]) dtype=torch.float16 cpu raio=0.5 contig=True norm=0.68 |
|
INFO:__main__:model=model.layers.11.mlp.gate_proj.weight 201 shape=torch.Size([5632, 2048]) dtype=torch.float16 cpu raio=0.5 contig=True norm=0.68 |
|
INFO:__main__:model=model.layers.11.mlp.up_proj.weight 201 shape=torch.Size([5632, 2048]) dtype=torch.float16 cpu raio=0.5 contig=True norm=0.68 |
|
INFO:__main__:model=model.layers.11.mlp.down_proj.weight 201 shape=torch.Size([2048, 5632]) dtype=torch.float16 cpu raio=0.5 contig=True norm=0.68 |
|
INFO:__main__:model=model.layers.11.input_layernorm.weight 201 shape=torch.Size([2048]) dtype=torch.float16 cpu raio=0.5 contig=True norm=0.68 |
|
INFO:__main__:model=model.layers.11.post_attention_layernorm.weight 201 shape=torch.Size([2048]) dtype=torch.float16 cpu raio=0.5 contig=True norm=0.68 |
|
INFO:__main__:model=model.layers.12.self_attn.q_proj.weight 201 shape=torch.Size([2048, 2048]) dtype=torch.float16 cpu raio=0.5 contig=True norm=0.68 |
|
INFO:__main__:model=model.layers.12.self_attn.k_proj.weight 201 shape=torch.Size([256, 2048]) dtype=torch.float16 cpu raio=0.5 contig=True norm=0.68 |
|
INFO:__main__:model=model.layers.12.self_attn.v_proj.weight 201 shape=torch.Size([256, 2048]) dtype=torch.float16 cpu raio=0.5 contig=True norm=0.68 |
|
INFO:__main__:model=model.layers.12.self_attn.o_proj.weight 201 shape=torch.Size([2048, 2048]) dtype=torch.float16 cpu raio=0.5 contig=True norm=0.68 |
|
INFO:__main__:model=model.layers.12.mlp.gate_proj.weight 201 shape=torch.Size([5632, 2048]) dtype=torch.float16 cpu raio=0.5 contig=True norm=0.68 |
|
INFO:__main__:model=model.layers.12.mlp.up_proj.weight 201 shape=torch.Size([5632, 2048]) dtype=torch.float16 cpu raio=0.5 contig=True norm=0.68 |
|
INFO:__main__:model=model.layers.12.mlp.down_proj.weight 201 shape=torch.Size([2048, 5632]) dtype=torch.float16 cpu raio=0.5 contig=True norm=0.68 |
|
INFO:__main__:model=model.layers.12.input_layernorm.weight 201 shape=torch.Size([2048]) dtype=torch.float16 cpu raio=0.5 contig=True norm=0.68 |
|
INFO:__main__:model=model.layers.12.post_attention_layernorm.weight 201 shape=torch.Size([2048]) dtype=torch.float16 cpu raio=0.5 contig=True norm=0.68 |
|
INFO:__main__:model=model.layers.13.self_attn.q_proj.weight 201 shape=torch.Size([2048, 2048]) dtype=torch.float16 cpu raio=0.5 contig=True norm=0.68 |
|
INFO:__main__:model=model.layers.13.self_attn.k_proj.weight 201 shape=torch.Size([256, 2048]) dtype=torch.float16 cpu raio=0.5 contig=True norm=0.68 |
|
INFO:__main__:model=model.layers.13.self_attn.v_proj.weight 201 shape=torch.Size([256, 2048]) dtype=torch.float16 cpu raio=0.5 contig=True norm=0.68 |
|
INFO:__main__:model=model.layers.13.self_attn.o_proj.weight 201 shape=torch.Size([2048, 2048]) dtype=torch.float16 cpu raio=0.5 contig=True norm=0.68 |
|
INFO:__main__:model=model.layers.13.mlp.gate_proj.weight 201 shape=torch.Size([5632, 2048]) dtype=torch.float16 cpu raio=0.5 contig=True norm=0.68 |
|
INFO:__main__:model=model.layers.13.mlp.up_proj.weight 201 shape=torch.Size([5632, 2048]) dtype=torch.float16 cpu raio=0.5 contig=True norm=0.68 |
|
INFO:__main__:model=model.layers.13.mlp.down_proj.weight 201 shape=torch.Size([2048, 5632]) dtype=torch.float16 cpu raio=0.5 contig=True norm=0.68 |
|
INFO:__main__:model=model.layers.13.input_layernorm.weight 201 shape=torch.Size([2048]) dtype=torch.float16 cpu raio=0.5 contig=True norm=0.68 |
|
INFO:__main__:model=model.layers.13.post_attention_layernorm.weight 201 shape=torch.Size([2048]) dtype=torch.float16 cpu raio=0.5 contig=True norm=0.68 |
|
INFO:__main__:model=model.layers.14.self_attn.q_proj.weight 201 shape=torch.Size([2048, 2048]) dtype=torch.float16 cpu raio=0.5 contig=True norm=0.68 |
|
INFO:__main__:model=model.layers.14.self_attn.k_proj.weight 201 shape=torch.Size([256, 2048]) dtype=torch.float16 cpu raio=0.5 contig=True norm=0.68 |
|
INFO:__main__:model=model.layers.14.self_attn.v_proj.weight 201 shape=torch.Size([256, 2048]) dtype=torch.float16 cpu raio=0.5 contig=True norm=0.68 |
|
INFO:__main__:model=model.layers.14.self_attn.o_proj.weight 201 shape=torch.Size([2048, 2048]) dtype=torch.float16 cpu raio=0.5 contig=True norm=0.68 |
|
INFO:__main__:model=model.layers.14.mlp.gate_proj.weight 201 shape=torch.Size([5632, 2048]) dtype=torch.float16 cpu raio=0.5 contig=True norm=0.68 |
|
INFO:__main__:model=model.layers.14.mlp.up_proj.weight 201 shape=torch.Size([5632, 2048]) dtype=torch.float16 cpu raio=0.5 contig=True norm=0.68 |
|
INFO:__main__:model=model.layers.14.mlp.down_proj.weight 201 shape=torch.Size([2048, 5632]) dtype=torch.float16 cpu raio=0.5 contig=True norm=0.68 |
|
INFO:__main__:model=model.layers.14.input_layernorm.weight 201 shape=torch.Size([2048]) dtype=torch.float16 cpu raio=0.5 contig=True norm=0.68 |
|
INFO:__main__:model=model.layers.14.post_attention_layernorm.weight 201 shape=torch.Size([2048]) dtype=torch.float16 cpu raio=0.5 contig=True norm=0.68 |
|
INFO:__main__:model=model.layers.15.self_attn.q_proj.weight 201 shape=torch.Size([2048, 2048]) dtype=torch.float16 cpu raio=0.5 contig=True norm=0.68 |
|
INFO:__main__:model=model.layers.15.self_attn.k_proj.weight 201 shape=torch.Size([256, 2048]) dtype=torch.float16 cpu raio=0.5 contig=True norm=0.68 |
|
INFO:__main__:model=model.layers.15.self_attn.v_proj.weight 201 shape=torch.Size([256, 2048]) dtype=torch.float16 cpu raio=0.5 contig=True norm=0.68 |
|
INFO:__main__:model=model.layers.15.self_attn.o_proj.weight 201 shape=torch.Size([2048, 2048]) dtype=torch.float16 cpu raio=0.5 contig=True norm=0.68 |
|
INFO:__main__:model=model.layers.15.mlp.gate_proj.weight 201 shape=torch.Size([5632, 2048]) dtype=torch.float16 cpu raio=0.5 contig=True norm=0.68 |
|
INFO:__main__:model=model.layers.15.mlp.up_proj.weight 201 shape=torch.Size([5632, 2048]) dtype=torch.float16 cpu raio=0.5 contig=True norm=0.68 |
|
INFO:__main__:model=model.layers.15.mlp.down_proj.weight 201 shape=torch.Size([2048, 5632]) dtype=torch.float16 cpu raio=0.5 contig=True norm=0.68 |
|
INFO:__main__:model=model.layers.15.input_layernorm.weight 201 shape=torch.Size([2048]) dtype=torch.float16 cpu raio=0.5 contig=True norm=0.68 |
|
INFO:__main__:model=model.layers.15.post_attention_layernorm.weight 201 shape=torch.Size([2048]) dtype=torch.float16 cpu raio=0.5 contig=True norm=0.68 |
|
INFO:__main__:model=model.layers.16.self_attn.q_proj.weight 201 shape=torch.Size([2048, 2048]) dtype=torch.float16 cpu raio=0.5 contig=True norm=0.68 |
|
INFO:__main__:model=model.layers.16.self_attn.k_proj.weight 201 shape=torch.Size([256, 2048]) dtype=torch.float16 cpu raio=0.5 contig=True norm=0.68 |
|
INFO:__main__:model=model.layers.16.self_attn.v_proj.weight 201 shape=torch.Size([256, 2048]) dtype=torch.float16 cpu raio=0.5 contig=True norm=0.68 |
|
INFO:__main__:model=model.layers.16.self_attn.o_proj.weight 201 shape=torch.Size([2048, 2048]) dtype=torch.float16 cpu raio=0.5 contig=True norm=0.68 |
|
INFO:__main__:model=model.layers.16.mlp.gate_proj.weight 201 shape=torch.Size([5632, 2048]) dtype=torch.float16 cpu raio=0.5 contig=True norm=0.68 |
|
INFO:__main__:model=model.layers.16.mlp.up_proj.weight 201 shape=torch.Size([5632, 2048]) dtype=torch.float16 cpu raio=0.5 contig=True norm=0.68 |
|
INFO:__main__:model=model.layers.16.mlp.down_proj.weight 201 shape=torch.Size([2048, 5632]) dtype=torch.float16 cpu raio=0.5 contig=True norm=0.68 |
|
INFO:__main__:model=model.layers.16.input_layernorm.weight 201 shape=torch.Size([2048]) dtype=torch.float16 cpu raio=0.5 contig=True norm=0.68 |
|
INFO:__main__:model=model.layers.16.post_attention_layernorm.weight 201 shape=torch.Size([2048]) dtype=torch.float16 cpu raio=0.5 contig=True norm=0.68 |
|
INFO:__main__:model=model.layers.17.self_attn.q_proj.weight 201 shape=torch.Size([2048, 2048]) dtype=torch.float16 cpu raio=0.5 contig=True norm=0.68 |
|
INFO:__main__:model=model.layers.17.self_attn.k_proj.weight 201 shape=torch.Size([256, 2048]) dtype=torch.float16 cpu raio=0.5 contig=True norm=0.68 |
|
INFO:__main__:model=model.layers.17.self_attn.v_proj.weight 201 shape=torch.Size([256, 2048]) dtype=torch.float16 cpu raio=0.5 contig=True norm=0.68 |
|
INFO:__main__:model=model.layers.17.self_attn.o_proj.weight 201 shape=torch.Size([2048, 2048]) dtype=torch.float16 cpu raio=0.5 contig=True norm=0.68 |
|
INFO:__main__:model=model.layers.17.mlp.gate_proj.weight 201 shape=torch.Size([5632, 2048]) dtype=torch.float16 cpu raio=0.5 contig=True norm=0.68 |
|
INFO:__main__:model=model.layers.17.mlp.up_proj.weight 201 shape=torch.Size([5632, 2048]) dtype=torch.float16 cpu raio=0.5 contig=True norm=0.68 |
|
INFO:__main__:model=model.layers.17.mlp.down_proj.weight 201 shape=torch.Size([2048, 5632]) dtype=torch.float16 cpu raio=0.5 contig=True norm=0.68 |
|
INFO:__main__:model=model.layers.17.input_layernorm.weight 201 shape=torch.Size([2048]) dtype=torch.float16 cpu raio=0.5 contig=True norm=0.68 |
|
INFO:__main__:model=model.layers.17.post_attention_layernorm.weight 201 shape=torch.Size([2048]) dtype=torch.float16 cpu raio=0.5 contig=True norm=0.68 |
|
INFO:__main__:model=model.layers.18.self_attn.q_proj.weight 201 shape=torch.Size([2048, 2048]) dtype=torch.float16 cpu raio=0.5 contig=True norm=0.68 |
|
INFO:__main__:model=model.layers.18.self_attn.k_proj.weight 201 shape=torch.Size([256, 2048]) dtype=torch.float16 cpu raio=0.5 contig=True norm=0.68 |
|
INFO:__main__:model=model.layers.18.self_attn.v_proj.weight 201 shape=torch.Size([256, 2048]) dtype=torch.float16 cpu raio=0.5 contig=True norm=0.68 |
|
INFO:__main__:model=model.layers.18.self_attn.o_proj.weight 201 shape=torch.Size([2048, 2048]) dtype=torch.float16 cpu raio=0.5 contig=True norm=0.68 |
|
INFO:__main__:model=model.layers.18.mlp.gate_proj.weight 201 shape=torch.Size([5632, 2048]) dtype=torch.float16 cpu raio=0.5 contig=True norm=0.68 |
|
INFO:__main__:model=model.layers.18.mlp.up_proj.weight 201 shape=torch.Size([5632, 2048]) dtype=torch.float16 cpu raio=0.5 contig=True norm=0.68 |
|
INFO:__main__:model=model.layers.18.mlp.down_proj.weight 201 shape=torch.Size([2048, 5632]) dtype=torch.float16 cpu raio=0.5 contig=True norm=0.68 |
|
INFO:__main__:model=model.layers.18.input_layernorm.weight 201 shape=torch.Size([2048]) dtype=torch.float16 cpu raio=0.5 contig=True norm=0.68 |
|
INFO:__main__:model=model.layers.18.post_attention_layernorm.weight 201 shape=torch.Size([2048]) dtype=torch.float16 cpu raio=0.5 contig=True norm=0.68 |
|
INFO:__main__:model=model.layers.19.self_attn.q_proj.weight 201 shape=torch.Size([2048, 2048]) dtype=torch.float16 cpu raio=0.5 contig=True norm=0.68 |
|
INFO:__main__:model=model.layers.19.self_attn.k_proj.weight 201 shape=torch.Size([256, 2048]) dtype=torch.float16 cpu raio=0.5 contig=True norm=0.68 |
|
INFO:__main__:model=model.layers.19.self_attn.v_proj.weight 201 shape=torch.Size([256, 2048]) dtype=torch.float16 cpu raio=0.5 contig=True norm=0.68 |
|
INFO:__main__:model=model.layers.19.self_attn.o_proj.weight 201 shape=torch.Size([2048, 2048]) dtype=torch.float16 cpu raio=0.5 contig=True norm=0.68 |
|
INFO:__main__:model=model.layers.19.mlp.gate_proj.weight 201 shape=torch.Size([5632, 2048]) dtype=torch.float16 cpu raio=0.5 contig=True norm=0.68 |
|
INFO:__main__:model=model.layers.19.mlp.up_proj.weight 201 shape=torch.Size([5632, 2048]) dtype=torch.float16 cpu raio=0.5 contig=True norm=0.68 |
|
INFO:__main__:model=model.layers.19.mlp.down_proj.weight 201 shape=torch.Size([2048, 5632]) dtype=torch.float16 cpu raio=0.5 contig=True norm=0.68 |
|
INFO:__main__:model=model.layers.19.input_layernorm.weight 201 shape=torch.Size([2048]) dtype=torch.float16 cpu raio=0.5 contig=True norm=0.68 |
|
INFO:__main__:model=model.layers.19.post_attention_layernorm.weight 201 shape=torch.Size([2048]) dtype=torch.float16 cpu raio=0.5 contig=True norm=0.68 |
|
INFO:__main__:model=model.layers.20.self_attn.q_proj.weight 201 shape=torch.Size([2048, 2048]) dtype=torch.float16 cpu raio=0.5 contig=True norm=0.68 |
|
INFO:__main__:model=model.layers.20.self_attn.k_proj.weight 201 shape=torch.Size([256, 2048]) dtype=torch.float16 cpu raio=0.5 contig=True norm=0.68 |
|
INFO:__main__:model=model.layers.20.self_attn.v_proj.weight 201 shape=torch.Size([256, 2048]) dtype=torch.float16 cpu raio=0.5 contig=True norm=0.68 |
|
INFO:__main__:model=model.layers.20.self_attn.o_proj.weight 201 shape=torch.Size([2048, 2048]) dtype=torch.float16 cpu raio=0.5 contig=True norm=0.68 |
|
INFO:__main__:model=model.layers.20.mlp.gate_proj.weight 201 shape=torch.Size([5632, 2048]) dtype=torch.float16 cpu raio=0.5 contig=True norm=0.68 |
|
INFO:__main__:model=model.layers.20.mlp.up_proj.weight 201 shape=torch.Size([5632, 2048]) dtype=torch.float16 cpu raio=0.5 contig=True norm=0.68 |
|
INFO:__main__:model=model.layers.20.mlp.down_proj.weight 201 shape=torch.Size([2048, 5632]) dtype=torch.float16 cpu raio=0.5 contig=True norm=0.68 |
|
INFO:__main__:model=model.layers.20.input_layernorm.weight 201 shape=torch.Size([2048]) dtype=torch.float16 cpu raio=0.5 contig=True norm=0.68 |
|
INFO:__main__:model=model.layers.20.post_attention_layernorm.weight 201 shape=torch.Size([2048]) dtype=torch.float16 cpu raio=0.5 contig=True norm=0.68 |
|
INFO:__main__:model=model.layers.21.self_attn.q_proj.weight 201 shape=torch.Size([2048, 2048]) dtype=torch.float16 cpu raio=0.5 contig=True norm=0.68 |
|
INFO:__main__:model=model.layers.21.self_attn.k_proj.weight 201 shape=torch.Size([256, 2048]) dtype=torch.float16 cpu raio=0.5 contig=True norm=0.68 |
|
INFO:__main__:model=model.layers.21.self_attn.v_proj.weight 201 shape=torch.Size([256, 2048]) dtype=torch.float16 cpu raio=0.5 contig=True norm=0.68 |
|
INFO:__main__:model=model.layers.21.self_attn.o_proj.weight 201 shape=torch.Size([2048, 2048]) dtype=torch.float16 cpu raio=0.5 contig=True norm=0.68 |
|
INFO:__main__:model=model.layers.21.mlp.gate_proj.weight 201 shape=torch.Size([5632, 2048]) dtype=torch.float16 cpu raio=0.5 contig=True norm=0.68 |
|
INFO:__main__:model=model.layers.21.mlp.up_proj.weight 201 shape=torch.Size([5632, 2048]) dtype=torch.float16 cpu raio=0.5 contig=True norm=0.68 |
|
INFO:__main__:model=model.layers.21.mlp.down_proj.weight 201 shape=torch.Size([2048, 5632]) dtype=torch.float16 cpu raio=0.5 contig=True norm=0.68 |
|
INFO:__main__:model=model.layers.21.input_layernorm.weight 201 shape=torch.Size([2048]) dtype=torch.float16 cpu raio=0.5 contig=True norm=0.68 |
|
INFO:__main__:model=model.layers.21.post_attention_layernorm.weight 201 shape=torch.Size([2048]) dtype=torch.float16 cpu raio=0.5 contig=True norm=0.68 |
|
INFO:__main__:model=model.norm.weight 201 shape=torch.Size([2048]) dtype=torch.float16 cpu raio=0.5 contig=True norm=0.68 |
|
INFO:__main__:model=lm_head.weight 201 shape=torch.Size([32000, 2048]) dtype=torch.float16 cpu raio=0.5 contig=True norm=0.68 |
|
INFO:__main__:{} - done merge saving to file: matlok/tinyllama-cinder-openhermes-32k |
|
config.json: 100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 724/724 [00:00<00:00, 6.15MB/s] |
|
model.safetensors: 100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 2.20G/2.20G [00:57<00:00, 38.0MB/s] |
|
generation_config.json: 100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 133/133 [00:00<00:00, 1.82MB/s] |
|
INFO:__main__:loading newly-created file: matlok/tinyllama-cinder-openhermes-32k |
|
INFO:__main__:model keys=201 |
|
INFO:__main__: 0 model.embed_tokens.weight shape=torch.Size([32000, 2048]) type=torch.float16 dev=cpu contig=True |
|
INFO:__main__: 1 model.layers.0.self_attn.q_proj.weight shape=torch.Size([2048, 2048]) type=torch.float16 dev=cpu contig=True |
|
INFO:__main__: 2 model.layers.0.self_attn.k_proj.weight shape=torch.Size([256, 2048]) type=torch.float16 dev=cpu contig=True |
|
INFO:__main__: 3 model.layers.0.self_attn.v_proj.weight shape=torch.Size([256, 2048]) type=torch.float16 dev=cpu contig=True |
|
INFO:__main__: 4 model.layers.0.self_attn.o_proj.weight shape=torch.Size([2048, 2048]) type=torch.float16 dev=cpu contig=True |
|
INFO:__main__: 5 model.layers.0.mlp.gate_proj.weight shape=torch.Size([5632, 2048]) type=torch.float16 dev=cpu contig=True |
|
INFO:__main__: 6 model.layers.0.mlp.up_proj.weight shape=torch.Size([5632, 2048]) type=torch.float16 dev=cpu contig=True |
|
INFO:__main__: 7 model.layers.0.mlp.down_proj.weight shape=torch.Size([2048, 5632]) type=torch.float16 dev=cpu contig=True |
|
INFO:__main__: 8 model.layers.0.input_layernorm.weight shape=torch.Size([2048]) type=torch.float16 dev=cpu contig=True |
|
INFO:__main__: 9 model.layers.0.post_attention_layernorm.weight shape=torch.Size([2048]) type=torch.float16 dev=cpu contig=True |
|
INFO:__main__: 10 model.layers.1.self_attn.q_proj.weight shape=torch.Size([2048, 2048]) type=torch.float16 dev=cpu contig=True |
|
INFO:__main__: 11 model.layers.1.self_attn.k_proj.weight shape=torch.Size([256, 2048]) type=torch.float16 dev=cpu contig=True |
|
INFO:__main__: 12 model.layers.1.self_attn.v_proj.weight shape=torch.Size([256, 2048]) type=torch.float16 dev=cpu contig=True |
|
INFO:__main__: 13 model.layers.1.self_attn.o_proj.weight shape=torch.Size([2048, 2048]) type=torch.float16 dev=cpu contig=True |
|
INFO:__main__: 14 model.layers.1.mlp.gate_proj.weight shape=torch.Size([5632, 2048]) type=torch.float16 dev=cpu contig=True |
|
INFO:__main__: 15 model.layers.1.mlp.up_proj.weight shape=torch.Size([5632, 2048]) type=torch.float16 dev=cpu contig=True |
|
INFO:__main__: 16 model.layers.1.mlp.down_proj.weight shape=torch.Size([2048, 5632]) type=torch.float16 dev=cpu contig=True |
|
INFO:__main__: 17 model.layers.1.input_layernorm.weight shape=torch.Size([2048]) type=torch.float16 dev=cpu contig=True |
|
INFO:__main__: 18 model.layers.1.post_attention_layernorm.weight shape=torch.Size([2048]) type=torch.float16 dev=cpu contig=True |
|
INFO:__main__: 19 model.layers.2.self_attn.q_proj.weight shape=torch.Size([2048, 2048]) type=torch.float16 dev=cpu contig=True |
|
INFO:__main__: 20 model.layers.2.self_attn.k_proj.weight shape=torch.Size([256, 2048]) type=torch.float16 dev=cpu contig=True |
|
INFO:__main__: 21 model.layers.2.self_attn.v_proj.weight shape=torch.Size([256, 2048]) type=torch.float16 dev=cpu contig=True |
|
INFO:__main__: 22 model.layers.2.self_attn.o_proj.weight shape=torch.Size([2048, 2048]) type=torch.float16 dev=cpu contig=True |
|
INFO:__main__: 23 model.layers.2.mlp.gate_proj.weight shape=torch.Size([5632, 2048]) type=torch.float16 dev=cpu contig=True |
|
INFO:__main__: 24 model.layers.2.mlp.up_proj.weight shape=torch.Size([5632, 2048]) type=torch.float16 dev=cpu contig=True |
|
INFO:__main__: 25 model.layers.2.mlp.down_proj.weight shape=torch.Size([2048, 5632]) type=torch.float16 dev=cpu contig=True |
|
INFO:__main__: 26 model.layers.2.input_layernorm.weight shape=torch.Size([2048]) type=torch.float16 dev=cpu contig=True |
|
INFO:__main__: 27 model.layers.2.post_attention_layernorm.weight shape=torch.Size([2048]) type=torch.float16 dev=cpu contig=True |
|
INFO:__main__: 28 model.layers.3.self_attn.q_proj.weight shape=torch.Size([2048, 2048]) type=torch.float16 dev=cpu contig=True |
|
INFO:__main__: 29 model.layers.3.self_attn.k_proj.weight shape=torch.Size([256, 2048]) type=torch.float16 dev=cpu contig=True |
|
INFO:__main__: 30 model.layers.3.self_attn.v_proj.weight shape=torch.Size([256, 2048]) type=torch.float16 dev=cpu contig=True |
|
INFO:__main__: 31 model.layers.3.self_attn.o_proj.weight shape=torch.Size([2048, 2048]) type=torch.float16 dev=cpu contig=True |
|
INFO:__main__: 32 model.layers.3.mlp.gate_proj.weight shape=torch.Size([5632, 2048]) type=torch.float16 dev=cpu contig=True |
|
INFO:__main__: 33 model.layers.3.mlp.up_proj.weight shape=torch.Size([5632, 2048]) type=torch.float16 dev=cpu contig=True |
|
INFO:__main__: 34 model.layers.3.mlp.down_proj.weight shape=torch.Size([2048, 5632]) type=torch.float16 dev=cpu contig=True |
|
INFO:__main__: 35 model.layers.3.input_layernorm.weight shape=torch.Size([2048]) type=torch.float16 dev=cpu contig=True |
|
INFO:__main__: 36 model.layers.3.post_attention_layernorm.weight shape=torch.Size([2048]) type=torch.float16 dev=cpu contig=True |
|
INFO:__main__: 37 model.layers.4.self_attn.q_proj.weight shape=torch.Size([2048, 2048]) type=torch.float16 dev=cpu contig=True |
|
INFO:__main__: 38 model.layers.4.self_attn.k_proj.weight shape=torch.Size([256, 2048]) type=torch.float16 dev=cpu contig=True |
|
INFO:__main__: 39 model.layers.4.self_attn.v_proj.weight shape=torch.Size([256, 2048]) type=torch.float16 dev=cpu contig=True |
|
INFO:__main__: 40 model.layers.4.self_attn.o_proj.weight shape=torch.Size([2048, 2048]) type=torch.float16 dev=cpu contig=True |
|
INFO:__main__: 41 model.layers.4.mlp.gate_proj.weight shape=torch.Size([5632, 2048]) type=torch.float16 dev=cpu contig=True |
|
INFO:__main__: 42 model.layers.4.mlp.up_proj.weight shape=torch.Size([5632, 2048]) type=torch.float16 dev=cpu contig=True |
|
INFO:__main__: 43 model.layers.4.mlp.down_proj.weight shape=torch.Size([2048, 5632]) type=torch.float16 dev=cpu contig=True |
|
INFO:__main__: 44 model.layers.4.input_layernorm.weight shape=torch.Size([2048]) type=torch.float16 dev=cpu contig=True |
|
INFO:__main__: 45 model.layers.4.post_attention_layernorm.weight shape=torch.Size([2048]) type=torch.float16 dev=cpu contig=True |
|
INFO:__main__: 46 model.layers.5.self_attn.q_proj.weight shape=torch.Size([2048, 2048]) type=torch.float16 dev=cpu contig=True |
|
INFO:__main__: 47 model.layers.5.self_attn.k_proj.weight shape=torch.Size([256, 2048]) type=torch.float16 dev=cpu contig=True |
|
INFO:__main__: 48 model.layers.5.self_attn.v_proj.weight shape=torch.Size([256, 2048]) type=torch.float16 dev=cpu contig=True |
|
INFO:__main__: 49 model.layers.5.self_attn.o_proj.weight shape=torch.Size([2048, 2048]) type=torch.float16 dev=cpu contig=True |
|
INFO:__main__: 50 model.layers.5.mlp.gate_proj.weight shape=torch.Size([5632, 2048]) type=torch.float16 dev=cpu contig=True |
|
INFO:__main__: 51 model.layers.5.mlp.up_proj.weight shape=torch.Size([5632, 2048]) type=torch.float16 dev=cpu contig=True |
|
INFO:__main__: 52 model.layers.5.mlp.down_proj.weight shape=torch.Size([2048, 5632]) type=torch.float16 dev=cpu contig=True |
|
INFO:__main__: 53 model.layers.5.input_layernorm.weight shape=torch.Size([2048]) type=torch.float16 dev=cpu contig=True |
|
INFO:__main__: 54 model.layers.5.post_attention_layernorm.weight shape=torch.Size([2048]) type=torch.float16 dev=cpu contig=True |
|
INFO:__main__: 55 model.layers.6.self_attn.q_proj.weight shape=torch.Size([2048, 2048]) type=torch.float16 dev=cpu contig=True |
|
INFO:__main__: 56 model.layers.6.self_attn.k_proj.weight shape=torch.Size([256, 2048]) type=torch.float16 dev=cpu contig=True |
|
INFO:__main__: 57 model.layers.6.self_attn.v_proj.weight shape=torch.Size([256, 2048]) type=torch.float16 dev=cpu contig=True |
|
INFO:__main__: 58 model.layers.6.self_attn.o_proj.weight shape=torch.Size([2048, 2048]) type=torch.float16 dev=cpu contig=True |
|
INFO:__main__: 59 model.layers.6.mlp.gate_proj.weight shape=torch.Size([5632, 2048]) type=torch.float16 dev=cpu contig=True |
|
INFO:__main__: 60 model.layers.6.mlp.up_proj.weight shape=torch.Size([5632, 2048]) type=torch.float16 dev=cpu contig=True |
|
INFO:__main__: 61 model.layers.6.mlp.down_proj.weight shape=torch.Size([2048, 5632]) type=torch.float16 dev=cpu contig=True |
|
INFO:__main__: 62 model.layers.6.input_layernorm.weight shape=torch.Size([2048]) type=torch.float16 dev=cpu contig=True |
|
INFO:__main__: 63 model.layers.6.post_attention_layernorm.weight shape=torch.Size([2048]) type=torch.float16 dev=cpu contig=True |
|
INFO:__main__: 64 model.layers.7.self_attn.q_proj.weight shape=torch.Size([2048, 2048]) type=torch.float16 dev=cpu contig=True |
|
INFO:__main__: 65 model.layers.7.self_attn.k_proj.weight shape=torch.Size([256, 2048]) type=torch.float16 dev=cpu contig=True |
|
INFO:__main__: 66 model.layers.7.self_attn.v_proj.weight shape=torch.Size([256, 2048]) type=torch.float16 dev=cpu contig=True |
|
INFO:__main__: 67 model.layers.7.self_attn.o_proj.weight shape=torch.Size([2048, 2048]) type=torch.float16 dev=cpu contig=True |
|
INFO:__main__: 68 model.layers.7.mlp.gate_proj.weight shape=torch.Size([5632, 2048]) type=torch.float16 dev=cpu contig=True |
|
INFO:__main__: 69 model.layers.7.mlp.up_proj.weight shape=torch.Size([5632, 2048]) type=torch.float16 dev=cpu contig=True |
|
INFO:__main__: 70 model.layers.7.mlp.down_proj.weight shape=torch.Size([2048, 5632]) type=torch.float16 dev=cpu contig=True |
|
INFO:__main__: 71 model.layers.7.input_layernorm.weight shape=torch.Size([2048]) type=torch.float16 dev=cpu contig=True |
|
INFO:__main__: 72 model.layers.7.post_attention_layernorm.weight shape=torch.Size([2048]) type=torch.float16 dev=cpu contig=True |
|
INFO:__main__: 73 model.layers.8.self_attn.q_proj.weight shape=torch.Size([2048, 2048]) type=torch.float16 dev=cpu contig=True |
|
INFO:__main__: 74 model.layers.8.self_attn.k_proj.weight shape=torch.Size([256, 2048]) type=torch.float16 dev=cpu contig=True |
|
INFO:__main__: 75 model.layers.8.self_attn.v_proj.weight shape=torch.Size([256, 2048]) type=torch.float16 dev=cpu contig=True |
|
INFO:__main__: 76 model.layers.8.self_attn.o_proj.weight shape=torch.Size([2048, 2048]) type=torch.float16 dev=cpu contig=True |
|
INFO:__main__: 77 model.layers.8.mlp.gate_proj.weight shape=torch.Size([5632, 2048]) type=torch.float16 dev=cpu contig=True |
|
INFO:__main__: 78 model.layers.8.mlp.up_proj.weight shape=torch.Size([5632, 2048]) type=torch.float16 dev=cpu contig=True |
|
INFO:__main__: 79 model.layers.8.mlp.down_proj.weight shape=torch.Size([2048, 5632]) type=torch.float16 dev=cpu contig=True |
|
INFO:__main__: 80 model.layers.8.input_layernorm.weight shape=torch.Size([2048]) type=torch.float16 dev=cpu contig=True |
|
INFO:__main__: 81 model.layers.8.post_attention_layernorm.weight shape=torch.Size([2048]) type=torch.float16 dev=cpu contig=True |
|
INFO:__main__: 82 model.layers.9.self_attn.q_proj.weight shape=torch.Size([2048, 2048]) type=torch.float16 dev=cpu contig=True |
|
INFO:__main__: 83 model.layers.9.self_attn.k_proj.weight shape=torch.Size([256, 2048]) type=torch.float16 dev=cpu contig=True |
|
INFO:__main__: 84 model.layers.9.self_attn.v_proj.weight shape=torch.Size([256, 2048]) type=torch.float16 dev=cpu contig=True |
|
INFO:__main__: 85 model.layers.9.self_attn.o_proj.weight shape=torch.Size([2048, 2048]) type=torch.float16 dev=cpu contig=True |
|
INFO:__main__: 86 model.layers.9.mlp.gate_proj.weight shape=torch.Size([5632, 2048]) type=torch.float16 dev=cpu contig=True |
|
INFO:__main__: 87 model.layers.9.mlp.up_proj.weight shape=torch.Size([5632, 2048]) type=torch.float16 dev=cpu contig=True |
|
INFO:__main__: 88 model.layers.9.mlp.down_proj.weight shape=torch.Size([2048, 5632]) type=torch.float16 dev=cpu contig=True |
|
INFO:__main__: 89 model.layers.9.input_layernorm.weight shape=torch.Size([2048]) type=torch.float16 dev=cpu contig=True |
|
INFO:__main__: 90 model.layers.9.post_attention_layernorm.weight shape=torch.Size([2048]) type=torch.float16 dev=cpu contig=True |
|
INFO:__main__: 91 model.layers.10.self_attn.q_proj.weight shape=torch.Size([2048, 2048]) type=torch.float16 dev=cpu contig=True |
|
INFO:__main__: 92 model.layers.10.self_attn.k_proj.weight shape=torch.Size([256, 2048]) type=torch.float16 dev=cpu contig=True |
|
INFO:__main__: 93 model.layers.10.self_attn.v_proj.weight shape=torch.Size([256, 2048]) type=torch.float16 dev=cpu contig=True |
|
INFO:__main__: 94 model.layers.10.self_attn.o_proj.weight shape=torch.Size([2048, 2048]) type=torch.float16 dev=cpu contig=True |
|
INFO:__main__: 95 model.layers.10.mlp.gate_proj.weight shape=torch.Size([5632, 2048]) type=torch.float16 dev=cpu contig=True |
|
INFO:__main__: 96 model.layers.10.mlp.up_proj.weight shape=torch.Size([5632, 2048]) type=torch.float16 dev=cpu contig=True |
|
INFO:__main__: 97 model.layers.10.mlp.down_proj.weight shape=torch.Size([2048, 5632]) type=torch.float16 dev=cpu contig=True |
|
INFO:__main__: 98 model.layers.10.input_layernorm.weight shape=torch.Size([2048]) type=torch.float16 dev=cpu contig=True |
|
INFO:__main__: 99 model.layers.10.post_attention_layernorm.weight shape=torch.Size([2048]) type=torch.float16 dev=cpu contig=True |
|
INFO:__main__:100 model.layers.11.self_attn.q_proj.weight shape=torch.Size([2048, 2048]) type=torch.float16 dev=cpu contig=True |
|
INFO:__main__:101 model.layers.11.self_attn.k_proj.weight shape=torch.Size([256, 2048]) type=torch.float16 dev=cpu contig=True |
|
INFO:__main__:102 model.layers.11.self_attn.v_proj.weight shape=torch.Size([256, 2048]) type=torch.float16 dev=cpu contig=True |
|
INFO:__main__:103 model.layers.11.self_attn.o_proj.weight shape=torch.Size([2048, 2048]) type=torch.float16 dev=cpu contig=True |
|
INFO:__main__:104 model.layers.11.mlp.gate_proj.weight shape=torch.Size([5632, 2048]) type=torch.float16 dev=cpu contig=True |
|
INFO:__main__:105 model.layers.11.mlp.up_proj.weight shape=torch.Size([5632, 2048]) type=torch.float16 dev=cpu contig=True |
|
INFO:__main__:106 model.layers.11.mlp.down_proj.weight shape=torch.Size([2048, 5632]) type=torch.float16 dev=cpu contig=True |
|
INFO:__main__:107 model.layers.11.input_layernorm.weight shape=torch.Size([2048]) type=torch.float16 dev=cpu contig=True |
|
INFO:__main__:108 model.layers.11.post_attention_layernorm.weight shape=torch.Size([2048]) type=torch.float16 dev=cpu contig=True |
|
INFO:__main__:109 model.layers.12.self_attn.q_proj.weight shape=torch.Size([2048, 2048]) type=torch.float16 dev=cpu contig=True |
|
INFO:__main__:110 model.layers.12.self_attn.k_proj.weight shape=torch.Size([256, 2048]) type=torch.float16 dev=cpu contig=True |
|
INFO:__main__:111 model.layers.12.self_attn.v_proj.weight shape=torch.Size([256, 2048]) type=torch.float16 dev=cpu contig=True |
|
INFO:__main__:112 model.layers.12.self_attn.o_proj.weight shape=torch.Size([2048, 2048]) type=torch.float16 dev=cpu contig=True |
|
INFO:__main__:113 model.layers.12.mlp.gate_proj.weight shape=torch.Size([5632, 2048]) type=torch.float16 dev=cpu contig=True |
|
INFO:__main__:114 model.layers.12.mlp.up_proj.weight shape=torch.Size([5632, 2048]) type=torch.float16 dev=cpu contig=True |
|
INFO:__main__:115 model.layers.12.mlp.down_proj.weight shape=torch.Size([2048, 5632]) type=torch.float16 dev=cpu contig=True |
|
INFO:__main__:116 model.layers.12.input_layernorm.weight shape=torch.Size([2048]) type=torch.float16 dev=cpu contig=True |
|
INFO:__main__:117 model.layers.12.post_attention_layernorm.weight shape=torch.Size([2048]) type=torch.float16 dev=cpu contig=True |
|
INFO:__main__:118 model.layers.13.self_attn.q_proj.weight shape=torch.Size([2048, 2048]) type=torch.float16 dev=cpu contig=True |
|
INFO:__main__:119 model.layers.13.self_attn.k_proj.weight shape=torch.Size([256, 2048]) type=torch.float16 dev=cpu contig=True |
|
INFO:__main__:120 model.layers.13.self_attn.v_proj.weight shape=torch.Size([256, 2048]) type=torch.float16 dev=cpu contig=True |
|
INFO:__main__:121 model.layers.13.self_attn.o_proj.weight shape=torch.Size([2048, 2048]) type=torch.float16 dev=cpu contig=True |
|
INFO:__main__:122 model.layers.13.mlp.gate_proj.weight shape=torch.Size([5632, 2048]) type=torch.float16 dev=cpu contig=True |
|
INFO:__main__:123 model.layers.13.mlp.up_proj.weight shape=torch.Size([5632, 2048]) type=torch.float16 dev=cpu contig=True |
|
INFO:__main__:124 model.layers.13.mlp.down_proj.weight shape=torch.Size([2048, 5632]) type=torch.float16 dev=cpu contig=True |
|
INFO:__main__:125 model.layers.13.input_layernorm.weight shape=torch.Size([2048]) type=torch.float16 dev=cpu contig=True |
|
INFO:__main__:126 model.layers.13.post_attention_layernorm.weight shape=torch.Size([2048]) type=torch.float16 dev=cpu contig=True |
|
INFO:__main__:127 model.layers.14.self_attn.q_proj.weight shape=torch.Size([2048, 2048]) type=torch.float16 dev=cpu contig=True |
|
INFO:__main__:128 model.layers.14.self_attn.k_proj.weight shape=torch.Size([256, 2048]) type=torch.float16 dev=cpu contig=True |
|
INFO:__main__:129 model.layers.14.self_attn.v_proj.weight shape=torch.Size([256, 2048]) type=torch.float16 dev=cpu contig=True |
|
INFO:__main__:130 model.layers.14.self_attn.o_proj.weight shape=torch.Size([2048, 2048]) type=torch.float16 dev=cpu contig=True |
|
INFO:__main__:131 model.layers.14.mlp.gate_proj.weight shape=torch.Size([5632, 2048]) type=torch.float16 dev=cpu contig=True |
|
INFO:__main__:132 model.layers.14.mlp.up_proj.weight shape=torch.Size([5632, 2048]) type=torch.float16 dev=cpu contig=True |
|
INFO:__main__:133 model.layers.14.mlp.down_proj.weight shape=torch.Size([2048, 5632]) type=torch.float16 dev=cpu contig=True |
|
INFO:__main__:134 model.layers.14.input_layernorm.weight shape=torch.Size([2048]) type=torch.float16 dev=cpu contig=True |
|
INFO:__main__:135 model.layers.14.post_attention_layernorm.weight shape=torch.Size([2048]) type=torch.float16 dev=cpu contig=True |
|
INFO:__main__:136 model.layers.15.self_attn.q_proj.weight shape=torch.Size([2048, 2048]) type=torch.float16 dev=cpu contig=True |
|
INFO:__main__:137 model.layers.15.self_attn.k_proj.weight shape=torch.Size([256, 2048]) type=torch.float16 dev=cpu contig=True |
|
INFO:__main__:138 model.layers.15.self_attn.v_proj.weight shape=torch.Size([256, 2048]) type=torch.float16 dev=cpu contig=True |
|
INFO:__main__:139 model.layers.15.self_attn.o_proj.weight shape=torch.Size([2048, 2048]) type=torch.float16 dev=cpu contig=True |
|
INFO:__main__:140 model.layers.15.mlp.gate_proj.weight shape=torch.Size([5632, 2048]) type=torch.float16 dev=cpu contig=True |
|
INFO:__main__:141 model.layers.15.mlp.up_proj.weight shape=torch.Size([5632, 2048]) type=torch.float16 dev=cpu contig=True |
|
INFO:__main__:142 model.layers.15.mlp.down_proj.weight shape=torch.Size([2048, 5632]) type=torch.float16 dev=cpu contig=True |
|
INFO:__main__:143 model.layers.15.input_layernorm.weight shape=torch.Size([2048]) type=torch.float16 dev=cpu contig=True |
|
INFO:__main__:144 model.layers.15.post_attention_layernorm.weight shape=torch.Size([2048]) type=torch.float16 dev=cpu contig=True |
|
INFO:__main__:145 model.layers.16.self_attn.q_proj.weight shape=torch.Size([2048, 2048]) type=torch.float16 dev=cpu contig=True |
|
INFO:__main__:146 model.layers.16.self_attn.k_proj.weight shape=torch.Size([256, 2048]) type=torch.float16 dev=cpu contig=True |
|
INFO:__main__:147 model.layers.16.self_attn.v_proj.weight shape=torch.Size([256, 2048]) type=torch.float16 dev=cpu contig=True |
|
INFO:__main__:148 model.layers.16.self_attn.o_proj.weight shape=torch.Size([2048, 2048]) type=torch.float16 dev=cpu contig=True |
|
INFO:__main__:149 model.layers.16.mlp.gate_proj.weight shape=torch.Size([5632, 2048]) type=torch.float16 dev=cpu contig=True |
|
INFO:__main__:150 model.layers.16.mlp.up_proj.weight shape=torch.Size([5632, 2048]) type=torch.float16 dev=cpu contig=True |
|
INFO:__main__:151 model.layers.16.mlp.down_proj.weight shape=torch.Size([2048, 5632]) type=torch.float16 dev=cpu contig=True |
|
INFO:__main__:152 model.layers.16.input_layernorm.weight shape=torch.Size([2048]) type=torch.float16 dev=cpu contig=True |
|
INFO:__main__:153 model.layers.16.post_attention_layernorm.weight shape=torch.Size([2048]) type=torch.float16 dev=cpu contig=True |
|
INFO:__main__:154 model.layers.17.self_attn.q_proj.weight shape=torch.Size([2048, 2048]) type=torch.float16 dev=cpu contig=True |
|
INFO:__main__:155 model.layers.17.self_attn.k_proj.weight shape=torch.Size([256, 2048]) type=torch.float16 dev=cpu contig=True |
|
INFO:__main__:156 model.layers.17.self_attn.v_proj.weight shape=torch.Size([256, 2048]) type=torch.float16 dev=cpu contig=True |
|
INFO:__main__:157 model.layers.17.self_attn.o_proj.weight shape=torch.Size([2048, 2048]) type=torch.float16 dev=cpu contig=True |
|
INFO:__main__:158 model.layers.17.mlp.gate_proj.weight shape=torch.Size([5632, 2048]) type=torch.float16 dev=cpu contig=True |
|
INFO:__main__:159 model.layers.17.mlp.up_proj.weight shape=torch.Size([5632, 2048]) type=torch.float16 dev=cpu contig=True |
|
INFO:__main__:160 model.layers.17.mlp.down_proj.weight shape=torch.Size([2048, 5632]) type=torch.float16 dev=cpu contig=True |
|
INFO:__main__:161 model.layers.17.input_layernorm.weight shape=torch.Size([2048]) type=torch.float16 dev=cpu contig=True |
|
INFO:__main__:162 model.layers.17.post_attention_layernorm.weight shape=torch.Size([2048]) type=torch.float16 dev=cpu contig=True |
|
INFO:__main__:163 model.layers.18.self_attn.q_proj.weight shape=torch.Size([2048, 2048]) type=torch.float16 dev=cpu contig=True |
|
INFO:__main__:164 model.layers.18.self_attn.k_proj.weight shape=torch.Size([256, 2048]) type=torch.float16 dev=cpu contig=True |
|
INFO:__main__:165 model.layers.18.self_attn.v_proj.weight shape=torch.Size([256, 2048]) type=torch.float16 dev=cpu contig=True |
|
INFO:__main__:166 model.layers.18.self_attn.o_proj.weight shape=torch.Size([2048, 2048]) type=torch.float16 dev=cpu contig=True |
|
INFO:__main__:167 model.layers.18.mlp.gate_proj.weight shape=torch.Size([5632, 2048]) type=torch.float16 dev=cpu contig=True |
|
INFO:__main__:168 model.layers.18.mlp.up_proj.weight shape=torch.Size([5632, 2048]) type=torch.float16 dev=cpu contig=True |
|
INFO:__main__:169 model.layers.18.mlp.down_proj.weight shape=torch.Size([2048, 5632]) type=torch.float16 dev=cpu contig=True |
|
INFO:__main__:170 model.layers.18.input_layernorm.weight shape=torch.Size([2048]) type=torch.float16 dev=cpu contig=True |
|
INFO:__main__:171 model.layers.18.post_attention_layernorm.weight shape=torch.Size([2048]) type=torch.float16 dev=cpu contig=True |
|
INFO:__main__:172 model.layers.19.self_attn.q_proj.weight shape=torch.Size([2048, 2048]) type=torch.float16 dev=cpu contig=True |
|
INFO:__main__:173 model.layers.19.self_attn.k_proj.weight shape=torch.Size([256, 2048]) type=torch.float16 dev=cpu contig=True |
|
INFO:__main__:174 model.layers.19.self_attn.v_proj.weight shape=torch.Size([256, 2048]) type=torch.float16 dev=cpu contig=True |
|
INFO:__main__:175 model.layers.19.self_attn.o_proj.weight shape=torch.Size([2048, 2048]) type=torch.float16 dev=cpu contig=True |
|
INFO:__main__:176 model.layers.19.mlp.gate_proj.weight shape=torch.Size([5632, 2048]) type=torch.float16 dev=cpu contig=True |
|
INFO:__main__:177 model.layers.19.mlp.up_proj.weight shape=torch.Size([5632, 2048]) type=torch.float16 dev=cpu contig=True |
|
INFO:__main__:178 model.layers.19.mlp.down_proj.weight shape=torch.Size([2048, 5632]) type=torch.float16 dev=cpu contig=True |
|
INFO:__main__:179 model.layers.19.input_layernorm.weight shape=torch.Size([2048]) type=torch.float16 dev=cpu contig=True |
|
INFO:__main__:180 model.layers.19.post_attention_layernorm.weight shape=torch.Size([2048]) type=torch.float16 dev=cpu contig=True |
|
INFO:__main__:181 model.layers.20.self_attn.q_proj.weight shape=torch.Size([2048, 2048]) type=torch.float16 dev=cpu contig=True |
|
INFO:__main__:182 model.layers.20.self_attn.k_proj.weight shape=torch.Size([256, 2048]) type=torch.float16 dev=cpu contig=True |
|
INFO:__main__:183 model.layers.20.self_attn.v_proj.weight shape=torch.Size([256, 2048]) type=torch.float16 dev=cpu contig=True |
|
INFO:__main__:184 model.layers.20.self_attn.o_proj.weight shape=torch.Size([2048, 2048]) type=torch.float16 dev=cpu contig=True |
|
INFO:__main__:185 model.layers.20.mlp.gate_proj.weight shape=torch.Size([5632, 2048]) type=torch.float16 dev=cpu contig=True |
|
INFO:__main__:186 model.layers.20.mlp.up_proj.weight shape=torch.Size([5632, 2048]) type=torch.float16 dev=cpu contig=True |
|
INFO:__main__:187 model.layers.20.mlp.down_proj.weight shape=torch.Size([2048, 5632]) type=torch.float16 dev=cpu contig=True |
|
INFO:__main__:188 model.layers.20.input_layernorm.weight shape=torch.Size([2048]) type=torch.float16 dev=cpu contig=True |
|
INFO:__main__:189 model.layers.20.post_attention_layernorm.weight shape=torch.Size([2048]) type=torch.float16 dev=cpu contig=True |
|
INFO:__main__:190 model.layers.21.self_attn.q_proj.weight shape=torch.Size([2048, 2048]) type=torch.float16 dev=cpu contig=True |
|
INFO:__main__:191 model.layers.21.self_attn.k_proj.weight shape=torch.Size([256, 2048]) type=torch.float16 dev=cpu contig=True |
|
INFO:__main__:192 model.layers.21.self_attn.v_proj.weight shape=torch.Size([256, 2048]) type=torch.float16 dev=cpu contig=True |
|
INFO:__main__:193 model.layers.21.self_attn.o_proj.weight shape=torch.Size([2048, 2048]) type=torch.float16 dev=cpu contig=True |
|
INFO:__main__:194 model.layers.21.mlp.gate_proj.weight shape=torch.Size([5632, 2048]) type=torch.float16 dev=cpu contig=True |
|
INFO:__main__:195 model.layers.21.mlp.up_proj.weight shape=torch.Size([5632, 2048]) type=torch.float16 dev=cpu contig=True |
|
INFO:__main__:196 model.layers.21.mlp.down_proj.weight shape=torch.Size([2048, 5632]) type=torch.float16 dev=cpu contig=True |
|
INFO:__main__:197 model.layers.21.input_layernorm.weight shape=torch.Size([2048]) type=torch.float16 dev=cpu contig=True |
|
INFO:__main__:198 model.layers.21.post_attention_layernorm.weight shape=torch.Size([2048]) type=torch.float16 dev=cpu contig=True |
|
INFO:__main__:199 model.norm.weight shape=torch.Size([2048]) type=torch.float16 dev=cpu contig=True |
|
INFO:__main__:200 lm_head.weight shape=torch.Size([32000, 2048]) type=torch.float16 dev=cpu contig=True |
|
INFO:__main__:done loading new model: LlamaForCausalLM( |
|
(model): LlamaModel( |
|
(embed_tokens): Embedding(32000, 2048) |
|
(layers): ModuleList( |
|
(0-21): 22 x LlamaDecoderLayer( |
|
(self_attn): LlamaSdpaAttention( |
|
(q_proj): Linear(in_features=2048, out_features=2048, bias=False) |
|
(k_proj): Linear(in_features=2048, out_features=256, bias=False) |
|
(v_proj): Linear(in_features=2048, out_features=256, bias=False) |
|
(o_proj): Linear(in_features=2048, out_features=2048, bias=False) |
|
(rotary_emb): LlamaRotaryEmbedding() |
|
) |
|
(mlp): LlamaMLP( |
|
(gate_proj): Linear(in_features=2048, out_features=5632, bias=False) |
|
(up_proj): Linear(in_features=2048, out_features=5632, bias=False) |
|
(down_proj): Linear(in_features=5632, out_features=2048, bias=False) |
|
(act_fn): SiLU() |
|
) |
|
(input_layernorm): LlamaRMSNorm() |
|
(post_attention_layernorm): LlamaRMSNorm() |
|
) |
|
) |
|
(norm): LlamaRMSNorm() |
|
) |
|
(lm_head): Linear(in_features=2048, out_features=32000, bias=False) |
|
) file: matlok/tinyllama-cinder-openhermes-32k |
|
|
|
real 1m18.070s |
|
user 2m10.228s |
|
sys 0m14.040s |
|
``` |
|
|
|
Note: code sample above was modified from [this very helpful GitHub gist](https://gist.github.com/maldevide/08829eada04ad9bd78e46c1a3787d42b) |
|
|