mrfakename commited on
Commit
129f554
0 Parent(s):
.gitattributes ADDED
@@ -0,0 +1,43 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ *.7z filter=lfs diff=lfs merge=lfs -text
2
+ *.arrow filter=lfs diff=lfs merge=lfs -text
3
+ *.bin filter=lfs diff=lfs merge=lfs -text
4
+ *.bz2 filter=lfs diff=lfs merge=lfs -text
5
+ *.ckpt filter=lfs diff=lfs merge=lfs -text
6
+ *.ftz filter=lfs diff=lfs merge=lfs -text
7
+ *.gz filter=lfs diff=lfs merge=lfs -text
8
+ *.h5 filter=lfs diff=lfs merge=lfs -text
9
+ *.joblib filter=lfs diff=lfs merge=lfs -text
10
+ *.lfs.* filter=lfs diff=lfs merge=lfs -text
11
+ *.mlmodel filter=lfs diff=lfs merge=lfs -text
12
+ *.model filter=lfs diff=lfs merge=lfs -text
13
+ *.msgpack filter=lfs diff=lfs merge=lfs -text
14
+ *.npy filter=lfs diff=lfs merge=lfs -text
15
+ *.npz filter=lfs diff=lfs merge=lfs -text
16
+ *.onnx filter=lfs diff=lfs merge=lfs -text
17
+ *.ot filter=lfs diff=lfs merge=lfs -text
18
+ *.parquet filter=lfs diff=lfs merge=lfs -text
19
+ *.pb filter=lfs diff=lfs merge=lfs -text
20
+ *.pickle filter=lfs diff=lfs merge=lfs -text
21
+ *.pkl filter=lfs diff=lfs merge=lfs -text
22
+ *.pt filter=lfs diff=lfs merge=lfs -text
23
+ *.pth filter=lfs diff=lfs merge=lfs -text
24
+ *.rar filter=lfs diff=lfs merge=lfs -text
25
+ *.safetensors filter=lfs diff=lfs merge=lfs -text
26
+ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
27
+ *.tar.* filter=lfs diff=lfs merge=lfs -text
28
+ *.tar filter=lfs diff=lfs merge=lfs -text
29
+ *.tflite filter=lfs diff=lfs merge=lfs -text
30
+ *.tgz filter=lfs diff=lfs merge=lfs -text
31
+ *.wasm filter=lfs diff=lfs merge=lfs -text
32
+ *.xz filter=lfs diff=lfs merge=lfs -text
33
+ *.zip filter=lfs diff=lfs merge=lfs -text
34
+ *.zst filter=lfs diff=lfs merge=lfs -text
35
+ *tfevents* filter=lfs diff=lfs merge=lfs -text
36
+ models/ggml-vocab-aquila.gguf filter=lfs diff=lfs merge=lfs -text
37
+ models/ggml-vocab-baichuan.gguf filter=lfs diff=lfs merge=lfs -text
38
+ models/ggml-vocab-falcon.gguf filter=lfs diff=lfs merge=lfs -text
39
+ models/ggml-vocab-gpt-neox.gguf filter=lfs diff=lfs merge=lfs -text
40
+ models/ggml-vocab-mpt.gguf filter=lfs diff=lfs merge=lfs -text
41
+ models/ggml-vocab-refact.gguf filter=lfs diff=lfs merge=lfs -text
42
+ models/ggml-vocab-stablelm-3b-4e1t.gguf filter=lfs diff=lfs merge=lfs -text
43
+ models/ggml-vocab-starcoder.gguf filter=lfs diff=lfs merge=lfs -text
LICENSE ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ THIS SOFTWARE IS NOT OPEN SOURCED!!! REDISTRIBUTION PROHIBITED!
2
+
3
+ Copyright 2023 mrfakename. All rights reserved. REDISTRIBUTION PROHIBITED!!! PLEASE don't go republishing this, putting it on github, sharing colabs of it, etc! Pease DO NOT do this! Please ask FIRST instead! Please don't copy-paste this code into your project!
README.md ADDED
@@ -0,0 +1,16 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ title: Convert to GGUF
3
+ emoji: 🌍
4
+ colorFrom: blue
5
+ colorTo: purple
6
+ sdk: gradio
7
+ sdk_version: 4.8.0
8
+ app_file: app.py
9
+ pinned: false
10
+ tags:
11
+ - gguf
12
+ ---
13
+
14
+ THIS SOFTWARE IS NOT OPEN SOURCED!!! REDISTRIBUTION PROHIBITED!
15
+
16
+ Copyright 2023 mrfakename. All rights reserved. REDISTRIBUTION PROHIBITED!!! PLEASE don't go republishing this, putting it on github, sharing colabs of it, etc! Pease DO NOT do this! Please ask FIRST instead! Please don't copy-paste this code into your project!
README_TEMPLATE.md ADDED
@@ -0,0 +1,22 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ base_model: <<MODEL_ID>>
3
+ inference: false
4
+ pipeline_tag: text-generation
5
+ quantized_by: mfn
6
+ tags:
7
+ - gguf
8
+ ---
9
+
10
+ # GGUF
11
+
12
+ ## GGUF models for <<MODEL_ID>>
13
+
14
+ * Original model: [<<MODEL_ID>>](https://huggingface.co/<<MODEL_ID>>)
15
+
16
+ ## Description
17
+
18
+ This model contains GGUF models for [<<MODEL_ID>>](https://huggingface.co/<<MODEL_ID>>)
19
+
20
+ ## License
21
+
22
+ The license of this model follows that of the original model.
app.py ADDED
@@ -0,0 +1,61 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # THIS SOFTWARE IS NOT OPEN SOURCED!!! REDISTRIBUTION PROHIBITED! SEE LICENSE FOR DETAILS.
2
+
3
+ ## TODO: Only allow 2 quantizations to run at once
4
+ from huggingface_hub import HfApi
5
+ import os
6
+ from hfconv import convert
7
+ from constants import *
8
+ import gradio as gr
9
+ import threading
10
+ from slugify import slugify
11
+
12
+ theme = gr.themes.Base(
13
+ font=[gr.themes.GoogleFont('Libre Franklin'), gr.themes.GoogleFont('Public Sans'), 'system-ui', 'sans-serif'],
14
+ )
15
+ DESCRIPTION = """
16
+ Welcome to Convert to GGUF, a **free** tool to convert all your models to gguf
17
+ """.strip()
18
+
19
+ # def run_real(model_id: str) -> str:
20
+ def run(model_id):
21
+ if model_id == "":
22
+ return """
23
+ ### Invalid input 🐞
24
+
25
+ Please input a model_id.
26
+ """
27
+ try:
28
+ api = HfApi(token=HF_TOKEN)
29
+ if not api.repo_exists(model_id):
30
+ raise gr.Error('Unable to locate repo')
31
+ # repo_id = convert(api=api, model_id=model_id)
32
+ background_thread = threading.Thread(target=convert, args=(api, model_id))
33
+ background_thread.start()
34
+ repo_id = username + "/" + slugify(model_id.strip()) + "-GGUF"
35
+ string = f"""## Quantizing
36
+ We are quantizing the model now. If it is successful and it works, it will be available [here](https://huggingface.co/{repo_id}). It may take up to several hours to complete. If it does not work after several hours, please try again. If it does not work after many tries, please contact us.""".strip()
37
+ # if errors:
38
+ # string += "\nErrors during conversion:\n"
39
+ # string += "\n".join(f"Error while converting {filename}: {e}, skipped conversion" for filename, e in errors)
40
+ return string
41
+ except Exception as e:
42
+ return f"""
43
+ ### Error
44
+
45
+ {e}
46
+ """
47
+ demo = gr.Interface(
48
+ title="Convert LLMs to GGUF & Quantize",
49
+ description=DESCRIPTION,
50
+ allow_flagging="never",
51
+ article="Created by [mrfakename](https://twitter.com/realmrfakename).",
52
+ inputs=[
53
+ gr.Text(max_lines=1, label="model_id"),
54
+ ],
55
+ outputs=[gr.Markdown(label="output")],
56
+ fn=run,
57
+ css="footer{display:none !important}",
58
+ theme=theme
59
+ )
60
+
61
+ demo.queue(api_open=False, max_size=15).launch(show_api=False)
constants.py ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ import os
2
+ username = 'gguf'
3
+ HF_TOKEN = os.environ.get("HF_TOKEN")
4
+ types_to_quantize = ['Q6_K', 'Q5_K_M', 'Q5_K_S', 'Q4_K_M', 'Q4_K_S', 'Q3_K_L', 'Q3_K_M', 'Q3_K_S', 'Q2_K']
cscript.py ADDED
@@ -0,0 +1,1168 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # THIS FILE IS MIT LICENSED BUT THE REST IS NOT! SEE THE LICENSE FOR DETAILS! FROM THE LLAMA.CPP MODEL
2
+ #!/usr/bin/env python3
3
+ from __future__ import annotations
4
+
5
+ import argparse
6
+ import concurrent.futures
7
+ import enum
8
+ import faulthandler
9
+ import functools
10
+ import itertools
11
+ import json
12
+ import math
13
+ import mmap
14
+ import pickle
15
+ import re
16
+ import signal
17
+ import struct
18
+ import sys
19
+ import time
20
+ import zipfile
21
+ from abc import ABCMeta, abstractmethod
22
+ from concurrent.futures import ProcessPoolExecutor, ThreadPoolExecutor
23
+ from dataclasses import dataclass
24
+ from pathlib import Path
25
+ from typing import IO, TYPE_CHECKING, Any, Callable, Iterable, Literal, TypeVar
26
+
27
+ import numpy as np
28
+ from sentencepiece import SentencePieceProcessor
29
+
30
+ import os
31
+ if 'NO_LOCAL_GGUF' not in os.environ:
32
+ sys.path.insert(1, str(Path(__file__).parent / 'gguf-py'))
33
+ import gguf
34
+
35
+ if TYPE_CHECKING:
36
+ from typing import TypeAlias
37
+
38
+ if hasattr(faulthandler, 'register') and hasattr(signal, 'SIGUSR1'):
39
+ faulthandler.register(signal.SIGUSR1)
40
+
41
+ NDArray: TypeAlias = 'np.ndarray[Any, Any]'
42
+
43
+ ARCH = gguf.MODEL_ARCH.LLAMA
44
+
45
+ DEFAULT_CONCURRENCY = 8
46
+ #
47
+ # data types
48
+ #
49
+
50
+
51
+ @dataclass(frozen=True)
52
+ class DataType:
53
+ name: str
54
+ dtype: np.dtype[Any]
55
+ valid_conversions: list[str]
56
+
57
+ def elements_to_bytes(self, n_elements: int) -> int:
58
+ return n_elements * self.dtype.itemsize
59
+
60
+
61
+ @dataclass(frozen=True)
62
+ class UnquantizedDataType(DataType):
63
+ pass
64
+
65
+
66
+ DT_F16 = UnquantizedDataType('F16', dtype = np.dtype(np.float16), valid_conversions = ['F32', 'Q8_0'])
67
+ DT_F32 = UnquantizedDataType('F32', dtype = np.dtype(np.float32), valid_conversions = ['F16', 'Q8_0'])
68
+ DT_I32 = UnquantizedDataType('I32', dtype = np.dtype(np.int16), valid_conversions = [])
69
+ DT_BF16 = UnquantizedDataType('BF16', dtype = np.dtype(np.uint16), valid_conversions = ['F32', 'F16', 'Q8_0'])
70
+
71
+
72
+ @dataclass(frozen=True)
73
+ class QuantizedDataType(DataType):
74
+ block_size: int
75
+ quantized_dtype: np.dtype[Any]
76
+ ggml_type: gguf.GGMLQuantizationType
77
+
78
+ def quantize(self, arr: NDArray) -> NDArray:
79
+ raise NotImplementedError(f'Quantization for {self.name} not implemented')
80
+
81
+ def elements_to_bytes(self, n_elements: int) -> int:
82
+ assert n_elements % self.block_size == 0, f'Invalid number of elements {n_elements} for {self.name} with block size {self.block_size}'
83
+ return self.quantized_dtype.itemsize * (n_elements // self.block_size)
84
+
85
+
86
+ @dataclass(frozen=True)
87
+ class Q8_0QuantizedDataType(QuantizedDataType):
88
+ # Mini Q8_0 quantization in Python!
89
+ def quantize(self, arr: NDArray) -> NDArray:
90
+ assert arr.size % self.block_size == 0 and arr.size != 0, f'Bad array size {arr.size}'
91
+ assert arr.dtype == np.float32, f'Bad array type {arr.dtype}'
92
+ n_blocks = arr.size // self.block_size
93
+ blocks = arr.reshape((n_blocks, self.block_size))
94
+ # Much faster implementation of block quantization contributed by @Cebtenzzre
95
+
96
+ def quantize_blocks_q8_0(blocks: NDArray) -> Iterable[tuple[Any, Any]]:
97
+ d = abs(blocks).max(axis = 1) / np.float32(127)
98
+ with np.errstate(divide = 'ignore'):
99
+ qs = (blocks / d[:, None]).round()
100
+ qs[d == 0] = 0
101
+ yield from zip(d, qs)
102
+ return np.fromiter(quantize_blocks_q8_0(blocks), count = n_blocks, dtype = self.quantized_dtype)
103
+
104
+
105
+ DT_Q8_0 = Q8_0QuantizedDataType('Q8_0',
106
+ dtype = np.dtype(np.float32), valid_conversions = [],
107
+ ggml_type = gguf.GGMLQuantizationType.Q8_0, block_size = 32,
108
+ quantized_dtype = np.dtype([('d', '<f2'), ('qs', 'i1', (32,))]))
109
+
110
+ # Quantized types skipped here because they may also map to np.float32
111
+ NUMPY_TYPE_TO_DATA_TYPE: dict[np.dtype[Any], DataType] = {}
112
+ for dt in (DT_BF16, DT_F16, DT_F32, DT_I32):
113
+ if dt.dtype in NUMPY_TYPE_TO_DATA_TYPE:
114
+ raise ValueError(f'Invalid duplicate data type {dt}')
115
+ NUMPY_TYPE_TO_DATA_TYPE[dt.dtype] = dt
116
+
117
+ SAFETENSORS_DATA_TYPES: dict[str, DataType] = {
118
+ 'BF16': DT_BF16,
119
+ 'F16': DT_F16,
120
+ 'F32': DT_F32,
121
+ 'I32': DT_I32,
122
+ }
123
+
124
+ # TODO: match this with `llama_ftype`
125
+ # TODO: rename to LLAMAFileType
126
+ # TODO: move to `gguf.py`
127
+
128
+
129
+ class GGMLFileType(enum.IntEnum):
130
+ AllF32 = 0
131
+ MostlyF16 = 1 # except 1d tensors
132
+ MostlyQ8_0 = 7 # except 1d tensors
133
+
134
+ def type_for_tensor(self, name: str, tensor: LazyTensor) -> DataType:
135
+ dt = GGML_FILE_TYPE_TO_DATA_TYPE.get(self)
136
+ if dt is None:
137
+ raise ValueError(self)
138
+ # 1D tensors are always F32.
139
+ return dt if len(tensor.shape) > 1 else DT_F32
140
+
141
+
142
+ GGML_FILE_TYPE_TO_DATA_TYPE: dict[GGMLFileType, DataType] = {
143
+ GGMLFileType.AllF32 : DT_F32,
144
+ GGMLFileType.MostlyF16 : DT_F16,
145
+ GGMLFileType.MostlyQ8_0: DT_Q8_0,
146
+ }
147
+
148
+ #
149
+ # hparams loading
150
+ #
151
+
152
+
153
+ @dataclass
154
+ class Params:
155
+ n_vocab: int
156
+ n_embd: int
157
+ n_layer: int
158
+ n_ctx: int
159
+ n_ff: int
160
+ n_head: int
161
+ n_head_kv: int
162
+ f_norm_eps: float
163
+
164
+ rope_scaling_type: gguf.RopeScalingType | None = None
165
+ f_rope_freq_base: float | None = None
166
+ f_rope_scale: float | None = None
167
+ n_orig_ctx: int | None = None
168
+ rope_finetuned: bool | None = None
169
+
170
+ ftype: GGMLFileType | None = None
171
+
172
+ # path to the directory containing the model files
173
+ path_model: Path | None = None
174
+
175
+ @staticmethod
176
+ def guessed(model: LazyModel) -> Params:
177
+ # try transformer naming first
178
+ n_vocab, n_embd = model["model.embed_tokens.weight"].shape if "model.embed_tokens.weight" in model else model["tok_embeddings.weight"].shape
179
+
180
+ # try transformer naming first
181
+ if "model.layers.0.self_attn.q_proj.weight" in model:
182
+ n_layer = next(i for i in itertools.count() if f"model.layers.{i}.self_attn.q_proj.weight" not in model)
183
+ elif "model.layers.0.self_attn.W_pack.weight" in model: # next: try baichuan naming
184
+ n_layer = next(i for i in itertools.count() if f"model.layers.{i}.self_attn.W_pack.weight" not in model)
185
+ else:
186
+ n_layer = next(i for i in itertools.count() if f"layers.{i}.attention.wq.weight" not in model)
187
+
188
+ if n_layer < 1:
189
+ raise Exception("failed to guess 'n_layer'. This model is unknown or unsupported.\n"
190
+ "Suggestion: provide 'config.json' of the model in the same directory containing model files.")
191
+
192
+ n_head = n_embd // 128 # guessed
193
+ n_mult = 256 # guessed
194
+
195
+ # TODO: verify this
196
+ n_ff = int(2 * (4 * n_embd) / 3)
197
+ n_ff = n_mult * ((n_ff + n_mult - 1) // n_mult)
198
+
199
+ return Params(
200
+ n_vocab = n_vocab,
201
+ n_embd = n_embd,
202
+ n_layer = n_layer,
203
+ n_ctx = -1,
204
+ n_ff = n_ff,
205
+ n_head = n_head,
206
+ n_head_kv = n_head,
207
+ f_norm_eps = 1e-5,
208
+ )
209
+
210
+ @staticmethod
211
+ def loadHFTransformerJson(model: LazyModel, config_path: Path) -> Params:
212
+ config = json.load(open(config_path))
213
+
214
+ rope_scaling_type = f_rope_scale = n_orig_ctx = rope_finetuned = None
215
+ rope_scaling = config.get("rope_scaling")
216
+
217
+ if rope_scaling is not None and (typ := rope_scaling.get("type")):
218
+ rope_factor = rope_scaling.get("factor")
219
+ f_rope_scale = rope_factor
220
+ if typ == "linear":
221
+ rope_scaling_type = gguf.RopeScalingType.LINEAR
222
+ elif typ == "yarn":
223
+ rope_scaling_type = gguf.RopeScalingType.YARN
224
+ n_orig_ctx = rope_scaling['original_max_position_embeddings']
225
+ rope_finetuned = rope_scaling['finetuned']
226
+ else:
227
+ raise NotImplementedError(f'Unknown rope scaling type: {typ}')
228
+
229
+ if "max_sequence_length" in config:
230
+ n_ctx = config["max_sequence_length"]
231
+ elif "max_position_embeddings" in config:
232
+ n_ctx = config["max_position_embeddings"]
233
+ else:
234
+ raise Exception("failed to guess 'n_ctx'. This model is unknown or unsupported.\n"
235
+ "Suggestion: provide 'config.json' of the model in the same directory containing model files.")
236
+
237
+ return Params(
238
+ n_vocab = config["vocab_size"],
239
+ n_embd = config["hidden_size"],
240
+ n_layer = config["num_hidden_layers"],
241
+ n_ctx = n_ctx,
242
+ n_ff = config["intermediate_size"],
243
+ n_head = (n_head := config["num_attention_heads"]),
244
+ n_head_kv = config.get("num_key_value_heads", n_head),
245
+ f_norm_eps = config["rms_norm_eps"],
246
+ f_rope_freq_base = config.get("rope_theta"),
247
+ rope_scaling_type = rope_scaling_type,
248
+ f_rope_scale = f_rope_scale,
249
+ n_orig_ctx = n_orig_ctx,
250
+ rope_finetuned = rope_finetuned,
251
+ )
252
+
253
+ # LLaMA v2 70B params.json
254
+ # {"dim": 8192, "multiple_of": 4096, "ffn_dim_multiplier": 1.3, "n_heads": 64, "n_kv_heads": 8, "n_layers": 80, "norm_eps": 1e-05, "vocab_size": -1}
255
+ @staticmethod
256
+ def loadOriginalParamsJson(model: LazyModel, config_path: Path) -> Params:
257
+ config = json.load(open(config_path))
258
+
259
+ # hack to determine LLaMA v1 vs v2 vs CodeLlama
260
+ if config.get("rope_theta") == 1000000:
261
+ # CodeLlama
262
+ n_ctx = 16384
263
+ elif config["norm_eps"] == 1e-05:
264
+ # LLaMA v2
265
+ n_ctx = 4096
266
+ else:
267
+ # LLaMA v1
268
+ n_ctx = 2048
269
+
270
+ return Params(
271
+ n_vocab = model["tok_embeddings.weight"].shape[0],
272
+ n_embd = config["dim"],
273
+ n_layer = config["n_layers"],
274
+ n_ctx = n_ctx,
275
+ n_ff = model["layers.0.feed_forward.w1.weight"].shape[0],
276
+ n_head = (n_head := config["n_heads"]),
277
+ n_head_kv = config.get("n_kv_heads", n_head),
278
+ f_norm_eps = config["norm_eps"],
279
+ f_rope_freq_base = config.get("rope_theta"),
280
+ )
281
+
282
+ @staticmethod
283
+ def load(model_plus: ModelPlus) -> Params:
284
+ hf_config_path = model_plus.paths[0].parent / "config.json"
285
+ orig_config_path = model_plus.paths[0].parent / "params.json"
286
+
287
+ if hf_config_path.exists():
288
+ params = Params.loadHFTransformerJson(model_plus.model, hf_config_path)
289
+ elif orig_config_path.exists():
290
+ params = Params.loadOriginalParamsJson(model_plus.model, orig_config_path)
291
+ elif model_plus.format != 'none':
292
+ params = Params.guessed(model_plus.model)
293
+ else:
294
+ raise ValueError('Cannot guess params when model format is none')
295
+
296
+ params.path_model = model_plus.paths[0].parent
297
+
298
+ return params
299
+
300
+
301
+ #
302
+ # vocab
303
+ #
304
+
305
+ class BpeVocab:
306
+ def __init__(self, fname_tokenizer: Path, fname_added_tokens: Path | None) -> None:
307
+ self.bpe_tokenizer = json.loads(open(str(fname_tokenizer), encoding="utf-8").read())
308
+ added_tokens: dict[str, int]
309
+ if fname_added_tokens is not None:
310
+ # FIXME: Verify that added tokens here _cannot_ overlap with the main vocab.
311
+ added_tokens = json.load(open(fname_added_tokens, encoding="utf-8"))
312
+ else:
313
+ # Fall back to trying to find the added tokens in tokenizer.json
314
+ tokenizer_json_file = fname_tokenizer.parent / 'tokenizer.json'
315
+ if not tokenizer_json_file.is_file():
316
+ added_tokens = {}
317
+ else:
318
+ tokenizer_json = json.load(open(tokenizer_json_file, encoding="utf-8"))
319
+ added_tokens = dict(
320
+ (item['content'], item['id'])
321
+ for item in tokenizer_json.get('added_tokens', [])
322
+ # Added tokens here can be duplicates of the main vocabulary.
323
+ if item['content'] not in self.bpe_tokenizer)
324
+
325
+ vocab_size: int = len(self.bpe_tokenizer)
326
+ expected_ids = list(range(vocab_size, vocab_size + len(added_tokens)))
327
+ actual_ids = sorted(added_tokens.values())
328
+ if expected_ids != actual_ids:
329
+ expected_end_id = vocab_size + len(actual_ids) - 1
330
+ raise Exception(f"Expected the {len(actual_ids)} added token ID(s) to be sequential in the range {vocab_size} - {expected_end_id}; got {actual_ids}")
331
+
332
+ items = sorted(added_tokens.items(), key=lambda text_idx: text_idx[1])
333
+ self.added_tokens_list = [text for (text, idx) in items]
334
+ self.vocab_size_base: int = vocab_size
335
+ self.vocab_size: int = self.vocab_size_base + len(self.added_tokens_list)
336
+ self.fname_tokenizer = fname_tokenizer
337
+ self.fname_added_tokens = fname_added_tokens
338
+
339
+ def bpe_tokens(self) -> Iterable[tuple[bytes, float, gguf.TokenType]]:
340
+ tokenizer = self.bpe_tokenizer
341
+ reverse_vocab = {id: encoded_tok for encoded_tok, id in tokenizer.items()}
342
+
343
+ for i, _ in enumerate(tokenizer):
344
+ yield reverse_vocab[i], 0.0, gguf.TokenType.NORMAL
345
+
346
+ def added_tokens(self) -> Iterable[tuple[bytes, float, gguf.TokenType]]:
347
+ for text in self.added_tokens_list:
348
+ score = -1000.0
349
+ yield text.encode("utf-8"), score, gguf.TokenType.CONTROL
350
+
351
+ def all_tokens(self) -> Iterable[tuple[bytes, float, gguf.TokenType]]:
352
+ yield from self.bpe_tokens()
353
+ yield from self.added_tokens()
354
+
355
+ def __repr__(self) -> str:
356
+ return f"<BpeVocab with {self.vocab_size_base} base tokens and {len(self.added_tokens_list)} added tokens>"
357
+
358
+
359
+ class SentencePieceVocab:
360
+ def __init__(self, fname_tokenizer: Path, fname_added_tokens: Path | None) -> None:
361
+ self.sentencepiece_tokenizer = SentencePieceProcessor(str(fname_tokenizer))
362
+ added_tokens: dict[str, int]
363
+ if fname_added_tokens is not None:
364
+ added_tokens = json.load(open(fname_added_tokens, encoding="utf-8"))
365
+ else:
366
+ added_tokens = {}
367
+
368
+ vocab_size: int = self.sentencepiece_tokenizer.vocab_size()
369
+
370
+ new_tokens = {id: piece for piece, id in added_tokens.items() if id >= vocab_size}
371
+ expected_new_ids = list(range(vocab_size, vocab_size + len(new_tokens)))
372
+ actual_new_ids = sorted(new_tokens.keys())
373
+
374
+ if expected_new_ids != actual_new_ids:
375
+ raise ValueError(f"Expected new token IDs {expected_new_ids} to be sequential; got {actual_new_ids}")
376
+
377
+ # Token pieces that were added to the base vocabulary.
378
+ self.added_tokens_list = [new_tokens[id] for id in actual_new_ids]
379
+ self.vocab_size_base = vocab_size
380
+ self.vocab_size = self.vocab_size_base + len(self.added_tokens_list)
381
+ self.fname_tokenizer = fname_tokenizer
382
+ self.fname_added_tokens = fname_added_tokens
383
+
384
+ def sentencepiece_tokens(self) -> Iterable[tuple[bytes, float, gguf.TokenType]]:
385
+ tokenizer = self.sentencepiece_tokenizer
386
+ for i in range(tokenizer.vocab_size()):
387
+ piece = tokenizer.id_to_piece(i)
388
+ text: bytes = piece.encode("utf-8")
389
+ score: float = tokenizer.get_score(i)
390
+
391
+ toktype = gguf.TokenType.NORMAL
392
+ if tokenizer.is_unknown(i):
393
+ toktype = gguf.TokenType.UNKNOWN
394
+ if tokenizer.is_control(i):
395
+ toktype = gguf.TokenType.CONTROL
396
+
397
+ # NOTE: I think added_tokens are user defined.
398
+ # ref: https://github.com/google/sentencepiece/blob/master/src/sentencepiece_model.proto
399
+ # if tokenizer.is_user_defined(i): toktype = gguf.TokenType.USER_DEFINED
400
+
401
+ if tokenizer.is_unused(i):
402
+ toktype = gguf.TokenType.UNUSED
403
+ if tokenizer.is_byte(i):
404
+ toktype = gguf.TokenType.BYTE
405
+
406
+ yield text, score, toktype
407
+
408
+ def added_tokens(self) -> Iterable[tuple[bytes, float, gguf.TokenType]]:
409
+ for text in self.added_tokens_list:
410
+ score = -1000.0
411
+ yield text.encode("utf-8"), score, gguf.TokenType.USER_DEFINED
412
+
413
+ def all_tokens(self) -> Iterable[tuple[bytes, float, gguf.TokenType]]:
414
+ yield from self.sentencepiece_tokens()
415
+ yield from self.added_tokens()
416
+
417
+ def __repr__(self) -> str:
418
+ return f"<SentencePieceVocab with {self.vocab_size_base} base tokens and {len(self.added_tokens_list)} added tokens>"
419
+
420
+
421
+ Vocab: TypeAlias = 'BpeVocab | SentencePieceVocab'
422
+
423
+ #
424
+ # data loading
425
+ # TODO: reuse (probably move to gguf.py?)
426
+ #
427
+
428
+
429
+ def permute(weights: NDArray, n_head: int, n_head_kv: int) -> NDArray:
430
+ # print( "permute debug " + str(weights.shape[0]) + " x " + str(weights.shape[1]) + " nhead " + str(n_head) + " nheadkv " + str(n_kv_head) )
431
+ if n_head_kv is not None and n_head != n_head_kv:
432
+ n_head = n_head_kv
433
+ return (weights.reshape(n_head, 2, weights.shape[0] // n_head // 2, *weights.shape[1:])
434
+ .swapaxes(1, 2)
435
+ .reshape(weights.shape))
436
+
437
+
438
+ class Tensor(metaclass=ABCMeta):
439
+ data_type: DataType
440
+
441
+ @abstractmethod
442
+ def astype(self, data_type: DataType) -> Tensor: ...
443
+ @abstractmethod
444
+ def permute(self, n_head: int, n_head_kv: int) -> Tensor: ...
445
+ @abstractmethod
446
+ def permute_part(self, n_part: int, n_head: int, n_head_kv: int) -> UnquantizedTensor: ...
447
+ @abstractmethod
448
+ def part(self, n_part: int) -> UnquantizedTensor: ...
449
+ @abstractmethod
450
+ def to_ggml(self) -> GGMLCompatibleTensor: ...
451
+
452
+
453
+ def bf16_to_fp32(bf16_arr: np.ndarray[Any, np.dtype[np.uint16]]) -> NDArray:
454
+ assert bf16_arr.dtype == np.uint16, f"Input array should be of dtype uint16, but got {bf16_arr.dtype}"
455
+ fp32_arr = bf16_arr.astype(np.uint32) << 16
456
+ return fp32_arr.view(np.float32)
457
+
458
+
459
+ class UnquantizedTensor(Tensor):
460
+ def __init__(self, ndarray: NDArray) -> None:
461
+ assert isinstance(ndarray, np.ndarray)
462
+ self.ndarray = ndarray
463
+ self.data_type = NUMPY_TYPE_TO_DATA_TYPE[ndarray.dtype]
464
+
465
+ def astype(self, data_type: DataType) -> Tensor:
466
+ dtype = data_type.dtype
467
+ if self.data_type == DT_BF16:
468
+ self.ndarray = bf16_to_fp32(self.ndarray)
469
+ return UnquantizedTensor(self.ndarray.astype(dtype))
470
+
471
+ def to_ggml(self) -> UnquantizedTensor:
472
+ return self
473
+
474
+ def permute_part(self, n_part: int, n_head: int, n_head_kv: int) -> UnquantizedTensor:
475
+ r = self.ndarray.shape[0] // 3
476
+ return UnquantizedTensor(permute(self.ndarray[r * n_part : r * n_part + r, ...], n_head, n_head_kv))
477
+
478
+ def part(self, n_part: int) -> UnquantizedTensor:
479
+ r = self.ndarray.shape[0] // 3
480
+ return UnquantizedTensor(self.ndarray[r * n_part : r * n_part + r, ...])
481
+
482
+ def permute(self, n_head: int, n_head_kv: int) -> UnquantizedTensor:
483
+ return UnquantizedTensor(permute(self.ndarray, n_head, n_head_kv))
484
+
485
+
486
+ def load_unquantized(lazy_tensor: LazyTensor, expected_dtype: Any = None, convert: bool = False) -> NDArray:
487
+ tensor = lazy_tensor.load()
488
+ assert isinstance(tensor, UnquantizedTensor)
489
+
490
+ # double-check:
491
+ actual_shape = list(tensor.ndarray.shape)
492
+ assert actual_shape == lazy_tensor.shape, (actual_shape, lazy_tensor.shape)
493
+ if expected_dtype is not None and expected_dtype != tensor.ndarray.dtype:
494
+ if convert:
495
+ tensor.ndarray = tensor.ndarray.astype(expected_dtype)
496
+ else:
497
+ raise ValueError(f'expected this tensor to have dtype {expected_dtype}, got {tensor.ndarray.dtype}')
498
+
499
+ return tensor.ndarray
500
+
501
+
502
+ GGMLCompatibleTensor = UnquantizedTensor
503
+
504
+
505
+ @dataclass
506
+ class LazyTensor:
507
+ _load: Callable[[], Tensor]
508
+ shape: list[int]
509
+ data_type: DataType
510
+ description: str
511
+
512
+ def load(self) -> Tensor:
513
+ ret = self._load()
514
+ # Should be okay if it maps to the same numpy type?
515
+ assert ret.data_type == self.data_type or (self.data_type.dtype == ret.data_type.dtype), \
516
+ (self.data_type, ret.data_type, self.description)
517
+ return ret
518
+
519
+ def astype(self, data_type: DataType) -> LazyTensor:
520
+ self.validate_conversion_to(data_type)
521
+
522
+ def load() -> Tensor:
523
+ return self.load().astype(data_type)
524
+ return LazyTensor(load, self.shape, data_type, f'convert({data_type}) {self.description}')
525
+
526
+ def validate_conversion_to(self, data_type: DataType) -> None:
527
+ if data_type != self.data_type and data_type.name not in self.data_type.valid_conversions:
528
+ raise ValueError(f'Cannot validate conversion from {self.data_type} to {data_type}.')
529
+
530
+
531
+ LazyModel: TypeAlias = 'dict[str, LazyTensor]'
532
+
533
+
534
+ @dataclass
535
+ class ModelPlus:
536
+ model: LazyModel
537
+ paths: list[Path] # Where this was read from.
538
+ format: Literal['ggml', 'torch', 'safetensors', 'none']
539
+ vocab: Vocab | None # For GGML models (which have vocab built in), the vocab.
540
+
541
+
542
+ def merge_sharded(models: list[LazyModel]) -> LazyModel:
543
+ # Original LLaMA models have each file contain one part of each tensor.
544
+ # Use a dict instead of a set to preserve order.
545
+ names = {name: None for model in models for name in model}
546
+
547
+ def convert(name: str) -> LazyTensor:
548
+ lazy_tensors: list[LazyTensor] = [model[name] for model in models]
549
+ if len(lazy_tensors) == 1:
550
+ # only one file; don't go through this procedure since there might
551
+ # be quantized tensors
552
+ return lazy_tensors[0]
553
+ if len(lazy_tensors[0].shape) == 1:
554
+ # the tensor is just duplicated in every file
555
+ return lazy_tensors[0]
556
+ if name.startswith('tok_embeddings.') or \
557
+ name.endswith('.attention.wo.weight') or \
558
+ name.endswith('.feed_forward.w2.weight'):
559
+ # split by columns
560
+ axis = 1
561
+ else:
562
+ # split by rows
563
+ axis = 0
564
+ concatenated_shape = list(lazy_tensors[0].shape)
565
+ concatenated_shape[axis] = sum(tensor.shape[axis] for tensor in lazy_tensors)
566
+
567
+ def load() -> UnquantizedTensor:
568
+ ndarrays = [load_unquantized(tensor) for tensor in lazy_tensors]
569
+ concatenated: NDArray = np.concatenate(ndarrays, axis=axis)
570
+ return UnquantizedTensor(concatenated)
571
+ description = 'concatenated[[' + '] | ['.join(lt.description for lt in lazy_tensors) + ']]'
572
+ return LazyTensor(load, concatenated_shape, lazy_tensors[0].data_type, description)
573
+ return {name: convert(name) for name in names}
574
+
575
+
576
+ def merge_multifile_models(models_plus: list[ModelPlus]) -> ModelPlus:
577
+ formats = set(mp.format for mp in models_plus)
578
+ assert len(formats) == 1, "different formats?"
579
+ format = formats.pop()
580
+ paths = [path for mp in models_plus for path in mp.paths]
581
+ # Use the first non-None vocab, if any.
582
+ try:
583
+ vocab = next(mp.vocab for mp in models_plus if mp.vocab is not None)
584
+ except StopIteration:
585
+ vocab = None
586
+
587
+ if any("model.embed_tokens.weight" in mp.model for mp in models_plus):
588
+ # Transformers models put different tensors in different files, but
589
+ # don't split indivdual tensors between files.
590
+ model: LazyModel = {}
591
+ for mp in models_plus:
592
+ model.update(mp.model)
593
+ else:
594
+ model = merge_sharded([mp.model for mp in models_plus])
595
+
596
+ return ModelPlus(model, paths, format, vocab)
597
+
598
+
599
+ def permute_lazy(lazy_tensor: LazyTensor, n_head: int, n_head_kv: int) -> LazyTensor:
600
+ def load() -> Tensor:
601
+ return lazy_tensor.load().permute(n_head, n_head_kv)
602
+ return LazyTensor(load, lazy_tensor.shape, lazy_tensor.data_type, f'permute({n_head}, {n_head_kv}) ' + lazy_tensor.description)
603
+
604
+
605
+ def permute_part_lazy(lazy_tensor: LazyTensor, n_part: int, n_head: int, n_head_kv: int) -> LazyTensor:
606
+ def load() -> Tensor:
607
+ return lazy_tensor.load().permute_part(n_part, n_head, n_head_kv)
608
+ s = lazy_tensor.shape.copy()
609
+ s[0] = s[0] // 3
610
+ return LazyTensor(load, s, lazy_tensor.data_type, f'permute({n_head}, {n_head_kv}) ' + lazy_tensor.description)
611
+
612
+
613
+ def part_lazy(lazy_tensor: LazyTensor, n_part: int) -> LazyTensor:
614
+ def load() -> Tensor:
615
+ return lazy_tensor.load().part(n_part)
616
+ s = lazy_tensor.shape.copy()
617
+ s[0] = s[0] // 3
618
+ return LazyTensor(load, s, lazy_tensor.data_type, 'part ' + lazy_tensor.description)
619
+
620
+
621
+ # Functionality that simulates `torch.load` but where individual tensors are
622
+ # only loaded into memory on demand, not all at once.
623
+ # PyTorch can't do this natively as of time of writing:
624
+ # - https://github.com/pytorch/pytorch/issues/64327
625
+ # This allows us to de-shard without multiplying RAM usage, and also
626
+ # conveniently drops the PyTorch dependency (though we still need numpy).
627
+
628
+
629
+ @dataclass
630
+ class LazyStorageKind:
631
+ data_type: DataType
632
+
633
+
634
+ @dataclass
635
+ class LazyStorage:
636
+ load: Callable[[int, int], NDArray]
637
+ kind: LazyStorageKind
638
+ description: str
639
+
640
+
641
+ class LazyUnpickler(pickle.Unpickler):
642
+ def __init__(self, fp: IO[bytes], data_base_path: str, zip_file: zipfile.ZipFile):
643
+ super().__init__(fp)
644
+ self.data_base_path = data_base_path
645
+ self.zip_file = zip_file
646
+
647
+ def persistent_load(self, pid: Any) -> Any:
648
+ assert pid[0] == 'storage'
649
+ assert isinstance(pid[1], LazyStorageKind)
650
+ data_type = pid[1].data_type
651
+ filename_stem = pid[2]
652
+ filename = f'{self.data_base_path}/{filename_stem}'
653
+ info = self.zip_file.getinfo(filename)
654
+
655
+ def load(offset: int, elm_count: int) -> NDArray:
656
+ dtype = data_type.dtype
657
+ fp = self.zip_file.open(info)
658
+ fp.seek(offset * dtype.itemsize)
659
+ size = elm_count * dtype.itemsize
660
+ data = fp.read(size)
661
+ assert len(data) == size
662
+ return np.frombuffer(data, dtype)
663
+ description = f'storage data_type={data_type} path-in-zip={filename} path={self.zip_file.filename}'
664
+ return LazyStorage(load=load, kind=pid[1], description=description)
665
+
666
+ @staticmethod
667
+ def lazy_rebuild_tensor_v2(storage: Any, storage_offset: Any, size: Any, stride: Any,
668
+ requires_grad: Any, backward_hooks: Any, metadata: Any = None) -> LazyTensor:
669
+ assert isinstance(storage, LazyStorage)
670
+
671
+ def load() -> UnquantizedTensor:
672
+ elm_count = stride[0] * size[0]
673
+ return UnquantizedTensor(storage.load(storage_offset, elm_count).reshape(size))
674
+ description = f'pickled storage_offset={storage_offset} in {storage.description}'
675
+ return LazyTensor(load, list(size), storage.kind.data_type, description)
676
+
677
+ @staticmethod
678
+ def rebuild_from_type_v2(func, new_type, args, state):
679
+ return func(*args)
680
+
681
+ CLASSES: dict[tuple[str, str], Any] = {
682
+ # getattr used here as a workaround for mypy not being smart enough to detrmine
683
+ # the staticmethods have a __func__ attribute.
684
+ ('torch._tensor', '_rebuild_from_type_v2'): getattr(rebuild_from_type_v2, '__func__'),
685
+ ('torch._utils', '_rebuild_tensor_v2'): getattr(lazy_rebuild_tensor_v2, '__func__'),
686
+ ('torch', 'BFloat16Storage'): LazyStorageKind(DT_BF16),
687
+ ('torch', 'HalfStorage'): LazyStorageKind(DT_F16),
688
+ ('torch', 'FloatStorage'): LazyStorageKind(DT_F32),
689
+ ('torch', 'IntStorage'): LazyStorageKind(DT_I32),
690
+ ('torch', 'Tensor'): LazyTensor,
691
+ }
692
+
693
+ def find_class(self, module: str, name: str) -> Any:
694
+ if not module.startswith('torch'):
695
+ return super().find_class(module, name)
696
+ return self.CLASSES[(module, name)]
697
+
698
+
699
+ def lazy_load_torch_file(outer_fp: IO[bytes], path: Path) -> ModelPlus:
700
+ zf = zipfile.ZipFile(outer_fp)
701
+ pickle_paths = [name for name in zf.namelist() if name.endswith('.pkl')]
702
+ assert len(pickle_paths) == 1, pickle_paths
703
+ pickle_fp = zf.open(pickle_paths[0], 'r')
704
+ unpickler = LazyUnpickler(pickle_fp,
705
+ data_base_path=pickle_paths[0][:-4],
706
+ zip_file=zf)
707
+ model = unpickler.load()
708
+ if 'model' in model: model = model['model']
709
+ as_dict = dict(model.items())
710
+ return ModelPlus(model=as_dict, paths=[path], format='torch', vocab=None)
711
+
712
+
713
+ def lazy_load_safetensors_file(fp: IO[bytes], path: Path) -> ModelPlus:
714
+ header_size, = struct.unpack('<Q', fp.read(8))
715
+ header: dict[str, dict[str, Any]] = json.loads(fp.read(header_size))
716
+ # Use mmap for the actual data to avoid race conditions with the file offset.
717
+ mapped = memoryview(mmap.mmap(fp.fileno(), 0, access=mmap.ACCESS_READ))
718
+ byte_buf = mapped[8 + header_size:]
719
+
720
+ def convert(info: dict[str, Any]) -> LazyTensor:
721
+ data_type = SAFETENSORS_DATA_TYPES[info['dtype']]
722
+ numpy_dtype = data_type.dtype
723
+ shape: list[int] = info['shape']
724
+ begin, end = info['data_offsets']
725
+ assert 0 <= begin <= end <= len(byte_buf)
726
+ assert end - begin == math.prod(shape) * numpy_dtype.itemsize
727
+ buf = byte_buf[begin:end]
728
+
729
+ def load() -> UnquantizedTensor:
730
+ return UnquantizedTensor(np.frombuffer(buf, dtype=numpy_dtype).reshape(shape))
731
+ description = f'safetensors begin={begin} end={end} type={data_type} path={path}'
732
+ return LazyTensor(load, shape, data_type, description)
733
+ model = {name: convert(info) for (name, info) in header.items() if name != '__metadata__'}
734
+ return ModelPlus(model=model, paths=[path], format='safetensors', vocab=None)
735
+
736
+
737
+ def must_read(fp: IO[bytes], length: int) -> bytes:
738
+ ret = fp.read(length)
739
+ if len(ret) < length:
740
+ raise Exception("unexpectedly reached end of file")
741
+ return ret
742
+
743
+
744
+ @functools.lru_cache(maxsize=None)
745
+ def lazy_load_file(path: Path) -> ModelPlus:
746
+ fp = open(path, 'rb')
747
+ first8 = fp.read(8)
748
+ fp.seek(0)
749
+ if first8[:2] == b'PK':
750
+ # A zip file, i.e. PyTorch format
751
+ return lazy_load_torch_file(fp, path)
752
+ elif struct.unpack('<Q', first8)[0] < 16 * 1024 * 1024:
753
+ # Probably safetensors
754
+ return lazy_load_safetensors_file(fp, path)
755
+ else:
756
+ raise ValueError(f"unknown format: {path}")
757
+
758
+
759
+ In = TypeVar('In')
760
+ Out = TypeVar('Out')
761
+
762
+
763
+ def bounded_parallel_map(func: Callable[[In], Out], iterable: Iterable[In], concurrency: int, max_workers: int | None = None, use_processpool_executor: bool = False) -> Iterable[Out]:
764
+ '''Parallel map, but with backpressure. If the caller doesn't call `next`
765
+ fast enough, this will stop calling `func` at some point rather than
766
+ letting results pile up in memory. Specifically, there is a max of one
767
+ output value buffered per thread.'''
768
+ if concurrency < 2:
769
+ yield from map(func, iterable)
770
+ # Not reached.
771
+ iterable = iter(iterable)
772
+ executor_class: type[ThreadPoolExecutor] | type[ProcessPoolExecutor]
773
+ if use_processpool_executor:
774
+ executor_class = ProcessPoolExecutor
775
+ else:
776
+ executor_class = ThreadPoolExecutor
777
+ with executor_class(max_workers = max_workers) as executor:
778
+ futures: list[concurrent.futures.Future[Out]] = []
779
+ done = False
780
+ for _ in range(concurrency):
781
+ try:
782
+ futures.append(executor.submit(func, next(iterable)))
783
+ except StopIteration:
784
+ done = True
785
+ break
786
+
787
+ while futures:
788
+ result = futures.pop(0).result()
789
+ while not done and len(futures) < concurrency:
790
+ try:
791
+ futures.append(executor.submit(func, next(iterable)))
792
+ except StopIteration:
793
+ done = True
794
+ break
795
+ yield result
796
+
797
+
798
+ def check_vocab_size(params: Params, vocab: Vocab) -> None:
799
+ if params.n_vocab != vocab.vocab_size:
800
+ assert isinstance(vocab, BpeVocab) or isinstance(vocab, SentencePieceVocab)
801
+ if params.n_vocab == vocab.vocab_size_base:
802
+ print("Ignoring added_tokens.json since model matches vocab size without it.")
803
+ vocab.added_tokens_list = []
804
+ vocab.vocab_size = vocab.vocab_size_base
805
+ return
806
+ msg = f"Vocab size mismatch (model has {params.n_vocab}, but {vocab.fname_tokenizer}"
807
+ if vocab.fname_added_tokens is not None:
808
+ msg += f" combined with {vocab.fname_added_tokens}"
809
+ msg += f" has {vocab.vocab_size})."
810
+ if vocab.vocab_size < params.n_vocab < vocab.vocab_size + 20 and vocab.fname_added_tokens is None:
811
+ msg += f" Most likely you are missing added_tokens.json (should be in {vocab.fname_tokenizer.parent})."
812
+ raise Exception(msg)
813
+
814
+
815
+ class OutputFile:
816
+ def __init__(self, fname_out: Path, endianess:gguf.GGUFEndian = gguf.GGUFEndian.LITTLE) -> None:
817
+ self.gguf = gguf.GGUFWriter(fname_out, gguf.MODEL_ARCH_NAMES[ARCH], endianess=endianess)
818
+
819
+ def add_meta_arch(self, params: Params) -> None:
820
+ name = "LLaMA"
821
+
822
+ # TODO: better logic to determine model name
823
+ if params.n_ctx == 4096:
824
+ name = "LLaMA v2"
825
+ elif params.path_model is not None:
826
+ name = str(params.path_model.parent).split('/')[-1]
827
+
828
+ self.gguf.add_name (name)
829
+ self.gguf.add_context_length (params.n_ctx)
830
+ self.gguf.add_embedding_length (params.n_embd)
831
+ self.gguf.add_block_count (params.n_layer)
832
+ self.gguf.add_feed_forward_length (params.n_ff)
833
+ self.gguf.add_rope_dimension_count(params.n_embd // params.n_head)
834
+ self.gguf.add_head_count (params.n_head)
835
+ self.gguf.add_head_count_kv (params.n_head_kv)
836
+ self.gguf.add_layer_norm_rms_eps (params.f_norm_eps)
837
+
838
+ if params.f_rope_freq_base is not None:
839
+ self.gguf.add_rope_freq_base(params.f_rope_freq_base)
840
+
841
+ if params.rope_scaling_type:
842
+ assert params.f_rope_scale is not None
843
+ self.gguf.add_rope_scaling_type(params.rope_scaling_type)
844
+ self.gguf.add_rope_scaling_factor(params.f_rope_scale)
845
+
846
+ if params.n_orig_ctx is not None:
847
+ self.gguf.add_rope_scaling_orig_ctx_len(params.n_orig_ctx)
848
+
849
+ if params.rope_finetuned is not None:
850
+ self.gguf.add_rope_scaling_finetuned(params.rope_finetuned)
851
+
852
+ if params.ftype is not None:
853
+ self.gguf.add_file_type(params.ftype)
854
+
855
+ def add_meta_vocab(self, vocab: Vocab) -> None:
856
+ tokens = []
857
+ scores = []
858
+ toktypes = []
859
+ # NOTE: `all_tokens` returns the base vocabulary and added tokens
860
+ for text, score, toktype in vocab.all_tokens():
861
+ tokens.append(text)
862
+ scores.append(score)
863
+ toktypes.append(toktype)
864
+
865
+ if isinstance(vocab, SentencePieceVocab):
866
+ self.gguf.add_tokenizer_model("llama")
867
+ elif isinstance(vocab, BpeVocab):
868
+ self.gguf.add_tokenizer_model("gpt2")
869
+ else:
870
+ raise ValueError('Unknown vocab type: Not BpeVocab or SentencePieceVocab')
871
+ self.gguf.add_token_list(tokens)
872
+ self.gguf.add_token_scores(scores)
873
+ self.gguf.add_token_types(toktypes)
874
+
875
+ def add_meta_special_vocab(self, svocab: gguf.SpecialVocab) -> None:
876
+ svocab.add_to_gguf(self.gguf)
877
+
878
+ def add_tensor_info(self, name: str, tensor: LazyTensor) -> None:
879
+ n_elements = int(np.prod(tensor.shape))
880
+ raw_dtype = getattr(tensor.data_type, 'ggml_type', None)
881
+ data_type = getattr(tensor.data_type, 'quantized_type', None) or tensor.data_type.dtype
882
+ data_nbytes = tensor.data_type.elements_to_bytes(n_elements)
883
+ self.gguf.add_tensor_info(name, tensor.shape, data_type, data_nbytes, raw_dtype = raw_dtype)
884
+
885
+ def write_meta(self) -> None:
886
+ self.gguf.write_header_to_file()
887
+ self.gguf.write_kv_data_to_file()
888
+
889
+ def write_tensor_info(self) -> None:
890
+ self.gguf.write_ti_data_to_file()
891
+
892
+ def close(self) -> None:
893
+ self.gguf.close()
894
+
895
+ @staticmethod
896
+ def write_vocab_only(fname_out: Path, params: Params, vocab: Vocab, svocab: gguf.SpecialVocab, endianess:gguf.GGUFEndian = gguf.GGUFEndian.LITTLE) -> None:
897
+ check_vocab_size(params, vocab)
898
+
899
+ of = OutputFile(fname_out, endianess=endianess)
900
+
901
+ # meta data
902
+ of.add_meta_arch(params)
903
+ of.add_meta_vocab(vocab)
904
+ of.add_meta_special_vocab(svocab)
905
+
906
+ of.write_meta()
907
+
908
+ of.close()
909
+
910
+ @staticmethod
911
+ def do_item(item: tuple[str, LazyTensor]) -> tuple[DataType, NDArray]:
912
+ name, lazy_tensor = item
913
+ tensor = lazy_tensor.load().to_ggml()
914
+ return (lazy_tensor.data_type, tensor.ndarray)
915
+
916
+ @staticmethod
917
+ def maybe_do_quantize(item: tuple[DataType, NDArray]) -> NDArray:
918
+ dt, arr = item
919
+ if not isinstance(dt, QuantizedDataType):
920
+ return arr
921
+ return dt.quantize(arr)
922
+
923
+ @staticmethod
924
+ def write_all(fname_out: Path, ftype: GGMLFileType, params: Params, model: LazyModel, vocab: Vocab, svocab: gguf.SpecialVocab, concurrency: int = DEFAULT_CONCURRENCY, endianess: gguf.GGUFEndian = gguf.GGUFEndian.LITTLE) -> None:
925
+ check_vocab_size(params, vocab)
926
+
927
+ of = OutputFile(fname_out, endianess=endianess)
928
+
929
+ # meta data
930
+ of.add_meta_arch(params)
931
+ of.add_meta_vocab(vocab)
932
+ of.add_meta_special_vocab(svocab)
933
+
934
+ # tensor info
935
+ for name, lazy_tensor in model.items():
936
+ of.add_tensor_info(name, lazy_tensor)
937
+
938
+ of.write_meta()
939
+ of.write_tensor_info()
940
+
941
+ # tensor data
942
+ ndarrays_inner = bounded_parallel_map(OutputFile.do_item, model.items(), concurrency = concurrency)
943
+ if ftype == GGMLFileType.MostlyQ8_0:
944
+ ndarrays = bounded_parallel_map(OutputFile.maybe_do_quantize, ndarrays_inner, concurrency = concurrency, max_workers = concurrency, use_processpool_executor = True)
945
+ else:
946
+ ndarrays = map(OutputFile.maybe_do_quantize, ndarrays_inner)
947
+
948
+ start = time.time()
949
+ for i, ((name, lazy_tensor), ndarray) in enumerate(zip(model.items(), ndarrays)):
950
+ elapsed = time.time() - start
951
+ size = ' x '.join(f"{dim:6d}" for dim in lazy_tensor.shape)
952
+ padi = len(str(len(model)))
953
+ print(f"[{i+1:{padi}d}/{len(model)}] Writing tensor {name:38s} | size {size:16} | type {lazy_tensor.data_type.name:4} | T+{int(elapsed):4}")
954
+ of.gguf.write_tensor_data(ndarray)
955
+
956
+ of.close()
957
+
958
+
959
+ def pick_output_type(model: LazyModel, output_type_str: str | None) -> GGMLFileType:
960
+ wq_type = model[gguf.TENSOR_NAMES[gguf.MODEL_TENSOR.ATTN_Q].format(bid=0) +".weight"].data_type
961
+
962
+ if output_type_str == "f32" or (output_type_str is None and wq_type == DT_F32):
963
+ return GGMLFileType.AllF32
964
+ if output_type_str == "f16" or (output_type_str is None and wq_type in (DT_F16, DT_BF16)):
965
+ return GGMLFileType.MostlyF16
966
+ if output_type_str == "q8_0":
967
+ return GGMLFileType.MostlyQ8_0
968
+
969
+ name_to_type = {name: lazy_tensor.data_type for (name, lazy_tensor) in model.items()}
970
+
971
+ raise Exception(f"Unexpected combination of types: {name_to_type}")
972
+
973
+
974
+ def convert_to_output_type(model: LazyModel, output_type: GGMLFileType) -> LazyModel:
975
+ return {name: tensor.astype(output_type.type_for_tensor(name, tensor))
976
+ for (name, tensor) in model.items()}
977
+
978
+
979
+ def convert_model_names(model: LazyModel, params: Params) -> LazyModel:
980
+ tmap = gguf.TensorNameMap(ARCH, params.n_layer)
981
+ should_skip: set[gguf.MODEL_TENSOR] = set(gguf.MODEL_TENSOR_SKIP.get(ARCH, []))
982
+
983
+ tmp = model
984
+
985
+ # HF models permut or pack some of the tensors, so we need to undo that
986
+ for i in itertools.count():
987
+ if f"model.layers.{i}.self_attn.q_proj.weight" in model:
988
+ print(f"Permuting layer {i}")
989
+ tmp[f"model.layers.{i}.self_attn.q_proj.weight"] = permute_lazy(model[f"model.layers.{i}.self_attn.q_proj.weight"], params.n_head, params.n_head)
990
+ tmp[f"model.layers.{i}.self_attn.k_proj.weight"] = permute_lazy(model[f"model.layers.{i}.self_attn.k_proj.weight"], params.n_head, params.n_head_kv)
991
+ # tmp[f"model.layers.{i}.self_attn.v_proj.weight"] = model[f"model.layers.{i}.self_attn.v_proj.weight"]
992
+ elif f"model.layers.{i}.self_attn.W_pack.weight" in model:
993
+ print(f"Unpacking and permuting layer {i}")
994
+ tmp[f"model.layers.{i}.self_attn.q_proj.weight"] = permute_part_lazy(model[f"model.layers.{i}.self_attn.W_pack.weight"], 0, params.n_head, params.n_head)
995
+ tmp[f"model.layers.{i}.self_attn.k_proj.weight"] = permute_part_lazy(model[f"model.layers.{i}.self_attn.W_pack.weight"], 1, params.n_head, params.n_head_kv)
996
+ tmp[f"model.layers.{i}.self_attn.v_proj.weight"] = part_lazy (model[f"model.layers.{i}.self_attn.W_pack.weight"], 2)
997
+ del tmp[f"model.layers.{i}.self_attn.W_pack.weight"]
998
+ else:
999
+ break
1000
+
1001
+ out: LazyModel = {}
1002
+ for name, lazy_tensor in model.items():
1003
+ tensor_type, name_new = tmap.get_type_and_name(name, try_suffixes = (".weight", ".bias")) or (None, None)
1004
+ if name_new is None:
1005
+ raise Exception(f"Unexpected tensor name: {name}")
1006
+
1007
+ if tensor_type in should_skip:
1008
+ print(f"skipping tensor {name_new}")
1009
+ continue
1010
+
1011
+ print(f"{name:48s} -> {name_new:40s} | {lazy_tensor.data_type.name:6s} | {lazy_tensor.shape}")
1012
+ out[name_new] = lazy_tensor
1013
+
1014
+ return out
1015
+
1016
+
1017
+ def nth_multifile_path(path: Path, n: int) -> Path | None:
1018
+ '''Given any path belonging to a multi-file model (e.g. foo.bin.1), return
1019
+ the nth path in the model.
1020
+ '''
1021
+ # Support the following patterns:
1022
+ patterns: list[tuple[str, str]] = [
1023
+ # - x.00.pth, x.01.pth, etc.
1024
+ (r'\.[0-9]{2}\.pth$', f'.{n:02}.pth'),
1025
+ # - x-00001-of-00002.bin, x-00002-of-00002.bin, etc.
1026
+ (r'-[0-9]{5}-of-(.*)$', fr'-{n:05}-of-\1'),
1027
+ # x.bin, x.bin.1, etc.
1028
+ (r'(\.[0-9]+)?$', r'\1' if n == 0 else fr'\1.{n}')
1029
+ ]
1030
+ for regex, replacement in patterns:
1031
+ if re.search(regex, path.name):
1032
+ new_path = path.with_name(re.sub(regex, replacement, path.name))
1033
+ if new_path.exists():
1034
+ return new_path
1035
+ return None
1036
+
1037
+
1038
+ def find_multifile_paths(path: Path) -> list[Path]:
1039
+ '''Given any path belonging to a multi-file model (e.g. foo.bin.1), return
1040
+ the whole list of paths in the model.
1041
+ '''
1042
+ ret: list[Path] = []
1043
+ for i in itertools.count():
1044
+ nth_path = nth_multifile_path(path, i)
1045
+ if nth_path is None:
1046
+ break
1047
+ ret.append(nth_path)
1048
+ if not ret:
1049
+ # No matches. This should only happen if the file was named, e.g.,
1050
+ # foo.0, and there was no file named foo. Oh well, try to process it
1051
+ # as a single file.
1052
+ return [path]
1053
+ return ret
1054
+
1055
+
1056
+ def load_some_model(path: Path) -> ModelPlus:
1057
+ '''Load a model of any supported format.'''
1058
+ # Be extra-friendly and accept either a file or a directory:
1059
+ if path.is_dir():
1060
+ # Check if it's a set of safetensors files first
1061
+ globs = ["model-00001-of-*.safetensors", "model.safetensors"]
1062
+ files = [file for glob in globs for file in path.glob(glob)]
1063
+ if not files:
1064
+ # Try the PyTorch patterns too, with lower priority
1065
+ globs = ["consolidated.00.pth", "pytorch_model-00001-of-*.bin", "*.pt", "pytorch_model.bin"]
1066
+ files = [file for glob in globs for file in path.glob(glob)]
1067
+ if not files:
1068
+ raise Exception(f"Can't find model in directory {path}")
1069
+ if len(files) > 1:
1070
+ raise Exception(f"Found multiple models in {path}, not sure which to pick: {files}")
1071
+ path = files[0]
1072
+
1073
+ paths = find_multifile_paths(path)
1074
+ models_plus: list[ModelPlus] = []
1075
+ for path in paths:
1076
+ print(f"Loading model file {path}")
1077
+ models_plus.append(lazy_load_file(path))
1078
+
1079
+ model_plus = merge_multifile_models(models_plus)
1080
+ return model_plus
1081
+
1082
+
1083
+ def load_vocab(path: Path, vocabtype: str | None) -> Vocab:
1084
+ # Be extra-friendly and accept either a file or a directory. Also, if it's
1085
+ # a directory, it might be the model directory, and tokenizer.model might
1086
+ # be in the parent of that.
1087
+ if path.is_dir():
1088
+ vocab_file = "tokenizer.model"
1089
+ if vocabtype == 'bpe':
1090
+ vocab_file = "vocab.json"
1091
+ path2 = path / vocab_file
1092
+ # Use `.parent` instead of /.. to handle the symlink case better.
1093
+ path3 = path.parent / vocab_file
1094
+ path4 = Path(os.path.abspath("./models")) / vocab_file
1095
+ if path2.exists():
1096
+ path = path2
1097
+ elif path3.exists():
1098
+ path = path3
1099
+ elif path4.exists():
1100
+ path = path4
1101
+ else:
1102
+ raise FileNotFoundError(
1103
+ f"Could not find {vocab_file} in {path} or its parent; "
1104
+ "if it's in another directory, pass the directory as --vocab-dir")
1105
+
1106
+ print(f"Loading vocab file '{path}', type '{vocabtype}'")
1107
+
1108
+ added_tokens_path = path.parent / "added_tokens.json"
1109
+ if vocabtype == "bpe":
1110
+ return BpeVocab(path, added_tokens_path if added_tokens_path.exists() else None)
1111
+ elif vocabtype == "spm":
1112
+ return SentencePieceVocab(path, added_tokens_path if added_tokens_path.exists() else None)
1113
+ else:
1114
+ raise ValueError(f"Unsupported vocabulary type {vocabtype}")
1115
+
1116
+
1117
+ def default_outfile(model_paths: list[Path], file_type: GGMLFileType) -> Path:
1118
+ namestr = {
1119
+ GGMLFileType.AllF32: "f32",
1120
+ GGMLFileType.MostlyF16: "f16",
1121
+ GGMLFileType.MostlyQ8_0:"q8_0",
1122
+ }[file_type]
1123
+ ret = model_paths[0].parent / f"ggml-model-{namestr}.gguf"
1124
+ if ret in model_paths:
1125
+ sys.stderr.write(
1126
+ f"Error: Default output path ({ret}) would overwrite the input. "
1127
+ "Please explicitly specify a path using --outfile.\n")
1128
+ sys.exit(1)
1129
+ return ret
1130
+
1131
+
1132
+ def do_dump_model(model_plus: ModelPlus) -> None:
1133
+ print(f"model_plus.paths = {model_plus.paths!r}")
1134
+ print(f"model_plus.format = {model_plus.format!r}")
1135
+ print(f"model_plus.vocab = {model_plus.vocab!r}")
1136
+ for name, lazy_tensor in model_plus.model.items():
1137
+ print(f"{name}: shape={lazy_tensor.shape} type={lazy_tensor.data_type}; {lazy_tensor.description}")
1138
+
1139
+
1140
+ def main(amodel, outfile, outtype=None):
1141
+ output_choices = ["f32", "f16"]
1142
+ if np.uint32(1) == np.uint32(1).newbyteorder("<"):
1143
+ # We currently only support Q8_0 output on little endian systems.
1144
+ output_choices.append("q8_0")
1145
+
1146
+ model_plus = load_some_model(Path(amodel))
1147
+ if model_plus.vocab:
1148
+ vocab = model_plus.vocab
1149
+ else:
1150
+ vocab_dir = model_plus.paths[0].parent
1151
+ vocab = load_vocab(Path(vocab_dir), vocabtype="spm")
1152
+ print(vocab)
1153
+ endianess = gguf.GGUFEndian.LITTLE
1154
+
1155
+ params = Params.load(model_plus)
1156
+ if params.n_ctx == -1:
1157
+ raise ValueError("CTX is 1")
1158
+ special_vocab = gguf.SpecialVocab(model_plus.paths[0].parent, load_merges = False, n_vocab = vocab.vocab_size)
1159
+ model = model_plus.model
1160
+ model = convert_model_names(model, params)
1161
+ ftype = pick_output_type(model, outtype)
1162
+ model = convert_to_output_type(model, ftype)
1163
+ outfile = outfile
1164
+ params.ftype = ftype
1165
+ print(f"Writing {outfile}, format {ftype}")
1166
+ OutputFile.write_all(outfile, ftype, params, model, vocab, special_vocab, concurrency = DEFAULT_CONCURRENCY, endianess=endianess)
1167
+ print(f"Wrote {outfile}")
1168
+
hfconv.py ADDED
@@ -0,0 +1,82 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # THIS SOFTWARE IS NOT OPEN SOURCED!!! REDISTRIBUTION PROHIBITED! SEE LICENSE FOR DETAILS.
2
+ from constants import *
3
+
4
+ from llama_cpp import llama_cpp
5
+ types = {
6
+ 'F32': 0,
7
+ 'F16': 1,
8
+ 'Q4_0': 2,
9
+ 'Q4_1': 3,
10
+ 'Q8_0': 7,
11
+ 'Q5_0': 8,
12
+ 'Q5_1': 9,
13
+ 'Q2_K': 10,
14
+ 'Q3_K_S': 11,
15
+ 'Q3_K_M': 12,
16
+ 'Q3_K_L': 13,
17
+ 'Q4_K_S': 14,
18
+ 'Q4_K_M': 15,
19
+ 'Q5_K_S': 16,
20
+ 'Q5_K_M': 17,
21
+ 'Q6_K': 18,
22
+ }
23
+ def calcftype(type):
24
+ return types[type.upper()]
25
+
26
+
27
+ import shutil
28
+ import tempfile
29
+ import os
30
+ from slugify import slugify
31
+ from huggingface_hub import CommitInfo, CommitOperationAdd, Discussion, HfApi, hf_hub_download, repo_exists
32
+ from huggingface_hub.file_download import repo_folder_name
33
+ from typing import Dict, List, Optional, Set, Tuple
34
+ from huggingface_hub import snapshot_download
35
+ from cscript import main
36
+
37
+ def convert_it(
38
+ model_id, token, folder
39
+ ):
40
+ with open("README_TEMPLATE.md", 'r') as f:
41
+ README = f.read().replace('<<MODEL_ID>>', model_id)
42
+ path = snapshot_download(
43
+ repo_id=model_id, token=token, cache_dir=folder
44
+ )
45
+ sf_name = "model-f16.gguf"
46
+ main(path, os.path.join(folder, "model-f16.gguf"))
47
+ operation = [
48
+ CommitOperationAdd(path_in_repo=sf_name, path_or_fileobj=os.path.join(folder, "model-f16.gguf")),
49
+ CommitOperationAdd(path_in_repo="README.md", path_or_fileobj=README.encode()),
50
+ ]
51
+ print("Quantization Time!")
52
+ for type in types_to_quantize:
53
+ print(f"Quantizing {type}!")
54
+ llama_cpp.llama_model_quantize(os.path.join(folder, "model-f16.gguf").encode(), os.path.join(folder, f"model-{type.lower()}.gguf").encode(), llama_cpp.llama_model_quantize_params(0, calcftype(type), True))
55
+ print(f"Done Quantizing {type}!")
56
+ operation.append(CommitOperationAdd(path_in_repo=f"model-{type.lower()}.gguf", path_or_fileobj=os.path.join(folder, f"model-{type.lower()}.gguf")))
57
+ return operation
58
+
59
+
60
+ def convert(
61
+ api: "HfApi", model_id: str, revision: Optional[str] = None, force: bool = False
62
+ ) -> Tuple["CommitInfo", List[Tuple[str, "Exception"]]]:
63
+ repo_id = username + "/" + slugify(model_id.strip()) + "-GGUF"
64
+ with tempfile.TemporaryDirectory() as d:
65
+ # d = "~/test"
66
+ folder = os.path.join(d, repo_folder_name(repo_id=model_id, repo_type="models"))
67
+ os.makedirs(folder)
68
+ if repo_exists(repo_id, token=api.token):
69
+ raise ValueError("Already exists")
70
+ try:
71
+ ops = convert_it(model_id, api.token, d)
72
+ api.create_repo(repo_id)
73
+ api.create_commit(
74
+ repo_id=repo_id,
75
+ revision=revision,
76
+ operations=ops,
77
+ commit_message="Add GGUF version",
78
+ commit_description="Automated commit"
79
+ )
80
+ finally:
81
+ shutil.rmtree(folder)
82
+ return repo_id
models/ggml-vocab-aquila.gguf ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:7c53c3c516ac67c7ca12977b9690fdea3d2ef13bbaed6378f98191a13ef5ca00
3
+ size 4825676
models/ggml-vocab-baichuan.gguf ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:4f5b955697f3bd3108070b1d5936c7eb9fc542b81c6932e59abddec75bca1963
3
+ size 1340998
models/ggml-vocab-falcon.gguf ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:ffbc7c119de7e9aab8f4257d617e3fa55f942a9f9ca84139ef3f5b1ca53836a8
3
+ size 2547782
models/ggml-vocab-gpt-neox.gguf ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:ae593a7f9b8bb174ed4f5019e41530463e4dac7aa06e42dee8aa650d2bdac53d
3
+ size 1771431
models/ggml-vocab-llama.gguf ADDED
Binary file (724 kB). View file
 
models/ggml-vocab-mpt.gguf ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:55c7df0d0443a24260ac6f8d3710f224fa38137cfaec6693413b913194d47cc5
3
+ size 1771406
models/ggml-vocab-refact.gguf ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:38ffb84a4c1aba7dc7f84358827e252874edeb80050bb0358e2b34fca09741d3
3
+ size 1720666
models/ggml-vocab-stablelm-3b-4e1t.gguf ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:8d9bc7a5570cf02a9d9347afa2a5d3847a9a96e88309b9b41c929b871021a6dd
3
+ size 1768581
models/ggml-vocab-starcoder.gguf ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:621db3ccdfcc3e5ed687fbba6dec9c6b29cec9f3c48172435a704f6321689b66
3
+ size 1719281
requirements.txt ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ torch
2
+ transformers
3
+ gradio
4
+ sentencepiece
5
+ gguf
6
+ numpy
7
+ python-slugify
8
+ llama-cpp-python