Added Quantisation Information
#5
by
lgcharpe
- opened
README.md
CHANGED
@@ -374,4 +374,73 @@ model = AutoModelForCausalLM.from_pretrained(
|
|
374 |
load_in_8bit=True,
|
375 |
torch_dtype=torch.bfloat16
|
376 |
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
377 |
```
|
|
|
374 |
load_in_8bit=True,
|
375 |
torch_dtype=torch.bfloat16
|
376 |
)
|
377 |
+
```
|
378 |
+
|
379 |
+
_____
|
380 |
+
## Quantization
|
381 |
+
|
382 |
+
### Provided files
|
383 |
+
|
384 |
+
| Name | Quant method | Bits Per Weight | Size | Max RAM/VRAM required | Use case |
|
385 |
+
| ---- | ---- | ---- | ---- | ---- | ----- |
|
386 |
+
| [normistral-7b-warm-Q3_K_M.gguf](https://huggingface.co/norallm/normistral-7b-warm/blob/main/normistral-7b-warm-Q3_K_M.gguf) | Q3_K_M | 3.89 | 3.28 GB| 5.37 GB | very small, high quality loss |
|
387 |
+
| [normistral-7b-warm-Q4_K_M.gguf](https://huggingface.co/norallm/normistral-7b-warm/blob/main/normistral-7b-warm-Q4_K_M.gguf) | Q4_K_M | 4.83 | 4.07 GB| 6.16 GB | medium, balanced quality - recommended |
|
388 |
+
| [normistral-7b-warm-Q5_K_M.gguf](https://huggingface.co/norallm/normistral-7b-warm/blob/main/normistral-7b-warm-Q5_K_M.gguf) | Q5_K_M | 5.67 | 4.78 GB| 6.87 GB | large, very low quality loss - recommended |
|
389 |
+
| [normistral-7b-warm-Q6_K.gguf](https://huggingface.co/norallm/normistral-7b-warm/blob/main/normistral-7b-warm-Q6_K.gguf) | Q6_K | 6.56 | 5.54 GB| 7.63 GB | very large, extremely low quality loss |
|
390 |
+
| [normistral-7b-warm-Q8_0.gguf](https://huggingface.co/norallm/normistral-7b-warm/blob/main/normistral-7b-warm-Q8_0.gguf) | Q8_0 | 8.50 | 7.17 GB| 9.26 GB | very large, extremely low quality loss - not recommended |
|
391 |
+
|
392 |
+
### How to run from Python code
|
393 |
+
|
394 |
+
You can use GGUF models from Python using the [llama-cpp-python](https://github.com/abetlen/llama-cpp-python) for example.
|
395 |
+
|
396 |
+
#### How to load this model in Python code, using llama-cpp-python
|
397 |
+
|
398 |
+
For full documentation, please see: [llama-cpp-python docs](https://llama-cpp-python.readthedocs.io/en/latest/).
|
399 |
+
|
400 |
+
#### First install the package
|
401 |
+
|
402 |
+
Run one of the following commands, according to your system:
|
403 |
+
|
404 |
+
```shell
|
405 |
+
# Base llama-ccp-python with no GPU acceleration
|
406 |
+
pip install llama-cpp-python
|
407 |
+
# With NVidia CUDA acceleration
|
408 |
+
CMAKE_ARGS="-DLLAMA_CUBLAS=on" pip install llama-cpp-python
|
409 |
+
# Or with OpenBLAS acceleration
|
410 |
+
CMAKE_ARGS="-DLLAMA_BLAS=ON -DLLAMA_BLAS_VENDOR=OpenBLAS" pip install llama-cpp-python
|
411 |
+
# Or with CLBLast acceleration
|
412 |
+
CMAKE_ARGS="-DLLAMA_CLBLAST=on" pip install llama-cpp-python
|
413 |
+
# Or with AMD ROCm GPU acceleration (Linux only)
|
414 |
+
CMAKE_ARGS="-DLLAMA_HIPBLAS=on" pip install llama-cpp-python
|
415 |
+
# Or with Metal GPU acceleration for macOS systems only
|
416 |
+
CMAKE_ARGS="-DLLAMA_METAL=on" pip install llama-cpp-python
|
417 |
+
|
418 |
+
# In windows, to set the variables CMAKE_ARGS in PowerShell, follow this format; eg for NVidia CUDA:
|
419 |
+
$env:CMAKE_ARGS = "-DLLAMA_OPENBLAS=on"
|
420 |
+
pip install llama-cpp-python
|
421 |
+
```
|
422 |
+
|
423 |
+
#### Simple llama-cpp-python example code
|
424 |
+
|
425 |
+
```python
|
426 |
+
from llama_cpp import Llama
|
427 |
+
|
428 |
+
# Directly from huggingface-hub (requires huggingface-hub to be installed)
|
429 |
+
# Set gpu_layers to the number of layers to offload to GPU. Set to 0 if no GPU acceleration is available on your system.
|
430 |
+
llm = Llama.from_pretrained(
|
431 |
+
repo_id="norallm/normistral-7b-warm", # HuggingFace repository containing the GGUF files.
|
432 |
+
filename="*Q4_K_M.gguf", # suffix of the filename containing the level of quantization.
|
433 |
+
n_ctx=32768, # The max sequence length to use - note that longer sequence lengths require much more resources
|
434 |
+
n_threads=8, # The number of CPU threads to use, tailor to your system and the resulting performance
|
435 |
+
n_gpu_layers=35 # The number of layers to offload to GPU, if you have GPU acceleration available
|
436 |
+
)
|
437 |
+
|
438 |
+
# Simple inference example
|
439 |
+
output = llm(
|
440 |
+
"Engelsk: Hello everyone! I'm a language model, how are you doing today?\nBokmål:", # Prompt
|
441 |
+
max_tokens=512, # Generate up to 512 tokens
|
442 |
+
stop=["</s>"], # Example stop token
|
443 |
+
echo=True, # Whether to echo the prompt
|
444 |
+
temperature=0.3 # Temperature to set, for Q3_K_M, Q4_K_M, Q5_K_M, and Q6_0 it is recommended to set it relatively low.
|
445 |
+
)
|
446 |
```
|