Initial Upload

Browse files

Files changed (9) hide show

README.md +84 -3
config.json +51 -0
generation_config.json +7 -0
model.safetensors +3 -0
quantization_config.json +24 -0
special_tokens_map.json +23 -0
tokenizer.json +0 -0
tokenizer.model +3 -0
tokenizer_config.json +43 -0

README.md CHANGED Viewed

@@ -1,3 +1,84 @@
----
-license: apache-2.0
----

+---
+language:
+- it
+- en
+tags:
+- pretrained
+- pytorch
+- causal-lm
+- minerva
+- autoround
+- intel-autoround
+- woq
+- gptq
+- intel
+license: apache-2.0
+model_name: Minerva 350M base v1.0
+base_model:
+- sapienzanlp/Minerva-350M-base-v1.0
+inference: false
+model_creator: sapienzanlp
+datasets:
+- uonlp/CulturaX
+pipeline_tag: text-generation
+prompt_template: '{prompt}
+  '
+quantized_by: fbaldassarri
+---
+## Model Information
+Quantized version of [sapienzanlp/Minerva-350M-base-v1.0](https://huggingface.co/sapienzanlp/Minerva-350M-base-v1.0) using torch.float32 for quantization tuning.
+- 4 bits (INT4)
+- group size = 128
+- Asymmetrical Quantization
+- Method WoQ (AutoRound format)
+Fast and low memory, 2-3X speedup (slight accuracy drop at W4G128)
+Quantization framework: [Intel AutoRound](https://github.com/intel/auto-round) v0.4.3
+Note: this INT4 version of Minerva-350M-base-v1.0 has been quantized to run inference through CPU.
+## Replication Recipe
+### Step 1 Install Requirements
+I suggest to install requirements into a dedicated python-virtualenv or a conda enviroment.
+```
+wget https://github.com/intel/auto-round/archive/refs/tags/v0.4.3.tar.gz
+tar -xvzf v0.4.3.tar.gz
+cd auto-round-0.4.3
+pip install -r requirements-cpu.txt --upgrade
+```
+### Step 2 Build Intel AutoRound wheel from sources
+```
+pip install -vvv --no-build-isolation -e .[cpu]
+```
+### Step 3 Script for Quantization
+```
+  from transformers import AutoModelForCausalLM, AutoTokenizer
+  model_name = "sapienzanlp/Minerva-350M-base-v1.0"
+  model = AutoModelForCausalLM.from_pretrained(model_name)
+  tokenizer = AutoTokenizer.from_pretrained(model_name)
+  from auto_round import AutoRound
+  bits, group_size, sym, device, amp = 4, 128, False, 'cpu', False
+  autoround = AutoRound(model, tokenizer, nsamples=128, iters=200, seqlen=512, batch_size=4, bits=bits, group_size=group_size, sym=sym, device=device, amp=amp)
+  autoround.quantize()
+  output_dir = "./AutoRound/sapienzanlp_Minerva-350M-base-v1.0-autoround-int4-gs128-asym"
+  autoround.save_quantized(output_dir, format='auto_round', inplace=True)
+```
+## License
+[Apache 2.0 License](https://choosealicense.com/licenses/apache-2.0/)
+## Disclaimer
+This quantized model comes with no warranty. It has been developed only for research purposes.

config.json ADDED Viewed

	@@ -0,0 +1,51 @@

+{
+  "_name_or_path": "sapienzanlp/Minerva-350M-base-v1.0",
+  "architectures": [
+    "MistralForCausalLM"
+  ],
+  "attention_dropout": 0.0,
+  "bos_token_id": 1,
+  "eos_token_id": 2,
+  "head_dim": 72,
+  "hidden_act": "silu",
+  "hidden_size": 1152,
+  "initializer_range": 0.02,
+  "intermediate_size": 4032,
+  "max_position_embeddings": 16384,
+  "model_type": "mistral",
+  "num_attention_heads": 16,
+  "num_hidden_layers": 16,
+  "num_key_value_heads": 4,
+  "quantization_config": {
+    "amp": false,
+    "autoround_version": "0.4.3",
+    "backend": "auto_round:exllamav2",
+    "batch_size": 4,
+    "bits": 4,
+    "data_type": "int",
+    "dataset": "NeelNanda/pile-10k",
+    "enable_minmax_tuning": true,
+    "enable_norm_bias_tuning": false,
+    "enable_quanted_input": true,
+    "gradient_accumulate_steps": 1,
+    "group_size": 128,
+    "iters": 200,
+    "low_gpu_mem_usage": false,
+    "lr": 0.005,
+    "minmax_lr": 0.005,
+    "nsamples": 128,
+    "quant_method": "intel/auto-round",
+    "scale_dtype": "torch.float16",
+    "seqlen": 512,
+    "sym": false,
+    "to_quant_block_names": null
+  },
+  "rms_norm_eps": 1e-05,
+  "rope_theta": 10000.0,
+  "sliding_window": 2048,
+  "tie_word_embeddings": false,
+  "torch_dtype": "float32",
+  "transformers_version": "4.47.1",
+  "use_cache": false,
+  "vocab_size": 32768
+}

generation_config.json ADDED Viewed

	@@ -0,0 +1,7 @@

+{
+  "_from_model_config": true,
+  "bos_token_id": 1,
+  "eos_token_id": 2,
+  "transformers_version": "4.47.1",
+  "use_cache": false
+}

model.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:8e4880b385a129e25e8d0ae32e360e1a55673f7fa1358430b3a760f4c615df05
+size 446328528

quantization_config.json ADDED Viewed

	@@ -0,0 +1,24 @@

+{
+  "bits": 4,
+  "group_size": 128,
+  "sym": false,
+  "data_type": "int",
+  "enable_quanted_input": true,
+  "enable_minmax_tuning": true,
+  "seqlen": 512,
+  "batch_size": 4,
+  "scale_dtype": "torch.float16",
+  "lr": 0.005,
+  "minmax_lr": 0.005,
+  "gradient_accumulate_steps": 1,
+  "iters": 200,
+  "amp": false,
+  "nsamples": 128,
+  "low_gpu_mem_usage": false,
+  "to_quant_block_names": null,
+  "enable_norm_bias_tuning": false,
+  "dataset": "NeelNanda/pile-10k",
+  "autoround_version": "0.4.3",
+  "quant_method": "intel/auto-round",
+  "backend": "auto_round:exllamav2"
+}

special_tokens_map.json ADDED Viewed

	@@ -0,0 +1,23 @@

+{
+  "bos_token": {
+    "content": "<s>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "eos_token": {
+    "content": "</s>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "unk_token": {
+    "content": "<unk>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  }
+}

tokenizer.json ADDED Viewed

The diff for this file is too large to render. See raw diff

tokenizer.model ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:bb411d9f702041597b9c84f0c99ee6601f02631467e2a76722db3ca1a2a77bf5
+size 795167

tokenizer_config.json ADDED Viewed

	@@ -0,0 +1,43 @@

+{
+  "add_bos_token": true,
+  "add_eos_token": false,
+  "add_prefix_space": true,
+  "added_tokens_decoder": {
+    "0": {
+      "content": "<unk>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "1": {
+      "content": "<s>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "2": {
+      "content": "</s>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    }
+  },
+  "bos_token": "<s>",
+  "clean_up_tokenization_spaces": false,
+  "eos_token": "</s>",
+  "extra_special_tokens": {},
+  "legacy": true,
+  "model_max_length": 1000000000000000019884624838656,
+  "pad_token": null,
+  "sp_model_kwargs": {},
+  "spaces_between_special_tokens": false,
+  "tokenizer_class": "LlamaTokenizer",
+  "unk_token": "<unk>",
+  "use_default_system_prompt": true
+}