Delta-Vector commited on
Commit
992f746
1 Parent(s): 38d97c0

out with the old in with the new

Browse files
Files changed (3) hide show
  1. README.md +95 -74
  2. tokenizer.json +2 -2
  3. tokenizer_config.json +256 -0
README.md CHANGED
@@ -43,39 +43,32 @@ state of the art AI models and helping foster innovation for everyone.
43
 
44
  ### Usage
45
 
46
- Below we share some code snippets on how to get quickly started with running the model. First make sure to `pip install -U transformers`, then copy the snippet from the section that is relevant for your usecase.
47
-
 
 
48
 
49
- #### Running the model on a single / multi GPU
50
 
 
51
 
52
  ```python
53
- # pip install accelerate
54
- from transformers import AutoTokenizer, AutoModelForCausalLM
55
  import torch
 
56
 
57
- tokenizer = AutoTokenizer.from_pretrained("google/gemma-2-9b")
58
- model = AutoModelForCausalLM.from_pretrained(
59
- "google/gemma-2-9b",
60
- device_map="auto",
61
- torch_dtype=torch.bfloat16
62
  )
63
 
64
- input_text = "Write me a poem about Machine Learning."
65
- input_ids = tokenizer(input_text, return_tensors="pt").to("cuda")
66
-
67
- outputs = model.generate(**input_ids)
68
- print(tokenizer.decode(outputs[0]))
69
  ```
70
 
71
- <a name="precisions"></a>
72
- #### Running the model on a GPU using different precisions
73
-
74
- The native weights of this model were exported in `bfloat16` precision. You can use `float16`, which may be faster on certain hardware, indicating the `torch_dtype` when loading the model. For convenience, the `float16` revision of the repo contains a copy of the weights already converted to that precision.
75
-
76
- You can also use `float32` if you skip the dtype, but no precision increase will occur (model weights will just be upcasted to `float32`). See examples below.
77
-
78
- * _Using `torch.float16`_
79
 
80
  ```python
81
  # pip install accelerate
@@ -86,57 +79,31 @@ tokenizer = AutoTokenizer.from_pretrained("google/gemma-2-9b")
86
  model = AutoModelForCausalLM.from_pretrained(
87
  "google/gemma-2-9b",
88
  device_map="auto",
89
- torch_dtype=torch.float16,
90
- revision="float16",
91
  )
92
 
93
  input_text = "Write me a poem about Machine Learning."
94
  input_ids = tokenizer(input_text, return_tensors="pt").to("cuda")
95
 
96
- outputs = model.generate(**input_ids)
97
  print(tokenizer.decode(outputs[0]))
98
  ```
99
 
100
- * _Using `torch.bfloat16`_
101
 
102
- ```python
103
- # pip install accelerate
104
- from transformers import AutoTokenizer, AutoModelForCausalLM
105
 
106
- tokenizer = AutoTokenizer.from_pretrained("google/gemma-2-9b")
107
- model = AutoModelForCausalLM.from_pretrained(
108
- "google/gemma-2-9b",
109
- device_map="auto",
110
- torch_dtype=torch.bfloat16)
111
-
112
- input_text = "Write me a poem about Machine Learning."
113
- input_ids = tokenizer(input_text, return_tensors="pt").to("cuda")
114
-
115
- outputs = model.generate(**input_ids)
116
- print(tokenizer.decode(outputs[0]))
117
- ```
118
-
119
- * _Upcasting to `torch.float32`_
120
-
121
- ```python
122
- # pip install accelerate
123
- from transformers import AutoTokenizer, AutoModelForCausalLM
124
-
125
- tokenizer = AutoTokenizer.from_pretrained("google/gemma-2-9b")
126
- model = AutoModelForCausalLM.from_pretrained(
127
- "google/gemma-2-9b",
128
- device_map="auto")
129
-
130
- input_text = "Write me a poem about Machine Learning."
131
- input_ids = tokenizer(input_text, return_tensors="pt").to("cuda")
132
-
133
- outputs = model.generate(**input_ids)
134
- print(tokenizer.decode(outputs[0]))
135
  ```
136
 
137
  #### Quantized Versions through `bitsandbytes`
138
 
139
- * _Using 8-bit precision (int8)_
 
 
 
140
 
141
  ```python
142
  # pip install bitsandbytes accelerate
@@ -147,16 +114,21 @@ quantization_config = BitsAndBytesConfig(load_in_8bit=True)
147
  tokenizer = AutoTokenizer.from_pretrained("google/gemma-2-9b")
148
  model = AutoModelForCausalLM.from_pretrained(
149
  "google/gemma-2-9b",
150
- quantization_config=quantization_config)
 
151
 
152
  input_text = "Write me a poem about Machine Learning."
153
  input_ids = tokenizer(input_text, return_tensors="pt").to("cuda")
154
 
155
- outputs = model.generate(**input_ids)
156
  print(tokenizer.decode(outputs[0]))
157
  ```
 
158
 
159
- * _Using 4-bit precision_
 
 
 
160
 
161
  ```python
162
  # pip install bitsandbytes accelerate
@@ -167,30 +139,79 @@ quantization_config = BitsAndBytesConfig(load_in_4bit=True)
167
  tokenizer = AutoTokenizer.from_pretrained("google/gemma-2-9b")
168
  model = AutoModelForCausalLM.from_pretrained(
169
  "google/gemma-2-9b",
170
- quantization_config=quantization_config)
 
171
 
172
  input_text = "Write me a poem about Machine Learning."
173
  input_ids = tokenizer(input_text, return_tensors="pt").to("cuda")
174
 
175
- outputs = model.generate(**input_ids)
176
  print(tokenizer.decode(outputs[0]))
177
  ```
 
178
 
 
179
 
180
- #### Other optimizations
 
 
 
181
 
182
- * _Flash Attention 2_
 
183
 
184
- First make sure to install `flash-attn` in your environment `pip install flash-attn`
185
 
186
- ```diff
187
- model = AutoModelForCausalLM.from_pretrained(
188
- model_id,
189
- torch_dtype=torch.float16,
190
- + attn_implementation="flash_attention_2"
191
- ).to(0)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
192
  ```
193
 
 
 
 
 
194
  ### Inputs and outputs
195
 
196
  * **Input:** Text string, such as a question, a prompt, or a document to be
 
43
 
44
  ### Usage
45
 
46
+ Below we share some code snippets on how to get quickly started with running the model. First, install the Transformers library with:
47
+ ```sh
48
+ pip install -U transformers
49
+ ```
50
 
51
+ Then, copy the snippet from the section that is relevant for your usecase.
52
 
53
+ #### Running with the `pipeline` API
54
 
55
  ```python
 
 
56
  import torch
57
+ from transformers import pipeline
58
 
59
+ pipe = pipeline(
60
+ "text-generation",
61
+ model="google/gemma-2-9b",
62
+ device="cuda", # replace with "mps" to run on a Mac device
 
63
  )
64
 
65
+ text = "Once upon a time,"
66
+ outputs = pipe(text, max_new_tokens=256)
67
+ response = outputs[0]["generated_text"]
68
+ print(response)
 
69
  ```
70
 
71
+ #### Running the model on a single / multi GPU
 
 
 
 
 
 
 
72
 
73
  ```python
74
  # pip install accelerate
 
79
  model = AutoModelForCausalLM.from_pretrained(
80
  "google/gemma-2-9b",
81
  device_map="auto",
 
 
82
  )
83
 
84
  input_text = "Write me a poem about Machine Learning."
85
  input_ids = tokenizer(input_text, return_tensors="pt").to("cuda")
86
 
87
+ outputs = model.generate(**input_ids, max_new_tokens=32)
88
  print(tokenizer.decode(outputs[0]))
89
  ```
90
 
91
+ #### Running the model through a CLI
92
 
93
+ The [local-gemma](https://github.com/huggingface/local-gemma) repository contains a lightweight wrapper around Transformers
94
+ for running Gemma 2 through a command line interface, or CLI. Follow the [installation instructions](https://github.com/huggingface/local-gemma#cli-usage)
95
+ for getting started, then launch the CLI through the following command:
96
 
97
+ ```shell
98
+ local-gemma --model "google/gemma-2-9b" --prompt "What is the capital of Mexico?"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
99
  ```
100
 
101
  #### Quantized Versions through `bitsandbytes`
102
 
103
+ <details>
104
+ <summary>
105
+ Using 8-bit precision (int8)
106
+ </summary>
107
 
108
  ```python
109
  # pip install bitsandbytes accelerate
 
114
  tokenizer = AutoTokenizer.from_pretrained("google/gemma-2-9b")
115
  model = AutoModelForCausalLM.from_pretrained(
116
  "google/gemma-2-9b",
117
+ quantization_config=quantization_config,
118
+ )
119
 
120
  input_text = "Write me a poem about Machine Learning."
121
  input_ids = tokenizer(input_text, return_tensors="pt").to("cuda")
122
 
123
+ outputs = model.generate(**input_ids, max_new_tokens=32)
124
  print(tokenizer.decode(outputs[0]))
125
  ```
126
+ </details>
127
 
128
+ <details>
129
+ <summary>
130
+ Using 4-bit precision
131
+ </summary>
132
 
133
  ```python
134
  # pip install bitsandbytes accelerate
 
139
  tokenizer = AutoTokenizer.from_pretrained("google/gemma-2-9b")
140
  model = AutoModelForCausalLM.from_pretrained(
141
  "google/gemma-2-9b",
142
+ quantization_config=quantization_config,
143
+ )
144
 
145
  input_text = "Write me a poem about Machine Learning."
146
  input_ids = tokenizer(input_text, return_tensors="pt").to("cuda")
147
 
148
+ outputs = model.generate(**input_ids, max_new_tokens=32)
149
  print(tokenizer.decode(outputs[0]))
150
  ```
151
+ </details>
152
 
153
+ #### Advanced Usage
154
 
155
+ <details>
156
+ <summary>
157
+ Torch compile
158
+ </summary>
159
 
160
+ [Torch compile](https://pytorch.org/tutorials/intermediate/torch_compile_tutorial.html) is a method for speeding-up the
161
+ inference of PyTorch modules. The Gemma-2 model can be run up to 6x faster by leveraging torch compile.
162
 
163
+ Note that two warm-up steps are required before the full inference speed is realised:
164
 
165
+ ```python
166
+ import os
167
+ os.environ["TOKENIZERS_PARALLELISM"] = "false"
168
+
169
+ from transformers import AutoTokenizer, Gemma2ForCausalLM
170
+ from transformers.cache_utils import HybridCache
171
+ import torch
172
+
173
+ torch.set_float32_matmul_precision("high")
174
+
175
+ # load the model + tokenizer
176
+ tokenizer = AutoTokenizer.from_pretrained("google/gemma-2-9b")
177
+ model = Gemma2ForCausalLM.from_pretrained("google/gemma-2-9b", torch_dtype=torch.bfloat16)
178
+ model.to("cuda")
179
+
180
+ # apply the torch compile transformation
181
+ model.forward = torch.compile(model.forward, mode="reduce-overhead", fullgraph=True)
182
+
183
+ # pre-process inputs
184
+ input_text = "The theory of special relativity states "
185
+ model_inputs = tokenizer(input_text, return_tensors="pt").to("cuda")
186
+ prompt_length = model_inputs.input_ids.shape[1]
187
+
188
+ # set-up k/v cache
189
+ past_key_values = HybridCache(
190
+ config=model.config,
191
+ max_batch_size=1,
192
+ max_cache_len=model.config.max_position_embeddings,
193
+ device=model.device,
194
+ dtype=model.dtype
195
+ )
196
+
197
+ # enable passing kv cache to generate
198
+ model._supports_cache_class = True
199
+ model.generation_config.cache_implementation = None
200
+
201
+ # two warm-up steps
202
+ for idx in range(2):
203
+ outputs = model.generate(**model_inputs, past_key_values=past_key_values, do_sample=True, temperature=1.0, max_new_tokens=128)
204
+ past_key_values.reset()
205
+
206
+ # fast run
207
+ outputs = model.generate(**model_inputs, past_key_values=past_key_values, do_sample=True, temperature=1.0, max_new_tokens=128)
208
+ print(tokenizer.decode(outputs[0], skip_special_tokens=True))
209
  ```
210
 
211
+ For more details, refer to the [Transformers documentation](https://huggingface.co/docs/transformers/main/en/llm_optims?static-kv=basic+usage%3A+generation_config).
212
+
213
+ </details>
214
+
215
  ### Inputs and outputs
216
 
217
  * **Input:** Text string, such as a question, a prompt, or a document to be
tokenizer.json CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:7da53ca29fb16f6b2489482fc0bc6a394162cdab14d12764a1755ebc583fea79
3
- size 17518525
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:3f289bc05132635a8bc7aca7aa21255efd5e18f3710f43e3cdb96bcd41be4922
3
+ size 17525357
tokenizer_config.json CHANGED
@@ -1737,6 +1737,262 @@
1737
  "rstrip": false,
1738
  "single_word": false,
1739
  "special": false
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1740
  }
1741
  },
1742
  "additional_special_tokens": [
 
1737
  "rstrip": false,
1738
  "single_word": false,
1739
  "special": false
1740
+ },
1741
+ "255968": {
1742
+ "content": "[toxicity=0]",
1743
+ "lstrip": false,
1744
+ "normalized": false,
1745
+ "rstrip": false,
1746
+ "single_word": false,
1747
+ "special": false
1748
+ },
1749
+ "255969": {
1750
+ "content": "\t\t",
1751
+ "lstrip": false,
1752
+ "normalized": false,
1753
+ "rstrip": false,
1754
+ "single_word": false,
1755
+ "special": false
1756
+ },
1757
+ "255970": {
1758
+ "content": "\t\t\t",
1759
+ "lstrip": false,
1760
+ "normalized": false,
1761
+ "rstrip": false,
1762
+ "single_word": false,
1763
+ "special": false
1764
+ },
1765
+ "255971": {
1766
+ "content": "\t\t\t\t",
1767
+ "lstrip": false,
1768
+ "normalized": false,
1769
+ "rstrip": false,
1770
+ "single_word": false,
1771
+ "special": false
1772
+ },
1773
+ "255972": {
1774
+ "content": "\t\t\t\t\t",
1775
+ "lstrip": false,
1776
+ "normalized": false,
1777
+ "rstrip": false,
1778
+ "single_word": false,
1779
+ "special": false
1780
+ },
1781
+ "255973": {
1782
+ "content": "\t\t\t\t\t\t",
1783
+ "lstrip": false,
1784
+ "normalized": false,
1785
+ "rstrip": false,
1786
+ "single_word": false,
1787
+ "special": false
1788
+ },
1789
+ "255974": {
1790
+ "content": "\t\t\t\t\t\t\t",
1791
+ "lstrip": false,
1792
+ "normalized": false,
1793
+ "rstrip": false,
1794
+ "single_word": false,
1795
+ "special": false
1796
+ },
1797
+ "255975": {
1798
+ "content": "\t\t\t\t\t\t\t\t",
1799
+ "lstrip": false,
1800
+ "normalized": false,
1801
+ "rstrip": false,
1802
+ "single_word": false,
1803
+ "special": false
1804
+ },
1805
+ "255976": {
1806
+ "content": "\t\t\t\t\t\t\t\t\t",
1807
+ "lstrip": false,
1808
+ "normalized": false,
1809
+ "rstrip": false,
1810
+ "single_word": false,
1811
+ "special": false
1812
+ },
1813
+ "255977": {
1814
+ "content": "\t\t\t\t\t\t\t\t\t\t",
1815
+ "lstrip": false,
1816
+ "normalized": false,
1817
+ "rstrip": false,
1818
+ "single_word": false,
1819
+ "special": false
1820
+ },
1821
+ "255978": {
1822
+ "content": "\t\t\t\t\t\t\t\t\t\t\t",
1823
+ "lstrip": false,
1824
+ "normalized": false,
1825
+ "rstrip": false,
1826
+ "single_word": false,
1827
+ "special": false
1828
+ },
1829
+ "255979": {
1830
+ "content": "\t\t\t\t\t\t\t\t\t\t\t\t",
1831
+ "lstrip": false,
1832
+ "normalized": false,
1833
+ "rstrip": false,
1834
+ "single_word": false,
1835
+ "special": false
1836
+ },
1837
+ "255980": {
1838
+ "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t",
1839
+ "lstrip": false,
1840
+ "normalized": false,
1841
+ "rstrip": false,
1842
+ "single_word": false,
1843
+ "special": false
1844
+ },
1845
+ "255981": {
1846
+ "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t",
1847
+ "lstrip": false,
1848
+ "normalized": false,
1849
+ "rstrip": false,
1850
+ "single_word": false,
1851
+ "special": false
1852
+ },
1853
+ "255982": {
1854
+ "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t",
1855
+ "lstrip": false,
1856
+ "normalized": false,
1857
+ "rstrip": false,
1858
+ "single_word": false,
1859
+ "special": false
1860
+ },
1861
+ "255983": {
1862
+ "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t",
1863
+ "lstrip": false,
1864
+ "normalized": false,
1865
+ "rstrip": false,
1866
+ "single_word": false,
1867
+ "special": false
1868
+ },
1869
+ "255984": {
1870
+ "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t",
1871
+ "lstrip": false,
1872
+ "normalized": false,
1873
+ "rstrip": false,
1874
+ "single_word": false,
1875
+ "special": false
1876
+ },
1877
+ "255985": {
1878
+ "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t",
1879
+ "lstrip": false,
1880
+ "normalized": false,
1881
+ "rstrip": false,
1882
+ "single_word": false,
1883
+ "special": false
1884
+ },
1885
+ "255986": {
1886
+ "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t",
1887
+ "lstrip": false,
1888
+ "normalized": false,
1889
+ "rstrip": false,
1890
+ "single_word": false,
1891
+ "special": false
1892
+ },
1893
+ "255987": {
1894
+ "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t",
1895
+ "lstrip": false,
1896
+ "normalized": false,
1897
+ "rstrip": false,
1898
+ "single_word": false,
1899
+ "special": false
1900
+ },
1901
+ "255988": {
1902
+ "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t",
1903
+ "lstrip": false,
1904
+ "normalized": false,
1905
+ "rstrip": false,
1906
+ "single_word": false,
1907
+ "special": false
1908
+ },
1909
+ "255989": {
1910
+ "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t",
1911
+ "lstrip": false,
1912
+ "normalized": false,
1913
+ "rstrip": false,
1914
+ "single_word": false,
1915
+ "special": false
1916
+ },
1917
+ "255990": {
1918
+ "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t",
1919
+ "lstrip": false,
1920
+ "normalized": false,
1921
+ "rstrip": false,
1922
+ "single_word": false,
1923
+ "special": false
1924
+ },
1925
+ "255991": {
1926
+ "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t",
1927
+ "lstrip": false,
1928
+ "normalized": false,
1929
+ "rstrip": false,
1930
+ "single_word": false,
1931
+ "special": false
1932
+ },
1933
+ "255992": {
1934
+ "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t",
1935
+ "lstrip": false,
1936
+ "normalized": false,
1937
+ "rstrip": false,
1938
+ "single_word": false,
1939
+ "special": false
1940
+ },
1941
+ "255993": {
1942
+ "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t",
1943
+ "lstrip": false,
1944
+ "normalized": false,
1945
+ "rstrip": false,
1946
+ "single_word": false,
1947
+ "special": false
1948
+ },
1949
+ "255994": {
1950
+ "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t",
1951
+ "lstrip": false,
1952
+ "normalized": false,
1953
+ "rstrip": false,
1954
+ "single_word": false,
1955
+ "special": false
1956
+ },
1957
+ "255995": {
1958
+ "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t",
1959
+ "lstrip": false,
1960
+ "normalized": false,
1961
+ "rstrip": false,
1962
+ "single_word": false,
1963
+ "special": false
1964
+ },
1965
+ "255996": {
1966
+ "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t",
1967
+ "lstrip": false,
1968
+ "normalized": false,
1969
+ "rstrip": false,
1970
+ "single_word": false,
1971
+ "special": false
1972
+ },
1973
+ "255997": {
1974
+ "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t",
1975
+ "lstrip": false,
1976
+ "normalized": false,
1977
+ "rstrip": false,
1978
+ "single_word": false,
1979
+ "special": false
1980
+ },
1981
+ "255998": {
1982
+ "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t",
1983
+ "lstrip": false,
1984
+ "normalized": false,
1985
+ "rstrip": false,
1986
+ "single_word": false,
1987
+ "special": false
1988
+ },
1989
+ "255999": {
1990
+ "content": "<unused99>",
1991
+ "lstrip": false,
1992
+ "normalized": false,
1993
+ "rstrip": false,
1994
+ "single_word": false,
1995
+ "special": false
1996
  }
1997
  },
1998
  "additional_special_tokens": [