Merge branch 'main' of https://huggingface.co/ybelkada/flan-t5-large into main
Browse files- README.md +6 -11
- config.json +0 -29
README.md
CHANGED
@@ -62,9 +62,7 @@ language:
|
|
62 |
- no
|
63 |
|
64 |
tags:
|
65 |
-
-
|
66 |
-
- translation
|
67 |
-
- text-generation
|
68 |
|
69 |
datasets:
|
70 |
- svakulenk0/qrecc
|
@@ -101,7 +99,7 @@ license: apache-2.0
|
|
101 |
|
102 |
# TL;DR
|
103 |
|
104 |
-
If you already know T5, FLAN-T5 is just better at everything. For the same number of parameters, these models have been fine-tuned on more than 1000 additional tasks covering also more languages.
|
105 |
As mentioned in the first few lines of the abstract :
|
106 |
> Flan-PaLM 540B achieves state-of-the-art performance on several benchmarks, such as 75.2% on five-shot MMLU. We also publicly release Flan-T5 checkpoints,1 which achieve strong few-shot performance even compared to much larger models, such as PaLM 62B. Overall, instruction finetuning is a general method for improving the performance and usability of pretrained language models.
|
107 |
|
@@ -155,7 +153,7 @@ print(tokenizer.decode(outputs[0]))
|
|
155 |
<summary> Click to expand </summary>
|
156 |
|
157 |
```python
|
158 |
-
|
159 |
from transformers import T5Tokenizer, T5ForConditionalGeneration
|
160 |
|
161 |
tokenizer = T5Tokenizer.from_pretrained("google/flan-t5-large")
|
@@ -178,6 +176,7 @@ print(tokenizer.decode(outputs[0]))
|
|
178 |
<summary> Click to expand </summary>
|
179 |
|
180 |
```python
|
|
|
181 |
import torch
|
182 |
from transformers import T5Tokenizer, T5ForConditionalGeneration
|
183 |
|
@@ -199,7 +198,7 @@ print(tokenizer.decode(outputs[0]))
|
|
199 |
<summary> Click to expand </summary>
|
200 |
|
201 |
```python
|
202 |
-
# pip install bitsandbytes
|
203 |
from transformers import T5Tokenizer, T5ForConditionalGeneration
|
204 |
|
205 |
tokenizer = T5Tokenizer.from_pretrained("google/flan-t5-large")
|
@@ -308,8 +307,4 @@ Carbon emissions can be estimated using the [Machine Learning Impact calculator]
|
|
308 |
|
309 |
copyright = {Creative Commons Attribution 4.0 International}
|
310 |
}
|
311 |
-
```
|
312 |
-
|
313 |
-
# Model Card Authors
|
314 |
-
|
315 |
-
This model card was written by the team at Hugging Face.
|
|
|
62 |
- no
|
63 |
|
64 |
tags:
|
65 |
+
- text2text-generation
|
|
|
|
|
66 |
|
67 |
datasets:
|
68 |
- svakulenk0/qrecc
|
|
|
99 |
|
100 |
# TL;DR
|
101 |
|
102 |
+
If you already know T5, FLAN-T5 is just better at everything. For the same number of parameters, these models have been fine-tuned on more than 1000 additional tasks covering also more languages.
|
103 |
As mentioned in the first few lines of the abstract :
|
104 |
> Flan-PaLM 540B achieves state-of-the-art performance on several benchmarks, such as 75.2% on five-shot MMLU. We also publicly release Flan-T5 checkpoints,1 which achieve strong few-shot performance even compared to much larger models, such as PaLM 62B. Overall, instruction finetuning is a general method for improving the performance and usability of pretrained language models.
|
105 |
|
|
|
153 |
<summary> Click to expand </summary>
|
154 |
|
155 |
```python
|
156 |
+
# pip install accelerate
|
157 |
from transformers import T5Tokenizer, T5ForConditionalGeneration
|
158 |
|
159 |
tokenizer = T5Tokenizer.from_pretrained("google/flan-t5-large")
|
|
|
176 |
<summary> Click to expand </summary>
|
177 |
|
178 |
```python
|
179 |
+
# pip install accelerate
|
180 |
import torch
|
181 |
from transformers import T5Tokenizer, T5ForConditionalGeneration
|
182 |
|
|
|
198 |
<summary> Click to expand </summary>
|
199 |
|
200 |
```python
|
201 |
+
# pip install bitsandbytes accelerate
|
202 |
from transformers import T5Tokenizer, T5ForConditionalGeneration
|
203 |
|
204 |
tokenizer = T5Tokenizer.from_pretrained("google/flan-t5-large")
|
|
|
307 |
|
308 |
copyright = {Creative Commons Attribution 4.0 International}
|
309 |
}
|
310 |
+
```
|
|
|
|
|
|
|
|
config.json
CHANGED
@@ -23,35 +23,6 @@
|
|
23 |
"pad_token_id": 0,
|
24 |
"relative_attention_max_distance": 128,
|
25 |
"relative_attention_num_buckets": 32,
|
26 |
-
"task_specific_params": {
|
27 |
-
"summarization": {
|
28 |
-
"early_stopping": true,
|
29 |
-
"length_penalty": 2.0,
|
30 |
-
"max_length": 200,
|
31 |
-
"min_length": 30,
|
32 |
-
"no_repeat_ngram_size": 3,
|
33 |
-
"num_beams": 4,
|
34 |
-
"prefix": "summarize: "
|
35 |
-
},
|
36 |
-
"translation_en_to_de": {
|
37 |
-
"early_stopping": true,
|
38 |
-
"max_length": 300,
|
39 |
-
"num_beams": 4,
|
40 |
-
"prefix": "translate English to German: "
|
41 |
-
},
|
42 |
-
"translation_en_to_fr": {
|
43 |
-
"early_stopping": true,
|
44 |
-
"max_length": 300,
|
45 |
-
"num_beams": 4,
|
46 |
-
"prefix": "translate English to French: "
|
47 |
-
},
|
48 |
-
"translation_en_to_ro": {
|
49 |
-
"early_stopping": true,
|
50 |
-
"max_length": 300,
|
51 |
-
"num_beams": 4,
|
52 |
-
"prefix": "translate English to Romanian: "
|
53 |
-
}
|
54 |
-
},
|
55 |
"tie_word_embeddings": false,
|
56 |
"transformers_version": "4.23.1",
|
57 |
"use_cache": true,
|
|
|
23 |
"pad_token_id": 0,
|
24 |
"relative_attention_max_distance": 128,
|
25 |
"relative_attention_num_buckets": 32,
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
26 |
"tie_word_embeddings": false,
|
27 |
"transformers_version": "4.23.1",
|
28 |
"use_cache": true,
|