jploski commited on
Commit
7661a5f
1 Parent(s): 7a70a03

Uploaded new model version trained from scratch using syncdoth/RetNet commit 40fd7585 (2023-11-03)

Browse files
Files changed (6) hide show
  1. README.md +10 -8
  2. config.json +24 -14
  3. generation_config.json +6 -0
  4. model.safetensors +2 -2
  5. tokenizer.json +1 -6
  6. tokenizer_config.json +116 -0
README.md CHANGED
@@ -29,6 +29,8 @@ https://raw.githubusercontent.com/karpathy/char-rnn/master/data/tinyshakespeare/
29
 
30
  ## Training procedure
31
 
 
 
32
  Just used the single tinyshakespeare text file as both the training and validation set (split up into paragraphs). See:
33
 
34
  https://colab.research.google.com/drive/1wZnM7FCe4TsQpoamJ7NDAuQfA3DYiwHi?usp=sharing
@@ -51,15 +53,15 @@ The following hyperparameters were used during training:
51
 
52
  | Training Loss | Epoch | Step | Validation Loss |
53
  |:-------------:|:-----:|:----:|:---------------:|
54
- | 5.3901 | 9.93 | 370 | 4.1523 |
55
- | 3.8122 | 19.87 | 740 | 3.3425 |
56
- | 3.1609 | 29.8 | 1110 | 2.8916 |
57
- | 2.8352 | 39.73 | 1480 | 2.7718 |
58
 
59
 
60
  ### Framework versions
61
 
62
- - Transformers 4.31.0
63
- - Pytorch 2.0.1+cu118
64
- - Datasets 2.14.3
65
- - Tokenizers 0.13.3
 
29
 
30
  ## Training procedure
31
 
32
+ Note: updated on 2023-11-10 to work with the current version of syncdoth/RetNet.
33
+
34
  Just used the single tinyshakespeare text file as both the training and validation set (split up into paragraphs). See:
35
 
36
  https://colab.research.google.com/drive/1wZnM7FCe4TsQpoamJ7NDAuQfA3DYiwHi?usp=sharing
 
53
 
54
  | Training Loss | Epoch | Step | Validation Loss |
55
  |:-------------:|:-----:|:----:|:---------------:|
56
+ | 3.6853 | 10.0 | 370 | 3.4459 |
57
+ | 2.1973 | 20.0 | 740 | 2.0213 |
58
+ | 1.3819 | 30.0 | 1110 | 1.3017 |
59
+ | 1.1658 | 40.0 | 1480 | 1.1566 |
60
 
61
 
62
  ### Framework versions
63
 
64
+ - Transformers 4.35.0
65
+ - Pytorch 2.1.0+cu118
66
+ - Datasets 2.14.6
67
+ - Tokenizers 0.14.1
config.json CHANGED
@@ -1,28 +1,38 @@
1
  {
 
 
2
  "architectures": [
3
- "RetNetModelWithLMHead"
4
  ],
5
- "chunk_size": 512,
 
 
 
 
 
 
 
 
6
  "eos_token_id": 11,
7
- "ffn_proj_size": 256,
8
  "forward_impl": "parallel",
9
- "hidden_size": 128,
10
  "initializer_range": 0.02,
11
  "is_decoder": true,
 
 
12
  "model_type": "retnet",
13
- "num_heads": 4,
14
- "num_layers": 8,
15
  "output_retentions": false,
16
  "pad_token_id": 11,
17
- "qk_dim": 128,
 
 
18
  "torch_dtype": "float32",
19
- "transformers_version": "4.31.0",
20
  "unk_token_id": 11,
21
- "use_bias_in_mlp": true,
22
- "use_bias_in_msr": false,
23
- "use_bias_in_msr_out": false,
24
  "use_cache": true,
25
- "use_default_gamma": false,
26
- "v_dim": 256,
27
- "vocab_size": 65024
 
 
28
  }
 
1
  {
2
+ "activation_dropout": 0.0,
3
+ "activation_fn": "swish",
4
  "architectures": [
5
+ "RetNetForCausalLM"
6
  ],
7
+ "decoder_embed_dim": 128,
8
+ "decoder_ffn_embed_dim": 256,
9
+ "decoder_layers": 8,
10
+ "decoder_normalize_before": true,
11
+ "decoder_retention_heads": 4,
12
+ "decoder_value_embed_dim": 256,
13
+ "deepnorm": false,
14
+ "drop_path_rate": 0.0,
15
+ "dropout": 0.0,
16
  "eos_token_id": 11,
 
17
  "forward_impl": "parallel",
 
18
  "initializer_range": 0.02,
19
  "is_decoder": true,
20
+ "layernorm_embedding": true,
21
+ "layernorm_eps": 1e-06,
22
  "model_type": "retnet",
23
+ "no_scale_embedding": true,
 
24
  "output_retentions": false,
25
  "pad_token_id": 11,
26
+ "recurrent_chunk_size": 512,
27
+ "subln": true,
28
+ "tie_word_embeddings": false,
29
  "torch_dtype": "float32",
30
+ "transformers_version": "4.35.0",
31
  "unk_token_id": 11,
 
 
 
32
  "use_cache": true,
33
+ "use_ffn_rms_norm": false,
34
+ "use_glu": true,
35
+ "use_lm_decay": false,
36
+ "vocab_size": 65024,
37
+ "z_loss_coeff": 0.0
38
  }
generation_config.json ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ {
2
+ "_from_model_config": true,
3
+ "eos_token_id": 11,
4
+ "pad_token_id": 11,
5
+ "transformers_version": "4.35.0"
6
+ }
model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:946419d6dfb43d664380c8e8647a69f8500247ab5a631c6443aba08f08ce0aa6
3
- size 39624432
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:ff8967abfad37523aff5bd998187886e89f0ed86a89067f7a05e29b2902eac9a
3
+ size 73943144
tokenizer.json CHANGED
@@ -1,11 +1,6 @@
1
  {
2
  "version": "1.0",
3
- "truncation": {
4
- "direction": "Right",
5
- "max_length": 64,
6
- "strategy": "LongestFirst",
7
- "stride": 0
8
- },
9
  "padding": null,
10
  "added_tokens": [
11
  {
 
1
  {
2
  "version": "1.0",
3
+ "truncation": null,
 
 
 
 
 
4
  "padding": null,
5
  "added_tokens": [
6
  {
tokenizer_config.json CHANGED
@@ -1,7 +1,123 @@
1
  {
2
  "add_prefix_space": false,
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3
  "clean_up_tokenization_spaces": true,
4
  "eos_token": "<|endoftext|>",
 
 
 
 
5
  "model_max_length": 2048,
 
6
  "tokenizer_class": "PreTrainedTokenizerFast"
7
  }
 
1
  {
2
  "add_prefix_space": false,
3
+ "added_tokens_decoder": {
4
+ "0": {
5
+ "content": ">>TITLE<<",
6
+ "lstrip": false,
7
+ "normalized": false,
8
+ "rstrip": false,
9
+ "single_word": false,
10
+ "special": true
11
+ },
12
+ "1": {
13
+ "content": ">>ABSTRACT<<",
14
+ "lstrip": false,
15
+ "normalized": false,
16
+ "rstrip": false,
17
+ "single_word": false,
18
+ "special": true
19
+ },
20
+ "2": {
21
+ "content": ">>INTRODUCTION<<",
22
+ "lstrip": false,
23
+ "normalized": false,
24
+ "rstrip": false,
25
+ "single_word": false,
26
+ "special": true
27
+ },
28
+ "3": {
29
+ "content": ">>SUMMARY<<",
30
+ "lstrip": false,
31
+ "normalized": false,
32
+ "rstrip": false,
33
+ "single_word": false,
34
+ "special": true
35
+ },
36
+ "4": {
37
+ "content": ">>COMMENT<<",
38
+ "lstrip": false,
39
+ "normalized": false,
40
+ "rstrip": false,
41
+ "single_word": false,
42
+ "special": true
43
+ },
44
+ "5": {
45
+ "content": ">>ANSWER<<",
46
+ "lstrip": false,
47
+ "normalized": false,
48
+ "rstrip": false,
49
+ "single_word": false,
50
+ "special": true
51
+ },
52
+ "6": {
53
+ "content": ">>QUESTION<<",
54
+ "lstrip": false,
55
+ "normalized": false,
56
+ "rstrip": false,
57
+ "single_word": false,
58
+ "special": true
59
+ },
60
+ "7": {
61
+ "content": ">>DOMAIN<<",
62
+ "lstrip": false,
63
+ "normalized": false,
64
+ "rstrip": false,
65
+ "single_word": false,
66
+ "special": true
67
+ },
68
+ "8": {
69
+ "content": ">>PREFIX<<",
70
+ "lstrip": false,
71
+ "normalized": false,
72
+ "rstrip": false,
73
+ "single_word": false,
74
+ "special": true
75
+ },
76
+ "9": {
77
+ "content": ">>SUFFIX<<",
78
+ "lstrip": false,
79
+ "normalized": false,
80
+ "rstrip": false,
81
+ "single_word": false,
82
+ "special": true
83
+ },
84
+ "10": {
85
+ "content": ">>MIDDLE<<",
86
+ "lstrip": false,
87
+ "normalized": false,
88
+ "rstrip": false,
89
+ "single_word": false,
90
+ "special": true
91
+ },
92
+ "11": {
93
+ "content": "<|endoftext|>",
94
+ "lstrip": false,
95
+ "normalized": false,
96
+ "rstrip": false,
97
+ "single_word": false,
98
+ "special": true
99
+ }
100
+ },
101
+ "additional_special_tokens": [
102
+ ">>TITLE<<",
103
+ ">>ABSTRACT<<",
104
+ ">>INTRODUCTION<<",
105
+ ">>SUMMARY<<",
106
+ ">>COMMENT<<",
107
+ ">>ANSWER<<",
108
+ ">>QUESTION<<",
109
+ ">>DOMAIN<<",
110
+ ">>PREFIX<<",
111
+ ">>SUFFIX<<",
112
+ ">>MIDDLE<<"
113
+ ],
114
  "clean_up_tokenization_spaces": true,
115
  "eos_token": "<|endoftext|>",
116
+ "model_input_names": [
117
+ "input_ids",
118
+ "attention_mask"
119
+ ],
120
  "model_max_length": 2048,
121
+ "pad_token": "<|endoftext|>",
122
  "tokenizer_class": "PreTrainedTokenizerFast"
123
  }