3v324v23 commited on
Commit
f4bdf0f
1 Parent(s): ec62f1c

add models

Browse files
This view is limited to 50 files because it contains too many changes.   See raw diff
Files changed (50) hide show
  1. .gitattributes +10 -0
  2. README.md +3 -3
  3. baseline_bs4_top1.log +0 -0
  4. baseline_bs4_top1/checkpoint-10000/config.json +62 -0
  5. baseline_bs4_top1/checkpoint-10000/generation_config.json +7 -0
  6. baseline_bs4_top1/checkpoint-10000/optimizer.pt +3 -0
  7. baseline_bs4_top1/checkpoint-10000/pytorch_model.bin +3 -0
  8. baseline_bs4_top1/checkpoint-10000/rng_state.pth +3 -0
  9. baseline_bs4_top1/checkpoint-10000/scheduler.pt +3 -0
  10. baseline_bs4_top1/checkpoint-10000/special_tokens_map.json +107 -0
  11. baseline_bs4_top1/checkpoint-10000/spiece.model +3 -0
  12. baseline_bs4_top1/checkpoint-10000/tokenizer.json +0 -0
  13. baseline_bs4_top1/checkpoint-10000/tokenizer_config.json +112 -0
  14. baseline_bs4_top1/checkpoint-10000/trainer_state.json +139 -0
  15. baseline_bs4_top1/checkpoint-10000/training_args.bin +3 -0
  16. baseline_bs4_top1/checkpoint-20000/config.json +62 -0
  17. baseline_bs4_top1/checkpoint-20000/generation_config.json +7 -0
  18. baseline_bs4_top1/checkpoint-20000/optimizer.pt +3 -0
  19. baseline_bs4_top1/checkpoint-20000/pytorch_model.bin +3 -0
  20. baseline_bs4_top1/checkpoint-20000/rng_state.pth +3 -0
  21. baseline_bs4_top1/checkpoint-20000/scheduler.pt +3 -0
  22. baseline_bs4_top1/checkpoint-20000/special_tokens_map.json +107 -0
  23. baseline_bs4_top1/checkpoint-20000/spiece.model +3 -0
  24. baseline_bs4_top1/checkpoint-20000/tokenizer.json +0 -0
  25. baseline_bs4_top1/checkpoint-20000/tokenizer_config.json +112 -0
  26. baseline_bs4_top1/checkpoint-20000/trainer_state.json +259 -0
  27. baseline_bs4_top1/checkpoint-20000/training_args.bin +3 -0
  28. baseline_bs4_top1/data_config.json +1 -0
  29. baseline_bs4_top1/hfmodel_config.json +1 -0
  30. baseline_bs4_top1/model_config.json +1 -0
  31. baseline_bs4_top1/train_config.json +1 -0
  32. baseline_bs4_top2.log +0 -0
  33. baseline_bs4_top2/checkpoint-10000/config.json +62 -0
  34. baseline_bs4_top2/checkpoint-10000/generation_config.json +7 -0
  35. baseline_bs4_top2/checkpoint-10000/optimizer.pt +3 -0
  36. baseline_bs4_top2/checkpoint-10000/pytorch_model.bin +3 -0
  37. baseline_bs4_top2/checkpoint-10000/rng_state.pth +3 -0
  38. baseline_bs4_top2/checkpoint-10000/scheduler.pt +3 -0
  39. baseline_bs4_top2/checkpoint-10000/special_tokens_map.json +107 -0
  40. baseline_bs4_top2/checkpoint-10000/spiece.model +3 -0
  41. baseline_bs4_top2/checkpoint-10000/tokenizer.json +0 -0
  42. baseline_bs4_top2/checkpoint-10000/tokenizer_config.json +112 -0
  43. baseline_bs4_top2/checkpoint-10000/trainer_state.json +139 -0
  44. baseline_bs4_top2/checkpoint-10000/training_args.bin +3 -0
  45. baseline_bs4_top2/checkpoint-20000/config.json +62 -0
  46. baseline_bs4_top2/checkpoint-20000/generation_config.json +7 -0
  47. baseline_bs4_top2/checkpoint-20000/optimizer.pt +3 -0
  48. baseline_bs4_top2/checkpoint-20000/pytorch_model.bin +3 -0
  49. baseline_bs4_top2/checkpoint-20000/rng_state.pth +3 -0
  50. baseline_bs4_top2/checkpoint-20000/scheduler.pt +3 -0
.gitattributes CHANGED
@@ -33,3 +33,13 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
 
 
 
 
 
 
 
 
 
 
 
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
36
+ baseline_bs4_top4/checkpoint-10000/pytorch_model.bin filter=lfs diff=lfs merge=lfs -text
37
+ calibrate_margin_ibn_dd/checkpoint-20000/pytorch_model.bin filter=lfs diff=lfs merge=lfs -text
38
+ calibrate_rank_ibn_dd/checkpoint-10000/pytorch_model.bin filter=lfs diff=lfs merge=lfs -text
39
+ baseline_bs4_top1/checkpoint-10000/pytorch_model.bin filter=lfs diff=lfs merge=lfs -text
40
+ baseline_bs4_top1/checkpoint-20000/pytorch_model.bin filter=lfs diff=lfs merge=lfs -text
41
+ baseline_bs4_top2/checkpoint-10000/pytorch_model.bin filter=lfs diff=lfs merge=lfs -text
42
+ baseline_bs4_top2/checkpoint-20000/pytorch_model.bin filter=lfs diff=lfs merge=lfs -text
43
+ baseline_bs4_top4/checkpoint-20000/pytorch_model.bin filter=lfs diff=lfs merge=lfs -text
44
+ calibrate_margin_ibn_dd/checkpoint-10000/pytorch_model.bin filter=lfs diff=lfs merge=lfs -text
45
+ calibrate_rank_ibn_dd/checkpoint-20000/pytorch_model.bin filter=lfs diff=lfs merge=lfs -text
README.md CHANGED
@@ -1,3 +1,3 @@
1
- ---
2
- license: apache-2.0
3
- ---
 
1
+ ---
2
+ license: apache-2.0
3
+ ---
baseline_bs4_top1.log ADDED
The diff for this file is too large to render. See raw diff
 
baseline_bs4_top1/checkpoint-10000/config.json ADDED
@@ -0,0 +1,62 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_name_or_path": "google/flan-t5-base",
3
+ "architectures": [
4
+ "SoftRelPromptFlanT5"
5
+ ],
6
+ "classifier_dropout": 0.0,
7
+ "d_ff": 2048,
8
+ "d_kv": 64,
9
+ "d_model": 768,
10
+ "decoder_start_token_id": 0,
11
+ "dense_act_fn": "gelu_new",
12
+ "dropout_rate": 0.1,
13
+ "eos_token_id": 1,
14
+ "feed_forward_proj": "gated-gelu",
15
+ "initializer_factor": 1.0,
16
+ "is_encoder_decoder": true,
17
+ "is_gated_act": true,
18
+ "layer_norm_epsilon": 1e-06,
19
+ "model_type": "t5",
20
+ "n_positions": 512,
21
+ "num_decoder_layers": 12,
22
+ "num_heads": 12,
23
+ "num_layers": 12,
24
+ "output_past": true,
25
+ "pad_token_id": 0,
26
+ "relative_attention_max_distance": 128,
27
+ "relative_attention_num_buckets": 32,
28
+ "task_specific_params": {
29
+ "summarization": {
30
+ "early_stopping": true,
31
+ "length_penalty": 2.0,
32
+ "max_length": 200,
33
+ "min_length": 30,
34
+ "no_repeat_ngram_size": 3,
35
+ "num_beams": 4,
36
+ "prefix": "summarize: "
37
+ },
38
+ "translation_en_to_de": {
39
+ "early_stopping": true,
40
+ "max_length": 300,
41
+ "num_beams": 4,
42
+ "prefix": "translate English to German: "
43
+ },
44
+ "translation_en_to_fr": {
45
+ "early_stopping": true,
46
+ "max_length": 300,
47
+ "num_beams": 4,
48
+ "prefix": "translate English to French: "
49
+ },
50
+ "translation_en_to_ro": {
51
+ "early_stopping": true,
52
+ "max_length": 300,
53
+ "num_beams": 4,
54
+ "prefix": "translate English to Romanian: "
55
+ }
56
+ },
57
+ "tie_word_embeddings": false,
58
+ "torch_dtype": "float32",
59
+ "transformers_version": "4.33.1",
60
+ "use_cache": true,
61
+ "vocab_size": 32128
62
+ }
baseline_bs4_top1/checkpoint-10000/generation_config.json ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ {
2
+ "_from_model_config": true,
3
+ "decoder_start_token_id": 0,
4
+ "eos_token_id": 1,
5
+ "pad_token_id": 0,
6
+ "transformers_version": "4.33.1"
7
+ }
baseline_bs4_top1/checkpoint-10000/optimizer.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:09c0ceab0ed46acb0ec16d626a5fcc26798ea2d7a8eddcbfe151546635d969fb
3
+ size 144545
baseline_bs4_top1/checkpoint-10000/pytorch_model.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:5e12a403413d9634b6e7804e3e2e1979f6566c15d89f196db0bc292bd6885c61
3
+ size 990480513
baseline_bs4_top1/checkpoint-10000/rng_state.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:de4c87fc2dfbf5a627f8c2a0575b0effa1f233623d0165ebcd993a60952af24b
3
+ size 14575
baseline_bs4_top1/checkpoint-10000/scheduler.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:89744af0d534dd9add5a42ebd997c43178aeb78f0f65e79af8379d8a5c11b73a
3
+ size 627
baseline_bs4_top1/checkpoint-10000/special_tokens_map.json ADDED
@@ -0,0 +1,107 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "additional_special_tokens": [
3
+ "<extra_id_0>",
4
+ "<extra_id_1>",
5
+ "<extra_id_2>",
6
+ "<extra_id_3>",
7
+ "<extra_id_4>",
8
+ "<extra_id_5>",
9
+ "<extra_id_6>",
10
+ "<extra_id_7>",
11
+ "<extra_id_8>",
12
+ "<extra_id_9>",
13
+ "<extra_id_10>",
14
+ "<extra_id_11>",
15
+ "<extra_id_12>",
16
+ "<extra_id_13>",
17
+ "<extra_id_14>",
18
+ "<extra_id_15>",
19
+ "<extra_id_16>",
20
+ "<extra_id_17>",
21
+ "<extra_id_18>",
22
+ "<extra_id_19>",
23
+ "<extra_id_20>",
24
+ "<extra_id_21>",
25
+ "<extra_id_22>",
26
+ "<extra_id_23>",
27
+ "<extra_id_24>",
28
+ "<extra_id_25>",
29
+ "<extra_id_26>",
30
+ "<extra_id_27>",
31
+ "<extra_id_28>",
32
+ "<extra_id_29>",
33
+ "<extra_id_30>",
34
+ "<extra_id_31>",
35
+ "<extra_id_32>",
36
+ "<extra_id_33>",
37
+ "<extra_id_34>",
38
+ "<extra_id_35>",
39
+ "<extra_id_36>",
40
+ "<extra_id_37>",
41
+ "<extra_id_38>",
42
+ "<extra_id_39>",
43
+ "<extra_id_40>",
44
+ "<extra_id_41>",
45
+ "<extra_id_42>",
46
+ "<extra_id_43>",
47
+ "<extra_id_44>",
48
+ "<extra_id_45>",
49
+ "<extra_id_46>",
50
+ "<extra_id_47>",
51
+ "<extra_id_48>",
52
+ "<extra_id_49>",
53
+ "<extra_id_50>",
54
+ "<extra_id_51>",
55
+ "<extra_id_52>",
56
+ "<extra_id_53>",
57
+ "<extra_id_54>",
58
+ "<extra_id_55>",
59
+ "<extra_id_56>",
60
+ "<extra_id_57>",
61
+ "<extra_id_58>",
62
+ "<extra_id_59>",
63
+ "<extra_id_60>",
64
+ "<extra_id_61>",
65
+ "<extra_id_62>",
66
+ "<extra_id_63>",
67
+ "<extra_id_64>",
68
+ "<extra_id_65>",
69
+ "<extra_id_66>",
70
+ "<extra_id_67>",
71
+ "<extra_id_68>",
72
+ "<extra_id_69>",
73
+ "<extra_id_70>",
74
+ "<extra_id_71>",
75
+ "<extra_id_72>",
76
+ "<extra_id_73>",
77
+ "<extra_id_74>",
78
+ "<extra_id_75>",
79
+ "<extra_id_76>",
80
+ "<extra_id_77>",
81
+ "<extra_id_78>",
82
+ "<extra_id_79>",
83
+ "<extra_id_80>",
84
+ "<extra_id_81>",
85
+ "<extra_id_82>",
86
+ "<extra_id_83>",
87
+ "<extra_id_84>",
88
+ "<extra_id_85>",
89
+ "<extra_id_86>",
90
+ "<extra_id_87>",
91
+ "<extra_id_88>",
92
+ "<extra_id_89>",
93
+ "<extra_id_90>",
94
+ "<extra_id_91>",
95
+ "<extra_id_92>",
96
+ "<extra_id_93>",
97
+ "<extra_id_94>",
98
+ "<extra_id_95>",
99
+ "<extra_id_96>",
100
+ "<extra_id_97>",
101
+ "<extra_id_98>",
102
+ "<extra_id_99>"
103
+ ],
104
+ "eos_token": "</s>",
105
+ "pad_token": "<pad>",
106
+ "unk_token": "<unk>"
107
+ }
baseline_bs4_top1/checkpoint-10000/spiece.model ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:d60acb128cf7b7f2536e8f38a5b18a05535c9e14c7a355904270e15b0945ea86
3
+ size 791656
baseline_bs4_top1/checkpoint-10000/tokenizer.json ADDED
The diff for this file is too large to render. See raw diff
 
baseline_bs4_top1/checkpoint-10000/tokenizer_config.json ADDED
@@ -0,0 +1,112 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "additional_special_tokens": [
3
+ "<extra_id_0>",
4
+ "<extra_id_1>",
5
+ "<extra_id_2>",
6
+ "<extra_id_3>",
7
+ "<extra_id_4>",
8
+ "<extra_id_5>",
9
+ "<extra_id_6>",
10
+ "<extra_id_7>",
11
+ "<extra_id_8>",
12
+ "<extra_id_9>",
13
+ "<extra_id_10>",
14
+ "<extra_id_11>",
15
+ "<extra_id_12>",
16
+ "<extra_id_13>",
17
+ "<extra_id_14>",
18
+ "<extra_id_15>",
19
+ "<extra_id_16>",
20
+ "<extra_id_17>",
21
+ "<extra_id_18>",
22
+ "<extra_id_19>",
23
+ "<extra_id_20>",
24
+ "<extra_id_21>",
25
+ "<extra_id_22>",
26
+ "<extra_id_23>",
27
+ "<extra_id_24>",
28
+ "<extra_id_25>",
29
+ "<extra_id_26>",
30
+ "<extra_id_27>",
31
+ "<extra_id_28>",
32
+ "<extra_id_29>",
33
+ "<extra_id_30>",
34
+ "<extra_id_31>",
35
+ "<extra_id_32>",
36
+ "<extra_id_33>",
37
+ "<extra_id_34>",
38
+ "<extra_id_35>",
39
+ "<extra_id_36>",
40
+ "<extra_id_37>",
41
+ "<extra_id_38>",
42
+ "<extra_id_39>",
43
+ "<extra_id_40>",
44
+ "<extra_id_41>",
45
+ "<extra_id_42>",
46
+ "<extra_id_43>",
47
+ "<extra_id_44>",
48
+ "<extra_id_45>",
49
+ "<extra_id_46>",
50
+ "<extra_id_47>",
51
+ "<extra_id_48>",
52
+ "<extra_id_49>",
53
+ "<extra_id_50>",
54
+ "<extra_id_51>",
55
+ "<extra_id_52>",
56
+ "<extra_id_53>",
57
+ "<extra_id_54>",
58
+ "<extra_id_55>",
59
+ "<extra_id_56>",
60
+ "<extra_id_57>",
61
+ "<extra_id_58>",
62
+ "<extra_id_59>",
63
+ "<extra_id_60>",
64
+ "<extra_id_61>",
65
+ "<extra_id_62>",
66
+ "<extra_id_63>",
67
+ "<extra_id_64>",
68
+ "<extra_id_65>",
69
+ "<extra_id_66>",
70
+ "<extra_id_67>",
71
+ "<extra_id_68>",
72
+ "<extra_id_69>",
73
+ "<extra_id_70>",
74
+ "<extra_id_71>",
75
+ "<extra_id_72>",
76
+ "<extra_id_73>",
77
+ "<extra_id_74>",
78
+ "<extra_id_75>",
79
+ "<extra_id_76>",
80
+ "<extra_id_77>",
81
+ "<extra_id_78>",
82
+ "<extra_id_79>",
83
+ "<extra_id_80>",
84
+ "<extra_id_81>",
85
+ "<extra_id_82>",
86
+ "<extra_id_83>",
87
+ "<extra_id_84>",
88
+ "<extra_id_85>",
89
+ "<extra_id_86>",
90
+ "<extra_id_87>",
91
+ "<extra_id_88>",
92
+ "<extra_id_89>",
93
+ "<extra_id_90>",
94
+ "<extra_id_91>",
95
+ "<extra_id_92>",
96
+ "<extra_id_93>",
97
+ "<extra_id_94>",
98
+ "<extra_id_95>",
99
+ "<extra_id_96>",
100
+ "<extra_id_97>",
101
+ "<extra_id_98>",
102
+ "<extra_id_99>"
103
+ ],
104
+ "clean_up_tokenization_spaces": true,
105
+ "eos_token": "</s>",
106
+ "extra_ids": 100,
107
+ "model_max_length": 512,
108
+ "pad_token": "<pad>",
109
+ "sp_model_kwargs": {},
110
+ "tokenizer_class": "T5Tokenizer",
111
+ "unk_token": "<unk>"
112
+ }
baseline_bs4_top1/checkpoint-10000/trainer_state.json ADDED
@@ -0,0 +1,139 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "best_metric": null,
3
+ "best_model_checkpoint": null,
4
+ "epoch": 0.008527584604298755,
5
+ "eval_steps": 500,
6
+ "global_step": 10000,
7
+ "is_hyper_param_search": false,
8
+ "is_local_process_zero": true,
9
+ "is_world_process_zero": true,
10
+ "log_history": [
11
+ {
12
+ "epoch": 0.0,
13
+ "learning_rate": 0.01,
14
+ "loss": 1.6476,
15
+ "step": 500
16
+ },
17
+ {
18
+ "epoch": 0.0,
19
+ "learning_rate": 0.01,
20
+ "loss": 1.5631,
21
+ "step": 1000
22
+ },
23
+ {
24
+ "epoch": 0.0,
25
+ "learning_rate": 0.01,
26
+ "loss": 1.5854,
27
+ "step": 1500
28
+ },
29
+ {
30
+ "epoch": 0.0,
31
+ "learning_rate": 0.01,
32
+ "loss": 1.5557,
33
+ "step": 2000
34
+ },
35
+ {
36
+ "epoch": 0.0,
37
+ "learning_rate": 0.01,
38
+ "loss": 1.5653,
39
+ "step": 2500
40
+ },
41
+ {
42
+ "epoch": 0.0,
43
+ "learning_rate": 0.01,
44
+ "loss": 1.5457,
45
+ "step": 3000
46
+ },
47
+ {
48
+ "epoch": 0.0,
49
+ "learning_rate": 0.01,
50
+ "loss": 1.5552,
51
+ "step": 3500
52
+ },
53
+ {
54
+ "epoch": 0.0,
55
+ "learning_rate": 0.01,
56
+ "loss": 1.5559,
57
+ "step": 4000
58
+ },
59
+ {
60
+ "epoch": 0.0,
61
+ "learning_rate": 0.01,
62
+ "loss": 1.5465,
63
+ "step": 4500
64
+ },
65
+ {
66
+ "epoch": 0.0,
67
+ "learning_rate": 0.01,
68
+ "loss": 1.5481,
69
+ "step": 5000
70
+ },
71
+ {
72
+ "epoch": 0.0,
73
+ "learning_rate": 0.01,
74
+ "loss": 1.5311,
75
+ "step": 5500
76
+ },
77
+ {
78
+ "epoch": 0.01,
79
+ "learning_rate": 0.01,
80
+ "loss": 1.5356,
81
+ "step": 6000
82
+ },
83
+ {
84
+ "epoch": 0.01,
85
+ "learning_rate": 0.01,
86
+ "loss": 1.5502,
87
+ "step": 6500
88
+ },
89
+ {
90
+ "epoch": 0.01,
91
+ "learning_rate": 0.01,
92
+ "loss": 1.527,
93
+ "step": 7000
94
+ },
95
+ {
96
+ "epoch": 0.01,
97
+ "learning_rate": 0.01,
98
+ "loss": 1.5383,
99
+ "step": 7500
100
+ },
101
+ {
102
+ "epoch": 0.01,
103
+ "learning_rate": 0.01,
104
+ "loss": 1.5064,
105
+ "step": 8000
106
+ },
107
+ {
108
+ "epoch": 0.01,
109
+ "learning_rate": 0.01,
110
+ "loss": 1.5271,
111
+ "step": 8500
112
+ },
113
+ {
114
+ "epoch": 0.01,
115
+ "learning_rate": 0.01,
116
+ "loss": 1.5295,
117
+ "step": 9000
118
+ },
119
+ {
120
+ "epoch": 0.01,
121
+ "learning_rate": 0.01,
122
+ "loss": 1.5098,
123
+ "step": 9500
124
+ },
125
+ {
126
+ "epoch": 0.01,
127
+ "learning_rate": 0.01,
128
+ "loss": 1.53,
129
+ "step": 10000
130
+ }
131
+ ],
132
+ "logging_steps": 500,
133
+ "max_steps": 20000,
134
+ "num_train_epochs": 1,
135
+ "save_steps": 10000,
136
+ "total_flos": 4.829257277256499e+16,
137
+ "trial_name": null,
138
+ "trial_params": null
139
+ }
baseline_bs4_top1/checkpoint-10000/training_args.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:09ab173dc6fc36e5747aa5255939206d9c965bbe4469338c8b96de7a0faed00a
3
+ size 4539
baseline_bs4_top1/checkpoint-20000/config.json ADDED
@@ -0,0 +1,62 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_name_or_path": "google/flan-t5-base",
3
+ "architectures": [
4
+ "SoftRelPromptFlanT5"
5
+ ],
6
+ "classifier_dropout": 0.0,
7
+ "d_ff": 2048,
8
+ "d_kv": 64,
9
+ "d_model": 768,
10
+ "decoder_start_token_id": 0,
11
+ "dense_act_fn": "gelu_new",
12
+ "dropout_rate": 0.1,
13
+ "eos_token_id": 1,
14
+ "feed_forward_proj": "gated-gelu",
15
+ "initializer_factor": 1.0,
16
+ "is_encoder_decoder": true,
17
+ "is_gated_act": true,
18
+ "layer_norm_epsilon": 1e-06,
19
+ "model_type": "t5",
20
+ "n_positions": 512,
21
+ "num_decoder_layers": 12,
22
+ "num_heads": 12,
23
+ "num_layers": 12,
24
+ "output_past": true,
25
+ "pad_token_id": 0,
26
+ "relative_attention_max_distance": 128,
27
+ "relative_attention_num_buckets": 32,
28
+ "task_specific_params": {
29
+ "summarization": {
30
+ "early_stopping": true,
31
+ "length_penalty": 2.0,
32
+ "max_length": 200,
33
+ "min_length": 30,
34
+ "no_repeat_ngram_size": 3,
35
+ "num_beams": 4,
36
+ "prefix": "summarize: "
37
+ },
38
+ "translation_en_to_de": {
39
+ "early_stopping": true,
40
+ "max_length": 300,
41
+ "num_beams": 4,
42
+ "prefix": "translate English to German: "
43
+ },
44
+ "translation_en_to_fr": {
45
+ "early_stopping": true,
46
+ "max_length": 300,
47
+ "num_beams": 4,
48
+ "prefix": "translate English to French: "
49
+ },
50
+ "translation_en_to_ro": {
51
+ "early_stopping": true,
52
+ "max_length": 300,
53
+ "num_beams": 4,
54
+ "prefix": "translate English to Romanian: "
55
+ }
56
+ },
57
+ "tie_word_embeddings": false,
58
+ "torch_dtype": "float32",
59
+ "transformers_version": "4.33.1",
60
+ "use_cache": true,
61
+ "vocab_size": 32128
62
+ }
baseline_bs4_top1/checkpoint-20000/generation_config.json ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ {
2
+ "_from_model_config": true,
3
+ "decoder_start_token_id": 0,
4
+ "eos_token_id": 1,
5
+ "pad_token_id": 0,
6
+ "transformers_version": "4.33.1"
7
+ }
baseline_bs4_top1/checkpoint-20000/optimizer.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:d89bdef1fa8581fbf366465c2c48a742068cc56363ee861230021037b25a7a53
3
+ size 144545
baseline_bs4_top1/checkpoint-20000/pytorch_model.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:979953b9f2b37eb669d4d0a5cdaf0a0fa69b4432d7bf17322cc56e064d696559
3
+ size 990480513
baseline_bs4_top1/checkpoint-20000/rng_state.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:9ee65fd173e43a5bb96f2f07bf1e86b7666cd24f1ff7c2f132f19e39ccc7b2b9
3
+ size 14575
baseline_bs4_top1/checkpoint-20000/scheduler.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:a3ddfbcd33fc0d81c222807ca3e42cd9654f7e531f573941ed9599b1e07e0373
3
+ size 627
baseline_bs4_top1/checkpoint-20000/special_tokens_map.json ADDED
@@ -0,0 +1,107 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "additional_special_tokens": [
3
+ "<extra_id_0>",
4
+ "<extra_id_1>",
5
+ "<extra_id_2>",
6
+ "<extra_id_3>",
7
+ "<extra_id_4>",
8
+ "<extra_id_5>",
9
+ "<extra_id_6>",
10
+ "<extra_id_7>",
11
+ "<extra_id_8>",
12
+ "<extra_id_9>",
13
+ "<extra_id_10>",
14
+ "<extra_id_11>",
15
+ "<extra_id_12>",
16
+ "<extra_id_13>",
17
+ "<extra_id_14>",
18
+ "<extra_id_15>",
19
+ "<extra_id_16>",
20
+ "<extra_id_17>",
21
+ "<extra_id_18>",
22
+ "<extra_id_19>",
23
+ "<extra_id_20>",
24
+ "<extra_id_21>",
25
+ "<extra_id_22>",
26
+ "<extra_id_23>",
27
+ "<extra_id_24>",
28
+ "<extra_id_25>",
29
+ "<extra_id_26>",
30
+ "<extra_id_27>",
31
+ "<extra_id_28>",
32
+ "<extra_id_29>",
33
+ "<extra_id_30>",
34
+ "<extra_id_31>",
35
+ "<extra_id_32>",
36
+ "<extra_id_33>",
37
+ "<extra_id_34>",
38
+ "<extra_id_35>",
39
+ "<extra_id_36>",
40
+ "<extra_id_37>",
41
+ "<extra_id_38>",
42
+ "<extra_id_39>",
43
+ "<extra_id_40>",
44
+ "<extra_id_41>",
45
+ "<extra_id_42>",
46
+ "<extra_id_43>",
47
+ "<extra_id_44>",
48
+ "<extra_id_45>",
49
+ "<extra_id_46>",
50
+ "<extra_id_47>",
51
+ "<extra_id_48>",
52
+ "<extra_id_49>",
53
+ "<extra_id_50>",
54
+ "<extra_id_51>",
55
+ "<extra_id_52>",
56
+ "<extra_id_53>",
57
+ "<extra_id_54>",
58
+ "<extra_id_55>",
59
+ "<extra_id_56>",
60
+ "<extra_id_57>",
61
+ "<extra_id_58>",
62
+ "<extra_id_59>",
63
+ "<extra_id_60>",
64
+ "<extra_id_61>",
65
+ "<extra_id_62>",
66
+ "<extra_id_63>",
67
+ "<extra_id_64>",
68
+ "<extra_id_65>",
69
+ "<extra_id_66>",
70
+ "<extra_id_67>",
71
+ "<extra_id_68>",
72
+ "<extra_id_69>",
73
+ "<extra_id_70>",
74
+ "<extra_id_71>",
75
+ "<extra_id_72>",
76
+ "<extra_id_73>",
77
+ "<extra_id_74>",
78
+ "<extra_id_75>",
79
+ "<extra_id_76>",
80
+ "<extra_id_77>",
81
+ "<extra_id_78>",
82
+ "<extra_id_79>",
83
+ "<extra_id_80>",
84
+ "<extra_id_81>",
85
+ "<extra_id_82>",
86
+ "<extra_id_83>",
87
+ "<extra_id_84>",
88
+ "<extra_id_85>",
89
+ "<extra_id_86>",
90
+ "<extra_id_87>",
91
+ "<extra_id_88>",
92
+ "<extra_id_89>",
93
+ "<extra_id_90>",
94
+ "<extra_id_91>",
95
+ "<extra_id_92>",
96
+ "<extra_id_93>",
97
+ "<extra_id_94>",
98
+ "<extra_id_95>",
99
+ "<extra_id_96>",
100
+ "<extra_id_97>",
101
+ "<extra_id_98>",
102
+ "<extra_id_99>"
103
+ ],
104
+ "eos_token": "</s>",
105
+ "pad_token": "<pad>",
106
+ "unk_token": "<unk>"
107
+ }
baseline_bs4_top1/checkpoint-20000/spiece.model ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:d60acb128cf7b7f2536e8f38a5b18a05535c9e14c7a355904270e15b0945ea86
3
+ size 791656
baseline_bs4_top1/checkpoint-20000/tokenizer.json ADDED
The diff for this file is too large to render. See raw diff
 
baseline_bs4_top1/checkpoint-20000/tokenizer_config.json ADDED
@@ -0,0 +1,112 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "additional_special_tokens": [
3
+ "<extra_id_0>",
4
+ "<extra_id_1>",
5
+ "<extra_id_2>",
6
+ "<extra_id_3>",
7
+ "<extra_id_4>",
8
+ "<extra_id_5>",
9
+ "<extra_id_6>",
10
+ "<extra_id_7>",
11
+ "<extra_id_8>",
12
+ "<extra_id_9>",
13
+ "<extra_id_10>",
14
+ "<extra_id_11>",
15
+ "<extra_id_12>",
16
+ "<extra_id_13>",
17
+ "<extra_id_14>",
18
+ "<extra_id_15>",
19
+ "<extra_id_16>",
20
+ "<extra_id_17>",
21
+ "<extra_id_18>",
22
+ "<extra_id_19>",
23
+ "<extra_id_20>",
24
+ "<extra_id_21>",
25
+ "<extra_id_22>",
26
+ "<extra_id_23>",
27
+ "<extra_id_24>",
28
+ "<extra_id_25>",
29
+ "<extra_id_26>",
30
+ "<extra_id_27>",
31
+ "<extra_id_28>",
32
+ "<extra_id_29>",
33
+ "<extra_id_30>",
34
+ "<extra_id_31>",
35
+ "<extra_id_32>",
36
+ "<extra_id_33>",
37
+ "<extra_id_34>",
38
+ "<extra_id_35>",
39
+ "<extra_id_36>",
40
+ "<extra_id_37>",
41
+ "<extra_id_38>",
42
+ "<extra_id_39>",
43
+ "<extra_id_40>",
44
+ "<extra_id_41>",
45
+ "<extra_id_42>",
46
+ "<extra_id_43>",
47
+ "<extra_id_44>",
48
+ "<extra_id_45>",
49
+ "<extra_id_46>",
50
+ "<extra_id_47>",
51
+ "<extra_id_48>",
52
+ "<extra_id_49>",
53
+ "<extra_id_50>",
54
+ "<extra_id_51>",
55
+ "<extra_id_52>",
56
+ "<extra_id_53>",
57
+ "<extra_id_54>",
58
+ "<extra_id_55>",
59
+ "<extra_id_56>",
60
+ "<extra_id_57>",
61
+ "<extra_id_58>",
62
+ "<extra_id_59>",
63
+ "<extra_id_60>",
64
+ "<extra_id_61>",
65
+ "<extra_id_62>",
66
+ "<extra_id_63>",
67
+ "<extra_id_64>",
68
+ "<extra_id_65>",
69
+ "<extra_id_66>",
70
+ "<extra_id_67>",
71
+ "<extra_id_68>",
72
+ "<extra_id_69>",
73
+ "<extra_id_70>",
74
+ "<extra_id_71>",
75
+ "<extra_id_72>",
76
+ "<extra_id_73>",
77
+ "<extra_id_74>",
78
+ "<extra_id_75>",
79
+ "<extra_id_76>",
80
+ "<extra_id_77>",
81
+ "<extra_id_78>",
82
+ "<extra_id_79>",
83
+ "<extra_id_80>",
84
+ "<extra_id_81>",
85
+ "<extra_id_82>",
86
+ "<extra_id_83>",
87
+ "<extra_id_84>",
88
+ "<extra_id_85>",
89
+ "<extra_id_86>",
90
+ "<extra_id_87>",
91
+ "<extra_id_88>",
92
+ "<extra_id_89>",
93
+ "<extra_id_90>",
94
+ "<extra_id_91>",
95
+ "<extra_id_92>",
96
+ "<extra_id_93>",
97
+ "<extra_id_94>",
98
+ "<extra_id_95>",
99
+ "<extra_id_96>",
100
+ "<extra_id_97>",
101
+ "<extra_id_98>",
102
+ "<extra_id_99>"
103
+ ],
104
+ "clean_up_tokenization_spaces": true,
105
+ "eos_token": "</s>",
106
+ "extra_ids": 100,
107
+ "model_max_length": 512,
108
+ "pad_token": "<pad>",
109
+ "sp_model_kwargs": {},
110
+ "tokenizer_class": "T5Tokenizer",
111
+ "unk_token": "<unk>"
112
+ }
baseline_bs4_top1/checkpoint-20000/trainer_state.json ADDED
@@ -0,0 +1,259 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "best_metric": null,
3
+ "best_model_checkpoint": null,
4
+ "epoch": 0.01705516920859751,
5
+ "eval_steps": 500,
6
+ "global_step": 20000,
7
+ "is_hyper_param_search": false,
8
+ "is_local_process_zero": true,
9
+ "is_world_process_zero": true,
10
+ "log_history": [
11
+ {
12
+ "epoch": 0.0,
13
+ "learning_rate": 0.01,
14
+ "loss": 1.6476,
15
+ "step": 500
16
+ },
17
+ {
18
+ "epoch": 0.0,
19
+ "learning_rate": 0.01,
20
+ "loss": 1.5631,
21
+ "step": 1000
22
+ },
23
+ {
24
+ "epoch": 0.0,
25
+ "learning_rate": 0.01,
26
+ "loss": 1.5854,
27
+ "step": 1500
28
+ },
29
+ {
30
+ "epoch": 0.0,
31
+ "learning_rate": 0.01,
32
+ "loss": 1.5557,
33
+ "step": 2000
34
+ },
35
+ {
36
+ "epoch": 0.0,
37
+ "learning_rate": 0.01,
38
+ "loss": 1.5653,
39
+ "step": 2500
40
+ },
41
+ {
42
+ "epoch": 0.0,
43
+ "learning_rate": 0.01,
44
+ "loss": 1.5457,
45
+ "step": 3000
46
+ },
47
+ {
48
+ "epoch": 0.0,
49
+ "learning_rate": 0.01,
50
+ "loss": 1.5552,
51
+ "step": 3500
52
+ },
53
+ {
54
+ "epoch": 0.0,
55
+ "learning_rate": 0.01,
56
+ "loss": 1.5559,
57
+ "step": 4000
58
+ },
59
+ {
60
+ "epoch": 0.0,
61
+ "learning_rate": 0.01,
62
+ "loss": 1.5465,
63
+ "step": 4500
64
+ },
65
+ {
66
+ "epoch": 0.0,
67
+ "learning_rate": 0.01,
68
+ "loss": 1.5481,
69
+ "step": 5000
70
+ },
71
+ {
72
+ "epoch": 0.0,
73
+ "learning_rate": 0.01,
74
+ "loss": 1.5311,
75
+ "step": 5500
76
+ },
77
+ {
78
+ "epoch": 0.01,
79
+ "learning_rate": 0.01,
80
+ "loss": 1.5356,
81
+ "step": 6000
82
+ },
83
+ {
84
+ "epoch": 0.01,
85
+ "learning_rate": 0.01,
86
+ "loss": 1.5502,
87
+ "step": 6500
88
+ },
89
+ {
90
+ "epoch": 0.01,
91
+ "learning_rate": 0.01,
92
+ "loss": 1.527,
93
+ "step": 7000
94
+ },
95
+ {
96
+ "epoch": 0.01,
97
+ "learning_rate": 0.01,
98
+ "loss": 1.5383,
99
+ "step": 7500
100
+ },
101
+ {
102
+ "epoch": 0.01,
103
+ "learning_rate": 0.01,
104
+ "loss": 1.5064,
105
+ "step": 8000
106
+ },
107
+ {
108
+ "epoch": 0.01,
109
+ "learning_rate": 0.01,
110
+ "loss": 1.5271,
111
+ "step": 8500
112
+ },
113
+ {
114
+ "epoch": 0.01,
115
+ "learning_rate": 0.01,
116
+ "loss": 1.5295,
117
+ "step": 9000
118
+ },
119
+ {
120
+ "epoch": 0.01,
121
+ "learning_rate": 0.01,
122
+ "loss": 1.5098,
123
+ "step": 9500
124
+ },
125
+ {
126
+ "epoch": 0.01,
127
+ "learning_rate": 0.01,
128
+ "loss": 1.53,
129
+ "step": 10000
130
+ },
131
+ {
132
+ "epoch": 0.01,
133
+ "learning_rate": 0.01,
134
+ "loss": 1.5387,
135
+ "step": 10500
136
+ },
137
+ {
138
+ "epoch": 0.01,
139
+ "learning_rate": 0.01,
140
+ "loss": 1.5176,
141
+ "step": 11000
142
+ },
143
+ {
144
+ "epoch": 0.01,
145
+ "learning_rate": 0.01,
146
+ "loss": 1.5296,
147
+ "step": 11500
148
+ },
149
+ {
150
+ "epoch": 0.01,
151
+ "learning_rate": 0.01,
152
+ "loss": 1.5416,
153
+ "step": 12000
154
+ },
155
+ {
156
+ "epoch": 0.01,
157
+ "learning_rate": 0.01,
158
+ "loss": 1.514,
159
+ "step": 12500
160
+ },
161
+ {
162
+ "epoch": 0.01,
163
+ "learning_rate": 0.01,
164
+ "loss": 1.4975,
165
+ "step": 13000
166
+ },
167
+ {
168
+ "epoch": 0.01,
169
+ "learning_rate": 0.01,
170
+ "loss": 1.5488,
171
+ "step": 13500
172
+ },
173
+ {
174
+ "epoch": 0.01,
175
+ "learning_rate": 0.01,
176
+ "loss": 1.4987,
177
+ "step": 14000
178
+ },
179
+ {
180
+ "epoch": 0.01,
181
+ "learning_rate": 0.01,
182
+ "loss": 1.4859,
183
+ "step": 14500
184
+ },
185
+ {
186
+ "epoch": 0.01,
187
+ "learning_rate": 0.01,
188
+ "loss": 1.5495,
189
+ "step": 15000
190
+ },
191
+ {
192
+ "epoch": 0.01,
193
+ "learning_rate": 0.01,
194
+ "loss": 1.5347,
195
+ "step": 15500
196
+ },
197
+ {
198
+ "epoch": 0.01,
199
+ "learning_rate": 0.01,
200
+ "loss": 1.5225,
201
+ "step": 16000
202
+ },
203
+ {
204
+ "epoch": 0.01,
205
+ "learning_rate": 0.01,
206
+ "loss": 1.537,
207
+ "step": 16500
208
+ },
209
+ {
210
+ "epoch": 0.01,
211
+ "learning_rate": 0.01,
212
+ "loss": 1.512,
213
+ "step": 17000
214
+ },
215
+ {
216
+ "epoch": 0.01,
217
+ "learning_rate": 0.01,
218
+ "loss": 1.5263,
219
+ "step": 17500
220
+ },
221
+ {
222
+ "epoch": 0.02,
223
+ "learning_rate": 0.01,
224
+ "loss": 1.5188,
225
+ "step": 18000
226
+ },
227
+ {
228
+ "epoch": 0.02,
229
+ "learning_rate": 0.01,
230
+ "loss": 1.5163,
231
+ "step": 18500
232
+ },
233
+ {
234
+ "epoch": 0.02,
235
+ "learning_rate": 0.01,
236
+ "loss": 1.5372,
237
+ "step": 19000
238
+ },
239
+ {
240
+ "epoch": 0.02,
241
+ "learning_rate": 0.01,
242
+ "loss": 1.5225,
243
+ "step": 19500
244
+ },
245
+ {
246
+ "epoch": 0.02,
247
+ "learning_rate": 0.01,
248
+ "loss": 1.5114,
249
+ "step": 20000
250
+ }
251
+ ],
252
+ "logging_steps": 500,
253
+ "max_steps": 20000,
254
+ "num_train_epochs": 1,
255
+ "save_steps": 10000,
256
+ "total_flos": 9.652535293088563e+16,
257
+ "trial_name": null,
258
+ "trial_params": null
259
+ }
baseline_bs4_top1/checkpoint-20000/training_args.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:09ab173dc6fc36e5747aa5255939206d9c965bbe4469338c8b96de7a0faed00a
3
+ size 4539
baseline_bs4_top1/data_config.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"dataset_config_name": null, "overwrite_cache": false, "preprocessing_num_workers": null, "train_file": "/home/jhju/datasets/nils.sentence.transformers/ce.minilm.hardneg.vL.jsonl", "eval_file": null, "max_p_length": 128, "max_q_length": 16, "m_negative_per_example": 4, "m_positive_per_example": 4, "random_corrupt_rate": 0.0}
baseline_bs4_top1/hfmodel_config.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"model_name_or_path": "google/flan-t5-base", "config_name": "google/flan-t5-base", "tokenizer_name": "google/flan-t5-base", "cache_dir": null, "use_fast_tokenizer": true, "use_auth_token": false}
baseline_bs4_top1/model_config.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"add_classification_head": false, "baseline_prefix": "{1}", "instruction_prompt": "Generate a question for the passage with relevance label: ", "instruction_prompt_idx": [6939, 2206, 3, 9, 822, 21, 8, 5454, 28, 20208, 3783, 10, 3], "pos_neg_prompt": null, "pos_neg_prompt_idx": null, "relevant_prompt": "true true true true true", "relevant_prompt_idx": [1176, 1176, 1176, 1176, 1176], "irrelevant_prompt": "false false false false false", "irrelevant_prompt_idx": [6136, 6136, 6136, 6136, 6136], "head_size": 64, "pooling": "mean", "activation": "sigmoid", "latent_size": 128, "activate_prompt_attention": true}
baseline_bs4_top1/train_config.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"output_dir": "/work/jhju/readqg-baseline//baseline_bs4_top1", "overwrite_output_dir": true, "do_train": true, "do_eval": false, "do_predict": false, "evaluation_strategy": "no", "prediction_loss_only": false, "per_device_train_batch_size": 4, "per_device_eval_batch_size": 4, "per_gpu_train_batch_size": null, "per_gpu_eval_batch_size": null, "gradient_accumulation_steps": 1, "eval_accumulation_steps": null, "eval_delay": 0, "learning_rate": 0.01, "weight_decay": 0.0, "adam_beta1": 0.9, "adam_beta2": 0.999, "adam_epsilon": 1e-08, "max_grad_norm": 1.0, "num_train_epochs": 3.0, "max_steps": 20000, "lr_scheduler_type": "constant", "warmup_ratio": 0.0, "warmup_steps": 0, "log_level": "passive", "log_level_replica": "warning", "log_on_each_node": true, "logging_dir": "./logs", "logging_strategy": "steps", "logging_first_step": false, "logging_steps": 500, "logging_nan_inf_filter": true, "save_strategy": "steps", "save_steps": 10000, "save_total_limit": 5, "save_safetensors": false, "save_on_each_node": false, "no_cuda": false, "use_cpu": false, "use_mps_device": false, "seed": 42, "data_seed": null, "jit_mode_eval": false, "use_ipex": false, "bf16": false, "fp16": false, "fp16_opt_level": "O1", "half_precision_backend": "auto", "bf16_full_eval": false, "fp16_full_eval": false, "tf32": null, "local_rank": 0, "ddp_backend": null, "tpu_num_cores": null, "tpu_metrics_debug": false, "debug": [], "dataloader_drop_last": false, "eval_steps": 500, "dataloader_num_workers": 0, "past_index": -1, "run_name": "prompt=5_batch=4_sample=top1", "disable_tqdm": false, "remove_unused_columns": false, "label_names": null, "load_best_model_at_end": false, "metric_for_best_model": null, "greater_is_better": null, "ignore_data_skip": false, "sharded_ddp": [], "fsdp": [], "fsdp_min_num_params": 0, "fsdp_config": {"min_num_params": 0, "xla": false, "xla_fsdp_grad_ckpt": false}, "fsdp_transformer_layer_cls_to_wrap": null, "deepspeed": null, "label_smoothing_factor": 0.0, "optim": "adamw_torch", "optim_args": null, "adafactor": false, "group_by_length": false, "length_column_name": "length", "report_to": ["wandb"], "ddp_find_unused_parameters": null, "ddp_bucket_cap_mb": null, "ddp_broadcast_buffers": null, "dataloader_pin_memory": true, "skip_memory_metrics": true, "use_legacy_prediction_loop": false, "push_to_hub": false, "resume_from_checkpoint": null, "hub_model_id": null, "hub_strategy": "every_save", "hub_token": null, "hub_private_repo": false, "hub_always_push": false, "gradient_checkpointing": true, "include_inputs_for_metrics": false, "fp16_backend": "auto", "push_to_hub_model_id": null, "push_to_hub_organization": null, "push_to_hub_token": null, "_n_gpu": 1, "mp_parameters": "", "auto_find_batch_size": false, "full_determinism": false, "torchdynamo": null, "ray_scope": "last", "ddp_timeout": 1800, "torch_compile": false, "torch_compile_backend": null, "torch_compile_mode": null, "dispatch_batches": null, "sortish_sampler": false, "predict_with_generate": false, "generation_max_length": null, "generation_num_beams": null, "generation_config": null, "random_init": false, "enable_unlikelihood": false, "enable_calibration": null, "calibration_margin_ngrams": null, "gamma": 1.0, "enable_similarity_loss": null, "document_wise_contrastive": false, "relevance_wise_contrastive": false, "tau": 1.0, "sample_random": true, "sample_topk": 1, "enable_vae_loss": false, "k": 0.0025, "x0": 2500, "annealing_fn": "logistic"}
baseline_bs4_top2.log ADDED
The diff for this file is too large to render. See raw diff
 
baseline_bs4_top2/checkpoint-10000/config.json ADDED
@@ -0,0 +1,62 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_name_or_path": "google/flan-t5-base",
3
+ "architectures": [
4
+ "SoftRelPromptFlanT5"
5
+ ],
6
+ "classifier_dropout": 0.0,
7
+ "d_ff": 2048,
8
+ "d_kv": 64,
9
+ "d_model": 768,
10
+ "decoder_start_token_id": 0,
11
+ "dense_act_fn": "gelu_new",
12
+ "dropout_rate": 0.1,
13
+ "eos_token_id": 1,
14
+ "feed_forward_proj": "gated-gelu",
15
+ "initializer_factor": 1.0,
16
+ "is_encoder_decoder": true,
17
+ "is_gated_act": true,
18
+ "layer_norm_epsilon": 1e-06,
19
+ "model_type": "t5",
20
+ "n_positions": 512,
21
+ "num_decoder_layers": 12,
22
+ "num_heads": 12,
23
+ "num_layers": 12,
24
+ "output_past": true,
25
+ "pad_token_id": 0,
26
+ "relative_attention_max_distance": 128,
27
+ "relative_attention_num_buckets": 32,
28
+ "task_specific_params": {
29
+ "summarization": {
30
+ "early_stopping": true,
31
+ "length_penalty": 2.0,
32
+ "max_length": 200,
33
+ "min_length": 30,
34
+ "no_repeat_ngram_size": 3,
35
+ "num_beams": 4,
36
+ "prefix": "summarize: "
37
+ },
38
+ "translation_en_to_de": {
39
+ "early_stopping": true,
40
+ "max_length": 300,
41
+ "num_beams": 4,
42
+ "prefix": "translate English to German: "
43
+ },
44
+ "translation_en_to_fr": {
45
+ "early_stopping": true,
46
+ "max_length": 300,
47
+ "num_beams": 4,
48
+ "prefix": "translate English to French: "
49
+ },
50
+ "translation_en_to_ro": {
51
+ "early_stopping": true,
52
+ "max_length": 300,
53
+ "num_beams": 4,
54
+ "prefix": "translate English to Romanian: "
55
+ }
56
+ },
57
+ "tie_word_embeddings": false,
58
+ "torch_dtype": "float32",
59
+ "transformers_version": "4.33.1",
60
+ "use_cache": true,
61
+ "vocab_size": 32128
62
+ }
baseline_bs4_top2/checkpoint-10000/generation_config.json ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ {
2
+ "_from_model_config": true,
3
+ "decoder_start_token_id": 0,
4
+ "eos_token_id": 1,
5
+ "pad_token_id": 0,
6
+ "transformers_version": "4.33.1"
7
+ }
baseline_bs4_top2/checkpoint-10000/optimizer.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:bc67f5c5dc22f7c375b0c2101d705d270abdc690e52970bb6e6d499ca53cc6a7
3
+ size 144545
baseline_bs4_top2/checkpoint-10000/pytorch_model.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:d366a12d59545d20daa0dbcde33cbac47216dd084289a717889298622bb0e2ae
3
+ size 990480513
baseline_bs4_top2/checkpoint-10000/rng_state.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:89428fc2bed9ce2a97559ad926183a8a8fb059a55491935c6cdb5773685812f4
3
+ size 14511
baseline_bs4_top2/checkpoint-10000/scheduler.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:89744af0d534dd9add5a42ebd997c43178aeb78f0f65e79af8379d8a5c11b73a
3
+ size 627
baseline_bs4_top2/checkpoint-10000/special_tokens_map.json ADDED
@@ -0,0 +1,107 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "additional_special_tokens": [
3
+ "<extra_id_0>",
4
+ "<extra_id_1>",
5
+ "<extra_id_2>",
6
+ "<extra_id_3>",
7
+ "<extra_id_4>",
8
+ "<extra_id_5>",
9
+ "<extra_id_6>",
10
+ "<extra_id_7>",
11
+ "<extra_id_8>",
12
+ "<extra_id_9>",
13
+ "<extra_id_10>",
14
+ "<extra_id_11>",
15
+ "<extra_id_12>",
16
+ "<extra_id_13>",
17
+ "<extra_id_14>",
18
+ "<extra_id_15>",
19
+ "<extra_id_16>",
20
+ "<extra_id_17>",
21
+ "<extra_id_18>",
22
+ "<extra_id_19>",
23
+ "<extra_id_20>",
24
+ "<extra_id_21>",
25
+ "<extra_id_22>",
26
+ "<extra_id_23>",
27
+ "<extra_id_24>",
28
+ "<extra_id_25>",
29
+ "<extra_id_26>",
30
+ "<extra_id_27>",
31
+ "<extra_id_28>",
32
+ "<extra_id_29>",
33
+ "<extra_id_30>",
34
+ "<extra_id_31>",
35
+ "<extra_id_32>",
36
+ "<extra_id_33>",
37
+ "<extra_id_34>",
38
+ "<extra_id_35>",
39
+ "<extra_id_36>",
40
+ "<extra_id_37>",
41
+ "<extra_id_38>",
42
+ "<extra_id_39>",
43
+ "<extra_id_40>",
44
+ "<extra_id_41>",
45
+ "<extra_id_42>",
46
+ "<extra_id_43>",
47
+ "<extra_id_44>",
48
+ "<extra_id_45>",
49
+ "<extra_id_46>",
50
+ "<extra_id_47>",
51
+ "<extra_id_48>",
52
+ "<extra_id_49>",
53
+ "<extra_id_50>",
54
+ "<extra_id_51>",
55
+ "<extra_id_52>",
56
+ "<extra_id_53>",
57
+ "<extra_id_54>",
58
+ "<extra_id_55>",
59
+ "<extra_id_56>",
60
+ "<extra_id_57>",
61
+ "<extra_id_58>",
62
+ "<extra_id_59>",
63
+ "<extra_id_60>",
64
+ "<extra_id_61>",
65
+ "<extra_id_62>",
66
+ "<extra_id_63>",
67
+ "<extra_id_64>",
68
+ "<extra_id_65>",
69
+ "<extra_id_66>",
70
+ "<extra_id_67>",
71
+ "<extra_id_68>",
72
+ "<extra_id_69>",
73
+ "<extra_id_70>",
74
+ "<extra_id_71>",
75
+ "<extra_id_72>",
76
+ "<extra_id_73>",
77
+ "<extra_id_74>",
78
+ "<extra_id_75>",
79
+ "<extra_id_76>",
80
+ "<extra_id_77>",
81
+ "<extra_id_78>",
82
+ "<extra_id_79>",
83
+ "<extra_id_80>",
84
+ "<extra_id_81>",
85
+ "<extra_id_82>",
86
+ "<extra_id_83>",
87
+ "<extra_id_84>",
88
+ "<extra_id_85>",
89
+ "<extra_id_86>",
90
+ "<extra_id_87>",
91
+ "<extra_id_88>",
92
+ "<extra_id_89>",
93
+ "<extra_id_90>",
94
+ "<extra_id_91>",
95
+ "<extra_id_92>",
96
+ "<extra_id_93>",
97
+ "<extra_id_94>",
98
+ "<extra_id_95>",
99
+ "<extra_id_96>",
100
+ "<extra_id_97>",
101
+ "<extra_id_98>",
102
+ "<extra_id_99>"
103
+ ],
104
+ "eos_token": "</s>",
105
+ "pad_token": "<pad>",
106
+ "unk_token": "<unk>"
107
+ }
baseline_bs4_top2/checkpoint-10000/spiece.model ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:d60acb128cf7b7f2536e8f38a5b18a05535c9e14c7a355904270e15b0945ea86
3
+ size 791656
baseline_bs4_top2/checkpoint-10000/tokenizer.json ADDED
The diff for this file is too large to render. See raw diff
 
baseline_bs4_top2/checkpoint-10000/tokenizer_config.json ADDED
@@ -0,0 +1,112 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "additional_special_tokens": [
3
+ "<extra_id_0>",
4
+ "<extra_id_1>",
5
+ "<extra_id_2>",
6
+ "<extra_id_3>",
7
+ "<extra_id_4>",
8
+ "<extra_id_5>",
9
+ "<extra_id_6>",
10
+ "<extra_id_7>",
11
+ "<extra_id_8>",
12
+ "<extra_id_9>",
13
+ "<extra_id_10>",
14
+ "<extra_id_11>",
15
+ "<extra_id_12>",
16
+ "<extra_id_13>",
17
+ "<extra_id_14>",
18
+ "<extra_id_15>",
19
+ "<extra_id_16>",
20
+ "<extra_id_17>",
21
+ "<extra_id_18>",
22
+ "<extra_id_19>",
23
+ "<extra_id_20>",
24
+ "<extra_id_21>",
25
+ "<extra_id_22>",
26
+ "<extra_id_23>",
27
+ "<extra_id_24>",
28
+ "<extra_id_25>",
29
+ "<extra_id_26>",
30
+ "<extra_id_27>",
31
+ "<extra_id_28>",
32
+ "<extra_id_29>",
33
+ "<extra_id_30>",
34
+ "<extra_id_31>",
35
+ "<extra_id_32>",
36
+ "<extra_id_33>",
37
+ "<extra_id_34>",
38
+ "<extra_id_35>",
39
+ "<extra_id_36>",
40
+ "<extra_id_37>",
41
+ "<extra_id_38>",
42
+ "<extra_id_39>",
43
+ "<extra_id_40>",
44
+ "<extra_id_41>",
45
+ "<extra_id_42>",
46
+ "<extra_id_43>",
47
+ "<extra_id_44>",
48
+ "<extra_id_45>",
49
+ "<extra_id_46>",
50
+ "<extra_id_47>",
51
+ "<extra_id_48>",
52
+ "<extra_id_49>",
53
+ "<extra_id_50>",
54
+ "<extra_id_51>",
55
+ "<extra_id_52>",
56
+ "<extra_id_53>",
57
+ "<extra_id_54>",
58
+ "<extra_id_55>",
59
+ "<extra_id_56>",
60
+ "<extra_id_57>",
61
+ "<extra_id_58>",
62
+ "<extra_id_59>",
63
+ "<extra_id_60>",
64
+ "<extra_id_61>",
65
+ "<extra_id_62>",
66
+ "<extra_id_63>",
67
+ "<extra_id_64>",
68
+ "<extra_id_65>",
69
+ "<extra_id_66>",
70
+ "<extra_id_67>",
71
+ "<extra_id_68>",
72
+ "<extra_id_69>",
73
+ "<extra_id_70>",
74
+ "<extra_id_71>",
75
+ "<extra_id_72>",
76
+ "<extra_id_73>",
77
+ "<extra_id_74>",
78
+ "<extra_id_75>",
79
+ "<extra_id_76>",
80
+ "<extra_id_77>",
81
+ "<extra_id_78>",
82
+ "<extra_id_79>",
83
+ "<extra_id_80>",
84
+ "<extra_id_81>",
85
+ "<extra_id_82>",
86
+ "<extra_id_83>",
87
+ "<extra_id_84>",
88
+ "<extra_id_85>",
89
+ "<extra_id_86>",
90
+ "<extra_id_87>",
91
+ "<extra_id_88>",
92
+ "<extra_id_89>",
93
+ "<extra_id_90>",
94
+ "<extra_id_91>",
95
+ "<extra_id_92>",
96
+ "<extra_id_93>",
97
+ "<extra_id_94>",
98
+ "<extra_id_95>",
99
+ "<extra_id_96>",
100
+ "<extra_id_97>",
101
+ "<extra_id_98>",
102
+ "<extra_id_99>"
103
+ ],
104
+ "clean_up_tokenization_spaces": true,
105
+ "eos_token": "</s>",
106
+ "extra_ids": 100,
107
+ "model_max_length": 512,
108
+ "pad_token": "<pad>",
109
+ "sp_model_kwargs": {},
110
+ "tokenizer_class": "T5Tokenizer",
111
+ "unk_token": "<unk>"
112
+ }
baseline_bs4_top2/checkpoint-10000/trainer_state.json ADDED
@@ -0,0 +1,139 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "best_metric": null,
3
+ "best_model_checkpoint": null,
4
+ "epoch": 0.008527584604298755,
5
+ "eval_steps": 500,
6
+ "global_step": 10000,
7
+ "is_hyper_param_search": false,
8
+ "is_local_process_zero": true,
9
+ "is_world_process_zero": true,
10
+ "log_history": [
11
+ {
12
+ "epoch": 0.0,
13
+ "learning_rate": 0.01,
14
+ "loss": 1.6449,
15
+ "step": 500
16
+ },
17
+ {
18
+ "epoch": 0.0,
19
+ "learning_rate": 0.01,
20
+ "loss": 1.5651,
21
+ "step": 1000
22
+ },
23
+ {
24
+ "epoch": 0.0,
25
+ "learning_rate": 0.01,
26
+ "loss": 1.5875,
27
+ "step": 1500
28
+ },
29
+ {
30
+ "epoch": 0.0,
31
+ "learning_rate": 0.01,
32
+ "loss": 1.5539,
33
+ "step": 2000
34
+ },
35
+ {
36
+ "epoch": 0.0,
37
+ "learning_rate": 0.01,
38
+ "loss": 1.5524,
39
+ "step": 2500
40
+ },
41
+ {
42
+ "epoch": 0.0,
43
+ "learning_rate": 0.01,
44
+ "loss": 1.5449,
45
+ "step": 3000
46
+ },
47
+ {
48
+ "epoch": 0.0,
49
+ "learning_rate": 0.01,
50
+ "loss": 1.549,
51
+ "step": 3500
52
+ },
53
+ {
54
+ "epoch": 0.0,
55
+ "learning_rate": 0.01,
56
+ "loss": 1.5671,
57
+ "step": 4000
58
+ },
59
+ {
60
+ "epoch": 0.0,
61
+ "learning_rate": 0.01,
62
+ "loss": 1.5251,
63
+ "step": 4500
64
+ },
65
+ {
66
+ "epoch": 0.0,
67
+ "learning_rate": 0.01,
68
+ "loss": 1.5532,
69
+ "step": 5000
70
+ },
71
+ {
72
+ "epoch": 0.0,
73
+ "learning_rate": 0.01,
74
+ "loss": 1.5262,
75
+ "step": 5500
76
+ },
77
+ {
78
+ "epoch": 0.01,
79
+ "learning_rate": 0.01,
80
+ "loss": 1.5306,
81
+ "step": 6000
82
+ },
83
+ {
84
+ "epoch": 0.01,
85
+ "learning_rate": 0.01,
86
+ "loss": 1.5273,
87
+ "step": 6500
88
+ },
89
+ {
90
+ "epoch": 0.01,
91
+ "learning_rate": 0.01,
92
+ "loss": 1.519,
93
+ "step": 7000
94
+ },
95
+ {
96
+ "epoch": 0.01,
97
+ "learning_rate": 0.01,
98
+ "loss": 1.5305,
99
+ "step": 7500
100
+ },
101
+ {
102
+ "epoch": 0.01,
103
+ "learning_rate": 0.01,
104
+ "loss": 1.507,
105
+ "step": 8000
106
+ },
107
+ {
108
+ "epoch": 0.01,
109
+ "learning_rate": 0.01,
110
+ "loss": 1.5255,
111
+ "step": 8500
112
+ },
113
+ {
114
+ "epoch": 0.01,
115
+ "learning_rate": 0.01,
116
+ "loss": 1.5106,
117
+ "step": 9000
118
+ },
119
+ {
120
+ "epoch": 0.01,
121
+ "learning_rate": 0.01,
122
+ "loss": 1.5224,
123
+ "step": 9500
124
+ },
125
+ {
126
+ "epoch": 0.01,
127
+ "learning_rate": 0.01,
128
+ "loss": 1.5241,
129
+ "step": 10000
130
+ }
131
+ ],
132
+ "logging_steps": 500,
133
+ "max_steps": 20000,
134
+ "num_train_epochs": 1,
135
+ "save_steps": 10000,
136
+ "total_flos": 4.829257277256499e+16,
137
+ "trial_name": null,
138
+ "trial_params": null
139
+ }
baseline_bs4_top2/checkpoint-10000/training_args.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:af2adb9b227cf0f18c45853aa1db09565dc63c29547902da6f50352111a9a5e7
3
+ size 4539
baseline_bs4_top2/checkpoint-20000/config.json ADDED
@@ -0,0 +1,62 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_name_or_path": "google/flan-t5-base",
3
+ "architectures": [
4
+ "SoftRelPromptFlanT5"
5
+ ],
6
+ "classifier_dropout": 0.0,
7
+ "d_ff": 2048,
8
+ "d_kv": 64,
9
+ "d_model": 768,
10
+ "decoder_start_token_id": 0,
11
+ "dense_act_fn": "gelu_new",
12
+ "dropout_rate": 0.1,
13
+ "eos_token_id": 1,
14
+ "feed_forward_proj": "gated-gelu",
15
+ "initializer_factor": 1.0,
16
+ "is_encoder_decoder": true,
17
+ "is_gated_act": true,
18
+ "layer_norm_epsilon": 1e-06,
19
+ "model_type": "t5",
20
+ "n_positions": 512,
21
+ "num_decoder_layers": 12,
22
+ "num_heads": 12,
23
+ "num_layers": 12,
24
+ "output_past": true,
25
+ "pad_token_id": 0,
26
+ "relative_attention_max_distance": 128,
27
+ "relative_attention_num_buckets": 32,
28
+ "task_specific_params": {
29
+ "summarization": {
30
+ "early_stopping": true,
31
+ "length_penalty": 2.0,
32
+ "max_length": 200,
33
+ "min_length": 30,
34
+ "no_repeat_ngram_size": 3,
35
+ "num_beams": 4,
36
+ "prefix": "summarize: "
37
+ },
38
+ "translation_en_to_de": {
39
+ "early_stopping": true,
40
+ "max_length": 300,
41
+ "num_beams": 4,
42
+ "prefix": "translate English to German: "
43
+ },
44
+ "translation_en_to_fr": {
45
+ "early_stopping": true,
46
+ "max_length": 300,
47
+ "num_beams": 4,
48
+ "prefix": "translate English to French: "
49
+ },
50
+ "translation_en_to_ro": {
51
+ "early_stopping": true,
52
+ "max_length": 300,
53
+ "num_beams": 4,
54
+ "prefix": "translate English to Romanian: "
55
+ }
56
+ },
57
+ "tie_word_embeddings": false,
58
+ "torch_dtype": "float32",
59
+ "transformers_version": "4.33.1",
60
+ "use_cache": true,
61
+ "vocab_size": 32128
62
+ }
baseline_bs4_top2/checkpoint-20000/generation_config.json ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ {
2
+ "_from_model_config": true,
3
+ "decoder_start_token_id": 0,
4
+ "eos_token_id": 1,
5
+ "pad_token_id": 0,
6
+ "transformers_version": "4.33.1"
7
+ }
baseline_bs4_top2/checkpoint-20000/optimizer.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:a4e916e263fcf89b5d721173eee18cb5b06899e27ec18f26d0a22ed9e2016282
3
+ size 144545
baseline_bs4_top2/checkpoint-20000/pytorch_model.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:3835ee505ebe813e921bacd0eea2ed25ed36c4ff4e46de9431631321049a3a53
3
+ size 990480513
baseline_bs4_top2/checkpoint-20000/rng_state.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:973f9fb61da573e339acd801c6477b8cc4497eed865244e05b84a0eedbe74768
3
+ size 14511
baseline_bs4_top2/checkpoint-20000/scheduler.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:a3ddfbcd33fc0d81c222807ca3e42cd9654f7e531f573941ed9599b1e07e0373
3
+ size 627