diff --git a/.gitattributes b/.gitattributes
index 637fa167e56685c01bc97f08a420ea76330cf6df..1fd4826e6f1aafb9303f7a6f9709083bd5723fc3 100644
--- a/.gitattributes
+++ b/.gitattributes
@@ -30,3 +30,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text
+tokenizer.json filter=lfs diff=lfs merge=lfs -text
diff --git a/config.json b/config.json
new file mode 100644
index 0000000000000000000000000000000000000000..4a9ee4255fb8b0dced6c7760bb7778d90502f337
--- /dev/null
+++ b/config.json
@@ -0,0 +1,25 @@
+{
+  "apply_residual_connection_post_layernorm": false,
+  "attention_dropout": 0.0,
+  "architectures": [
+    "BloomModel"
+  ],
+  "attention_softmax_in_fp32": true,
+  "pad_token_id": 3,
+  "bos_token_id": 1,
+  "eos_token_id": 2,
+  "hidden_dropout": 0.0,
+  "initializer_range": 0.02,
+  "layer_norm_epsilon": 1e-05,
+  "masked_softmax_fusion": true,
+  "model_type": "bloom",
+  "n_embed": 14336,
+  "n_layer": 70,
+  "num_attention_heads": 112,
+  "pretraining_tp": 4,
+  "slow_but_exact": false,
+  "transformers_version": "4.21.0",
+  "use_cache": true,
+  "vocab_size": 250880
+}
+
diff --git a/evaluation_l1/Muennighoff_xstory_cloze/ar/Answer_Given_options/results.json b/evaluation_l1/Muennighoff_xstory_cloze/ar/Answer_Given_options/results.json
new file mode 100644
index 0000000000000000000000000000000000000000..fa14ca12f337280b15a3bbfa258ffa590c0dbfe3
--- /dev/null
+++ b/evaluation_l1/Muennighoff_xstory_cloze/ar/Answer_Given_options/results.json
@@ -0,0 +1,9 @@
+{
+  "dataset_name": "Muennighoff/xstory_cloze",
+  "dataset_config_name": "ar",
+  "template_name": "Answer Given options",
+  "evaluation": {
+    "accuracy": 0.7835870284579749
+  },
+  "arguments": "Namespace(config_name=None, dataset_config_name='ar', dataset_name='Muennighoff/xstory_cloze', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='en', template_name='Answer Given options', tokenizer_name=None, use_slow_tokenizer=False)"
+}
\ No newline at end of file
diff --git a/evaluation_l1/Muennighoff_xstory_cloze/ar/Choose_Story_Ending/results.json b/evaluation_l1/Muennighoff_xstory_cloze/ar/Choose_Story_Ending/results.json
new file mode 100644
index 0000000000000000000000000000000000000000..622b0178d3dc61824ffc4b9d954e29bbd0185009
--- /dev/null
+++ b/evaluation_l1/Muennighoff_xstory_cloze/ar/Choose_Story_Ending/results.json
@@ -0,0 +1,9 @@
+{
+  "dataset_name": "Muennighoff/xstory_cloze",
+  "dataset_config_name": "ar",
+  "template_name": "Choose Story Ending",
+  "evaluation": {
+    "accuracy": 0.9291859695565851
+  },
+  "arguments": "Namespace(config_name=None, dataset_config_name='ar', dataset_name='Muennighoff/xstory_cloze', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='en', template_name='Choose Story Ending', tokenizer_name=None, use_slow_tokenizer=False)"
+}
\ No newline at end of file
diff --git a/evaluation_l1/Muennighoff_xstory_cloze/ar/Generate_Ending/results.json b/evaluation_l1/Muennighoff_xstory_cloze/ar/Generate_Ending/results.json
new file mode 100644
index 0000000000000000000000000000000000000000..f3e3d5afe9f834ed904e4b4e7eda2b9f6baf767e
--- /dev/null
+++ b/evaluation_l1/Muennighoff_xstory_cloze/ar/Generate_Ending/results.json
@@ -0,0 +1,9 @@
+{
+  "dataset_name": "Muennighoff/xstory_cloze",
+  "dataset_config_name": "ar",
+  "template_name": "Generate Ending",
+  "evaluation": {
+    "accuracy": 0.6624751819986764
+  },
+  "arguments": "Namespace(config_name=None, dataset_config_name='ar', dataset_name='Muennighoff/xstory_cloze', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='en', template_name='Generate Ending', tokenizer_name=None, use_slow_tokenizer=False)"
+}
\ No newline at end of file
diff --git a/evaluation_l1/Muennighoff_xstory_cloze/ar/Novel_Correct_Ending/results.json b/evaluation_l1/Muennighoff_xstory_cloze/ar/Novel_Correct_Ending/results.json
new file mode 100644
index 0000000000000000000000000000000000000000..4210ba765770eac93980a029b2bc352c8972c508
--- /dev/null
+++ b/evaluation_l1/Muennighoff_xstory_cloze/ar/Novel_Correct_Ending/results.json
@@ -0,0 +1,9 @@
+{
+  "dataset_name": "Muennighoff/xstory_cloze",
+  "dataset_config_name": "ar",
+  "template_name": "Novel Correct Ending",
+  "evaluation": {
+    "accuracy": 0.9252150893448048
+  },
+  "arguments": "Namespace(config_name=None, dataset_config_name='ar', dataset_name='Muennighoff/xstory_cloze', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='en', template_name='Novel Correct Ending', tokenizer_name=None, use_slow_tokenizer=False)"
+}
\ No newline at end of file
diff --git a/evaluation_l1/Muennighoff_xstory_cloze/ar/Story_Continuation_and_Options/results.json b/evaluation_l1/Muennighoff_xstory_cloze/ar/Story_Continuation_and_Options/results.json
new file mode 100644
index 0000000000000000000000000000000000000000..f560c7f8871d2369ce47abd12e53e7667e83db89
--- /dev/null
+++ b/evaluation_l1/Muennighoff_xstory_cloze/ar/Story_Continuation_and_Options/results.json
@@ -0,0 +1,9 @@
+{
+  "dataset_name": "Muennighoff/xstory_cloze",
+  "dataset_config_name": "ar",
+  "template_name": "Story Continuation and Options",
+  "evaluation": {
+    "accuracy": 0.9159497021839841
+  },
+  "arguments": "Namespace(config_name=None, dataset_config_name='ar', dataset_name='Muennighoff/xstory_cloze', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='en', template_name='Story Continuation and Options', tokenizer_name=None, use_slow_tokenizer=False)"
+}
\ No newline at end of file
diff --git a/evaluation_l1/Muennighoff_xstory_cloze/es/Answer_Given_options/results.json b/evaluation_l1/Muennighoff_xstory_cloze/es/Answer_Given_options/results.json
new file mode 100644
index 0000000000000000000000000000000000000000..4a8c7a2f686d8f7e86828e16fbfd066edfdf6cd0
--- /dev/null
+++ b/evaluation_l1/Muennighoff_xstory_cloze/es/Answer_Given_options/results.json
@@ -0,0 +1,9 @@
+{
+  "dataset_name": "Muennighoff/xstory_cloze",
+  "dataset_config_name": "es",
+  "template_name": "Answer Given options",
+  "evaluation": {
+    "accuracy": 0.870946393117141
+  },
+  "arguments": "Namespace(config_name=None, dataset_config_name='es', dataset_name='Muennighoff/xstory_cloze', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='en', template_name='Answer Given options', tokenizer_name=None, use_slow_tokenizer=False)"
+}
\ No newline at end of file
diff --git a/evaluation_l1/Muennighoff_xstory_cloze/es/Choose_Story_Ending/results.json b/evaluation_l1/Muennighoff_xstory_cloze/es/Choose_Story_Ending/results.json
new file mode 100644
index 0000000000000000000000000000000000000000..cdfab5115b88fdcc96feeb85dd4f93ee938b65d7
--- /dev/null
+++ b/evaluation_l1/Muennighoff_xstory_cloze/es/Choose_Story_Ending/results.json
@@ -0,0 +1,9 @@
+{
+  "dataset_name": "Muennighoff/xstory_cloze",
+  "dataset_config_name": "es",
+  "template_name": "Choose Story Ending",
+  "evaluation": {
+    "accuracy": 0.9523494374586366
+  },
+  "arguments": "Namespace(config_name=None, dataset_config_name='es', dataset_name='Muennighoff/xstory_cloze', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='en', template_name='Choose Story Ending', tokenizer_name=None, use_slow_tokenizer=False)"
+}
\ No newline at end of file
diff --git a/evaluation_l1/Muennighoff_xstory_cloze/es/Generate_Ending/results.json b/evaluation_l1/Muennighoff_xstory_cloze/es/Generate_Ending/results.json
new file mode 100644
index 0000000000000000000000000000000000000000..49cbe5f2b61f12b9305cf9fb741f260ad3487a6f
--- /dev/null
+++ b/evaluation_l1/Muennighoff_xstory_cloze/es/Generate_Ending/results.json
@@ -0,0 +1,9 @@
+{
+  "dataset_name": "Muennighoff/xstory_cloze",
+  "dataset_config_name": "es",
+  "template_name": "Generate Ending",
+  "evaluation": {
+    "accuracy": 0.7319655857048313
+  },
+  "arguments": "Namespace(config_name=None, dataset_config_name='es', dataset_name='Muennighoff/xstory_cloze', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='en', template_name='Generate Ending', tokenizer_name=None, use_slow_tokenizer=False)"
+}
\ No newline at end of file
diff --git a/evaluation_l1/Muennighoff_xstory_cloze/es/Novel_Correct_Ending/results.json b/evaluation_l1/Muennighoff_xstory_cloze/es/Novel_Correct_Ending/results.json
new file mode 100644
index 0000000000000000000000000000000000000000..84abc297253d8c49058803f80b2439bbec58dc0c
--- /dev/null
+++ b/evaluation_l1/Muennighoff_xstory_cloze/es/Novel_Correct_Ending/results.json
@@ -0,0 +1,9 @@
+{
+  "dataset_name": "Muennighoff/xstory_cloze",
+  "dataset_config_name": "es",
+  "template_name": "Novel Correct Ending",
+  "evaluation": {
+    "accuracy": 0.9477167438782264
+  },
+  "arguments": "Namespace(config_name=None, dataset_config_name='es', dataset_name='Muennighoff/xstory_cloze', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='en', template_name='Novel Correct Ending', tokenizer_name=None, use_slow_tokenizer=False)"
+}
\ No newline at end of file
diff --git a/evaluation_l1/Muennighoff_xstory_cloze/es/Story_Continuation_and_Options/results.json b/evaluation_l1/Muennighoff_xstory_cloze/es/Story_Continuation_and_Options/results.json
new file mode 100644
index 0000000000000000000000000000000000000000..133194939f44992d70f7b27f95d3d923898d3a47
--- /dev/null
+++ b/evaluation_l1/Muennighoff_xstory_cloze/es/Story_Continuation_and_Options/results.json
@@ -0,0 +1,9 @@
+{
+  "dataset_name": "Muennighoff/xstory_cloze",
+  "dataset_config_name": "es",
+  "template_name": "Story Continuation and Options",
+  "evaluation": {
+    "accuracy": 0.9516876240900066
+  },
+  "arguments": "Namespace(config_name=None, dataset_config_name='es', dataset_name='Muennighoff/xstory_cloze', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='en', template_name='Story Continuation and Options', tokenizer_name=None, use_slow_tokenizer=False)"
+}
\ No newline at end of file
diff --git a/evaluation_l1/Muennighoff_xstory_cloze/eu/Answer_Given_options/results.json b/evaluation_l1/Muennighoff_xstory_cloze/eu/Answer_Given_options/results.json
new file mode 100644
index 0000000000000000000000000000000000000000..9f90e311862c4d4f544147953fedfe8b2e09bf27
--- /dev/null
+++ b/evaluation_l1/Muennighoff_xstory_cloze/eu/Answer_Given_options/results.json
@@ -0,0 +1,9 @@
+{
+  "dataset_name": "Muennighoff/xstory_cloze",
+  "dataset_config_name": "eu",
+  "template_name": "Answer Given options",
+  "evaluation": {
+    "accuracy": 0.6982131039046989
+  },
+  "arguments": "Namespace(config_name=None, dataset_config_name='eu', dataset_name='Muennighoff/xstory_cloze', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='en', template_name='Answer Given options', tokenizer_name=None, use_slow_tokenizer=False)"
+}
\ No newline at end of file
diff --git a/evaluation_l1/Muennighoff_xstory_cloze/eu/Choose_Story_Ending/results.json b/evaluation_l1/Muennighoff_xstory_cloze/eu/Choose_Story_Ending/results.json
new file mode 100644
index 0000000000000000000000000000000000000000..ac3a09567cd16940b268699baf757d683c11d3e2
--- /dev/null
+++ b/evaluation_l1/Muennighoff_xstory_cloze/eu/Choose_Story_Ending/results.json
@@ -0,0 +1,9 @@
+{
+  "dataset_name": "Muennighoff/xstory_cloze",
+  "dataset_config_name": "eu",
+  "template_name": "Choose Story Ending",
+  "evaluation": {
+    "accuracy": 0.85704831237591
+  },
+  "arguments": "Namespace(config_name=None, dataset_config_name='eu', dataset_name='Muennighoff/xstory_cloze', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='en', template_name='Choose Story Ending', tokenizer_name=None, use_slow_tokenizer=False)"
+}
\ No newline at end of file
diff --git a/evaluation_l1/Muennighoff_xstory_cloze/eu/Generate_Ending/results.json b/evaluation_l1/Muennighoff_xstory_cloze/eu/Generate_Ending/results.json
new file mode 100644
index 0000000000000000000000000000000000000000..537bd7aca6cf5b4d60179d3e77429be8d2472ffa
--- /dev/null
+++ b/evaluation_l1/Muennighoff_xstory_cloze/eu/Generate_Ending/results.json
@@ -0,0 +1,9 @@
+{
+  "dataset_name": "Muennighoff/xstory_cloze",
+  "dataset_config_name": "eu",
+  "template_name": "Generate Ending",
+  "evaluation": {
+    "accuracy": 0.614162806088683
+  },
+  "arguments": "Namespace(config_name=None, dataset_config_name='eu', dataset_name='Muennighoff/xstory_cloze', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='en', template_name='Generate Ending', tokenizer_name=None, use_slow_tokenizer=False)"
+}
\ No newline at end of file
diff --git a/evaluation_l1/Muennighoff_xstory_cloze/eu/Novel_Correct_Ending/results.json b/evaluation_l1/Muennighoff_xstory_cloze/eu/Novel_Correct_Ending/results.json
new file mode 100644
index 0000000000000000000000000000000000000000..7b613d66b0fa70d7ede65259912c94c972606b72
--- /dev/null
+++ b/evaluation_l1/Muennighoff_xstory_cloze/eu/Novel_Correct_Ending/results.json
@@ -0,0 +1,9 @@
+{
+  "dataset_name": "Muennighoff/xstory_cloze",
+  "dataset_config_name": "eu",
+  "template_name": "Novel Correct Ending",
+  "evaluation": {
+    "accuracy": 0.8590337524818001
+  },
+  "arguments": "Namespace(config_name=None, dataset_config_name='eu', dataset_name='Muennighoff/xstory_cloze', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='en', template_name='Novel Correct Ending', tokenizer_name=None, use_slow_tokenizer=False)"
+}
\ No newline at end of file
diff --git a/evaluation_l1/Muennighoff_xstory_cloze/eu/Story_Continuation_and_Options/results.json b/evaluation_l1/Muennighoff_xstory_cloze/eu/Story_Continuation_and_Options/results.json
new file mode 100644
index 0000000000000000000000000000000000000000..b78868605b4fc2d566a9cf12cfbd26fecb04615f
--- /dev/null
+++ b/evaluation_l1/Muennighoff_xstory_cloze/eu/Story_Continuation_and_Options/results.json
@@ -0,0 +1,9 @@
+{
+  "dataset_name": "Muennighoff/xstory_cloze",
+  "dataset_config_name": "eu",
+  "template_name": "Story Continuation and Options",
+  "evaluation": {
+    "accuracy": 0.8504301786896096
+  },
+  "arguments": "Namespace(config_name=None, dataset_config_name='eu', dataset_name='Muennighoff/xstory_cloze', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='en', template_name='Story Continuation and Options', tokenizer_name=None, use_slow_tokenizer=False)"
+}
\ No newline at end of file
diff --git a/evaluation_l1/Muennighoff_xstory_cloze/hi/Answer_Given_options/results.json b/evaluation_l1/Muennighoff_xstory_cloze/hi/Answer_Given_options/results.json
new file mode 100644
index 0000000000000000000000000000000000000000..dfcddff9122832e4023de739a08b165405f4bf97
--- /dev/null
+++ b/evaluation_l1/Muennighoff_xstory_cloze/hi/Answer_Given_options/results.json
@@ -0,0 +1,9 @@
+{
+  "dataset_name": "Muennighoff/xstory_cloze",
+  "dataset_config_name": "hi",
+  "template_name": "Answer Given options",
+  "evaluation": {
+    "accuracy": 0.7683653209794837
+  },
+  "arguments": "Namespace(config_name=None, dataset_config_name='hi', dataset_name='Muennighoff/xstory_cloze', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='en', template_name='Answer Given options', tokenizer_name=None, use_slow_tokenizer=False)"
+}
\ No newline at end of file
diff --git a/evaluation_l1/Muennighoff_xstory_cloze/hi/Choose_Story_Ending/results.json b/evaluation_l1/Muennighoff_xstory_cloze/hi/Choose_Story_Ending/results.json
new file mode 100644
index 0000000000000000000000000000000000000000..0249f476c61770d371aa612ab6f723bec18d3c63
--- /dev/null
+++ b/evaluation_l1/Muennighoff_xstory_cloze/hi/Choose_Story_Ending/results.json
@@ -0,0 +1,9 @@
+{
+  "dataset_name": "Muennighoff/xstory_cloze",
+  "dataset_config_name": "hi",
+  "template_name": "Choose Story Ending",
+  "evaluation": {
+    "accuracy": 0.8742554599602912
+  },
+  "arguments": "Namespace(config_name=None, dataset_config_name='hi', dataset_name='Muennighoff/xstory_cloze', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='en', template_name='Choose Story Ending', tokenizer_name=None, use_slow_tokenizer=False)"
+}
\ No newline at end of file
diff --git a/evaluation_l1/Muennighoff_xstory_cloze/hi/Generate_Ending/results.json b/evaluation_l1/Muennighoff_xstory_cloze/hi/Generate_Ending/results.json
new file mode 100644
index 0000000000000000000000000000000000000000..15715bbec8f0ca5626c8896695ab1ec869daef82
--- /dev/null
+++ b/evaluation_l1/Muennighoff_xstory_cloze/hi/Generate_Ending/results.json
@@ -0,0 +1,9 @@
+{
+  "dataset_name": "Muennighoff/xstory_cloze",
+  "dataset_config_name": "hi",
+  "template_name": "Generate Ending",
+  "evaluation": {
+    "accuracy": 0.657180675049636
+  },
+  "arguments": "Namespace(config_name=None, dataset_config_name='hi', dataset_name='Muennighoff/xstory_cloze', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='en', template_name='Generate Ending', tokenizer_name=None, use_slow_tokenizer=False)"
+}
\ No newline at end of file
diff --git a/evaluation_l1/Muennighoff_xstory_cloze/hi/Novel_Correct_Ending/results.json b/evaluation_l1/Muennighoff_xstory_cloze/hi/Novel_Correct_Ending/results.json
new file mode 100644
index 0000000000000000000000000000000000000000..2825f4a6c0a15ea38107bcf743d17899fc10205c
--- /dev/null
+++ b/evaluation_l1/Muennighoff_xstory_cloze/hi/Novel_Correct_Ending/results.json
@@ -0,0 +1,9 @@
+{
+  "dataset_name": "Muennighoff/xstory_cloze",
+  "dataset_config_name": "hi",
+  "template_name": "Novel Correct Ending",
+  "evaluation": {
+    "accuracy": 0.886829913964262
+  },
+  "arguments": "Namespace(config_name=None, dataset_config_name='hi', dataset_name='Muennighoff/xstory_cloze', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='en', template_name='Novel Correct Ending', tokenizer_name=None, use_slow_tokenizer=False)"
+}
\ No newline at end of file
diff --git a/evaluation_l1/Muennighoff_xstory_cloze/hi/Story_Continuation_and_Options/results.json b/evaluation_l1/Muennighoff_xstory_cloze/hi/Story_Continuation_and_Options/results.json
new file mode 100644
index 0000000000000000000000000000000000000000..9c3a3be4b72ddd989d75644db452c9a43117588c
--- /dev/null
+++ b/evaluation_l1/Muennighoff_xstory_cloze/hi/Story_Continuation_and_Options/results.json
@@ -0,0 +1,9 @@
+{
+  "dataset_name": "Muennighoff/xstory_cloze",
+  "dataset_config_name": "hi",
+  "template_name": "Story Continuation and Options",
+  "evaluation": {
+    "accuracy": 0.8762409000661814
+  },
+  "arguments": "Namespace(config_name=None, dataset_config_name='hi', dataset_name='Muennighoff/xstory_cloze', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='en', template_name='Story Continuation and Options', tokenizer_name=None, use_slow_tokenizer=False)"
+}
\ No newline at end of file
diff --git a/evaluation_l1/Muennighoff_xstory_cloze/id/Answer_Given_options/results.json b/evaluation_l1/Muennighoff_xstory_cloze/id/Answer_Given_options/results.json
new file mode 100644
index 0000000000000000000000000000000000000000..b5f961fd1a7f2e5d4fcc0cf3b9b24c3d5a3636fb
--- /dev/null
+++ b/evaluation_l1/Muennighoff_xstory_cloze/id/Answer_Given_options/results.json
@@ -0,0 +1,9 @@
+{
+  "dataset_name": "Muennighoff/xstory_cloze",
+  "dataset_config_name": "id",
+  "template_name": "Answer Given options",
+  "evaluation": {
+    "accuracy": 0.8332230311052283
+  },
+  "arguments": "Namespace(config_name=None, dataset_config_name='id', dataset_name='Muennighoff/xstory_cloze', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='en', template_name='Answer Given options', tokenizer_name=None, use_slow_tokenizer=False)"
+}
\ No newline at end of file
diff --git a/evaluation_l1/Muennighoff_xstory_cloze/id/Choose_Story_Ending/results.json b/evaluation_l1/Muennighoff_xstory_cloze/id/Choose_Story_Ending/results.json
new file mode 100644
index 0000000000000000000000000000000000000000..08e1da3e2e4a3c184f8a9616f90c0c499646325e
--- /dev/null
+++ b/evaluation_l1/Muennighoff_xstory_cloze/id/Choose_Story_Ending/results.json
@@ -0,0 +1,9 @@
+{
+  "dataset_name": "Muennighoff/xstory_cloze",
+  "dataset_config_name": "id",
+  "template_name": "Choose Story Ending",
+  "evaluation": {
+    "accuracy": 0.913964262078094
+  },
+  "arguments": "Namespace(config_name=None, dataset_config_name='id', dataset_name='Muennighoff/xstory_cloze', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='en', template_name='Choose Story Ending', tokenizer_name=None, use_slow_tokenizer=False)"
+}
\ No newline at end of file
diff --git a/evaluation_l1/Muennighoff_xstory_cloze/id/Generate_Ending/results.json b/evaluation_l1/Muennighoff_xstory_cloze/id/Generate_Ending/results.json
new file mode 100644
index 0000000000000000000000000000000000000000..1c19d4576266454a6162862c53b416bebe0d6b03
--- /dev/null
+++ b/evaluation_l1/Muennighoff_xstory_cloze/id/Generate_Ending/results.json
@@ -0,0 +1,9 @@
+{
+  "dataset_name": "Muennighoff/xstory_cloze",
+  "dataset_config_name": "id",
+  "template_name": "Generate Ending",
+  "evaluation": {
+    "accuracy": 0.700198544010589
+  },
+  "arguments": "Namespace(config_name=None, dataset_config_name='id', dataset_name='Muennighoff/xstory_cloze', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='en', template_name='Generate Ending', tokenizer_name=None, use_slow_tokenizer=False)"
+}
\ No newline at end of file
diff --git a/evaluation_l1/Muennighoff_xstory_cloze/id/Novel_Correct_Ending/results.json b/evaluation_l1/Muennighoff_xstory_cloze/id/Novel_Correct_Ending/results.json
new file mode 100644
index 0000000000000000000000000000000000000000..61259b0ab613799ec9d0e717e6b66e3e5676e6de
--- /dev/null
+++ b/evaluation_l1/Muennighoff_xstory_cloze/id/Novel_Correct_Ending/results.json
@@ -0,0 +1,9 @@
+{
+  "dataset_name": "Muennighoff/xstory_cloze",
+  "dataset_config_name": "id",
+  "template_name": "Novel Correct Ending",
+  "evaluation": {
+    "accuracy": 0.9205823957643945
+  },
+  "arguments": "Namespace(config_name=None, dataset_config_name='id', dataset_name='Muennighoff/xstory_cloze', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='en', template_name='Novel Correct Ending', tokenizer_name=None, use_slow_tokenizer=False)"
+}
\ No newline at end of file
diff --git a/evaluation_l1/Muennighoff_xstory_cloze/id/Story_Continuation_and_Options/results.json b/evaluation_l1/Muennighoff_xstory_cloze/id/Story_Continuation_and_Options/results.json
new file mode 100644
index 0000000000000000000000000000000000000000..0599dff562b06459a0b386388649b3fdb2d795d3
--- /dev/null
+++ b/evaluation_l1/Muennighoff_xstory_cloze/id/Story_Continuation_and_Options/results.json
@@ -0,0 +1,9 @@
+{
+  "dataset_name": "Muennighoff/xstory_cloze",
+  "dataset_config_name": "id",
+  "template_name": "Story Continuation and Options",
+  "evaluation": {
+    "accuracy": 0.9086697551290536
+  },
+  "arguments": "Namespace(config_name=None, dataset_config_name='id', dataset_name='Muennighoff/xstory_cloze', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='en', template_name='Story Continuation and Options', tokenizer_name=None, use_slow_tokenizer=False)"
+}
\ No newline at end of file
diff --git a/evaluation_l1/Muennighoff_xstory_cloze/zh/Answer_Given_options/results.json b/evaluation_l1/Muennighoff_xstory_cloze/zh/Answer_Given_options/results.json
new file mode 100644
index 0000000000000000000000000000000000000000..d4fea73deeac81c8a3056ea7163c644193143f38
--- /dev/null
+++ b/evaluation_l1/Muennighoff_xstory_cloze/zh/Answer_Given_options/results.json
@@ -0,0 +1,9 @@
+{
+  "dataset_name": "Muennighoff/xstory_cloze",
+  "dataset_config_name": "zh",
+  "template_name": "Answer Given options",
+  "evaluation": {
+    "accuracy": 0.870946393117141
+  },
+  "arguments": "Namespace(config_name=None, dataset_config_name='zh', dataset_name='Muennighoff/xstory_cloze', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='en', template_name='Answer Given options', tokenizer_name=None, use_slow_tokenizer=False)"
+}
\ No newline at end of file
diff --git a/evaluation_l1/Muennighoff_xstory_cloze/zh/Choose_Story_Ending/results.json b/evaluation_l1/Muennighoff_xstory_cloze/zh/Choose_Story_Ending/results.json
new file mode 100644
index 0000000000000000000000000000000000000000..50a8a9f7bcc5d647795ef2452673fb071e542ef9
--- /dev/null
+++ b/evaluation_l1/Muennighoff_xstory_cloze/zh/Choose_Story_Ending/results.json
@@ -0,0 +1,9 @@
+{
+  "dataset_name": "Muennighoff/xstory_cloze",
+  "dataset_config_name": "zh",
+  "template_name": "Choose Story Ending",
+  "evaluation": {
+    "accuracy": 0.9265387160820648
+  },
+  "arguments": "Namespace(config_name=None, dataset_config_name='zh', dataset_name='Muennighoff/xstory_cloze', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='en', template_name='Choose Story Ending', tokenizer_name=None, use_slow_tokenizer=False)"
+}
\ No newline at end of file
diff --git a/evaluation_l1/Muennighoff_xstory_cloze/zh/Generate_Ending/results.json b/evaluation_l1/Muennighoff_xstory_cloze/zh/Generate_Ending/results.json
new file mode 100644
index 0000000000000000000000000000000000000000..927c5f705c7b4716a9e99d27d3dde0bd724ba42c
--- /dev/null
+++ b/evaluation_l1/Muennighoff_xstory_cloze/zh/Generate_Ending/results.json
@@ -0,0 +1,9 @@
+{
+  "dataset_name": "Muennighoff/xstory_cloze",
+  "dataset_config_name": "zh",
+  "template_name": "Generate Ending",
+  "evaluation": {
+    "accuracy": 0.6823295830575777
+  },
+  "arguments": "Namespace(config_name=None, dataset_config_name='zh', dataset_name='Muennighoff/xstory_cloze', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='en', template_name='Generate Ending', tokenizer_name=None, use_slow_tokenizer=False)"
+}
\ No newline at end of file
diff --git a/evaluation_l1/Muennighoff_xstory_cloze/zh/Novel_Correct_Ending/results.json b/evaluation_l1/Muennighoff_xstory_cloze/zh/Novel_Correct_Ending/results.json
new file mode 100644
index 0000000000000000000000000000000000000000..b1a9390ed70408e37d3597045e7b9ec029eee5b4
--- /dev/null
+++ b/evaluation_l1/Muennighoff_xstory_cloze/zh/Novel_Correct_Ending/results.json
@@ -0,0 +1,9 @@
+{
+  "dataset_name": "Muennighoff/xstory_cloze",
+  "dataset_config_name": "zh",
+  "template_name": "Novel Correct Ending",
+  "evaluation": {
+    "accuracy": 0.928524156187955
+  },
+  "arguments": "Namespace(config_name=None, dataset_config_name='zh', dataset_name='Muennighoff/xstory_cloze', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='en', template_name='Novel Correct Ending', tokenizer_name=None, use_slow_tokenizer=False)"
+}
\ No newline at end of file
diff --git a/evaluation_l1/Muennighoff_xstory_cloze/zh/Story_Continuation_and_Options/results.json b/evaluation_l1/Muennighoff_xstory_cloze/zh/Story_Continuation_and_Options/results.json
new file mode 100644
index 0000000000000000000000000000000000000000..c912504a65bd932e2813c6dfbec4dfa31c750e9f
--- /dev/null
+++ b/evaluation_l1/Muennighoff_xstory_cloze/zh/Story_Continuation_and_Options/results.json
@@ -0,0 +1,9 @@
+{
+  "dataset_name": "Muennighoff/xstory_cloze",
+  "dataset_config_name": "zh",
+  "template_name": "Story Continuation and Options",
+  "evaluation": {
+    "accuracy": 0.9232296492389146
+  },
+  "arguments": "Namespace(config_name=None, dataset_config_name='zh', dataset_name='Muennighoff/xstory_cloze', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='en', template_name='Story Continuation and Options', tokenizer_name=None, use_slow_tokenizer=False)"
+}
\ No newline at end of file
diff --git a/evaluation_l1/Muennighoff_xwinograd/en/Replace/results.json b/evaluation_l1/Muennighoff_xwinograd/en/Replace/results.json
new file mode 100644
index 0000000000000000000000000000000000000000..e8a5a22b9790053662904d0e281b3325e8779d96
--- /dev/null
+++ b/evaluation_l1/Muennighoff_xwinograd/en/Replace/results.json
@@ -0,0 +1,9 @@
+{
+  "dataset_name": "Muennighoff/xwinograd",
+  "dataset_config_name": "en",
+  "template_name": "Replace",
+  "evaluation": {
+    "accuracy": 0.6933333333333334
+  },
+  "arguments": "Namespace(config_name=None, dataset_config_name='en', dataset_name='Muennighoff/xwinograd', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='test', target_max_length=256, template_config_name='en', template_name='Replace', tokenizer_name=None, use_slow_tokenizer=False)"
+}
\ No newline at end of file
diff --git a/evaluation_l1/Muennighoff_xwinograd/en/True_or_False/results.json b/evaluation_l1/Muennighoff_xwinograd/en/True_or_False/results.json
new file mode 100644
index 0000000000000000000000000000000000000000..fb72eae6a032d146ba4bda652b3c2831cb267690
--- /dev/null
+++ b/evaluation_l1/Muennighoff_xwinograd/en/True_or_False/results.json
@@ -0,0 +1,9 @@
+{
+  "dataset_name": "Muennighoff/xwinograd",
+  "dataset_config_name": "en",
+  "template_name": "True or False",
+  "evaluation": {
+    "accuracy": 0.5212903225806451
+  },
+  "arguments": "Namespace(config_name=None, dataset_config_name='en', dataset_name='Muennighoff/xwinograd', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='test', target_max_length=256, template_config_name='en', template_name='True or False', tokenizer_name=None, use_slow_tokenizer=False)"
+}
\ No newline at end of file
diff --git a/evaluation_l1/Muennighoff_xwinograd/en/does_underscore_refer_to/results.json b/evaluation_l1/Muennighoff_xwinograd/en/does_underscore_refer_to/results.json
new file mode 100644
index 0000000000000000000000000000000000000000..66aa24f4760f073f17cd55f9e6a510c174ad2875
--- /dev/null
+++ b/evaluation_l1/Muennighoff_xwinograd/en/does_underscore_refer_to/results.json
@@ -0,0 +1,9 @@
+{
+  "dataset_name": "Muennighoff/xwinograd",
+  "dataset_config_name": "en",
+  "template_name": "does underscore refer to",
+  "evaluation": {
+    "accuracy": 0.6563440860215054
+  },
+  "arguments": "Namespace(config_name=None, dataset_config_name='en', dataset_name='Muennighoff/xwinograd', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='test', target_max_length=256, template_config_name='en', template_name='does underscore refer to', tokenizer_name=None, use_slow_tokenizer=False)"
+}
\ No newline at end of file
diff --git a/evaluation_l1/Muennighoff_xwinograd/en/stand_for/results.json b/evaluation_l1/Muennighoff_xwinograd/en/stand_for/results.json
new file mode 100644
index 0000000000000000000000000000000000000000..96dd97d1f840cfe0c4253ab2c2c375afba165614
--- /dev/null
+++ b/evaluation_l1/Muennighoff_xwinograd/en/stand_for/results.json
@@ -0,0 +1,9 @@
+{
+  "dataset_name": "Muennighoff/xwinograd",
+  "dataset_config_name": "en",
+  "template_name": "stand for",
+  "evaluation": {
+    "accuracy": 0.5156989247311828
+  },
+  "arguments": "Namespace(config_name=None, dataset_config_name='en', dataset_name='Muennighoff/xwinograd', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='test', target_max_length=256, template_config_name='en', template_name='stand for', tokenizer_name=None, use_slow_tokenizer=False)"
+}
\ No newline at end of file
diff --git a/evaluation_l1/Muennighoff_xwinograd/en/underscore_refer_to/results.json b/evaluation_l1/Muennighoff_xwinograd/en/underscore_refer_to/results.json
new file mode 100644
index 0000000000000000000000000000000000000000..6ea1d115d892e712d5a15507f7c53b35973f58e6
--- /dev/null
+++ b/evaluation_l1/Muennighoff_xwinograd/en/underscore_refer_to/results.json
@@ -0,0 +1,9 @@
+{
+  "dataset_name": "Muennighoff/xwinograd",
+  "dataset_config_name": "en",
+  "template_name": "underscore refer to",
+  "evaluation": {
+    "accuracy": 0.6473118279569893
+  },
+  "arguments": "Namespace(config_name=None, dataset_config_name='en', dataset_name='Muennighoff/xwinograd', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='test', target_max_length=256, template_config_name='en', template_name='underscore refer to', tokenizer_name=None, use_slow_tokenizer=False)"
+}
\ No newline at end of file
diff --git a/evaluation_l1/Muennighoff_xwinograd/fr/Replace/results.json b/evaluation_l1/Muennighoff_xwinograd/fr/Replace/results.json
new file mode 100644
index 0000000000000000000000000000000000000000..0e44039abfed86c270ecfd42407b4a632f5beb6a
--- /dev/null
+++ b/evaluation_l1/Muennighoff_xwinograd/fr/Replace/results.json
@@ -0,0 +1,9 @@
+{
+  "dataset_name": "Muennighoff/xwinograd",
+  "dataset_config_name": "fr",
+  "template_name": "Replace",
+  "evaluation": {
+    "accuracy": 0.6024096385542169
+  },
+  "arguments": "Namespace(config_name=None, dataset_config_name='fr', dataset_name='Muennighoff/xwinograd', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='test', target_max_length=256, template_config_name='en', template_name='Replace', tokenizer_name=None, use_slow_tokenizer=False)"
+}
\ No newline at end of file
diff --git a/evaluation_l1/Muennighoff_xwinograd/fr/True_or_False/results.json b/evaluation_l1/Muennighoff_xwinograd/fr/True_or_False/results.json
new file mode 100644
index 0000000000000000000000000000000000000000..676ff78442f70e7604c9d4e4a162104fba700bc5
--- /dev/null
+++ b/evaluation_l1/Muennighoff_xwinograd/fr/True_or_False/results.json
@@ -0,0 +1,9 @@
+{
+  "dataset_name": "Muennighoff/xwinograd",
+  "dataset_config_name": "fr",
+  "template_name": "True or False",
+  "evaluation": {
+    "accuracy": 0.46987951807228917
+  },
+  "arguments": "Namespace(config_name=None, dataset_config_name='fr', dataset_name='Muennighoff/xwinograd', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='test', target_max_length=256, template_config_name='en', template_name='True or False', tokenizer_name=None, use_slow_tokenizer=False)"
+}
\ No newline at end of file
diff --git a/evaluation_l1/Muennighoff_xwinograd/fr/does_underscore_refer_to/results.json b/evaluation_l1/Muennighoff_xwinograd/fr/does_underscore_refer_to/results.json
new file mode 100644
index 0000000000000000000000000000000000000000..57fa5bc71b075b999289da2b81ab4cbd13217d92
--- /dev/null
+++ b/evaluation_l1/Muennighoff_xwinograd/fr/does_underscore_refer_to/results.json
@@ -0,0 +1,9 @@
+{
+  "dataset_name": "Muennighoff/xwinograd",
+  "dataset_config_name": "fr",
+  "template_name": "does underscore refer to",
+  "evaluation": {
+    "accuracy": 0.5903614457831325
+  },
+  "arguments": "Namespace(config_name=None, dataset_config_name='fr', dataset_name='Muennighoff/xwinograd', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='test', target_max_length=256, template_config_name='en', template_name='does underscore refer to', tokenizer_name=None, use_slow_tokenizer=False)"
+}
\ No newline at end of file
diff --git a/evaluation_l1/Muennighoff_xwinograd/fr/stand_for/results.json b/evaluation_l1/Muennighoff_xwinograd/fr/stand_for/results.json
new file mode 100644
index 0000000000000000000000000000000000000000..aca34b8ede42cbef1f40485a79e370482f93eb7f
--- /dev/null
+++ b/evaluation_l1/Muennighoff_xwinograd/fr/stand_for/results.json
@@ -0,0 +1,9 @@
+{
+  "dataset_name": "Muennighoff/xwinograd",
+  "dataset_config_name": "fr",
+  "template_name": "stand for",
+  "evaluation": {
+    "accuracy": 0.4939759036144578
+  },
+  "arguments": "Namespace(config_name=None, dataset_config_name='fr', dataset_name='Muennighoff/xwinograd', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='test', target_max_length=256, template_config_name='en', template_name='stand for', tokenizer_name=None, use_slow_tokenizer=False)"
+}
\ No newline at end of file
diff --git a/evaluation_l1/Muennighoff_xwinograd/fr/underscore_refer_to/results.json b/evaluation_l1/Muennighoff_xwinograd/fr/underscore_refer_to/results.json
new file mode 100644
index 0000000000000000000000000000000000000000..9e52bc4b92c7834bfbc21e02c80d8ec944c6eb26
--- /dev/null
+++ b/evaluation_l1/Muennighoff_xwinograd/fr/underscore_refer_to/results.json
@@ -0,0 +1,9 @@
+{
+  "dataset_name": "Muennighoff/xwinograd",
+  "dataset_config_name": "fr",
+  "template_name": "underscore refer to",
+  "evaluation": {
+    "accuracy": 0.6867469879518072
+  },
+  "arguments": "Namespace(config_name=None, dataset_config_name='fr', dataset_name='Muennighoff/xwinograd', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='test', target_max_length=256, template_config_name='en', template_name='underscore refer to', tokenizer_name=None, use_slow_tokenizer=False)"
+}
\ No newline at end of file
diff --git a/evaluation_l1/Muennighoff_xwinograd/pt/Replace/results.json b/evaluation_l1/Muennighoff_xwinograd/pt/Replace/results.json
new file mode 100644
index 0000000000000000000000000000000000000000..3a6b4ece370951cdb9b44db9ac165944227af945
--- /dev/null
+++ b/evaluation_l1/Muennighoff_xwinograd/pt/Replace/results.json
@@ -0,0 +1,9 @@
+{
+  "dataset_name": "Muennighoff/xwinograd",
+  "dataset_config_name": "pt",
+  "template_name": "Replace",
+  "evaluation": {
+    "accuracy": 0.6463878326996197
+  },
+  "arguments": "Namespace(config_name=None, dataset_config_name='pt', dataset_name='Muennighoff/xwinograd', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='test', target_max_length=256, template_config_name='en', template_name='Replace', tokenizer_name=None, use_slow_tokenizer=False)"
+}
\ No newline at end of file
diff --git a/evaluation_l1/Muennighoff_xwinograd/pt/True_or_False/results.json b/evaluation_l1/Muennighoff_xwinograd/pt/True_or_False/results.json
new file mode 100644
index 0000000000000000000000000000000000000000..14851a8471b73c07a28c1b00f9b88a4aa6722984
--- /dev/null
+++ b/evaluation_l1/Muennighoff_xwinograd/pt/True_or_False/results.json
@@ -0,0 +1,9 @@
+{
+  "dataset_name": "Muennighoff/xwinograd",
+  "dataset_config_name": "pt",
+  "template_name": "True or False",
+  "evaluation": {
+    "accuracy": 0.5285171102661597
+  },
+  "arguments": "Namespace(config_name=None, dataset_config_name='pt', dataset_name='Muennighoff/xwinograd', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='test', target_max_length=256, template_config_name='en', template_name='True or False', tokenizer_name=None, use_slow_tokenizer=False)"
+}
\ No newline at end of file
diff --git a/evaluation_l1/Muennighoff_xwinograd/pt/does_underscore_refer_to/results.json b/evaluation_l1/Muennighoff_xwinograd/pt/does_underscore_refer_to/results.json
new file mode 100644
index 0000000000000000000000000000000000000000..091ffea556f32c5e4e5869762a4c921446a22f69
--- /dev/null
+++ b/evaluation_l1/Muennighoff_xwinograd/pt/does_underscore_refer_to/results.json
@@ -0,0 +1,9 @@
+{
+  "dataset_name": "Muennighoff/xwinograd",
+  "dataset_config_name": "pt",
+  "template_name": "does underscore refer to",
+  "evaluation": {
+    "accuracy": 0.6007604562737643
+  },
+  "arguments": "Namespace(config_name=None, dataset_config_name='pt', dataset_name='Muennighoff/xwinograd', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='test', target_max_length=256, template_config_name='en', template_name='does underscore refer to', tokenizer_name=None, use_slow_tokenizer=False)"
+}
\ No newline at end of file
diff --git a/evaluation_l1/Muennighoff_xwinograd/pt/stand_for/results.json b/evaluation_l1/Muennighoff_xwinograd/pt/stand_for/results.json
new file mode 100644
index 0000000000000000000000000000000000000000..310ef9e24ba6109c7c26de82e901349762c9471a
--- /dev/null
+++ b/evaluation_l1/Muennighoff_xwinograd/pt/stand_for/results.json
@@ -0,0 +1,9 @@
+{
+  "dataset_name": "Muennighoff/xwinograd",
+  "dataset_config_name": "pt",
+  "template_name": "stand for",
+  "evaluation": {
+    "accuracy": 0.49809885931558934
+  },
+  "arguments": "Namespace(config_name=None, dataset_config_name='pt', dataset_name='Muennighoff/xwinograd', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='test', target_max_length=256, template_config_name='en', template_name='stand for', tokenizer_name=None, use_slow_tokenizer=False)"
+}
\ No newline at end of file
diff --git a/evaluation_l1/Muennighoff_xwinograd/pt/underscore_refer_to/results.json b/evaluation_l1/Muennighoff_xwinograd/pt/underscore_refer_to/results.json
new file mode 100644
index 0000000000000000000000000000000000000000..ef31c9a05fdae650875c5ab14ba65ad812396606
--- /dev/null
+++ b/evaluation_l1/Muennighoff_xwinograd/pt/underscore_refer_to/results.json
@@ -0,0 +1,9 @@
+{
+  "dataset_name": "Muennighoff/xwinograd",
+  "dataset_config_name": "pt",
+  "template_name": "underscore refer to",
+  "evaluation": {
+    "accuracy": 0.6083650190114068
+  },
+  "arguments": "Namespace(config_name=None, dataset_config_name='pt', dataset_name='Muennighoff/xwinograd', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='test', target_max_length=256, template_config_name='en', template_name='underscore refer to', tokenizer_name=None, use_slow_tokenizer=False)"
+}
\ No newline at end of file
diff --git a/evaluation_l1/Muennighoff_xwinograd/zh/Replace/results.json b/evaluation_l1/Muennighoff_xwinograd/zh/Replace/results.json
new file mode 100644
index 0000000000000000000000000000000000000000..1fbc6d83b73966e85bc5e1da6f2cd012adba195a
--- /dev/null
+++ b/evaluation_l1/Muennighoff_xwinograd/zh/Replace/results.json
@@ -0,0 +1,9 @@
+{
+  "dataset_name": "Muennighoff/xwinograd",
+  "dataset_config_name": "zh",
+  "template_name": "Replace",
+  "evaluation": {
+    "accuracy": 0.7063492063492064
+  },
+  "arguments": "Namespace(config_name=None, dataset_config_name='zh', dataset_name='Muennighoff/xwinograd', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='test', target_max_length=256, template_config_name='en', template_name='Replace', tokenizer_name=None, use_slow_tokenizer=False)"
+}
\ No newline at end of file
diff --git a/evaluation_l1/Muennighoff_xwinograd/zh/True_or_False/results.json b/evaluation_l1/Muennighoff_xwinograd/zh/True_or_False/results.json
new file mode 100644
index 0000000000000000000000000000000000000000..51ed558709378889fed0678b936ed71050d4f2db
--- /dev/null
+++ b/evaluation_l1/Muennighoff_xwinograd/zh/True_or_False/results.json
@@ -0,0 +1,9 @@
+{
+  "dataset_name": "Muennighoff/xwinograd",
+  "dataset_config_name": "zh",
+  "template_name": "True or False",
+  "evaluation": {
+    "accuracy": 0.5515873015873016
+  },
+  "arguments": "Namespace(config_name=None, dataset_config_name='zh', dataset_name='Muennighoff/xwinograd', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='test', target_max_length=256, template_config_name='en', template_name='True or False', tokenizer_name=None, use_slow_tokenizer=False)"
+}
\ No newline at end of file
diff --git a/evaluation_l1/Muennighoff_xwinograd/zh/does_underscore_refer_to/results.json b/evaluation_l1/Muennighoff_xwinograd/zh/does_underscore_refer_to/results.json
new file mode 100644
index 0000000000000000000000000000000000000000..f0f243e26825b9625fdde9c5dd2af458fce410ac
--- /dev/null
+++ b/evaluation_l1/Muennighoff_xwinograd/zh/does_underscore_refer_to/results.json
@@ -0,0 +1,9 @@
+{
+  "dataset_name": "Muennighoff/xwinograd",
+  "dataset_config_name": "zh",
+  "template_name": "does underscore refer to",
+  "evaluation": {
+    "accuracy": 0.621031746031746
+  },
+  "arguments": "Namespace(config_name=None, dataset_config_name='zh', dataset_name='Muennighoff/xwinograd', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='test', target_max_length=256, template_config_name='en', template_name='does underscore refer to', tokenizer_name=None, use_slow_tokenizer=False)"
+}
\ No newline at end of file
diff --git a/evaluation_l1/Muennighoff_xwinograd/zh/stand_for/results.json b/evaluation_l1/Muennighoff_xwinograd/zh/stand_for/results.json
new file mode 100644
index 0000000000000000000000000000000000000000..8ee2b197de90b9471d5898fed01f8d44b6ffbe7f
--- /dev/null
+++ b/evaluation_l1/Muennighoff_xwinograd/zh/stand_for/results.json
@@ -0,0 +1,9 @@
+{
+  "dataset_name": "Muennighoff/xwinograd",
+  "dataset_config_name": "zh",
+  "template_name": "stand for",
+  "evaluation": {
+    "accuracy": 0.5158730158730159
+  },
+  "arguments": "Namespace(config_name=None, dataset_config_name='zh', dataset_name='Muennighoff/xwinograd', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='test', target_max_length=256, template_config_name='en', template_name='stand for', tokenizer_name=None, use_slow_tokenizer=False)"
+}
\ No newline at end of file
diff --git a/evaluation_l1/Muennighoff_xwinograd/zh/underscore_refer_to/results.json b/evaluation_l1/Muennighoff_xwinograd/zh/underscore_refer_to/results.json
new file mode 100644
index 0000000000000000000000000000000000000000..bb725b17616be084da49f9d036df24a26aa8fc3e
--- /dev/null
+++ b/evaluation_l1/Muennighoff_xwinograd/zh/underscore_refer_to/results.json
@@ -0,0 +1,9 @@
+{
+  "dataset_name": "Muennighoff/xwinograd",
+  "dataset_config_name": "zh",
+  "template_name": "underscore refer to",
+  "evaluation": {
+    "accuracy": 0.6765873015873016
+  },
+  "arguments": "Namespace(config_name=None, dataset_config_name='zh', dataset_name='Muennighoff/xwinograd', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='test', target_max_length=256, template_config_name='en', template_name='underscore refer to', tokenizer_name=None, use_slow_tokenizer=False)"
+}
\ No newline at end of file
diff --git a/evaluation_l1/anli/dev_r1/GPT-3_style/results.json b/evaluation_l1/anli/dev_r1/GPT-3_style/results.json
new file mode 100644
index 0000000000000000000000000000000000000000..624ef5ca1c606f97954ef9489cd9b411b4aacdd9
--- /dev/null
+++ b/evaluation_l1/anli/dev_r1/GPT-3_style/results.json
@@ -0,0 +1,9 @@
+{
+  "dataset_name": "anli",
+  "dataset_config_name": "dev_r1",
+  "template_name": "GPT-3 style",
+  "evaluation": {
+    "accuracy": 0.497
+  },
+  "arguments": "Namespace(config_name=None, dataset_config_name='dev_r1', dataset_name='anli', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='dev_r1', target_max_length=256, template_config_name=None, template_name='GPT-3 style', tokenizer_name=None, use_slow_tokenizer=False)"
+}
\ No newline at end of file
diff --git a/evaluation_l1/anli/dev_r1/MNLI_crowdsource/results.json b/evaluation_l1/anli/dev_r1/MNLI_crowdsource/results.json
new file mode 100644
index 0000000000000000000000000000000000000000..17bea05f2e6e2b5dd592b8a4f29bb60008648757
--- /dev/null
+++ b/evaluation_l1/anli/dev_r1/MNLI_crowdsource/results.json
@@ -0,0 +1,9 @@
+{
+  "dataset_name": "anli",
+  "dataset_config_name": "dev_r1",
+  "template_name": "MNLI crowdsource",
+  "evaluation": {
+    "accuracy": 0.442
+  },
+  "arguments": "Namespace(config_name=None, dataset_config_name='dev_r1', dataset_name='anli', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='dev_r1', target_max_length=256, template_config_name=None, template_name='MNLI crowdsource', tokenizer_name=None, use_slow_tokenizer=False)"
+}
\ No newline at end of file
diff --git a/evaluation_l1/anli/dev_r1/can_we_infer/results.json b/evaluation_l1/anli/dev_r1/can_we_infer/results.json
new file mode 100644
index 0000000000000000000000000000000000000000..724d47c0dc6f6ef4d49e9caf8b4b2b63b9b58ac7
--- /dev/null
+++ b/evaluation_l1/anli/dev_r1/can_we_infer/results.json
@@ -0,0 +1,9 @@
+{
+  "dataset_name": "anli",
+  "dataset_config_name": "dev_r1",
+  "template_name": "can we infer",
+  "evaluation": {
+    "accuracy": 0.456
+  },
+  "arguments": "Namespace(config_name=None, dataset_config_name='dev_r1', dataset_name='anli', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='dev_r1', target_max_length=256, template_config_name=None, template_name='can we infer', tokenizer_name=None, use_slow_tokenizer=False)"
+}
\ No newline at end of file
diff --git a/evaluation_l1/anli/dev_r1/guaranteed_possible_impossible/results.json b/evaluation_l1/anli/dev_r1/guaranteed_possible_impossible/results.json
new file mode 100644
index 0000000000000000000000000000000000000000..5309d4d5e0a389c68d33eac1f803108aec5560b6
--- /dev/null
+++ b/evaluation_l1/anli/dev_r1/guaranteed_possible_impossible/results.json
@@ -0,0 +1,9 @@
+{
+  "dataset_name": "anli",
+  "dataset_config_name": "dev_r1",
+  "template_name": "guaranteed/possible/impossible",
+  "evaluation": {
+    "accuracy": 0.328
+  },
+  "arguments": "Namespace(config_name=None, dataset_config_name='dev_r1', dataset_name='anli', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='dev_r1', target_max_length=256, template_config_name=None, template_name='guaranteed/possible/impossible', tokenizer_name=None, use_slow_tokenizer=False)"
+}
\ No newline at end of file
diff --git a/evaluation_l1/anli/dev_r1/justified_in_saying/results.json b/evaluation_l1/anli/dev_r1/justified_in_saying/results.json
new file mode 100644
index 0000000000000000000000000000000000000000..c116ebcd4d24a3000e5a7370b54433dece9b944e
--- /dev/null
+++ b/evaluation_l1/anli/dev_r1/justified_in_saying/results.json
@@ -0,0 +1,9 @@
+{
+  "dataset_name": "anli",
+  "dataset_config_name": "dev_r1",
+  "template_name": "justified in saying",
+  "evaluation": {
+    "accuracy": 0.46
+  },
+  "arguments": "Namespace(config_name=None, dataset_config_name='dev_r1', dataset_name='anli', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='dev_r1', target_max_length=256, template_config_name=None, template_name='justified in saying', tokenizer_name=None, use_slow_tokenizer=False)"
+}
\ No newline at end of file
diff --git a/evaluation_l1/anli/dev_r2/GPT-3_style/results.json b/evaluation_l1/anli/dev_r2/GPT-3_style/results.json
new file mode 100644
index 0000000000000000000000000000000000000000..eccf8c1551bd97f197744e7516bcf00470f66e65
--- /dev/null
+++ b/evaluation_l1/anli/dev_r2/GPT-3_style/results.json
@@ -0,0 +1,9 @@
+{
+  "dataset_name": "anli",
+  "dataset_config_name": "dev_r2",
+  "template_name": "GPT-3 style",
+  "evaluation": {
+    "accuracy": 0.45
+  },
+  "arguments": "Namespace(config_name=None, dataset_config_name='dev_r2', dataset_name='anli', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='dev_r2', target_max_length=256, template_config_name=None, template_name='GPT-3 style', tokenizer_name=None, use_slow_tokenizer=False)"
+}
\ No newline at end of file
diff --git a/evaluation_l1/anli/dev_r2/MNLI_crowdsource/results.json b/evaluation_l1/anli/dev_r2/MNLI_crowdsource/results.json
new file mode 100644
index 0000000000000000000000000000000000000000..42e7d317c933d0b4944e0282146dc394124acb68
--- /dev/null
+++ b/evaluation_l1/anli/dev_r2/MNLI_crowdsource/results.json
@@ -0,0 +1,9 @@
+{
+  "dataset_name": "anli",
+  "dataset_config_name": "dev_r2",
+  "template_name": "MNLI crowdsource",
+  "evaluation": {
+    "accuracy": 0.382
+  },
+  "arguments": "Namespace(config_name=None, dataset_config_name='dev_r2', dataset_name='anli', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='dev_r2', target_max_length=256, template_config_name=None, template_name='MNLI crowdsource', tokenizer_name=None, use_slow_tokenizer=False)"
+}
\ No newline at end of file
diff --git a/evaluation_l1/anli/dev_r2/can_we_infer/results.json b/evaluation_l1/anli/dev_r2/can_we_infer/results.json
new file mode 100644
index 0000000000000000000000000000000000000000..bcf2167c91a3a87a89a39b4ba636a92c1c0499ee
--- /dev/null
+++ b/evaluation_l1/anli/dev_r2/can_we_infer/results.json
@@ -0,0 +1,9 @@
+{
+  "dataset_name": "anli",
+  "dataset_config_name": "dev_r2",
+  "template_name": "can we infer",
+  "evaluation": {
+    "accuracy": 0.419
+  },
+  "arguments": "Namespace(config_name=None, dataset_config_name='dev_r2', dataset_name='anli', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='dev_r2', target_max_length=256, template_config_name=None, template_name='can we infer', tokenizer_name=None, use_slow_tokenizer=False)"
+}
\ No newline at end of file
diff --git a/evaluation_l1/anli/dev_r2/guaranteed_possible_impossible/results.json b/evaluation_l1/anli/dev_r2/guaranteed_possible_impossible/results.json
new file mode 100644
index 0000000000000000000000000000000000000000..3b5e80243b6a064cff22a8ae1358d3a1065f5cc6
--- /dev/null
+++ b/evaluation_l1/anli/dev_r2/guaranteed_possible_impossible/results.json
@@ -0,0 +1,9 @@
+{
+  "dataset_name": "anli",
+  "dataset_config_name": "dev_r2",
+  "template_name": "guaranteed/possible/impossible",
+  "evaluation": {
+    "accuracy": 0.345
+  },
+  "arguments": "Namespace(config_name=None, dataset_config_name='dev_r2', dataset_name='anli', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='dev_r2', target_max_length=256, template_config_name=None, template_name='guaranteed/possible/impossible', tokenizer_name=None, use_slow_tokenizer=False)"
+}
\ No newline at end of file
diff --git a/evaluation_l1/anli/dev_r2/justified_in_saying/results.json b/evaluation_l1/anli/dev_r2/justified_in_saying/results.json
new file mode 100644
index 0000000000000000000000000000000000000000..104e8bc71bf21d47b3390778a2bae08084763b39
--- /dev/null
+++ b/evaluation_l1/anli/dev_r2/justified_in_saying/results.json
@@ -0,0 +1,9 @@
+{
+  "dataset_name": "anli",
+  "dataset_config_name": "dev_r2",
+  "template_name": "justified in saying",
+  "evaluation": {
+    "accuracy": 0.41
+  },
+  "arguments": "Namespace(config_name=None, dataset_config_name='dev_r2', dataset_name='anli', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='dev_r2', target_max_length=256, template_config_name=None, template_name='justified in saying', tokenizer_name=None, use_slow_tokenizer=False)"
+}
\ No newline at end of file
diff --git a/evaluation_l1/anli/dev_r3/GPT-3_style/results.json b/evaluation_l1/anli/dev_r3/GPT-3_style/results.json
new file mode 100644
index 0000000000000000000000000000000000000000..ceae21ce359e3db0adf8516d4199124caa0e5a81
--- /dev/null
+++ b/evaluation_l1/anli/dev_r3/GPT-3_style/results.json
@@ -0,0 +1,9 @@
+{
+  "dataset_name": "anli",
+  "dataset_config_name": "dev_r3",
+  "template_name": "GPT-3 style",
+  "evaluation": {
+    "accuracy": 0.4558333333333333
+  },
+  "arguments": "Namespace(config_name=None, dataset_config_name='dev_r3', dataset_name='anli', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='dev_r3', target_max_length=256, template_config_name=None, template_name='GPT-3 style', tokenizer_name=None, use_slow_tokenizer=False)"
+}
\ No newline at end of file
diff --git a/evaluation_l1/anli/dev_r3/MNLI_crowdsource/results.json b/evaluation_l1/anli/dev_r3/MNLI_crowdsource/results.json
new file mode 100644
index 0000000000000000000000000000000000000000..5e133c1acdb994634d864698c93a5014e3f47019
--- /dev/null
+++ b/evaluation_l1/anli/dev_r3/MNLI_crowdsource/results.json
@@ -0,0 +1,9 @@
+{
+  "dataset_name": "anli",
+  "dataset_config_name": "dev_r3",
+  "template_name": "MNLI crowdsource",
+  "evaluation": {
+    "accuracy": 0.41333333333333333
+  },
+  "arguments": "Namespace(config_name=None, dataset_config_name='dev_r3', dataset_name='anli', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='dev_r3', target_max_length=256, template_config_name=None, template_name='MNLI crowdsource', tokenizer_name=None, use_slow_tokenizer=False)"
+}
\ No newline at end of file
diff --git a/evaluation_l1/anli/dev_r3/can_we_infer/results.json b/evaluation_l1/anli/dev_r3/can_we_infer/results.json
new file mode 100644
index 0000000000000000000000000000000000000000..dc8705fa2dffaf0d5ab82b34ab8e28b6c4c9ae1e
--- /dev/null
+++ b/evaluation_l1/anli/dev_r3/can_we_infer/results.json
@@ -0,0 +1,9 @@
+{
+  "dataset_name": "anli",
+  "dataset_config_name": "dev_r3",
+  "template_name": "can we infer",
+  "evaluation": {
+    "accuracy": 0.4225
+  },
+  "arguments": "Namespace(config_name=None, dataset_config_name='dev_r3', dataset_name='anli', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='dev_r3', target_max_length=256, template_config_name=None, template_name='can we infer', tokenizer_name=None, use_slow_tokenizer=False)"
+}
\ No newline at end of file
diff --git a/evaluation_l1/anli/dev_r3/guaranteed_possible_impossible/results.json b/evaluation_l1/anli/dev_r3/guaranteed_possible_impossible/results.json
new file mode 100644
index 0000000000000000000000000000000000000000..dc1c9c2cbed6db1dba1a6b120cd962090e1c94b9
--- /dev/null
+++ b/evaluation_l1/anli/dev_r3/guaranteed_possible_impossible/results.json
@@ -0,0 +1,9 @@
+{
+  "dataset_name": "anli",
+  "dataset_config_name": "dev_r3",
+  "template_name": "guaranteed/possible/impossible",
+  "evaluation": {
+    "accuracy": 0.305
+  },
+  "arguments": "Namespace(config_name=None, dataset_config_name='dev_r3', dataset_name='anli', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='dev_r3', target_max_length=256, template_config_name=None, template_name='guaranteed/possible/impossible', tokenizer_name=None, use_slow_tokenizer=False)"
+}
\ No newline at end of file
diff --git a/evaluation_l1/anli/dev_r3/justified_in_saying/results.json b/evaluation_l1/anli/dev_r3/justified_in_saying/results.json
new file mode 100644
index 0000000000000000000000000000000000000000..81a1ad51681d7dba238377f77944974385162d26
--- /dev/null
+++ b/evaluation_l1/anli/dev_r3/justified_in_saying/results.json
@@ -0,0 +1,9 @@
+{
+  "dataset_name": "anli",
+  "dataset_config_name": "dev_r3",
+  "template_name": "justified in saying",
+  "evaluation": {
+    "accuracy": 0.4083333333333333
+  },
+  "arguments": "Namespace(config_name=None, dataset_config_name='dev_r3', dataset_name='anli', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='dev_r3', target_max_length=256, template_config_name=None, template_name='justified in saying', tokenizer_name=None, use_slow_tokenizer=False)"
+}
\ No newline at end of file
diff --git a/evaluation_l1/merged.csv b/evaluation_l1/merged.csv
new file mode 100644
index 0000000000000000000000000000000000000000..e22f6c3d3ab721a945ee943a9fe18ad51e742d92
--- /dev/null
+++ b/evaluation_l1/merged.csv
@@ -0,0 +1,194 @@
+dataset,prompt,metric,value
+anli_dev_r1,GPT-3 style,accuracy,0.497
+anli_dev_r1,MNLI crowdsource,accuracy,0.442
+anli_dev_r1,can we infer,accuracy,0.456
+anli_dev_r1,guaranteed/possible/impossible,accuracy,0.328
+anli_dev_r1,justified in saying,accuracy,0.46
+anli_dev_r1,median,accuracy,0.456
+anli_dev_r2,GPT-3 style,accuracy,0.45
+anli_dev_r2,MNLI crowdsource,accuracy,0.382
+anli_dev_r2,can we infer,accuracy,0.419
+anli_dev_r2,guaranteed/possible/impossible,accuracy,0.345
+anli_dev_r2,justified in saying,accuracy,0.41
+anli_dev_r2,median,accuracy,0.41
+anli_dev_r3,GPT-3 style,accuracy,0.4558333333333333
+anli_dev_r3,MNLI crowdsource,accuracy,0.41333333333333333
+anli_dev_r3,can we infer,accuracy,0.4225
+anli_dev_r3,guaranteed/possible/impossible,accuracy,0.305
+anli_dev_r3,justified in saying,accuracy,0.4083333333333333
+anli_dev_r3,median,accuracy,0.41333333333333333
+story_cloze_2016,Answer Given options,accuracy,0.9524318546231961
+story_cloze_2016,Choose Story Ending,accuracy,0.9668626402993051
+story_cloze_2016,Generate Ending,accuracy,0.7760555852485302
+story_cloze_2016,Novel Correct Ending,accuracy,0.9583110636023516
+story_cloze_2016,Story Continuation and Options,accuracy,0.9593800106894709
+story_cloze_2016,median,accuracy,0.9583110636023516
+super_glue_cb,GPT-3 style,accuracy,0.875
+super_glue_cb,MNLI crowdsource,accuracy,0.35714285714285715
+super_glue_cb,can we infer,accuracy,0.75
+super_glue_cb,guaranteed/possible/impossible,accuracy,0.7678571428571429
+super_glue_cb,justified in saying,accuracy,0.8035714285714286
+super_glue_cb,median,accuracy,0.7678571428571429
+super_glue_copa,"C1 or C2? premise, so/because…",accuracy,0.75
+super_glue_copa,best_option,accuracy,0.87
+super_glue_copa,cause_effect,accuracy,0.9
+super_glue_copa,i_am_hesitating,accuracy,0.91
+super_glue_copa,plausible_alternatives,accuracy,0.91
+super_glue_copa,median,accuracy,0.9
+super_glue_rte,GPT-3 style,accuracy,0.7870036101083032
+super_glue_rte,MNLI crowdsource,accuracy,0.8592057761732852
+super_glue_rte,does it follow that,accuracy,0.8194945848375451
+super_glue_rte,guaranteed true,accuracy,0.7942238267148014
+super_glue_rte,should assume,accuracy,0.8122743682310469
+super_glue_rte,median,accuracy,0.8122743682310469
+winogrande_winogrande_xl,Replace,accuracy,0.5998421468034728
+winogrande_winogrande_xl,True or False,accuracy,0.5359116022099447
+winogrande_winogrande_xl,does underscore refer to,accuracy,0.5864246250986582
+winogrande_winogrande_xl,stand for,accuracy,0.5201262825572218
+winogrande_winogrande_xl,underscore refer to,accuracy,0.5880031570639306
+winogrande_winogrande_xl,median,accuracy,0.5864246250986582
+xcopa_id,"C1 or C2? premise, so/because…",accuracy,0.56
+xcopa_id,best_option,accuracy,0.81
+xcopa_id,cause_effect,accuracy,0.87
+xcopa_id,i_am_hesitating,accuracy,0.83
+xcopa_id,plausible_alternatives,accuracy,0.87
+xcopa_id,median,accuracy,0.83
+xcopa_sw,"C1 or C2? premise, so/because…",accuracy,0.6
+xcopa_sw,best_option,accuracy,0.62
+xcopa_sw,cause_effect,accuracy,0.64
+xcopa_sw,i_am_hesitating,accuracy,0.66
+xcopa_sw,plausible_alternatives,accuracy,0.64
+xcopa_sw,median,accuracy,0.64
+xcopa_ta,"C1 or C2? premise, so/because…",accuracy,0.59
+xcopa_ta,best_option,accuracy,0.66
+xcopa_ta,cause_effect,accuracy,0.7
+xcopa_ta,i_am_hesitating,accuracy,0.69
+xcopa_ta,plausible_alternatives,accuracy,0.64
+xcopa_ta,median,accuracy,0.66
+xcopa_vi,"C1 or C2? premise, so/because…",accuracy,0.58
+xcopa_vi,best_option,accuracy,0.81
+xcopa_vi,cause_effect,accuracy,0.91
+xcopa_vi,i_am_hesitating,accuracy,0.85
+xcopa_vi,plausible_alternatives,accuracy,0.84
+xcopa_vi,median,accuracy,0.84
+xcopa_zh,"C1 or C2? premise, so/because…",accuracy,0.57
+xcopa_zh,best_option,accuracy,0.84
+xcopa_zh,cause_effect,accuracy,0.86
+xcopa_zh,i_am_hesitating,accuracy,0.86
+xcopa_zh,plausible_alternatives,accuracy,0.81
+xcopa_zh,median,accuracy,0.84
+xnli_ar,GPT-3 style,accuracy,0.5578313253012048
+xnli_ar,MNLI crowdsource,accuracy,0.41164658634538154
+xnli_ar,can we infer,accuracy,0.5152610441767068
+xnli_ar,guaranteed/possible/impossible,accuracy,0.5803212851405622
+xnli_ar,justified in saying,accuracy,0.5184738955823294
+xnli_ar,median,accuracy,0.5184738955823294
+xnli_en,GPT-3 style,accuracy,0.6176706827309237
+xnli_en,MNLI crowdsource,accuracy,0.4606425702811245
+xnli_en,can we infer,accuracy,0.5714859437751004
+xnli_en,guaranteed/possible/impossible,accuracy,0.6180722891566265
+xnli_en,justified in saying,accuracy,0.5746987951807229
+xnli_en,median,accuracy,0.5746987951807229
+xnli_es,GPT-3 style,accuracy,0.5911646586345382
+xnli_es,MNLI crowdsource,accuracy,0.43052208835341366
+xnli_es,can we infer,accuracy,0.4397590361445783
+xnli_es,guaranteed/possible/impossible,accuracy,0.5208835341365462
+xnli_es,justified in saying,accuracy,0.41726907630522087
+xnli_es,median,accuracy,0.4397590361445783
+xnli_fr,GPT-3 style,accuracy,0.5911646586345382
+xnli_fr,MNLI crowdsource,accuracy,0.4321285140562249
+xnli_fr,can we infer,accuracy,0.5369477911646586
+xnli_fr,guaranteed/possible/impossible,accuracy,0.5176706827309236
+xnli_fr,justified in saying,accuracy,0.5385542168674698
+xnli_fr,median,accuracy,0.5369477911646586
+xnli_hi,GPT-3 style,accuracy,0.5208835341365462
+xnli_hi,MNLI crowdsource,accuracy,0.3819277108433735
+xnli_hi,can we infer,accuracy,0.44176706827309237
+xnli_hi,guaranteed/possible/impossible,accuracy,0.5253012048192771
+xnli_hi,justified in saying,accuracy,0.44377510040160645
+xnli_hi,median,accuracy,0.44377510040160645
+xnli_sw,GPT-3 style,accuracy,0.5036144578313253
+xnli_sw,MNLI crowdsource,accuracy,0.3887550200803213
+xnli_sw,can we infer,accuracy,0.44216867469879517
+xnli_sw,guaranteed/possible/impossible,accuracy,0.38795180722891565
+xnli_sw,justified in saying,accuracy,0.4397590361445783
+xnli_sw,median,accuracy,0.4397590361445783
+xnli_ur,GPT-3 style,accuracy,0.4907630522088353
+xnli_ur,MNLI crowdsource,accuracy,0.37309236947791163
+xnli_ur,can we infer,accuracy,0.45863453815261046
+xnli_ur,guaranteed/possible/impossible,accuracy,0.5124497991967871
+xnli_ur,justified in saying,accuracy,0.45582329317269077
+xnli_ur,median,accuracy,0.45863453815261046
+xnli_vi,GPT-3 style,accuracy,0.5582329317269076
+xnli_vi,MNLI crowdsource,accuracy,0.42690763052208835
+xnli_vi,can we infer,accuracy,0.4759036144578313
+xnli_vi,guaranteed/possible/impossible,accuracy,0.5008032128514056
+xnli_vi,justified in saying,accuracy,0.4827309236947791
+xnli_vi,median,accuracy,0.4827309236947791
+xnli_zh,GPT-3 style,accuracy,0.5550200803212851
+xnli_zh,MNLI crowdsource,accuracy,0.4248995983935743
+xnli_zh,can we infer,accuracy,0.43052208835341366
+xnli_zh,guaranteed/possible/impossible,accuracy,0.5526104417670683
+xnli_zh,justified in saying,accuracy,0.44016064257028115
+xnli_zh,median,accuracy,0.44016064257028115
+xstory_cloze_ar,Answer Given options,accuracy,0.7835870284579749
+xstory_cloze_ar,Choose Story Ending,accuracy,0.9291859695565851
+xstory_cloze_ar,Generate Ending,accuracy,0.6624751819986764
+xstory_cloze_ar,Novel Correct Ending,accuracy,0.9252150893448048
+xstory_cloze_ar,Story Continuation and Options,accuracy,0.9159497021839841
+xstory_cloze_ar,median,accuracy,0.9159497021839841
+xstory_cloze_es,Answer Given options,accuracy,0.870946393117141
+xstory_cloze_es,Choose Story Ending,accuracy,0.9523494374586366
+xstory_cloze_es,Generate Ending,accuracy,0.7319655857048313
+xstory_cloze_es,Novel Correct Ending,accuracy,0.9477167438782264
+xstory_cloze_es,Story Continuation and Options,accuracy,0.9516876240900066
+xstory_cloze_es,median,accuracy,0.9477167438782264
+xstory_cloze_eu,Answer Given options,accuracy,0.6982131039046989
+xstory_cloze_eu,Choose Story Ending,accuracy,0.85704831237591
+xstory_cloze_eu,Generate Ending,accuracy,0.614162806088683
+xstory_cloze_eu,Novel Correct Ending,accuracy,0.8590337524818001
+xstory_cloze_eu,Story Continuation and Options,accuracy,0.8504301786896096
+xstory_cloze_eu,median,accuracy,0.8504301786896096
+xstory_cloze_hi,Answer Given options,accuracy,0.7683653209794837
+xstory_cloze_hi,Choose Story Ending,accuracy,0.8742554599602912
+xstory_cloze_hi,Generate Ending,accuracy,0.657180675049636
+xstory_cloze_hi,Novel Correct Ending,accuracy,0.886829913964262
+xstory_cloze_hi,Story Continuation and Options,accuracy,0.8762409000661814
+xstory_cloze_hi,median,accuracy,0.8742554599602912
+xstory_cloze_id,Answer Given options,accuracy,0.8332230311052283
+xstory_cloze_id,Choose Story Ending,accuracy,0.913964262078094
+xstory_cloze_id,Generate Ending,accuracy,0.700198544010589
+xstory_cloze_id,Novel Correct Ending,accuracy,0.9205823957643945
+xstory_cloze_id,Story Continuation and Options,accuracy,0.9086697551290536
+xstory_cloze_id,median,accuracy,0.9086697551290536
+xstory_cloze_zh,Answer Given options,accuracy,0.870946393117141
+xstory_cloze_zh,Choose Story Ending,accuracy,0.9265387160820648
+xstory_cloze_zh,Generate Ending,accuracy,0.6823295830575777
+xstory_cloze_zh,Novel Correct Ending,accuracy,0.928524156187955
+xstory_cloze_zh,Story Continuation and Options,accuracy,0.9232296492389146
+xstory_cloze_zh,median,accuracy,0.9232296492389146
+xwinograd_en,Replace,accuracy,0.6933333333333334
+xwinograd_en,True or False,accuracy,0.5212903225806451
+xwinograd_en,does underscore refer to,accuracy,0.6563440860215054
+xwinograd_en,stand for,accuracy,0.5156989247311828
+xwinograd_en,underscore refer to,accuracy,0.6473118279569893
+xwinograd_en,median,accuracy,0.6473118279569893
+xwinograd_fr,Replace,accuracy,0.6024096385542169
+xwinograd_fr,True or False,accuracy,0.46987951807228917
+xwinograd_fr,does underscore refer to,accuracy,0.5903614457831325
+xwinograd_fr,stand for,accuracy,0.4939759036144578
+xwinograd_fr,underscore refer to,accuracy,0.6867469879518072
+xwinograd_fr,median,accuracy,0.5903614457831325
+xwinograd_pt,Replace,accuracy,0.6463878326996197
+xwinograd_pt,True or False,accuracy,0.5285171102661597
+xwinograd_pt,does underscore refer to,accuracy,0.6007604562737643
+xwinograd_pt,stand for,accuracy,0.49809885931558934
+xwinograd_pt,underscore refer to,accuracy,0.6083650190114068
+xwinograd_pt,median,accuracy,0.6007604562737643
+xwinograd_zh,Replace,accuracy,0.7063492063492064
+xwinograd_zh,True or False,accuracy,0.5515873015873016
+xwinograd_zh,does underscore refer to,accuracy,0.621031746031746
+xwinograd_zh,stand for,accuracy,0.5158730158730159
+xwinograd_zh,underscore refer to,accuracy,0.6765873015873016
+xwinograd_zh,median,accuracy,0.621031746031746
+multiple,average,multiple,0.6665267892901372
diff --git a/evaluation_l1/merged.json b/evaluation_l1/merged.json
new file mode 100644
index 0000000000000000000000000000000000000000..501d6ffa6ac3a5c6e3c21ad460331ab55203d9b0
--- /dev/null
+++ b/evaluation_l1/merged.json
@@ -0,0 +1 @@
+{"Muennighoff/xstory_cloze_ar": {"Answer Given options": {"arguments": "Namespace(config_name=None, dataset_config_name='ar', dataset_name='Muennighoff/xstory_cloze', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='en', template_name='Answer Given options', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "ar", "dataset_name": "Muennighoff/xstory_cloze", "evaluation": {"accuracy": 0.7835870284579749}, "template_name": "Answer Given options"}, "Choose Story Ending": {"arguments": "Namespace(config_name=None, dataset_config_name='ar', dataset_name='Muennighoff/xstory_cloze', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='en', template_name='Choose Story Ending', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "ar", "dataset_name": "Muennighoff/xstory_cloze", "evaluation": {"accuracy": 0.9291859695565851}, "template_name": "Choose Story Ending"}, "Generate Ending": {"arguments": "Namespace(config_name=None, dataset_config_name='ar', dataset_name='Muennighoff/xstory_cloze', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='en', template_name='Generate Ending', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "ar", "dataset_name": "Muennighoff/xstory_cloze", "evaluation": {"accuracy": 0.6624751819986764}, "template_name": "Generate Ending"}, "Novel Correct Ending": {"arguments": "Namespace(config_name=None, dataset_config_name='ar', dataset_name='Muennighoff/xstory_cloze', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='en', template_name='Novel Correct Ending', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "ar", "dataset_name": "Muennighoff/xstory_cloze", "evaluation": {"accuracy": 0.9252150893448048}, "template_name": "Novel Correct Ending"}, "Story Continuation and Options": {"arguments": "Namespace(config_name=None, dataset_config_name='ar', dataset_name='Muennighoff/xstory_cloze', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='en', template_name='Story Continuation and Options', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "ar", "dataset_name": "Muennighoff/xstory_cloze", "evaluation": {"accuracy": 0.9159497021839841}, "template_name": "Story Continuation and Options"}}, "Muennighoff/xstory_cloze_es": {"Answer Given options": {"arguments": "Namespace(config_name=None, dataset_config_name='es', dataset_name='Muennighoff/xstory_cloze', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='en', template_name='Answer Given options', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "es", "dataset_name": "Muennighoff/xstory_cloze", "evaluation": {"accuracy": 0.870946393117141}, "template_name": "Answer Given options"}, "Choose Story Ending": {"arguments": "Namespace(config_name=None, dataset_config_name='es', dataset_name='Muennighoff/xstory_cloze', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='en', template_name='Choose Story Ending', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "es", "dataset_name": "Muennighoff/xstory_cloze", "evaluation": {"accuracy": 0.9523494374586366}, "template_name": "Choose Story Ending"}, "Generate Ending": {"arguments": "Namespace(config_name=None, dataset_config_name='es', dataset_name='Muennighoff/xstory_cloze', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='en', template_name='Generate Ending', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "es", "dataset_name": "Muennighoff/xstory_cloze", "evaluation": {"accuracy": 0.7319655857048313}, "template_name": "Generate Ending"}, "Novel Correct Ending": {"arguments": "Namespace(config_name=None, dataset_config_name='es', dataset_name='Muennighoff/xstory_cloze', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='en', template_name='Novel Correct Ending', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "es", "dataset_name": "Muennighoff/xstory_cloze", "evaluation": {"accuracy": 0.9477167438782264}, "template_name": "Novel Correct Ending"}, "Story Continuation and Options": {"arguments": "Namespace(config_name=None, dataset_config_name='es', dataset_name='Muennighoff/xstory_cloze', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='en', template_name='Story Continuation and Options', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "es", "dataset_name": "Muennighoff/xstory_cloze", "evaluation": {"accuracy": 0.9516876240900066}, "template_name": "Story Continuation and Options"}}, "Muennighoff/xstory_cloze_eu": {"Answer Given options": {"arguments": "Namespace(config_name=None, dataset_config_name='eu', dataset_name='Muennighoff/xstory_cloze', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='en', template_name='Answer Given options', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "eu", "dataset_name": "Muennighoff/xstory_cloze", "evaluation": {"accuracy": 0.6982131039046989}, "template_name": "Answer Given options"}, "Choose Story Ending": {"arguments": "Namespace(config_name=None, dataset_config_name='eu', dataset_name='Muennighoff/xstory_cloze', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='en', template_name='Choose Story Ending', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "eu", "dataset_name": "Muennighoff/xstory_cloze", "evaluation": {"accuracy": 0.85704831237591}, "template_name": "Choose Story Ending"}, "Generate Ending": {"arguments": "Namespace(config_name=None, dataset_config_name='eu', dataset_name='Muennighoff/xstory_cloze', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='en', template_name='Generate Ending', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "eu", "dataset_name": "Muennighoff/xstory_cloze", "evaluation": {"accuracy": 0.614162806088683}, "template_name": "Generate Ending"}, "Novel Correct Ending": {"arguments": "Namespace(config_name=None, dataset_config_name='eu', dataset_name='Muennighoff/xstory_cloze', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='en', template_name='Novel Correct Ending', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "eu", "dataset_name": "Muennighoff/xstory_cloze", "evaluation": {"accuracy": 0.8590337524818001}, "template_name": "Novel Correct Ending"}, "Story Continuation and Options": {"arguments": "Namespace(config_name=None, dataset_config_name='eu', dataset_name='Muennighoff/xstory_cloze', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='en', template_name='Story Continuation and Options', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "eu", "dataset_name": "Muennighoff/xstory_cloze", "evaluation": {"accuracy": 0.8504301786896096}, "template_name": "Story Continuation and Options"}}, "Muennighoff/xstory_cloze_hi": {"Answer Given options": {"arguments": "Namespace(config_name=None, dataset_config_name='hi', dataset_name='Muennighoff/xstory_cloze', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='en', template_name='Answer Given options', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "hi", "dataset_name": "Muennighoff/xstory_cloze", "evaluation": {"accuracy": 0.7683653209794837}, "template_name": "Answer Given options"}, "Choose Story Ending": {"arguments": "Namespace(config_name=None, dataset_config_name='hi', dataset_name='Muennighoff/xstory_cloze', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='en', template_name='Choose Story Ending', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "hi", "dataset_name": "Muennighoff/xstory_cloze", "evaluation": {"accuracy": 0.8742554599602912}, "template_name": "Choose Story Ending"}, "Generate Ending": {"arguments": "Namespace(config_name=None, dataset_config_name='hi', dataset_name='Muennighoff/xstory_cloze', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='en', template_name='Generate Ending', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "hi", "dataset_name": "Muennighoff/xstory_cloze", "evaluation": {"accuracy": 0.657180675049636}, "template_name": "Generate Ending"}, "Novel Correct Ending": {"arguments": "Namespace(config_name=None, dataset_config_name='hi', dataset_name='Muennighoff/xstory_cloze', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='en', template_name='Novel Correct Ending', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "hi", "dataset_name": "Muennighoff/xstory_cloze", "evaluation": {"accuracy": 0.886829913964262}, "template_name": "Novel Correct Ending"}, "Story Continuation and Options": {"arguments": "Namespace(config_name=None, dataset_config_name='hi', dataset_name='Muennighoff/xstory_cloze', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='en', template_name='Story Continuation and Options', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "hi", "dataset_name": "Muennighoff/xstory_cloze", "evaluation": {"accuracy": 0.8762409000661814}, "template_name": "Story Continuation and Options"}}, "Muennighoff/xstory_cloze_id": {"Answer Given options": {"arguments": "Namespace(config_name=None, dataset_config_name='id', dataset_name='Muennighoff/xstory_cloze', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='en', template_name='Answer Given options', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "id", "dataset_name": "Muennighoff/xstory_cloze", "evaluation": {"accuracy": 0.8332230311052283}, "template_name": "Answer Given options"}, "Choose Story Ending": {"arguments": "Namespace(config_name=None, dataset_config_name='id', dataset_name='Muennighoff/xstory_cloze', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='en', template_name='Choose Story Ending', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "id", "dataset_name": "Muennighoff/xstory_cloze", "evaluation": {"accuracy": 0.913964262078094}, "template_name": "Choose Story Ending"}, "Generate Ending": {"arguments": "Namespace(config_name=None, dataset_config_name='id', dataset_name='Muennighoff/xstory_cloze', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='en', template_name='Generate Ending', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "id", "dataset_name": "Muennighoff/xstory_cloze", "evaluation": {"accuracy": 0.700198544010589}, "template_name": "Generate Ending"}, "Novel Correct Ending": {"arguments": "Namespace(config_name=None, dataset_config_name='id', dataset_name='Muennighoff/xstory_cloze', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='en', template_name='Novel Correct Ending', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "id", "dataset_name": "Muennighoff/xstory_cloze", "evaluation": {"accuracy": 0.9205823957643945}, "template_name": "Novel Correct Ending"}, "Story Continuation and Options": {"arguments": "Namespace(config_name=None, dataset_config_name='id', dataset_name='Muennighoff/xstory_cloze', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='en', template_name='Story Continuation and Options', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "id", "dataset_name": "Muennighoff/xstory_cloze", "evaluation": {"accuracy": 0.9086697551290536}, "template_name": "Story Continuation and Options"}}, "Muennighoff/xstory_cloze_zh": {"Answer Given options": {"arguments": "Namespace(config_name=None, dataset_config_name='zh', dataset_name='Muennighoff/xstory_cloze', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='en', template_name='Answer Given options', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "zh", "dataset_name": "Muennighoff/xstory_cloze", "evaluation": {"accuracy": 0.870946393117141}, "template_name": "Answer Given options"}, "Choose Story Ending": {"arguments": "Namespace(config_name=None, dataset_config_name='zh', dataset_name='Muennighoff/xstory_cloze', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='en', template_name='Choose Story Ending', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "zh", "dataset_name": "Muennighoff/xstory_cloze", "evaluation": {"accuracy": 0.9265387160820648}, "template_name": "Choose Story Ending"}, "Generate Ending": {"arguments": "Namespace(config_name=None, dataset_config_name='zh', dataset_name='Muennighoff/xstory_cloze', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='en', template_name='Generate Ending', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "zh", "dataset_name": "Muennighoff/xstory_cloze", "evaluation": {"accuracy": 0.6823295830575777}, "template_name": "Generate Ending"}, "Novel Correct Ending": {"arguments": "Namespace(config_name=None, dataset_config_name='zh', dataset_name='Muennighoff/xstory_cloze', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='en', template_name='Novel Correct Ending', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "zh", "dataset_name": "Muennighoff/xstory_cloze", "evaluation": {"accuracy": 0.928524156187955}, "template_name": "Novel Correct Ending"}, "Story Continuation and Options": {"arguments": "Namespace(config_name=None, dataset_config_name='zh', dataset_name='Muennighoff/xstory_cloze', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='en', template_name='Story Continuation and Options', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "zh", "dataset_name": "Muennighoff/xstory_cloze", "evaluation": {"accuracy": 0.9232296492389146}, "template_name": "Story Continuation and Options"}}, "Muennighoff/xwinograd_en": {"Replace": {"arguments": "Namespace(config_name=None, dataset_config_name='en', dataset_name='Muennighoff/xwinograd', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='test', target_max_length=256, template_config_name='en', template_name='Replace', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "en", "dataset_name": "Muennighoff/xwinograd", "evaluation": {"accuracy": 0.6933333333333334}, "template_name": "Replace"}, "True or False": {"arguments": "Namespace(config_name=None, dataset_config_name='en', dataset_name='Muennighoff/xwinograd', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='test', target_max_length=256, template_config_name='en', template_name='True or False', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "en", "dataset_name": "Muennighoff/xwinograd", "evaluation": {"accuracy": 0.5212903225806451}, "template_name": "True or False"}, "does underscore refer to": {"arguments": "Namespace(config_name=None, dataset_config_name='en', dataset_name='Muennighoff/xwinograd', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='test', target_max_length=256, template_config_name='en', template_name='does underscore refer to', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "en", "dataset_name": "Muennighoff/xwinograd", "evaluation": {"accuracy": 0.6563440860215054}, "template_name": "does underscore refer to"}, "stand for": {"arguments": "Namespace(config_name=None, dataset_config_name='en', dataset_name='Muennighoff/xwinograd', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='test', target_max_length=256, template_config_name='en', template_name='stand for', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "en", "dataset_name": "Muennighoff/xwinograd", "evaluation": {"accuracy": 0.5156989247311828}, "template_name": "stand for"}, "underscore refer to": {"arguments": "Namespace(config_name=None, dataset_config_name='en', dataset_name='Muennighoff/xwinograd', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='test', target_max_length=256, template_config_name='en', template_name='underscore refer to', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "en", "dataset_name": "Muennighoff/xwinograd", "evaluation": {"accuracy": 0.6473118279569893}, "template_name": "underscore refer to"}}, "Muennighoff/xwinograd_fr": {"Replace": {"arguments": "Namespace(config_name=None, dataset_config_name='fr', dataset_name='Muennighoff/xwinograd', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='test', target_max_length=256, template_config_name='en', template_name='Replace', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "fr", "dataset_name": "Muennighoff/xwinograd", "evaluation": {"accuracy": 0.6024096385542169}, "template_name": "Replace"}, "True or False": {"arguments": "Namespace(config_name=None, dataset_config_name='fr', dataset_name='Muennighoff/xwinograd', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='test', target_max_length=256, template_config_name='en', template_name='True or False', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "fr", "dataset_name": "Muennighoff/xwinograd", "evaluation": {"accuracy": 0.46987951807228917}, "template_name": "True or False"}, "does underscore refer to": {"arguments": "Namespace(config_name=None, dataset_config_name='fr', dataset_name='Muennighoff/xwinograd', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='test', target_max_length=256, template_config_name='en', template_name='does underscore refer to', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "fr", "dataset_name": "Muennighoff/xwinograd", "evaluation": {"accuracy": 0.5903614457831325}, "template_name": "does underscore refer to"}, "stand for": {"arguments": "Namespace(config_name=None, dataset_config_name='fr', dataset_name='Muennighoff/xwinograd', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='test', target_max_length=256, template_config_name='en', template_name='stand for', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "fr", "dataset_name": "Muennighoff/xwinograd", "evaluation": {"accuracy": 0.4939759036144578}, "template_name": "stand for"}, "underscore refer to": {"arguments": "Namespace(config_name=None, dataset_config_name='fr', dataset_name='Muennighoff/xwinograd', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='test', target_max_length=256, template_config_name='en', template_name='underscore refer to', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "fr", "dataset_name": "Muennighoff/xwinograd", "evaluation": {"accuracy": 0.6867469879518072}, "template_name": "underscore refer to"}}, "Muennighoff/xwinograd_pt": {"Replace": {"arguments": "Namespace(config_name=None, dataset_config_name='pt', dataset_name='Muennighoff/xwinograd', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='test', target_max_length=256, template_config_name='en', template_name='Replace', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "pt", "dataset_name": "Muennighoff/xwinograd", "evaluation": {"accuracy": 0.6463878326996197}, "template_name": "Replace"}, "True or False": {"arguments": "Namespace(config_name=None, dataset_config_name='pt', dataset_name='Muennighoff/xwinograd', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='test', target_max_length=256, template_config_name='en', template_name='True or False', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "pt", "dataset_name": "Muennighoff/xwinograd", "evaluation": {"accuracy": 0.5285171102661597}, "template_name": "True or False"}, "does underscore refer to": {"arguments": "Namespace(config_name=None, dataset_config_name='pt', dataset_name='Muennighoff/xwinograd', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='test', target_max_length=256, template_config_name='en', template_name='does underscore refer to', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "pt", "dataset_name": "Muennighoff/xwinograd", "evaluation": {"accuracy": 0.6007604562737643}, "template_name": "does underscore refer to"}, "stand for": {"arguments": "Namespace(config_name=None, dataset_config_name='pt', dataset_name='Muennighoff/xwinograd', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='test', target_max_length=256, template_config_name='en', template_name='stand for', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "pt", "dataset_name": "Muennighoff/xwinograd", "evaluation": {"accuracy": 0.49809885931558934}, "template_name": "stand for"}, "underscore refer to": {"arguments": "Namespace(config_name=None, dataset_config_name='pt', dataset_name='Muennighoff/xwinograd', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='test', target_max_length=256, template_config_name='en', template_name='underscore refer to', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "pt", "dataset_name": "Muennighoff/xwinograd", "evaluation": {"accuracy": 0.6083650190114068}, "template_name": "underscore refer to"}}, "Muennighoff/xwinograd_zh": {"Replace": {"arguments": "Namespace(config_name=None, dataset_config_name='zh', dataset_name='Muennighoff/xwinograd', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='test', target_max_length=256, template_config_name='en', template_name='Replace', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "zh", "dataset_name": "Muennighoff/xwinograd", "evaluation": {"accuracy": 0.7063492063492064}, "template_name": "Replace"}, "True or False": {"arguments": "Namespace(config_name=None, dataset_config_name='zh', dataset_name='Muennighoff/xwinograd', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='test', target_max_length=256, template_config_name='en', template_name='True or False', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "zh", "dataset_name": "Muennighoff/xwinograd", "evaluation": {"accuracy": 0.5515873015873016}, "template_name": "True or False"}, "does underscore refer to": {"arguments": "Namespace(config_name=None, dataset_config_name='zh', dataset_name='Muennighoff/xwinograd', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='test', target_max_length=256, template_config_name='en', template_name='does underscore refer to', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "zh", "dataset_name": "Muennighoff/xwinograd", "evaluation": {"accuracy": 0.621031746031746}, "template_name": "does underscore refer to"}, "stand for": {"arguments": "Namespace(config_name=None, dataset_config_name='zh', dataset_name='Muennighoff/xwinograd', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='test', target_max_length=256, template_config_name='en', template_name='stand for', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "zh", "dataset_name": "Muennighoff/xwinograd", "evaluation": {"accuracy": 0.5158730158730159}, "template_name": "stand for"}, "underscore refer to": {"arguments": "Namespace(config_name=None, dataset_config_name='zh', dataset_name='Muennighoff/xwinograd', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='test', target_max_length=256, template_config_name='en', template_name='underscore refer to', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "zh", "dataset_name": "Muennighoff/xwinograd", "evaluation": {"accuracy": 0.6765873015873016}, "template_name": "underscore refer to"}}, "anli_dev_r1": {"GPT-3 style": {"arguments": "Namespace(config_name=None, dataset_config_name='dev_r1', dataset_name='anli', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='dev_r1', target_max_length=256, template_config_name=None, template_name='GPT-3 style', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "dev_r1", "dataset_name": "anli", "evaluation": {"accuracy": 0.497}, "template_name": "GPT-3 style"}, "MNLI crowdsource": {"arguments": "Namespace(config_name=None, dataset_config_name='dev_r1', dataset_name='anli', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='dev_r1', target_max_length=256, template_config_name=None, template_name='MNLI crowdsource', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "dev_r1", "dataset_name": "anli", "evaluation": {"accuracy": 0.442}, "template_name": "MNLI crowdsource"}, "can we infer": {"arguments": "Namespace(config_name=None, dataset_config_name='dev_r1', dataset_name='anli', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='dev_r1', target_max_length=256, template_config_name=None, template_name='can we infer', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "dev_r1", "dataset_name": "anli", "evaluation": {"accuracy": 0.456}, "template_name": "can we infer"}, "guaranteed/possible/impossible": {"arguments": "Namespace(config_name=None, dataset_config_name='dev_r1', dataset_name='anli', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='dev_r1', target_max_length=256, template_config_name=None, template_name='guaranteed/possible/impossible', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "dev_r1", "dataset_name": "anli", "evaluation": {"accuracy": 0.328}, "template_name": "guaranteed/possible/impossible"}, "justified in saying": {"arguments": "Namespace(config_name=None, dataset_config_name='dev_r1', dataset_name='anli', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='dev_r1', target_max_length=256, template_config_name=None, template_name='justified in saying', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "dev_r1", "dataset_name": "anli", "evaluation": {"accuracy": 0.46}, "template_name": "justified in saying"}}, "anli_dev_r2": {"GPT-3 style": {"arguments": "Namespace(config_name=None, dataset_config_name='dev_r2', dataset_name='anli', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='dev_r2', target_max_length=256, template_config_name=None, template_name='GPT-3 style', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "dev_r2", "dataset_name": "anli", "evaluation": {"accuracy": 0.45}, "template_name": "GPT-3 style"}, "MNLI crowdsource": {"arguments": "Namespace(config_name=None, dataset_config_name='dev_r2', dataset_name='anli', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='dev_r2', target_max_length=256, template_config_name=None, template_name='MNLI crowdsource', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "dev_r2", "dataset_name": "anli", "evaluation": {"accuracy": 0.382}, "template_name": "MNLI crowdsource"}, "can we infer": {"arguments": "Namespace(config_name=None, dataset_config_name='dev_r2', dataset_name='anli', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='dev_r2', target_max_length=256, template_config_name=None, template_name='can we infer', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "dev_r2", "dataset_name": "anli", "evaluation": {"accuracy": 0.419}, "template_name": "can we infer"}, "guaranteed/possible/impossible": {"arguments": "Namespace(config_name=None, dataset_config_name='dev_r2', dataset_name='anli', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='dev_r2', target_max_length=256, template_config_name=None, template_name='guaranteed/possible/impossible', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "dev_r2", "dataset_name": "anli", "evaluation": {"accuracy": 0.345}, "template_name": "guaranteed/possible/impossible"}, "justified in saying": {"arguments": "Namespace(config_name=None, dataset_config_name='dev_r2', dataset_name='anli', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='dev_r2', target_max_length=256, template_config_name=None, template_name='justified in saying', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "dev_r2", "dataset_name": "anli", "evaluation": {"accuracy": 0.41}, "template_name": "justified in saying"}}, "anli_dev_r3": {"GPT-3 style": {"arguments": "Namespace(config_name=None, dataset_config_name='dev_r3', dataset_name='anli', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='dev_r3', target_max_length=256, template_config_name=None, template_name='GPT-3 style', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "dev_r3", "dataset_name": "anli", "evaluation": {"accuracy": 0.4558333333333333}, "template_name": "GPT-3 style"}, "MNLI crowdsource": {"arguments": "Namespace(config_name=None, dataset_config_name='dev_r3', dataset_name='anli', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='dev_r3', target_max_length=256, template_config_name=None, template_name='MNLI crowdsource', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "dev_r3", "dataset_name": "anli", "evaluation": {"accuracy": 0.41333333333333333}, "template_name": "MNLI crowdsource"}, "can we infer": {"arguments": "Namespace(config_name=None, dataset_config_name='dev_r3', dataset_name='anli', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='dev_r3', target_max_length=256, template_config_name=None, template_name='can we infer', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "dev_r3", "dataset_name": "anli", "evaluation": {"accuracy": 0.4225}, "template_name": "can we infer"}, "guaranteed/possible/impossible": {"arguments": "Namespace(config_name=None, dataset_config_name='dev_r3', dataset_name='anli', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='dev_r3', target_max_length=256, template_config_name=None, template_name='guaranteed/possible/impossible', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "dev_r3", "dataset_name": "anli", "evaluation": {"accuracy": 0.305}, "template_name": "guaranteed/possible/impossible"}, "justified in saying": {"arguments": "Namespace(config_name=None, dataset_config_name='dev_r3', dataset_name='anli', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='dev_r3', target_max_length=256, template_config_name=None, template_name='justified in saying', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "dev_r3", "dataset_name": "anli", "evaluation": {"accuracy": 0.4083333333333333}, "template_name": "justified in saying"}}, "story_cloze_2016": {"Answer Given options": {"arguments": "Namespace(config_name=None, dataset_config_name='2016', dataset_name='story_cloze', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name=None, template_name='Answer Given options', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "2016", "dataset_name": "story_cloze", "evaluation": {"accuracy": 0.9524318546231961}, "template_name": "Answer Given options"}, "Choose Story Ending": {"arguments": "Namespace(config_name=None, dataset_config_name='2016', dataset_name='story_cloze', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name=None, template_name='Choose Story Ending', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "2016", "dataset_name": "story_cloze", "evaluation": {"accuracy": 0.9668626402993051}, "template_name": "Choose Story Ending"}, "Generate Ending": {"arguments": "Namespace(config_name=None, dataset_config_name='2016', dataset_name='story_cloze', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name=None, template_name='Generate Ending', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "2016", "dataset_name": "story_cloze", "evaluation": {"accuracy": 0.7760555852485302}, "template_name": "Generate Ending"}, "Novel Correct Ending": {"arguments": "Namespace(config_name=None, dataset_config_name='2016', dataset_name='story_cloze', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name=None, template_name='Novel Correct Ending', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "2016", "dataset_name": "story_cloze", "evaluation": {"accuracy": 0.9583110636023516}, "template_name": "Novel Correct Ending"}, "Story Continuation and Options": {"arguments": "Namespace(config_name=None, dataset_config_name='2016', dataset_name='story_cloze', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name=None, template_name='Story Continuation and Options', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "2016", "dataset_name": "story_cloze", "evaluation": {"accuracy": 0.9593800106894709}, "template_name": "Story Continuation and Options"}}, "super_glue_cb": {"GPT-3 style": {"arguments": "Namespace(config_name=None, dataset_config_name='cb', dataset_name='super_glue', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name=None, template_name='GPT-3 style', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "cb", "dataset_name": "super_glue", "evaluation": {"accuracy": 0.875}, "template_name": "GPT-3 style"}, "MNLI crowdsource": {"arguments": "Namespace(config_name=None, dataset_config_name='cb', dataset_name='super_glue', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name=None, template_name='MNLI crowdsource', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "cb", "dataset_name": "super_glue", "evaluation": {"accuracy": 0.35714285714285715}, "template_name": "MNLI crowdsource"}, "can we infer": {"arguments": "Namespace(config_name=None, dataset_config_name='cb', dataset_name='super_glue', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name=None, template_name='can we infer', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "cb", "dataset_name": "super_glue", "evaluation": {"accuracy": 0.75}, "template_name": "can we infer"}, "guaranteed/possible/impossible": {"arguments": "Namespace(config_name=None, dataset_config_name='cb', dataset_name='super_glue', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name=None, template_name='guaranteed/possible/impossible', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "cb", "dataset_name": "super_glue", "evaluation": {"accuracy": 0.7678571428571429}, "template_name": "guaranteed/possible/impossible"}, "justified in saying": {"arguments": "Namespace(config_name=None, dataset_config_name='cb', dataset_name='super_glue', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name=None, template_name='justified in saying', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "cb", "dataset_name": "super_glue", "evaluation": {"accuracy": 0.8035714285714286}, "template_name": "justified in saying"}}, "super_glue_copa": {"C1 or C2? premise, so/because\u2026": {"arguments": "Namespace(config_name=None, dataset_config_name='copa', dataset_name='super_glue', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='so/because\u2026,validation', target_max_length=256, template_config_name=None, template_name='C1 or C2? premise', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "copa", "dataset_name": "super_glue", "evaluation": {"accuracy": 0.75}, "template_name": "C1 or C2? premise, so/because\u2026"}, "best_option": {"arguments": "Namespace(config_name=None, dataset_config_name='copa', dataset_name='super_glue', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name=None, template_name='best_option', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "copa", "dataset_name": "super_glue", "evaluation": {"accuracy": 0.87}, "template_name": "best_option"}, "cause_effect": {"arguments": "Namespace(config_name=None, dataset_config_name='copa', dataset_name='super_glue', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name=None, template_name='cause_effect', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "copa", "dataset_name": "super_glue", "evaluation": {"accuracy": 0.9}, "template_name": "cause_effect"}, "i_am_hesitating": {"arguments": "Namespace(config_name=None, dataset_config_name='copa', dataset_name='super_glue', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name=None, template_name='i_am_hesitating', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "copa", "dataset_name": "super_glue", "evaluation": {"accuracy": 0.91}, "template_name": "i_am_hesitating"}, "plausible_alternatives": {"arguments": "Namespace(config_name=None, dataset_config_name='copa', dataset_name='super_glue', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name=None, template_name='plausible_alternatives', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "copa", "dataset_name": "super_glue", "evaluation": {"accuracy": 0.91}, "template_name": "plausible_alternatives"}}, "super_glue_rte": {"GPT-3 style": {"arguments": "Namespace(config_name=None, dataset_config_name='rte', dataset_name='super_glue', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name=None, template_name='GPT-3 style', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "rte", "dataset_name": "super_glue", "evaluation": {"accuracy": 0.7870036101083032}, "template_name": "GPT-3 style"}, "MNLI crowdsource": {"arguments": "Namespace(config_name=None, dataset_config_name='rte', dataset_name='super_glue', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name=None, template_name='MNLI crowdsource', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "rte", "dataset_name": "super_glue", "evaluation": {"accuracy": 0.8592057761732852}, "template_name": "MNLI crowdsource"}, "does it follow that": {"arguments": "Namespace(config_name=None, dataset_config_name='rte', dataset_name='super_glue', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name=None, template_name='does it follow that', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "rte", "dataset_name": "super_glue", "evaluation": {"accuracy": 0.8194945848375451}, "template_name": "does it follow that"}, "guaranteed true": {"arguments": "Namespace(config_name=None, dataset_config_name='rte', dataset_name='super_glue', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name=None, template_name='guaranteed true', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "rte", "dataset_name": "super_glue", "evaluation": {"accuracy": 0.7942238267148014}, "template_name": "guaranteed true"}, "should assume": {"arguments": "Namespace(config_name=None, dataset_config_name='rte', dataset_name='super_glue', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name=None, template_name='should assume', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "rte", "dataset_name": "super_glue", "evaluation": {"accuracy": 0.8122743682310469}, "template_name": "should assume"}}, "winogrande_winogrande_xl": {"Replace": {"arguments": "Namespace(config_name=None, dataset_config_name='winogrande_xl', dataset_name='winogrande', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name=None, template_name='Replace', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "winogrande_xl", "dataset_name": "winogrande", "evaluation": {"accuracy": 0.5998421468034728}, "template_name": "Replace"}, "True or False": {"arguments": "Namespace(config_name=None, dataset_config_name='winogrande_xl', dataset_name='winogrande', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name=None, template_name='True or False', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "winogrande_xl", "dataset_name": "winogrande", "evaluation": {"accuracy": 0.5359116022099447}, "template_name": "True or False"}, "does underscore refer to": {"arguments": "Namespace(config_name=None, dataset_config_name='winogrande_xl', dataset_name='winogrande', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name=None, template_name='does underscore refer to', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "winogrande_xl", "dataset_name": "winogrande", "evaluation": {"accuracy": 0.5864246250986582}, "template_name": "does underscore refer to"}, "stand for": {"arguments": "Namespace(config_name=None, dataset_config_name='winogrande_xl', dataset_name='winogrande', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name=None, template_name='stand for', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "winogrande_xl", "dataset_name": "winogrande", "evaluation": {"accuracy": 0.5201262825572218}, "template_name": "stand for"}, "underscore refer to": {"arguments": "Namespace(config_name=None, dataset_config_name='winogrande_xl', dataset_name='winogrande', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name=None, template_name='underscore refer to', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "winogrande_xl", "dataset_name": "winogrande", "evaluation": {"accuracy": 0.5880031570639306}, "template_name": "underscore refer to"}}, "xcopa_id": {"C1 or C2? premise, so/because\u2026": {"arguments": "Namespace(config_name=None, dataset_config_name='id', dataset_name='xcopa', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='so/because\u2026,validation', target_max_length=256, template_config_name='en', template_name='C1 or C2? premise', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "id", "dataset_name": "xcopa", "evaluation": {"accuracy": 0.56}, "template_name": "C1 or C2? premise, so/because\u2026"}, "best_option": {"arguments": "Namespace(config_name=None, dataset_config_name='id', dataset_name='xcopa', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='en', template_name='best_option', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "id", "dataset_name": "xcopa", "evaluation": {"accuracy": 0.81}, "template_name": "best_option"}, "cause_effect": {"arguments": "Namespace(config_name=None, dataset_config_name='id', dataset_name='xcopa', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='en', template_name='cause_effect', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "id", "dataset_name": "xcopa", "evaluation": {"accuracy": 0.87}, "template_name": "cause_effect"}, "i_am_hesitating": {"arguments": "Namespace(config_name=None, dataset_config_name='id', dataset_name='xcopa', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='en', template_name='i_am_hesitating', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "id", "dataset_name": "xcopa", "evaluation": {"accuracy": 0.83}, "template_name": "i_am_hesitating"}, "plausible_alternatives": {"arguments": "Namespace(config_name=None, dataset_config_name='id', dataset_name='xcopa', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='en', template_name='plausible_alternatives', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "id", "dataset_name": "xcopa", "evaluation": {"accuracy": 0.87}, "template_name": "plausible_alternatives"}}, "xcopa_sw": {"C1 or C2? premise, so/because\u2026": {"arguments": "Namespace(config_name=None, dataset_config_name='sw', dataset_name='xcopa', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='so/because\u2026,validation', target_max_length=256, template_config_name='en', template_name='C1 or C2? premise', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "sw", "dataset_name": "xcopa", "evaluation": {"accuracy": 0.6}, "template_name": "C1 or C2? premise, so/because\u2026"}, "best_option": {"arguments": "Namespace(config_name=None, dataset_config_name='sw', dataset_name='xcopa', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='en', template_name='best_option', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "sw", "dataset_name": "xcopa", "evaluation": {"accuracy": 0.62}, "template_name": "best_option"}, "cause_effect": {"arguments": "Namespace(config_name=None, dataset_config_name='sw', dataset_name='xcopa', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='en', template_name='cause_effect', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "sw", "dataset_name": "xcopa", "evaluation": {"accuracy": 0.64}, "template_name": "cause_effect"}, "i_am_hesitating": {"arguments": "Namespace(config_name=None, dataset_config_name='sw', dataset_name='xcopa', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='en', template_name='i_am_hesitating', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "sw", "dataset_name": "xcopa", "evaluation": {"accuracy": 0.66}, "template_name": "i_am_hesitating"}, "plausible_alternatives": {"arguments": "Namespace(config_name=None, dataset_config_name='sw', dataset_name='xcopa', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='en', template_name='plausible_alternatives', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "sw", "dataset_name": "xcopa", "evaluation": {"accuracy": 0.64}, "template_name": "plausible_alternatives"}}, "xcopa_ta": {"C1 or C2? premise, so/because\u2026": {"arguments": "Namespace(config_name=None, dataset_config_name='ta', dataset_name='xcopa', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='so/because\u2026,validation', target_max_length=256, template_config_name='en', template_name='C1 or C2? premise', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "ta", "dataset_name": "xcopa", "evaluation": {"accuracy": 0.59}, "template_name": "C1 or C2? premise, so/because\u2026"}, "best_option": {"arguments": "Namespace(config_name=None, dataset_config_name='ta', dataset_name='xcopa', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='en', template_name='best_option', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "ta", "dataset_name": "xcopa", "evaluation": {"accuracy": 0.66}, "template_name": "best_option"}, "cause_effect": {"arguments": "Namespace(config_name=None, dataset_config_name='ta', dataset_name='xcopa', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='en', template_name='cause_effect', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "ta", "dataset_name": "xcopa", "evaluation": {"accuracy": 0.7}, "template_name": "cause_effect"}, "i_am_hesitating": {"arguments": "Namespace(config_name=None, dataset_config_name='ta', dataset_name='xcopa', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='en', template_name='i_am_hesitating', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "ta", "dataset_name": "xcopa", "evaluation": {"accuracy": 0.69}, "template_name": "i_am_hesitating"}, "plausible_alternatives": {"arguments": "Namespace(config_name=None, dataset_config_name='ta', dataset_name='xcopa', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='en', template_name='plausible_alternatives', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "ta", "dataset_name": "xcopa", "evaluation": {"accuracy": 0.64}, "template_name": "plausible_alternatives"}}, "xcopa_vi": {"C1 or C2? premise, so/because\u2026": {"arguments": "Namespace(config_name=None, dataset_config_name='vi', dataset_name='xcopa', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='so/because\u2026,validation', target_max_length=256, template_config_name='en', template_name='C1 or C2? premise', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "vi", "dataset_name": "xcopa", "evaluation": {"accuracy": 0.58}, "template_name": "C1 or C2? premise, so/because\u2026"}, "best_option": {"arguments": "Namespace(config_name=None, dataset_config_name='vi', dataset_name='xcopa', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='en', template_name='best_option', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "vi", "dataset_name": "xcopa", "evaluation": {"accuracy": 0.81}, "template_name": "best_option"}, "cause_effect": {"arguments": "Namespace(config_name=None, dataset_config_name='vi', dataset_name='xcopa', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='en', template_name='cause_effect', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "vi", "dataset_name": "xcopa", "evaluation": {"accuracy": 0.91}, "template_name": "cause_effect"}, "i_am_hesitating": {"arguments": "Namespace(config_name=None, dataset_config_name='vi', dataset_name='xcopa', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='en', template_name='i_am_hesitating', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "vi", "dataset_name": "xcopa", "evaluation": {"accuracy": 0.85}, "template_name": "i_am_hesitating"}, "plausible_alternatives": {"arguments": "Namespace(config_name=None, dataset_config_name='vi', dataset_name='xcopa', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='en', template_name='plausible_alternatives', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "vi", "dataset_name": "xcopa", "evaluation": {"accuracy": 0.84}, "template_name": "plausible_alternatives"}}, "xcopa_zh": {"C1 or C2? premise, so/because\u2026": {"arguments": "Namespace(config_name=None, dataset_config_name='zh', dataset_name='xcopa', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='so/because\u2026,validation', target_max_length=256, template_config_name='en', template_name='C1 or C2? premise', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "zh", "dataset_name": "xcopa", "evaluation": {"accuracy": 0.57}, "template_name": "C1 or C2? premise, so/because\u2026"}, "best_option": {"arguments": "Namespace(config_name=None, dataset_config_name='zh', dataset_name='xcopa', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='en', template_name='best_option', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "zh", "dataset_name": "xcopa", "evaluation": {"accuracy": 0.84}, "template_name": "best_option"}, "cause_effect": {"arguments": "Namespace(config_name=None, dataset_config_name='zh', dataset_name='xcopa', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='en', template_name='cause_effect', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "zh", "dataset_name": "xcopa", "evaluation": {"accuracy": 0.86}, "template_name": "cause_effect"}, "i_am_hesitating": {"arguments": "Namespace(config_name=None, dataset_config_name='zh', dataset_name='xcopa', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='en', template_name='i_am_hesitating', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "zh", "dataset_name": "xcopa", "evaluation": {"accuracy": 0.86}, "template_name": "i_am_hesitating"}, "plausible_alternatives": {"arguments": "Namespace(config_name=None, dataset_config_name='zh', dataset_name='xcopa', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='en', template_name='plausible_alternatives', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "zh", "dataset_name": "xcopa", "evaluation": {"accuracy": 0.81}, "template_name": "plausible_alternatives"}}, "xnli_ar": {"GPT-3 style": {"arguments": "Namespace(config_name=None, dataset_config_name='ar', dataset_name='xnli', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='en', template_name='GPT-3 style', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "ar", "dataset_name": "xnli", "evaluation": {"accuracy": 0.5578313253012048}, "template_name": "GPT-3 style"}, "MNLI crowdsource": {"arguments": "Namespace(config_name=None, dataset_config_name='ar', dataset_name='xnli', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='en', template_name='MNLI crowdsource', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "ar", "dataset_name": "xnli", "evaluation": {"accuracy": 0.41164658634538154}, "template_name": "MNLI crowdsource"}, "can we infer": {"arguments": "Namespace(config_name=None, dataset_config_name='ar', dataset_name='xnli', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='en', template_name='can we infer', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "ar", "dataset_name": "xnli", "evaluation": {"accuracy": 0.5152610441767068}, "template_name": "can we infer"}, "guaranteed/possible/impossible": {"arguments": "Namespace(config_name=None, dataset_config_name='ar', dataset_name='xnli', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='en', template_name='guaranteed/possible/impossible', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "ar", "dataset_name": "xnli", "evaluation": {"accuracy": 0.5803212851405622}, "template_name": "guaranteed/possible/impossible"}, "justified in saying": {"arguments": "Namespace(config_name=None, dataset_config_name='ar', dataset_name='xnli', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='en', template_name='justified in saying', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "ar", "dataset_name": "xnli", "evaluation": {"accuracy": 0.5184738955823294}, "template_name": "justified in saying"}}, "xnli_en": {"GPT-3 style": {"arguments": "Namespace(config_name=None, dataset_config_name='en', dataset_name='xnli', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='en', template_name='GPT-3 style', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "en", "dataset_name": "xnli", "evaluation": {"accuracy": 0.6176706827309237}, "template_name": "GPT-3 style"}, "MNLI crowdsource": {"arguments": "Namespace(config_name=None, dataset_config_name='en', dataset_name='xnli', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='en', template_name='MNLI crowdsource', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "en", "dataset_name": "xnli", "evaluation": {"accuracy": 0.4606425702811245}, "template_name": "MNLI crowdsource"}, "can we infer": {"arguments": "Namespace(config_name=None, dataset_config_name='en', dataset_name='xnli', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='en', template_name='can we infer', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "en", "dataset_name": "xnli", "evaluation": {"accuracy": 0.5714859437751004}, "template_name": "can we infer"}, "guaranteed/possible/impossible": {"arguments": "Namespace(config_name=None, dataset_config_name='en', dataset_name='xnli', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='en', template_name='guaranteed/possible/impossible', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "en", "dataset_name": "xnli", "evaluation": {"accuracy": 0.6180722891566265}, "template_name": "guaranteed/possible/impossible"}, "justified in saying": {"arguments": "Namespace(config_name=None, dataset_config_name='en', dataset_name='xnli', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='en', template_name='justified in saying', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "en", "dataset_name": "xnli", "evaluation": {"accuracy": 0.5746987951807229}, "template_name": "justified in saying"}}, "xnli_es": {"GPT-3 style": {"arguments": "Namespace(config_name=None, dataset_config_name='es', dataset_name='xnli', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='en', template_name='GPT-3 style', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "es", "dataset_name": "xnli", "evaluation": {"accuracy": 0.5911646586345382}, "template_name": "GPT-3 style"}, "MNLI crowdsource": {"arguments": "Namespace(config_name=None, dataset_config_name='es', dataset_name='xnli', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='en', template_name='MNLI crowdsource', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "es", "dataset_name": "xnli", "evaluation": {"accuracy": 0.43052208835341366}, "template_name": "MNLI crowdsource"}, "can we infer": {"arguments": "Namespace(config_name=None, dataset_config_name='es', dataset_name='xnli', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='en', template_name='can we infer', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "es", "dataset_name": "xnli", "evaluation": {"accuracy": 0.4397590361445783}, "template_name": "can we infer"}, "guaranteed/possible/impossible": {"arguments": "Namespace(config_name=None, dataset_config_name='es', dataset_name='xnli', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='en', template_name='guaranteed/possible/impossible', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "es", "dataset_name": "xnli", "evaluation": {"accuracy": 0.5208835341365462}, "template_name": "guaranteed/possible/impossible"}, "justified in saying": {"arguments": "Namespace(config_name=None, dataset_config_name='es', dataset_name='xnli', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='en', template_name='justified in saying', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "es", "dataset_name": "xnli", "evaluation": {"accuracy": 0.41726907630522087}, "template_name": "justified in saying"}}, "xnli_fr": {"GPT-3 style": {"arguments": "Namespace(config_name=None, dataset_config_name='fr', dataset_name='xnli', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='en', template_name='GPT-3 style', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "fr", "dataset_name": "xnli", "evaluation": {"accuracy": 0.5911646586345382}, "template_name": "GPT-3 style"}, "MNLI crowdsource": {"arguments": "Namespace(config_name=None, dataset_config_name='fr', dataset_name='xnli', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='en', template_name='MNLI crowdsource', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "fr", "dataset_name": "xnli", "evaluation": {"accuracy": 0.4321285140562249}, "template_name": "MNLI crowdsource"}, "can we infer": {"arguments": "Namespace(config_name=None, dataset_config_name='fr', dataset_name='xnli', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='en', template_name='can we infer', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "fr", "dataset_name": "xnli", "evaluation": {"accuracy": 0.5369477911646586}, "template_name": "can we infer"}, "guaranteed/possible/impossible": {"arguments": "Namespace(config_name=None, dataset_config_name='fr', dataset_name='xnli', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='en', template_name='guaranteed/possible/impossible', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "fr", "dataset_name": "xnli", "evaluation": {"accuracy": 0.5176706827309236}, "template_name": "guaranteed/possible/impossible"}, "justified in saying": {"arguments": "Namespace(config_name=None, dataset_config_name='fr', dataset_name='xnli', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='en', template_name='justified in saying', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "fr", "dataset_name": "xnli", "evaluation": {"accuracy": 0.5385542168674698}, "template_name": "justified in saying"}}, "xnli_hi": {"GPT-3 style": {"arguments": "Namespace(config_name=None, dataset_config_name='hi', dataset_name='xnli', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='en', template_name='GPT-3 style', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "hi", "dataset_name": "xnli", "evaluation": {"accuracy": 0.5208835341365462}, "template_name": "GPT-3 style"}, "MNLI crowdsource": {"arguments": "Namespace(config_name=None, dataset_config_name='hi', dataset_name='xnli', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='en', template_name='MNLI crowdsource', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "hi", "dataset_name": "xnli", "evaluation": {"accuracy": 0.3819277108433735}, "template_name": "MNLI crowdsource"}, "can we infer": {"arguments": "Namespace(config_name=None, dataset_config_name='hi', dataset_name='xnli', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='en', template_name='can we infer', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "hi", "dataset_name": "xnli", "evaluation": {"accuracy": 0.44176706827309237}, "template_name": "can we infer"}, "guaranteed/possible/impossible": {"arguments": "Namespace(config_name=None, dataset_config_name='hi', dataset_name='xnli', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='en', template_name='guaranteed/possible/impossible', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "hi", "dataset_name": "xnli", "evaluation": {"accuracy": 0.5253012048192771}, "template_name": "guaranteed/possible/impossible"}, "justified in saying": {"arguments": "Namespace(config_name=None, dataset_config_name='hi', dataset_name='xnli', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='en', template_name='justified in saying', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "hi", "dataset_name": "xnli", "evaluation": {"accuracy": 0.44377510040160645}, "template_name": "justified in saying"}}, "xnli_sw": {"GPT-3 style": {"arguments": "Namespace(config_name=None, dataset_config_name='sw', dataset_name='xnli', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='en', template_name='GPT-3 style', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "sw", "dataset_name": "xnli", "evaluation": {"accuracy": 0.5036144578313253}, "template_name": "GPT-3 style"}, "MNLI crowdsource": {"arguments": "Namespace(config_name=None, dataset_config_name='sw', dataset_name='xnli', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='en', template_name='MNLI crowdsource', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "sw", "dataset_name": "xnli", "evaluation": {"accuracy": 0.3887550200803213}, "template_name": "MNLI crowdsource"}, "can we infer": {"arguments": "Namespace(config_name=None, dataset_config_name='sw', dataset_name='xnli', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='en', template_name='can we infer', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "sw", "dataset_name": "xnli", "evaluation": {"accuracy": 0.44216867469879517}, "template_name": "can we infer"}, "guaranteed/possible/impossible": {"arguments": "Namespace(config_name=None, dataset_config_name='sw', dataset_name='xnli', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='en', template_name='guaranteed/possible/impossible', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "sw", "dataset_name": "xnli", "evaluation": {"accuracy": 0.38795180722891565}, "template_name": "guaranteed/possible/impossible"}, "justified in saying": {"arguments": "Namespace(config_name=None, dataset_config_name='sw', dataset_name='xnli', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='en', template_name='justified in saying', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "sw", "dataset_name": "xnli", "evaluation": {"accuracy": 0.4397590361445783}, "template_name": "justified in saying"}}, "xnli_ur": {"GPT-3 style": {"arguments": "Namespace(config_name=None, dataset_config_name='ur', dataset_name='xnli', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='en', template_name='GPT-3 style', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "ur", "dataset_name": "xnli", "evaluation": {"accuracy": 0.4907630522088353}, "template_name": "GPT-3 style"}, "MNLI crowdsource": {"arguments": "Namespace(config_name=None, dataset_config_name='ur', dataset_name='xnli', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='en', template_name='MNLI crowdsource', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "ur", "dataset_name": "xnli", "evaluation": {"accuracy": 0.37309236947791163}, "template_name": "MNLI crowdsource"}, "can we infer": {"arguments": "Namespace(config_name=None, dataset_config_name='ur', dataset_name='xnli', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='en', template_name='can we infer', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "ur", "dataset_name": "xnli", "evaluation": {"accuracy": 0.45863453815261046}, "template_name": "can we infer"}, "guaranteed/possible/impossible": {"arguments": "Namespace(config_name=None, dataset_config_name='ur', dataset_name='xnli', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='en', template_name='guaranteed/possible/impossible', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "ur", "dataset_name": "xnli", "evaluation": {"accuracy": 0.5124497991967871}, "template_name": "guaranteed/possible/impossible"}, "justified in saying": {"arguments": "Namespace(config_name=None, dataset_config_name='ur', dataset_name='xnli', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='en', template_name='justified in saying', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "ur", "dataset_name": "xnli", "evaluation": {"accuracy": 0.45582329317269077}, "template_name": "justified in saying"}}, "xnli_vi": {"GPT-3 style": {"arguments": "Namespace(config_name=None, dataset_config_name='vi', dataset_name='xnli', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='en', template_name='GPT-3 style', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "vi", "dataset_name": "xnli", "evaluation": {"accuracy": 0.5582329317269076}, "template_name": "GPT-3 style"}, "MNLI crowdsource": {"arguments": "Namespace(config_name=None, dataset_config_name='vi', dataset_name='xnli', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='en', template_name='MNLI crowdsource', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "vi", "dataset_name": "xnli", "evaluation": {"accuracy": 0.42690763052208835}, "template_name": "MNLI crowdsource"}, "can we infer": {"arguments": "Namespace(config_name=None, dataset_config_name='vi', dataset_name='xnli', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='en', template_name='can we infer', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "vi", "dataset_name": "xnli", "evaluation": {"accuracy": 0.4759036144578313}, "template_name": "can we infer"}, "guaranteed/possible/impossible": {"arguments": "Namespace(config_name=None, dataset_config_name='vi', dataset_name='xnli', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='en', template_name='guaranteed/possible/impossible', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "vi", "dataset_name": "xnli", "evaluation": {"accuracy": 0.5008032128514056}, "template_name": "guaranteed/possible/impossible"}, "justified in saying": {"arguments": "Namespace(config_name=None, dataset_config_name='vi', dataset_name='xnli', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='en', template_name='justified in saying', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "vi", "dataset_name": "xnli", "evaluation": {"accuracy": 0.4827309236947791}, "template_name": "justified in saying"}}, "xnli_zh": {"GPT-3 style": {"arguments": "Namespace(config_name=None, dataset_config_name='zh', dataset_name='xnli', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='en', template_name='GPT-3 style', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "zh", "dataset_name": "xnli", "evaluation": {"accuracy": 0.5550200803212851}, "template_name": "GPT-3 style"}, "MNLI crowdsource": {"arguments": "Namespace(config_name=None, dataset_config_name='zh', dataset_name='xnli', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='en', template_name='MNLI crowdsource', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "zh", "dataset_name": "xnli", "evaluation": {"accuracy": 0.4248995983935743}, "template_name": "MNLI crowdsource"}, "can we infer": {"arguments": "Namespace(config_name=None, dataset_config_name='zh', dataset_name='xnli', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='en', template_name='can we infer', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "zh", "dataset_name": "xnli", "evaluation": {"accuracy": 0.43052208835341366}, "template_name": "can we infer"}, "guaranteed/possible/impossible": {"arguments": "Namespace(config_name=None, dataset_config_name='zh', dataset_name='xnli', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='en', template_name='guaranteed/possible/impossible', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "zh", "dataset_name": "xnli", "evaluation": {"accuracy": 0.5526104417670683}, "template_name": "guaranteed/possible/impossible"}, "justified in saying": {"arguments": "Namespace(config_name=None, dataset_config_name='zh', dataset_name='xnli', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='en', template_name='justified in saying', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "zh", "dataset_name": "xnli", "evaluation": {"accuracy": 0.44016064257028115}, "template_name": "justified in saying"}}}
\ No newline at end of file
diff --git a/evaluation_l1/story_cloze/2016/Answer_Given_options/results.json b/evaluation_l1/story_cloze/2016/Answer_Given_options/results.json
new file mode 100644
index 0000000000000000000000000000000000000000..c17d76a62d827c060ee745ccf9611eac8ff3cdac
--- /dev/null
+++ b/evaluation_l1/story_cloze/2016/Answer_Given_options/results.json
@@ -0,0 +1,9 @@
+{
+  "dataset_name": "story_cloze",
+  "dataset_config_name": "2016",
+  "template_name": "Answer Given options",
+  "evaluation": {
+    "accuracy": 0.9524318546231961
+  },
+  "arguments": "Namespace(config_name=None, dataset_config_name='2016', dataset_name='story_cloze', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name=None, template_name='Answer Given options', tokenizer_name=None, use_slow_tokenizer=False)"
+}
\ No newline at end of file
diff --git a/evaluation_l1/story_cloze/2016/Choose_Story_Ending/results.json b/evaluation_l1/story_cloze/2016/Choose_Story_Ending/results.json
new file mode 100644
index 0000000000000000000000000000000000000000..b402b37266757783bf6291bcee1c9cfa46d216f1
--- /dev/null
+++ b/evaluation_l1/story_cloze/2016/Choose_Story_Ending/results.json
@@ -0,0 +1,9 @@
+{
+  "dataset_name": "story_cloze",
+  "dataset_config_name": "2016",
+  "template_name": "Choose Story Ending",
+  "evaluation": {
+    "accuracy": 0.9668626402993051
+  },
+  "arguments": "Namespace(config_name=None, dataset_config_name='2016', dataset_name='story_cloze', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name=None, template_name='Choose Story Ending', tokenizer_name=None, use_slow_tokenizer=False)"
+}
\ No newline at end of file
diff --git a/evaluation_l1/story_cloze/2016/Generate_Ending/results.json b/evaluation_l1/story_cloze/2016/Generate_Ending/results.json
new file mode 100644
index 0000000000000000000000000000000000000000..fb9415efb88e94a410bb8a2473f70078b3feffb3
--- /dev/null
+++ b/evaluation_l1/story_cloze/2016/Generate_Ending/results.json
@@ -0,0 +1,9 @@
+{
+  "dataset_name": "story_cloze",
+  "dataset_config_name": "2016",
+  "template_name": "Generate Ending",
+  "evaluation": {
+    "accuracy": 0.7760555852485302
+  },
+  "arguments": "Namespace(config_name=None, dataset_config_name='2016', dataset_name='story_cloze', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name=None, template_name='Generate Ending', tokenizer_name=None, use_slow_tokenizer=False)"
+}
\ No newline at end of file
diff --git a/evaluation_l1/story_cloze/2016/Novel_Correct_Ending/results.json b/evaluation_l1/story_cloze/2016/Novel_Correct_Ending/results.json
new file mode 100644
index 0000000000000000000000000000000000000000..4c6a1159adea4b46e6d68e29b5ccf22f7d3eeded
--- /dev/null
+++ b/evaluation_l1/story_cloze/2016/Novel_Correct_Ending/results.json
@@ -0,0 +1,9 @@
+{
+  "dataset_name": "story_cloze",
+  "dataset_config_name": "2016",
+  "template_name": "Novel Correct Ending",
+  "evaluation": {
+    "accuracy": 0.9583110636023516
+  },
+  "arguments": "Namespace(config_name=None, dataset_config_name='2016', dataset_name='story_cloze', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name=None, template_name='Novel Correct Ending', tokenizer_name=None, use_slow_tokenizer=False)"
+}
\ No newline at end of file
diff --git a/evaluation_l1/story_cloze/2016/Story_Continuation_and_Options/results.json b/evaluation_l1/story_cloze/2016/Story_Continuation_and_Options/results.json
new file mode 100644
index 0000000000000000000000000000000000000000..2a5a5c76a3c4b4526a73cf505aa142660bf95af7
--- /dev/null
+++ b/evaluation_l1/story_cloze/2016/Story_Continuation_and_Options/results.json
@@ -0,0 +1,9 @@
+{
+  "dataset_name": "story_cloze",
+  "dataset_config_name": "2016",
+  "template_name": "Story Continuation and Options",
+  "evaluation": {
+    "accuracy": 0.9593800106894709
+  },
+  "arguments": "Namespace(config_name=None, dataset_config_name='2016', dataset_name='story_cloze', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name=None, template_name='Story Continuation and Options', tokenizer_name=None, use_slow_tokenizer=False)"
+}
\ No newline at end of file
diff --git a/evaluation_l1/super_glue/cb/GPT-3_style/results.json b/evaluation_l1/super_glue/cb/GPT-3_style/results.json
new file mode 100644
index 0000000000000000000000000000000000000000..150551119a34190b84b09a536f5bd1058b09bf1c
--- /dev/null
+++ b/evaluation_l1/super_glue/cb/GPT-3_style/results.json
@@ -0,0 +1,9 @@
+{
+  "dataset_name": "super_glue",
+  "dataset_config_name": "cb",
+  "template_name": "GPT-3 style",
+  "evaluation": {
+    "accuracy": 0.875
+  },
+  "arguments": "Namespace(config_name=None, dataset_config_name='cb', dataset_name='super_glue', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name=None, template_name='GPT-3 style', tokenizer_name=None, use_slow_tokenizer=False)"
+}
\ No newline at end of file
diff --git a/evaluation_l1/super_glue/cb/MNLI_crowdsource/results.json b/evaluation_l1/super_glue/cb/MNLI_crowdsource/results.json
new file mode 100644
index 0000000000000000000000000000000000000000..f504eb8fa769b4398bc5bfed5fd0032fbb5b979e
--- /dev/null
+++ b/evaluation_l1/super_glue/cb/MNLI_crowdsource/results.json
@@ -0,0 +1,9 @@
+{
+  "dataset_name": "super_glue",
+  "dataset_config_name": "cb",
+  "template_name": "MNLI crowdsource",
+  "evaluation": {
+    "accuracy": 0.35714285714285715
+  },
+  "arguments": "Namespace(config_name=None, dataset_config_name='cb', dataset_name='super_glue', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name=None, template_name='MNLI crowdsource', tokenizer_name=None, use_slow_tokenizer=False)"
+}
\ No newline at end of file
diff --git a/evaluation_l1/super_glue/cb/can_we_infer/results.json b/evaluation_l1/super_glue/cb/can_we_infer/results.json
new file mode 100644
index 0000000000000000000000000000000000000000..5e4c7c789ae4d0104e023f693f89f383f2d7765c
--- /dev/null
+++ b/evaluation_l1/super_glue/cb/can_we_infer/results.json
@@ -0,0 +1,9 @@
+{
+  "dataset_name": "super_glue",
+  "dataset_config_name": "cb",
+  "template_name": "can we infer",
+  "evaluation": {
+    "accuracy": 0.75
+  },
+  "arguments": "Namespace(config_name=None, dataset_config_name='cb', dataset_name='super_glue', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name=None, template_name='can we infer', tokenizer_name=None, use_slow_tokenizer=False)"
+}
\ No newline at end of file
diff --git a/evaluation_l1/super_glue/cb/guaranteed_possible_impossible/results.json b/evaluation_l1/super_glue/cb/guaranteed_possible_impossible/results.json
new file mode 100644
index 0000000000000000000000000000000000000000..52ea214f6c6b2ebd759e937adf6da63abfc8cc43
--- /dev/null
+++ b/evaluation_l1/super_glue/cb/guaranteed_possible_impossible/results.json
@@ -0,0 +1,9 @@
+{
+  "dataset_name": "super_glue",
+  "dataset_config_name": "cb",
+  "template_name": "guaranteed/possible/impossible",
+  "evaluation": {
+    "accuracy": 0.7678571428571429
+  },
+  "arguments": "Namespace(config_name=None, dataset_config_name='cb', dataset_name='super_glue', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name=None, template_name='guaranteed/possible/impossible', tokenizer_name=None, use_slow_tokenizer=False)"
+}
\ No newline at end of file
diff --git a/evaluation_l1/super_glue/cb/justified_in_saying/results.json b/evaluation_l1/super_glue/cb/justified_in_saying/results.json
new file mode 100644
index 0000000000000000000000000000000000000000..8d1911dc1d0b22bc55f044765a8435c1f0ca95c2
--- /dev/null
+++ b/evaluation_l1/super_glue/cb/justified_in_saying/results.json
@@ -0,0 +1,9 @@
+{
+  "dataset_name": "super_glue",
+  "dataset_config_name": "cb",
+  "template_name": "justified in saying",
+  "evaluation": {
+    "accuracy": 0.8035714285714286
+  },
+  "arguments": "Namespace(config_name=None, dataset_config_name='cb', dataset_name='super_glue', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name=None, template_name='justified in saying', tokenizer_name=None, use_slow_tokenizer=False)"
+}
\ No newline at end of file
diff --git a/evaluation_l1/super_glue/copa/C1_or_C2?_premise/results.json b/evaluation_l1/super_glue/copa/C1_or_C2?_premise/results.json
new file mode 100644
index 0000000000000000000000000000000000000000..fd07711fac65df94ecc461abbb54e79c4c29e1b1
--- /dev/null
+++ b/evaluation_l1/super_glue/copa/C1_or_C2?_premise/results.json
@@ -0,0 +1,9 @@
+{
+  "dataset_name": "super_glue",
+  "dataset_config_name": "copa",
+  "template_name": "C1 or C2? premise, so/because\u2026",
+  "evaluation": {
+    "accuracy": 0.75
+  },
+  "arguments": "Namespace(config_name=None, dataset_config_name='copa', dataset_name='super_glue', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='so/because\u2026,validation', target_max_length=256, template_config_name=None, template_name='C1 or C2? premise', tokenizer_name=None, use_slow_tokenizer=False)"
+}
\ No newline at end of file
diff --git a/evaluation_l1/super_glue/copa/best_option/results.json b/evaluation_l1/super_glue/copa/best_option/results.json
new file mode 100644
index 0000000000000000000000000000000000000000..5bbfcd2b3a8c506efc556c73a0a3177eef86355c
--- /dev/null
+++ b/evaluation_l1/super_glue/copa/best_option/results.json
@@ -0,0 +1,9 @@
+{
+  "dataset_name": "super_glue",
+  "dataset_config_name": "copa",
+  "template_name": "best_option",
+  "evaluation": {
+    "accuracy": 0.87
+  },
+  "arguments": "Namespace(config_name=None, dataset_config_name='copa', dataset_name='super_glue', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name=None, template_name='best_option', tokenizer_name=None, use_slow_tokenizer=False)"
+}
\ No newline at end of file
diff --git a/evaluation_l1/super_glue/copa/cause_effect/results.json b/evaluation_l1/super_glue/copa/cause_effect/results.json
new file mode 100644
index 0000000000000000000000000000000000000000..92a200f978839cc7833bf68d793994f49f880572
--- /dev/null
+++ b/evaluation_l1/super_glue/copa/cause_effect/results.json
@@ -0,0 +1,9 @@
+{
+  "dataset_name": "super_glue",
+  "dataset_config_name": "copa",
+  "template_name": "cause_effect",
+  "evaluation": {
+    "accuracy": 0.9
+  },
+  "arguments": "Namespace(config_name=None, dataset_config_name='copa', dataset_name='super_glue', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name=None, template_name='cause_effect', tokenizer_name=None, use_slow_tokenizer=False)"
+}
\ No newline at end of file
diff --git a/evaluation_l1/super_glue/copa/i_am_hesitating/results.json b/evaluation_l1/super_glue/copa/i_am_hesitating/results.json
new file mode 100644
index 0000000000000000000000000000000000000000..103509431a1ccfac1b771cd1d235bce0d3b70c4b
--- /dev/null
+++ b/evaluation_l1/super_glue/copa/i_am_hesitating/results.json
@@ -0,0 +1,9 @@
+{
+  "dataset_name": "super_glue",
+  "dataset_config_name": "copa",
+  "template_name": "i_am_hesitating",
+  "evaluation": {
+    "accuracy": 0.91
+  },
+  "arguments": "Namespace(config_name=None, dataset_config_name='copa', dataset_name='super_glue', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name=None, template_name='i_am_hesitating', tokenizer_name=None, use_slow_tokenizer=False)"
+}
\ No newline at end of file
diff --git a/evaluation_l1/super_glue/copa/plausible_alternatives/results.json b/evaluation_l1/super_glue/copa/plausible_alternatives/results.json
new file mode 100644
index 0000000000000000000000000000000000000000..0db9f991a03dbdd962d70f3b63f0ccc795887f73
--- /dev/null
+++ b/evaluation_l1/super_glue/copa/plausible_alternatives/results.json
@@ -0,0 +1,9 @@
+{
+  "dataset_name": "super_glue",
+  "dataset_config_name": "copa",
+  "template_name": "plausible_alternatives",
+  "evaluation": {
+    "accuracy": 0.91
+  },
+  "arguments": "Namespace(config_name=None, dataset_config_name='copa', dataset_name='super_glue', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name=None, template_name='plausible_alternatives', tokenizer_name=None, use_slow_tokenizer=False)"
+}
\ No newline at end of file
diff --git a/evaluation_l1/super_glue/rte/GPT-3_style/results.json b/evaluation_l1/super_glue/rte/GPT-3_style/results.json
new file mode 100644
index 0000000000000000000000000000000000000000..074738d3a743bb6e89779674116678b7551cfb54
--- /dev/null
+++ b/evaluation_l1/super_glue/rte/GPT-3_style/results.json
@@ -0,0 +1,9 @@
+{
+  "dataset_name": "super_glue",
+  "dataset_config_name": "rte",
+  "template_name": "GPT-3 style",
+  "evaluation": {
+    "accuracy": 0.7870036101083032
+  },
+  "arguments": "Namespace(config_name=None, dataset_config_name='rte', dataset_name='super_glue', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name=None, template_name='GPT-3 style', tokenizer_name=None, use_slow_tokenizer=False)"
+}
\ No newline at end of file
diff --git a/evaluation_l1/super_glue/rte/MNLI_crowdsource/results.json b/evaluation_l1/super_glue/rte/MNLI_crowdsource/results.json
new file mode 100644
index 0000000000000000000000000000000000000000..306946726d625ff5536f78fcee8d6be028f7a901
--- /dev/null
+++ b/evaluation_l1/super_glue/rte/MNLI_crowdsource/results.json
@@ -0,0 +1,9 @@
+{
+  "dataset_name": "super_glue",
+  "dataset_config_name": "rte",
+  "template_name": "MNLI crowdsource",
+  "evaluation": {
+    "accuracy": 0.8592057761732852
+  },
+  "arguments": "Namespace(config_name=None, dataset_config_name='rte', dataset_name='super_glue', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name=None, template_name='MNLI crowdsource', tokenizer_name=None, use_slow_tokenizer=False)"
+}
\ No newline at end of file
diff --git a/evaluation_l1/super_glue/rte/does_it_follow_that/results.json b/evaluation_l1/super_glue/rte/does_it_follow_that/results.json
new file mode 100644
index 0000000000000000000000000000000000000000..92225b08e90d636f57f4ac20ac58b082a7774712
--- /dev/null
+++ b/evaluation_l1/super_glue/rte/does_it_follow_that/results.json
@@ -0,0 +1,9 @@
+{
+  "dataset_name": "super_glue",
+  "dataset_config_name": "rte",
+  "template_name": "does it follow that",
+  "evaluation": {
+    "accuracy": 0.8194945848375451
+  },
+  "arguments": "Namespace(config_name=None, dataset_config_name='rte', dataset_name='super_glue', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name=None, template_name='does it follow that', tokenizer_name=None, use_slow_tokenizer=False)"
+}
\ No newline at end of file
diff --git a/evaluation_l1/super_glue/rte/guaranteed_true/results.json b/evaluation_l1/super_glue/rte/guaranteed_true/results.json
new file mode 100644
index 0000000000000000000000000000000000000000..13e0a08ce6250b6f6e5a37faa695793aa0bbecb0
--- /dev/null
+++ b/evaluation_l1/super_glue/rte/guaranteed_true/results.json
@@ -0,0 +1,9 @@
+{
+  "dataset_name": "super_glue",
+  "dataset_config_name": "rte",
+  "template_name": "guaranteed true",
+  "evaluation": {
+    "accuracy": 0.7942238267148014
+  },
+  "arguments": "Namespace(config_name=None, dataset_config_name='rte', dataset_name='super_glue', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name=None, template_name='guaranteed true', tokenizer_name=None, use_slow_tokenizer=False)"
+}
\ No newline at end of file
diff --git a/evaluation_l1/super_glue/rte/should_assume/results.json b/evaluation_l1/super_glue/rte/should_assume/results.json
new file mode 100644
index 0000000000000000000000000000000000000000..787186a80cc2515fe6118deece5a17d189a47d25
--- /dev/null
+++ b/evaluation_l1/super_glue/rte/should_assume/results.json
@@ -0,0 +1,9 @@
+{
+  "dataset_name": "super_glue",
+  "dataset_config_name": "rte",
+  "template_name": "should assume",
+  "evaluation": {
+    "accuracy": 0.8122743682310469
+  },
+  "arguments": "Namespace(config_name=None, dataset_config_name='rte', dataset_name='super_glue', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name=None, template_name='should assume', tokenizer_name=None, use_slow_tokenizer=False)"
+}
\ No newline at end of file
diff --git a/evaluation_l1/winogrande/winogrande_xl/Replace/results.json b/evaluation_l1/winogrande/winogrande_xl/Replace/results.json
new file mode 100644
index 0000000000000000000000000000000000000000..42019f2d1257574dd2ec471f125ef67a3dd6f761
--- /dev/null
+++ b/evaluation_l1/winogrande/winogrande_xl/Replace/results.json
@@ -0,0 +1,9 @@
+{
+  "dataset_name": "winogrande",
+  "dataset_config_name": "winogrande_xl",
+  "template_name": "Replace",
+  "evaluation": {
+    "accuracy": 0.5998421468034728
+  },
+  "arguments": "Namespace(config_name=None, dataset_config_name='winogrande_xl', dataset_name='winogrande', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name=None, template_name='Replace', tokenizer_name=None, use_slow_tokenizer=False)"
+}
\ No newline at end of file
diff --git a/evaluation_l1/winogrande/winogrande_xl/True_or_False/results.json b/evaluation_l1/winogrande/winogrande_xl/True_or_False/results.json
new file mode 100644
index 0000000000000000000000000000000000000000..7570b5a289024d5d31ca1e682a269567a07df00f
--- /dev/null
+++ b/evaluation_l1/winogrande/winogrande_xl/True_or_False/results.json
@@ -0,0 +1,9 @@
+{
+  "dataset_name": "winogrande",
+  "dataset_config_name": "winogrande_xl",
+  "template_name": "True or False",
+  "evaluation": {
+    "accuracy": 0.5359116022099447
+  },
+  "arguments": "Namespace(config_name=None, dataset_config_name='winogrande_xl', dataset_name='winogrande', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name=None, template_name='True or False', tokenizer_name=None, use_slow_tokenizer=False)"
+}
\ No newline at end of file
diff --git a/evaluation_l1/winogrande/winogrande_xl/does_underscore_refer_to/results.json b/evaluation_l1/winogrande/winogrande_xl/does_underscore_refer_to/results.json
new file mode 100644
index 0000000000000000000000000000000000000000..bbe231ea5c0a34aea5c9f156b5736f80199ac088
--- /dev/null
+++ b/evaluation_l1/winogrande/winogrande_xl/does_underscore_refer_to/results.json
@@ -0,0 +1,9 @@
+{
+  "dataset_name": "winogrande",
+  "dataset_config_name": "winogrande_xl",
+  "template_name": "does underscore refer to",
+  "evaluation": {
+    "accuracy": 0.5864246250986582
+  },
+  "arguments": "Namespace(config_name=None, dataset_config_name='winogrande_xl', dataset_name='winogrande', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name=None, template_name='does underscore refer to', tokenizer_name=None, use_slow_tokenizer=False)"
+}
\ No newline at end of file
diff --git a/evaluation_l1/winogrande/winogrande_xl/stand_for/results.json b/evaluation_l1/winogrande/winogrande_xl/stand_for/results.json
new file mode 100644
index 0000000000000000000000000000000000000000..6819315d5fa6359d1ebf74890878512463babfd8
--- /dev/null
+++ b/evaluation_l1/winogrande/winogrande_xl/stand_for/results.json
@@ -0,0 +1,9 @@
+{
+  "dataset_name": "winogrande",
+  "dataset_config_name": "winogrande_xl",
+  "template_name": "stand for",
+  "evaluation": {
+    "accuracy": 0.5201262825572218
+  },
+  "arguments": "Namespace(config_name=None, dataset_config_name='winogrande_xl', dataset_name='winogrande', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name=None, template_name='stand for', tokenizer_name=None, use_slow_tokenizer=False)"
+}
\ No newline at end of file
diff --git a/evaluation_l1/winogrande/winogrande_xl/underscore_refer_to/results.json b/evaluation_l1/winogrande/winogrande_xl/underscore_refer_to/results.json
new file mode 100644
index 0000000000000000000000000000000000000000..e5cf04d02a842555ba0d5dd39f2b3298ffba8250
--- /dev/null
+++ b/evaluation_l1/winogrande/winogrande_xl/underscore_refer_to/results.json
@@ -0,0 +1,9 @@
+{
+  "dataset_name": "winogrande",
+  "dataset_config_name": "winogrande_xl",
+  "template_name": "underscore refer to",
+  "evaluation": {
+    "accuracy": 0.5880031570639306
+  },
+  "arguments": "Namespace(config_name=None, dataset_config_name='winogrande_xl', dataset_name='winogrande', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name=None, template_name='underscore refer to', tokenizer_name=None, use_slow_tokenizer=False)"
+}
\ No newline at end of file
diff --git a/evaluation_l1/xcopa/id/C1_or_C2?_premise/results.json b/evaluation_l1/xcopa/id/C1_or_C2?_premise/results.json
new file mode 100644
index 0000000000000000000000000000000000000000..a1088c9040872e3ae150c45ed74645051dbe1144
--- /dev/null
+++ b/evaluation_l1/xcopa/id/C1_or_C2?_premise/results.json
@@ -0,0 +1,9 @@
+{
+  "dataset_name": "xcopa",
+  "dataset_config_name": "id",
+  "template_name": "C1 or C2? premise, so/because\u2026",
+  "evaluation": {
+    "accuracy": 0.56
+  },
+  "arguments": "Namespace(config_name=None, dataset_config_name='id', dataset_name='xcopa', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='so/because\u2026,validation', target_max_length=256, template_config_name='en', template_name='C1 or C2? premise', tokenizer_name=None, use_slow_tokenizer=False)"
+}
\ No newline at end of file
diff --git a/evaluation_l1/xcopa/id/best_option/results.json b/evaluation_l1/xcopa/id/best_option/results.json
new file mode 100644
index 0000000000000000000000000000000000000000..a20d27e3166b1f2c426acae2733282ee425f394d
--- /dev/null
+++ b/evaluation_l1/xcopa/id/best_option/results.json
@@ -0,0 +1,9 @@
+{
+  "dataset_name": "xcopa",
+  "dataset_config_name": "id",
+  "template_name": "best_option",
+  "evaluation": {
+    "accuracy": 0.81
+  },
+  "arguments": "Namespace(config_name=None, dataset_config_name='id', dataset_name='xcopa', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='en', template_name='best_option', tokenizer_name=None, use_slow_tokenizer=False)"
+}
\ No newline at end of file
diff --git a/evaluation_l1/xcopa/id/cause_effect/results.json b/evaluation_l1/xcopa/id/cause_effect/results.json
new file mode 100644
index 0000000000000000000000000000000000000000..3c5e9acce7d4e1f6ae99e36d6c9ca92de41d7e30
--- /dev/null
+++ b/evaluation_l1/xcopa/id/cause_effect/results.json
@@ -0,0 +1,9 @@
+{
+  "dataset_name": "xcopa",
+  "dataset_config_name": "id",
+  "template_name": "cause_effect",
+  "evaluation": {
+    "accuracy": 0.87
+  },
+  "arguments": "Namespace(config_name=None, dataset_config_name='id', dataset_name='xcopa', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='en', template_name='cause_effect', tokenizer_name=None, use_slow_tokenizer=False)"
+}
\ No newline at end of file
diff --git a/evaluation_l1/xcopa/id/i_am_hesitating/results.json b/evaluation_l1/xcopa/id/i_am_hesitating/results.json
new file mode 100644
index 0000000000000000000000000000000000000000..bd8d6933b8ef7729962ea2c239587e9cbd2311ec
--- /dev/null
+++ b/evaluation_l1/xcopa/id/i_am_hesitating/results.json
@@ -0,0 +1,9 @@
+{
+  "dataset_name": "xcopa",
+  "dataset_config_name": "id",
+  "template_name": "i_am_hesitating",
+  "evaluation": {
+    "accuracy": 0.83
+  },
+  "arguments": "Namespace(config_name=None, dataset_config_name='id', dataset_name='xcopa', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='en', template_name='i_am_hesitating', tokenizer_name=None, use_slow_tokenizer=False)"
+}
\ No newline at end of file
diff --git a/evaluation_l1/xcopa/id/plausible_alternatives/results.json b/evaluation_l1/xcopa/id/plausible_alternatives/results.json
new file mode 100644
index 0000000000000000000000000000000000000000..7a367c68f6181bbe1d1a9acb9f57b9f7228d2561
--- /dev/null
+++ b/evaluation_l1/xcopa/id/plausible_alternatives/results.json
@@ -0,0 +1,9 @@
+{
+  "dataset_name": "xcopa",
+  "dataset_config_name": "id",
+  "template_name": "plausible_alternatives",
+  "evaluation": {
+    "accuracy": 0.87
+  },
+  "arguments": "Namespace(config_name=None, dataset_config_name='id', dataset_name='xcopa', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='en', template_name='plausible_alternatives', tokenizer_name=None, use_slow_tokenizer=False)"
+}
\ No newline at end of file
diff --git a/evaluation_l1/xcopa/sw/C1_or_C2?_premise/results.json b/evaluation_l1/xcopa/sw/C1_or_C2?_premise/results.json
new file mode 100644
index 0000000000000000000000000000000000000000..923cf175a2ea8cee40930c011a9e6277f7ec24fb
--- /dev/null
+++ b/evaluation_l1/xcopa/sw/C1_or_C2?_premise/results.json
@@ -0,0 +1,9 @@
+{
+  "dataset_name": "xcopa",
+  "dataset_config_name": "sw",
+  "template_name": "C1 or C2? premise, so/because\u2026",
+  "evaluation": {
+    "accuracy": 0.6
+  },
+  "arguments": "Namespace(config_name=None, dataset_config_name='sw', dataset_name='xcopa', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='so/because\u2026,validation', target_max_length=256, template_config_name='en', template_name='C1 or C2? premise', tokenizer_name=None, use_slow_tokenizer=False)"
+}
\ No newline at end of file
diff --git a/evaluation_l1/xcopa/sw/best_option/results.json b/evaluation_l1/xcopa/sw/best_option/results.json
new file mode 100644
index 0000000000000000000000000000000000000000..a23f938d1d48be3de69b0ba28625fc4bf2b5e7f3
--- /dev/null
+++ b/evaluation_l1/xcopa/sw/best_option/results.json
@@ -0,0 +1,9 @@
+{
+  "dataset_name": "xcopa",
+  "dataset_config_name": "sw",
+  "template_name": "best_option",
+  "evaluation": {
+    "accuracy": 0.62
+  },
+  "arguments": "Namespace(config_name=None, dataset_config_name='sw', dataset_name='xcopa', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='en', template_name='best_option', tokenizer_name=None, use_slow_tokenizer=False)"
+}
\ No newline at end of file
diff --git a/evaluation_l1/xcopa/sw/cause_effect/results.json b/evaluation_l1/xcopa/sw/cause_effect/results.json
new file mode 100644
index 0000000000000000000000000000000000000000..72aaf4a87fbe31b7d4b265f0700c6b3d7e1d85e2
--- /dev/null
+++ b/evaluation_l1/xcopa/sw/cause_effect/results.json
@@ -0,0 +1,9 @@
+{
+  "dataset_name": "xcopa",
+  "dataset_config_name": "sw",
+  "template_name": "cause_effect",
+  "evaluation": {
+    "accuracy": 0.64
+  },
+  "arguments": "Namespace(config_name=None, dataset_config_name='sw', dataset_name='xcopa', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='en', template_name='cause_effect', tokenizer_name=None, use_slow_tokenizer=False)"
+}
\ No newline at end of file
diff --git a/evaluation_l1/xcopa/sw/i_am_hesitating/results.json b/evaluation_l1/xcopa/sw/i_am_hesitating/results.json
new file mode 100644
index 0000000000000000000000000000000000000000..e408b253e1bb3a3ea64e06a37a48df3244969442
--- /dev/null
+++ b/evaluation_l1/xcopa/sw/i_am_hesitating/results.json
@@ -0,0 +1,9 @@
+{
+  "dataset_name": "xcopa",
+  "dataset_config_name": "sw",
+  "template_name": "i_am_hesitating",
+  "evaluation": {
+    "accuracy": 0.66
+  },
+  "arguments": "Namespace(config_name=None, dataset_config_name='sw', dataset_name='xcopa', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='en', template_name='i_am_hesitating', tokenizer_name=None, use_slow_tokenizer=False)"
+}
\ No newline at end of file
diff --git a/evaluation_l1/xcopa/sw/plausible_alternatives/results.json b/evaluation_l1/xcopa/sw/plausible_alternatives/results.json
new file mode 100644
index 0000000000000000000000000000000000000000..1df4cfe999e3301fb63258105a6bd22d6d8a87d0
--- /dev/null
+++ b/evaluation_l1/xcopa/sw/plausible_alternatives/results.json
@@ -0,0 +1,9 @@
+{
+  "dataset_name": "xcopa",
+  "dataset_config_name": "sw",
+  "template_name": "plausible_alternatives",
+  "evaluation": {
+    "accuracy": 0.64
+  },
+  "arguments": "Namespace(config_name=None, dataset_config_name='sw', dataset_name='xcopa', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='en', template_name='plausible_alternatives', tokenizer_name=None, use_slow_tokenizer=False)"
+}
\ No newline at end of file
diff --git a/evaluation_l1/xcopa/ta/C1_or_C2?_premise/results.json b/evaluation_l1/xcopa/ta/C1_or_C2?_premise/results.json
new file mode 100644
index 0000000000000000000000000000000000000000..30417333f2d8e1c2f584a1d61725aea60b65c6f1
--- /dev/null
+++ b/evaluation_l1/xcopa/ta/C1_or_C2?_premise/results.json
@@ -0,0 +1,9 @@
+{
+  "dataset_name": "xcopa",
+  "dataset_config_name": "ta",
+  "template_name": "C1 or C2? premise, so/because\u2026",
+  "evaluation": {
+    "accuracy": 0.59
+  },
+  "arguments": "Namespace(config_name=None, dataset_config_name='ta', dataset_name='xcopa', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='so/because\u2026,validation', target_max_length=256, template_config_name='en', template_name='C1 or C2? premise', tokenizer_name=None, use_slow_tokenizer=False)"
+}
\ No newline at end of file
diff --git a/evaluation_l1/xcopa/ta/best_option/results.json b/evaluation_l1/xcopa/ta/best_option/results.json
new file mode 100644
index 0000000000000000000000000000000000000000..e2654eb7c7fdcc3225e2b671e77820a685159580
--- /dev/null
+++ b/evaluation_l1/xcopa/ta/best_option/results.json
@@ -0,0 +1,9 @@
+{
+  "dataset_name": "xcopa",
+  "dataset_config_name": "ta",
+  "template_name": "best_option",
+  "evaluation": {
+    "accuracy": 0.66
+  },
+  "arguments": "Namespace(config_name=None, dataset_config_name='ta', dataset_name='xcopa', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='en', template_name='best_option', tokenizer_name=None, use_slow_tokenizer=False)"
+}
\ No newline at end of file
diff --git a/evaluation_l1/xcopa/ta/cause_effect/results.json b/evaluation_l1/xcopa/ta/cause_effect/results.json
new file mode 100644
index 0000000000000000000000000000000000000000..06ed968210883b1677420df46920ae8767ae10d4
--- /dev/null
+++ b/evaluation_l1/xcopa/ta/cause_effect/results.json
@@ -0,0 +1,9 @@
+{
+  "dataset_name": "xcopa",
+  "dataset_config_name": "ta",
+  "template_name": "cause_effect",
+  "evaluation": {
+    "accuracy": 0.7
+  },
+  "arguments": "Namespace(config_name=None, dataset_config_name='ta', dataset_name='xcopa', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='en', template_name='cause_effect', tokenizer_name=None, use_slow_tokenizer=False)"
+}
\ No newline at end of file
diff --git a/evaluation_l1/xcopa/ta/i_am_hesitating/results.json b/evaluation_l1/xcopa/ta/i_am_hesitating/results.json
new file mode 100644
index 0000000000000000000000000000000000000000..7f856249983a23f49cc7c0ff874576f3165782d5
--- /dev/null
+++ b/evaluation_l1/xcopa/ta/i_am_hesitating/results.json
@@ -0,0 +1,9 @@
+{
+  "dataset_name": "xcopa",
+  "dataset_config_name": "ta",
+  "template_name": "i_am_hesitating",
+  "evaluation": {
+    "accuracy": 0.69
+  },
+  "arguments": "Namespace(config_name=None, dataset_config_name='ta', dataset_name='xcopa', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='en', template_name='i_am_hesitating', tokenizer_name=None, use_slow_tokenizer=False)"
+}
\ No newline at end of file
diff --git a/evaluation_l1/xcopa/ta/plausible_alternatives/results.json b/evaluation_l1/xcopa/ta/plausible_alternatives/results.json
new file mode 100644
index 0000000000000000000000000000000000000000..57306bd7fddc09ae459ae50228a5d0e5845181e0
--- /dev/null
+++ b/evaluation_l1/xcopa/ta/plausible_alternatives/results.json
@@ -0,0 +1,9 @@
+{
+  "dataset_name": "xcopa",
+  "dataset_config_name": "ta",
+  "template_name": "plausible_alternatives",
+  "evaluation": {
+    "accuracy": 0.64
+  },
+  "arguments": "Namespace(config_name=None, dataset_config_name='ta', dataset_name='xcopa', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='en', template_name='plausible_alternatives', tokenizer_name=None, use_slow_tokenizer=False)"
+}
\ No newline at end of file
diff --git a/evaluation_l1/xcopa/vi/C1_or_C2?_premise/results.json b/evaluation_l1/xcopa/vi/C1_or_C2?_premise/results.json
new file mode 100644
index 0000000000000000000000000000000000000000..ae38a9d01d5d6923042592213e5294c326c81bf7
--- /dev/null
+++ b/evaluation_l1/xcopa/vi/C1_or_C2?_premise/results.json
@@ -0,0 +1,9 @@
+{
+  "dataset_name": "xcopa",
+  "dataset_config_name": "vi",
+  "template_name": "C1 or C2? premise, so/because\u2026",
+  "evaluation": {
+    "accuracy": 0.58
+  },
+  "arguments": "Namespace(config_name=None, dataset_config_name='vi', dataset_name='xcopa', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='so/because\u2026,validation', target_max_length=256, template_config_name='en', template_name='C1 or C2? premise', tokenizer_name=None, use_slow_tokenizer=False)"
+}
\ No newline at end of file
diff --git a/evaluation_l1/xcopa/vi/best_option/results.json b/evaluation_l1/xcopa/vi/best_option/results.json
new file mode 100644
index 0000000000000000000000000000000000000000..b8dc7ac5d1e291253d6c136a15338d56638ad7ea
--- /dev/null
+++ b/evaluation_l1/xcopa/vi/best_option/results.json
@@ -0,0 +1,9 @@
+{
+  "dataset_name": "xcopa",
+  "dataset_config_name": "vi",
+  "template_name": "best_option",
+  "evaluation": {
+    "accuracy": 0.81
+  },
+  "arguments": "Namespace(config_name=None, dataset_config_name='vi', dataset_name='xcopa', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='en', template_name='best_option', tokenizer_name=None, use_slow_tokenizer=False)"
+}
\ No newline at end of file
diff --git a/evaluation_l1/xcopa/vi/cause_effect/results.json b/evaluation_l1/xcopa/vi/cause_effect/results.json
new file mode 100644
index 0000000000000000000000000000000000000000..6e7406329c9039078fb7ae9d2ef72155ae008417
--- /dev/null
+++ b/evaluation_l1/xcopa/vi/cause_effect/results.json
@@ -0,0 +1,9 @@
+{
+  "dataset_name": "xcopa",
+  "dataset_config_name": "vi",
+  "template_name": "cause_effect",
+  "evaluation": {
+    "accuracy": 0.91
+  },
+  "arguments": "Namespace(config_name=None, dataset_config_name='vi', dataset_name='xcopa', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='en', template_name='cause_effect', tokenizer_name=None, use_slow_tokenizer=False)"
+}
\ No newline at end of file
diff --git a/evaluation_l1/xcopa/vi/i_am_hesitating/results.json b/evaluation_l1/xcopa/vi/i_am_hesitating/results.json
new file mode 100644
index 0000000000000000000000000000000000000000..f41710a911538208550e160eed17539d59b24bf1
--- /dev/null
+++ b/evaluation_l1/xcopa/vi/i_am_hesitating/results.json
@@ -0,0 +1,9 @@
+{
+  "dataset_name": "xcopa",
+  "dataset_config_name": "vi",
+  "template_name": "i_am_hesitating",
+  "evaluation": {
+    "accuracy": 0.85
+  },
+  "arguments": "Namespace(config_name=None, dataset_config_name='vi', dataset_name='xcopa', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='en', template_name='i_am_hesitating', tokenizer_name=None, use_slow_tokenizer=False)"
+}
\ No newline at end of file
diff --git a/evaluation_l1/xcopa/vi/plausible_alternatives/results.json b/evaluation_l1/xcopa/vi/plausible_alternatives/results.json
new file mode 100644
index 0000000000000000000000000000000000000000..cf1bea02a25840d8ecb41d39a44242cef297b981
--- /dev/null
+++ b/evaluation_l1/xcopa/vi/plausible_alternatives/results.json
@@ -0,0 +1,9 @@
+{
+  "dataset_name": "xcopa",
+  "dataset_config_name": "vi",
+  "template_name": "plausible_alternatives",
+  "evaluation": {
+    "accuracy": 0.84
+  },
+  "arguments": "Namespace(config_name=None, dataset_config_name='vi', dataset_name='xcopa', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='en', template_name='plausible_alternatives', tokenizer_name=None, use_slow_tokenizer=False)"
+}
\ No newline at end of file
diff --git a/evaluation_l1/xcopa/zh/C1_or_C2?_premise/results.json b/evaluation_l1/xcopa/zh/C1_or_C2?_premise/results.json
new file mode 100644
index 0000000000000000000000000000000000000000..a9a4b3b430590a5347eaef9a5d4e361726881045
--- /dev/null
+++ b/evaluation_l1/xcopa/zh/C1_or_C2?_premise/results.json
@@ -0,0 +1,9 @@
+{
+  "dataset_name": "xcopa",
+  "dataset_config_name": "zh",
+  "template_name": "C1 or C2? premise, so/because\u2026",
+  "evaluation": {
+    "accuracy": 0.57
+  },
+  "arguments": "Namespace(config_name=None, dataset_config_name='zh', dataset_name='xcopa', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='so/because\u2026,validation', target_max_length=256, template_config_name='en', template_name='C1 or C2? premise', tokenizer_name=None, use_slow_tokenizer=False)"
+}
\ No newline at end of file
diff --git a/evaluation_l1/xcopa/zh/best_option/results.json b/evaluation_l1/xcopa/zh/best_option/results.json
new file mode 100644
index 0000000000000000000000000000000000000000..aed5c9b07cc6b2a3d46478aea301f538d1b3775a
--- /dev/null
+++ b/evaluation_l1/xcopa/zh/best_option/results.json
@@ -0,0 +1,9 @@
+{
+  "dataset_name": "xcopa",
+  "dataset_config_name": "zh",
+  "template_name": "best_option",
+  "evaluation": {
+    "accuracy": 0.84
+  },
+  "arguments": "Namespace(config_name=None, dataset_config_name='zh', dataset_name='xcopa', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='en', template_name='best_option', tokenizer_name=None, use_slow_tokenizer=False)"
+}
\ No newline at end of file
diff --git a/evaluation_l1/xcopa/zh/cause_effect/results.json b/evaluation_l1/xcopa/zh/cause_effect/results.json
new file mode 100644
index 0000000000000000000000000000000000000000..d7e9010b77cae2b075cdc0513155deb6d81aa382
--- /dev/null
+++ b/evaluation_l1/xcopa/zh/cause_effect/results.json
@@ -0,0 +1,9 @@
+{
+  "dataset_name": "xcopa",
+  "dataset_config_name": "zh",
+  "template_name": "cause_effect",
+  "evaluation": {
+    "accuracy": 0.86
+  },
+  "arguments": "Namespace(config_name=None, dataset_config_name='zh', dataset_name='xcopa', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='en', template_name='cause_effect', tokenizer_name=None, use_slow_tokenizer=False)"
+}
\ No newline at end of file
diff --git a/evaluation_l1/xcopa/zh/i_am_hesitating/results.json b/evaluation_l1/xcopa/zh/i_am_hesitating/results.json
new file mode 100644
index 0000000000000000000000000000000000000000..8f11039c838151ff3b5a0a2e27634e683ec56a1f
--- /dev/null
+++ b/evaluation_l1/xcopa/zh/i_am_hesitating/results.json
@@ -0,0 +1,9 @@
+{
+  "dataset_name": "xcopa",
+  "dataset_config_name": "zh",
+  "template_name": "i_am_hesitating",
+  "evaluation": {
+    "accuracy": 0.86
+  },
+  "arguments": "Namespace(config_name=None, dataset_config_name='zh', dataset_name='xcopa', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='en', template_name='i_am_hesitating', tokenizer_name=None, use_slow_tokenizer=False)"
+}
\ No newline at end of file
diff --git a/evaluation_l1/xcopa/zh/plausible_alternatives/results.json b/evaluation_l1/xcopa/zh/plausible_alternatives/results.json
new file mode 100644
index 0000000000000000000000000000000000000000..150f66521abd7106df9da22a6386cb009e460e36
--- /dev/null
+++ b/evaluation_l1/xcopa/zh/plausible_alternatives/results.json
@@ -0,0 +1,9 @@
+{
+  "dataset_name": "xcopa",
+  "dataset_config_name": "zh",
+  "template_name": "plausible_alternatives",
+  "evaluation": {
+    "accuracy": 0.81
+  },
+  "arguments": "Namespace(config_name=None, dataset_config_name='zh', dataset_name='xcopa', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='en', template_name='plausible_alternatives', tokenizer_name=None, use_slow_tokenizer=False)"
+}
\ No newline at end of file
diff --git a/evaluation_l1/xnli/ar/GPT-3_style/results.json b/evaluation_l1/xnli/ar/GPT-3_style/results.json
new file mode 100644
index 0000000000000000000000000000000000000000..ff01b7379c993414d086d9db17854a7205ac6cc6
--- /dev/null
+++ b/evaluation_l1/xnli/ar/GPT-3_style/results.json
@@ -0,0 +1,9 @@
+{
+  "dataset_name": "xnli",
+  "dataset_config_name": "ar",
+  "template_name": "GPT-3 style",
+  "evaluation": {
+    "accuracy": 0.5578313253012048
+  },
+  "arguments": "Namespace(config_name=None, dataset_config_name='ar', dataset_name='xnli', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='en', template_name='GPT-3 style', tokenizer_name=None, use_slow_tokenizer=False)"
+}
\ No newline at end of file
diff --git a/evaluation_l1/xnli/ar/MNLI_crowdsource/results.json b/evaluation_l1/xnli/ar/MNLI_crowdsource/results.json
new file mode 100644
index 0000000000000000000000000000000000000000..ac2ea9421148ad0ae4106e400778c01975b51947
--- /dev/null
+++ b/evaluation_l1/xnli/ar/MNLI_crowdsource/results.json
@@ -0,0 +1,9 @@
+{
+  "dataset_name": "xnli",
+  "dataset_config_name": "ar",
+  "template_name": "MNLI crowdsource",
+  "evaluation": {
+    "accuracy": 0.41164658634538154
+  },
+  "arguments": "Namespace(config_name=None, dataset_config_name='ar', dataset_name='xnli', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='en', template_name='MNLI crowdsource', tokenizer_name=None, use_slow_tokenizer=False)"
+}
\ No newline at end of file
diff --git a/evaluation_l1/xnli/ar/can_we_infer/results.json b/evaluation_l1/xnli/ar/can_we_infer/results.json
new file mode 100644
index 0000000000000000000000000000000000000000..dfcac9ad9cb913582ab0487da22cea8e0d7e8e23
--- /dev/null
+++ b/evaluation_l1/xnli/ar/can_we_infer/results.json
@@ -0,0 +1,9 @@
+{
+  "dataset_name": "xnli",
+  "dataset_config_name": "ar",
+  "template_name": "can we infer",
+  "evaluation": {
+    "accuracy": 0.5152610441767068
+  },
+  "arguments": "Namespace(config_name=None, dataset_config_name='ar', dataset_name='xnli', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='en', template_name='can we infer', tokenizer_name=None, use_slow_tokenizer=False)"
+}
\ No newline at end of file
diff --git a/evaluation_l1/xnli/ar/guaranteed_possible_impossible/results.json b/evaluation_l1/xnli/ar/guaranteed_possible_impossible/results.json
new file mode 100644
index 0000000000000000000000000000000000000000..5b11969b0e13f92bc6a6161eaf64e997249061de
--- /dev/null
+++ b/evaluation_l1/xnli/ar/guaranteed_possible_impossible/results.json
@@ -0,0 +1,9 @@
+{
+  "dataset_name": "xnli",
+  "dataset_config_name": "ar",
+  "template_name": "guaranteed/possible/impossible",
+  "evaluation": {
+    "accuracy": 0.5803212851405622
+  },
+  "arguments": "Namespace(config_name=None, dataset_config_name='ar', dataset_name='xnli', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='en', template_name='guaranteed/possible/impossible', tokenizer_name=None, use_slow_tokenizer=False)"
+}
\ No newline at end of file
diff --git a/evaluation_l1/xnli/ar/justified_in_saying/results.json b/evaluation_l1/xnli/ar/justified_in_saying/results.json
new file mode 100644
index 0000000000000000000000000000000000000000..efe506af9c6d3273a1318b96a97687fac279971e
--- /dev/null
+++ b/evaluation_l1/xnli/ar/justified_in_saying/results.json
@@ -0,0 +1,9 @@
+{
+  "dataset_name": "xnli",
+  "dataset_config_name": "ar",
+  "template_name": "justified in saying",
+  "evaluation": {
+    "accuracy": 0.5184738955823294
+  },
+  "arguments": "Namespace(config_name=None, dataset_config_name='ar', dataset_name='xnli', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='en', template_name='justified in saying', tokenizer_name=None, use_slow_tokenizer=False)"
+}
\ No newline at end of file
diff --git a/evaluation_l1/xnli/en/GPT-3_style/results.json b/evaluation_l1/xnli/en/GPT-3_style/results.json
new file mode 100644
index 0000000000000000000000000000000000000000..f210a5ed2192d1d6a9b76886e07ad2d5e0b43f00
--- /dev/null
+++ b/evaluation_l1/xnli/en/GPT-3_style/results.json
@@ -0,0 +1,9 @@
+{
+  "dataset_name": "xnli",
+  "dataset_config_name": "en",
+  "template_name": "GPT-3 style",
+  "evaluation": {
+    "accuracy": 0.6176706827309237
+  },
+  "arguments": "Namespace(config_name=None, dataset_config_name='en', dataset_name='xnli', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='en', template_name='GPT-3 style', tokenizer_name=None, use_slow_tokenizer=False)"
+}
\ No newline at end of file
diff --git a/evaluation_l1/xnli/en/MNLI_crowdsource/results.json b/evaluation_l1/xnli/en/MNLI_crowdsource/results.json
new file mode 100644
index 0000000000000000000000000000000000000000..aab3affbd4f922e20a2e7ff49377b5a9db287063
--- /dev/null
+++ b/evaluation_l1/xnli/en/MNLI_crowdsource/results.json
@@ -0,0 +1,9 @@
+{
+  "dataset_name": "xnli",
+  "dataset_config_name": "en",
+  "template_name": "MNLI crowdsource",
+  "evaluation": {
+    "accuracy": 0.4606425702811245
+  },
+  "arguments": "Namespace(config_name=None, dataset_config_name='en', dataset_name='xnli', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='en', template_name='MNLI crowdsource', tokenizer_name=None, use_slow_tokenizer=False)"
+}
\ No newline at end of file
diff --git a/evaluation_l1/xnli/en/can_we_infer/results.json b/evaluation_l1/xnli/en/can_we_infer/results.json
new file mode 100644
index 0000000000000000000000000000000000000000..c81e210fafab735c745e71971539d6067357f32d
--- /dev/null
+++ b/evaluation_l1/xnli/en/can_we_infer/results.json
@@ -0,0 +1,9 @@
+{
+  "dataset_name": "xnli",
+  "dataset_config_name": "en",
+  "template_name": "can we infer",
+  "evaluation": {
+    "accuracy": 0.5714859437751004
+  },
+  "arguments": "Namespace(config_name=None, dataset_config_name='en', dataset_name='xnli', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='en', template_name='can we infer', tokenizer_name=None, use_slow_tokenizer=False)"
+}
\ No newline at end of file
diff --git a/evaluation_l1/xnli/en/guaranteed_possible_impossible/results.json b/evaluation_l1/xnli/en/guaranteed_possible_impossible/results.json
new file mode 100644
index 0000000000000000000000000000000000000000..3f6783548974144b51902dcc85cea22068f7b94a
--- /dev/null
+++ b/evaluation_l1/xnli/en/guaranteed_possible_impossible/results.json
@@ -0,0 +1,9 @@
+{
+  "dataset_name": "xnli",
+  "dataset_config_name": "en",
+  "template_name": "guaranteed/possible/impossible",
+  "evaluation": {
+    "accuracy": 0.6180722891566265
+  },
+  "arguments": "Namespace(config_name=None, dataset_config_name='en', dataset_name='xnli', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='en', template_name='guaranteed/possible/impossible', tokenizer_name=None, use_slow_tokenizer=False)"
+}
\ No newline at end of file
diff --git a/evaluation_l1/xnli/en/justified_in_saying/results.json b/evaluation_l1/xnli/en/justified_in_saying/results.json
new file mode 100644
index 0000000000000000000000000000000000000000..1c9543f0f76df9a579da6abc88f92b6bdd7e2675
--- /dev/null
+++ b/evaluation_l1/xnli/en/justified_in_saying/results.json
@@ -0,0 +1,9 @@
+{
+  "dataset_name": "xnli",
+  "dataset_config_name": "en",
+  "template_name": "justified in saying",
+  "evaluation": {
+    "accuracy": 0.5746987951807229
+  },
+  "arguments": "Namespace(config_name=None, dataset_config_name='en', dataset_name='xnli', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='en', template_name='justified in saying', tokenizer_name=None, use_slow_tokenizer=False)"
+}
\ No newline at end of file
diff --git a/evaluation_l1/xnli/es/GPT-3_style/results.json b/evaluation_l1/xnli/es/GPT-3_style/results.json
new file mode 100644
index 0000000000000000000000000000000000000000..1d004908cc470c9648fe66b283ad455e1b834ef0
--- /dev/null
+++ b/evaluation_l1/xnli/es/GPT-3_style/results.json
@@ -0,0 +1,9 @@
+{
+  "dataset_name": "xnli",
+  "dataset_config_name": "es",
+  "template_name": "GPT-3 style",
+  "evaluation": {
+    "accuracy": 0.5911646586345382
+  },
+  "arguments": "Namespace(config_name=None, dataset_config_name='es', dataset_name='xnli', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='en', template_name='GPT-3 style', tokenizer_name=None, use_slow_tokenizer=False)"
+}
\ No newline at end of file
diff --git a/evaluation_l1/xnli/es/MNLI_crowdsource/results.json b/evaluation_l1/xnli/es/MNLI_crowdsource/results.json
new file mode 100644
index 0000000000000000000000000000000000000000..121a0c50ffb27924c389db1d5d51037b5d6f5491
--- /dev/null
+++ b/evaluation_l1/xnli/es/MNLI_crowdsource/results.json
@@ -0,0 +1,9 @@
+{
+  "dataset_name": "xnli",
+  "dataset_config_name": "es",
+  "template_name": "MNLI crowdsource",
+  "evaluation": {
+    "accuracy": 0.43052208835341366
+  },
+  "arguments": "Namespace(config_name=None, dataset_config_name='es', dataset_name='xnli', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='en', template_name='MNLI crowdsource', tokenizer_name=None, use_slow_tokenizer=False)"
+}
\ No newline at end of file
diff --git a/evaluation_l1/xnli/es/can_we_infer/results.json b/evaluation_l1/xnli/es/can_we_infer/results.json
new file mode 100644
index 0000000000000000000000000000000000000000..20d1691995a8a5433ecb240cb1c3d03746c2a51f
--- /dev/null
+++ b/evaluation_l1/xnli/es/can_we_infer/results.json
@@ -0,0 +1,9 @@
+{
+  "dataset_name": "xnli",
+  "dataset_config_name": "es",
+  "template_name": "can we infer",
+  "evaluation": {
+    "accuracy": 0.4397590361445783
+  },
+  "arguments": "Namespace(config_name=None, dataset_config_name='es', dataset_name='xnli', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='en', template_name='can we infer', tokenizer_name=None, use_slow_tokenizer=False)"
+}
\ No newline at end of file
diff --git a/evaluation_l1/xnli/es/guaranteed_possible_impossible/results.json b/evaluation_l1/xnli/es/guaranteed_possible_impossible/results.json
new file mode 100644
index 0000000000000000000000000000000000000000..49b23b559b781c83bad56c90e5174f1e0b28f1f8
--- /dev/null
+++ b/evaluation_l1/xnli/es/guaranteed_possible_impossible/results.json
@@ -0,0 +1,9 @@
+{
+  "dataset_name": "xnli",
+  "dataset_config_name": "es",
+  "template_name": "guaranteed/possible/impossible",
+  "evaluation": {
+    "accuracy": 0.5208835341365462
+  },
+  "arguments": "Namespace(config_name=None, dataset_config_name='es', dataset_name='xnli', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='en', template_name='guaranteed/possible/impossible', tokenizer_name=None, use_slow_tokenizer=False)"
+}
\ No newline at end of file
diff --git a/evaluation_l1/xnli/es/justified_in_saying/results.json b/evaluation_l1/xnli/es/justified_in_saying/results.json
new file mode 100644
index 0000000000000000000000000000000000000000..afeca9dbd848afc14b8b19d223b64f7d5efaaeaa
--- /dev/null
+++ b/evaluation_l1/xnli/es/justified_in_saying/results.json
@@ -0,0 +1,9 @@
+{
+  "dataset_name": "xnli",
+  "dataset_config_name": "es",
+  "template_name": "justified in saying",
+  "evaluation": {
+    "accuracy": 0.41726907630522087
+  },
+  "arguments": "Namespace(config_name=None, dataset_config_name='es', dataset_name='xnli', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='en', template_name='justified in saying', tokenizer_name=None, use_slow_tokenizer=False)"
+}
\ No newline at end of file
diff --git a/evaluation_l1/xnli/fr/GPT-3_style/results.json b/evaluation_l1/xnli/fr/GPT-3_style/results.json
new file mode 100644
index 0000000000000000000000000000000000000000..98dfab12ef834b10246c6965b6193ecddadd61e6
--- /dev/null
+++ b/evaluation_l1/xnli/fr/GPT-3_style/results.json
@@ -0,0 +1,9 @@
+{
+  "dataset_name": "xnli",
+  "dataset_config_name": "fr",
+  "template_name": "GPT-3 style",
+  "evaluation": {
+    "accuracy": 0.5911646586345382
+  },
+  "arguments": "Namespace(config_name=None, dataset_config_name='fr', dataset_name='xnli', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='en', template_name='GPT-3 style', tokenizer_name=None, use_slow_tokenizer=False)"
+}
\ No newline at end of file
diff --git a/evaluation_l1/xnli/fr/MNLI_crowdsource/results.json b/evaluation_l1/xnli/fr/MNLI_crowdsource/results.json
new file mode 100644
index 0000000000000000000000000000000000000000..45c5d22e3bf400249196b61eeecaa7da720505e5
--- /dev/null
+++ b/evaluation_l1/xnli/fr/MNLI_crowdsource/results.json
@@ -0,0 +1,9 @@
+{
+  "dataset_name": "xnli",
+  "dataset_config_name": "fr",
+  "template_name": "MNLI crowdsource",
+  "evaluation": {
+    "accuracy": 0.4321285140562249
+  },
+  "arguments": "Namespace(config_name=None, dataset_config_name='fr', dataset_name='xnli', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='en', template_name='MNLI crowdsource', tokenizer_name=None, use_slow_tokenizer=False)"
+}
\ No newline at end of file
diff --git a/evaluation_l1/xnli/fr/can_we_infer/results.json b/evaluation_l1/xnli/fr/can_we_infer/results.json
new file mode 100644
index 0000000000000000000000000000000000000000..77d6f700093348996e1d24295849682c1ee5ac64
--- /dev/null
+++ b/evaluation_l1/xnli/fr/can_we_infer/results.json
@@ -0,0 +1,9 @@
+{
+  "dataset_name": "xnli",
+  "dataset_config_name": "fr",
+  "template_name": "can we infer",
+  "evaluation": {
+    "accuracy": 0.5369477911646586
+  },
+  "arguments": "Namespace(config_name=None, dataset_config_name='fr', dataset_name='xnli', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='en', template_name='can we infer', tokenizer_name=None, use_slow_tokenizer=False)"
+}
\ No newline at end of file
diff --git a/evaluation_l1/xnli/fr/guaranteed_possible_impossible/results.json b/evaluation_l1/xnli/fr/guaranteed_possible_impossible/results.json
new file mode 100644
index 0000000000000000000000000000000000000000..88f476e6a408c2d37aa3e4e651313e71bc7f1b3f
--- /dev/null
+++ b/evaluation_l1/xnli/fr/guaranteed_possible_impossible/results.json
@@ -0,0 +1,9 @@
+{
+  "dataset_name": "xnli",
+  "dataset_config_name": "fr",
+  "template_name": "guaranteed/possible/impossible",
+  "evaluation": {
+    "accuracy": 0.5176706827309236
+  },
+  "arguments": "Namespace(config_name=None, dataset_config_name='fr', dataset_name='xnli', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='en', template_name='guaranteed/possible/impossible', tokenizer_name=None, use_slow_tokenizer=False)"
+}
\ No newline at end of file
diff --git a/evaluation_l1/xnli/fr/justified_in_saying/results.json b/evaluation_l1/xnli/fr/justified_in_saying/results.json
new file mode 100644
index 0000000000000000000000000000000000000000..e4bd8c269d5762e1069423690ccdde27e4e56e3b
--- /dev/null
+++ b/evaluation_l1/xnli/fr/justified_in_saying/results.json
@@ -0,0 +1,9 @@
+{
+  "dataset_name": "xnli",
+  "dataset_config_name": "fr",
+  "template_name": "justified in saying",
+  "evaluation": {
+    "accuracy": 0.5385542168674698
+  },
+  "arguments": "Namespace(config_name=None, dataset_config_name='fr', dataset_name='xnli', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='en', template_name='justified in saying', tokenizer_name=None, use_slow_tokenizer=False)"
+}
\ No newline at end of file
diff --git a/evaluation_l1/xnli/hi/GPT-3_style/results.json b/evaluation_l1/xnli/hi/GPT-3_style/results.json
new file mode 100644
index 0000000000000000000000000000000000000000..90d22bda903ba93a0e0324665ec828fc885d8156
--- /dev/null
+++ b/evaluation_l1/xnli/hi/GPT-3_style/results.json
@@ -0,0 +1,9 @@
+{
+  "dataset_name": "xnli",
+  "dataset_config_name": "hi",
+  "template_name": "GPT-3 style",
+  "evaluation": {
+    "accuracy": 0.5208835341365462
+  },
+  "arguments": "Namespace(config_name=None, dataset_config_name='hi', dataset_name='xnli', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='en', template_name='GPT-3 style', tokenizer_name=None, use_slow_tokenizer=False)"
+}
\ No newline at end of file
diff --git a/evaluation_l1/xnli/hi/MNLI_crowdsource/results.json b/evaluation_l1/xnli/hi/MNLI_crowdsource/results.json
new file mode 100644
index 0000000000000000000000000000000000000000..0b58e1ba7eec12668ad871080febcfaf9860649f
--- /dev/null
+++ b/evaluation_l1/xnli/hi/MNLI_crowdsource/results.json
@@ -0,0 +1,9 @@
+{
+  "dataset_name": "xnli",
+  "dataset_config_name": "hi",
+  "template_name": "MNLI crowdsource",
+  "evaluation": {
+    "accuracy": 0.3819277108433735
+  },
+  "arguments": "Namespace(config_name=None, dataset_config_name='hi', dataset_name='xnli', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='en', template_name='MNLI crowdsource', tokenizer_name=None, use_slow_tokenizer=False)"
+}
\ No newline at end of file
diff --git a/evaluation_l1/xnli/hi/can_we_infer/results.json b/evaluation_l1/xnli/hi/can_we_infer/results.json
new file mode 100644
index 0000000000000000000000000000000000000000..b58a33fd1887c977c1b06221f564dbe217c2e539
--- /dev/null
+++ b/evaluation_l1/xnli/hi/can_we_infer/results.json
@@ -0,0 +1,9 @@
+{
+  "dataset_name": "xnli",
+  "dataset_config_name": "hi",
+  "template_name": "can we infer",
+  "evaluation": {
+    "accuracy": 0.44176706827309237
+  },
+  "arguments": "Namespace(config_name=None, dataset_config_name='hi', dataset_name='xnli', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='en', template_name='can we infer', tokenizer_name=None, use_slow_tokenizer=False)"
+}
\ No newline at end of file
diff --git a/evaluation_l1/xnli/hi/guaranteed_possible_impossible/results.json b/evaluation_l1/xnli/hi/guaranteed_possible_impossible/results.json
new file mode 100644
index 0000000000000000000000000000000000000000..c768d703f49038fc3b9b146f5f22fe421746912a
--- /dev/null
+++ b/evaluation_l1/xnli/hi/guaranteed_possible_impossible/results.json
@@ -0,0 +1,9 @@
+{
+  "dataset_name": "xnli",
+  "dataset_config_name": "hi",
+  "template_name": "guaranteed/possible/impossible",
+  "evaluation": {
+    "accuracy": 0.5253012048192771
+  },
+  "arguments": "Namespace(config_name=None, dataset_config_name='hi', dataset_name='xnli', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='en', template_name='guaranteed/possible/impossible', tokenizer_name=None, use_slow_tokenizer=False)"
+}
\ No newline at end of file
diff --git a/evaluation_l1/xnli/hi/justified_in_saying/results.json b/evaluation_l1/xnli/hi/justified_in_saying/results.json
new file mode 100644
index 0000000000000000000000000000000000000000..176c133af7bf8b355f7e2b4214c435c7b241de40
--- /dev/null
+++ b/evaluation_l1/xnli/hi/justified_in_saying/results.json
@@ -0,0 +1,9 @@
+{
+  "dataset_name": "xnli",
+  "dataset_config_name": "hi",
+  "template_name": "justified in saying",
+  "evaluation": {
+    "accuracy": 0.44377510040160645
+  },
+  "arguments": "Namespace(config_name=None, dataset_config_name='hi', dataset_name='xnli', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='en', template_name='justified in saying', tokenizer_name=None, use_slow_tokenizer=False)"
+}
\ No newline at end of file
diff --git a/evaluation_l1/xnli/sw/GPT-3_style/results.json b/evaluation_l1/xnli/sw/GPT-3_style/results.json
new file mode 100644
index 0000000000000000000000000000000000000000..745c75053efd47e267b816d7fc9ef9059b9af430
--- /dev/null
+++ b/evaluation_l1/xnli/sw/GPT-3_style/results.json
@@ -0,0 +1,9 @@
+{
+  "dataset_name": "xnli",
+  "dataset_config_name": "sw",
+  "template_name": "GPT-3 style",
+  "evaluation": {
+    "accuracy": 0.5036144578313253
+  },
+  "arguments": "Namespace(config_name=None, dataset_config_name='sw', dataset_name='xnli', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='en', template_name='GPT-3 style', tokenizer_name=None, use_slow_tokenizer=False)"
+}
\ No newline at end of file
diff --git a/evaluation_l1/xnli/sw/MNLI_crowdsource/results.json b/evaluation_l1/xnli/sw/MNLI_crowdsource/results.json
new file mode 100644
index 0000000000000000000000000000000000000000..1fbf12e7a9737e4355d54e58ae5db9d89f701b93
--- /dev/null
+++ b/evaluation_l1/xnli/sw/MNLI_crowdsource/results.json
@@ -0,0 +1,9 @@
+{
+  "dataset_name": "xnli",
+  "dataset_config_name": "sw",
+  "template_name": "MNLI crowdsource",
+  "evaluation": {
+    "accuracy": 0.3887550200803213
+  },
+  "arguments": "Namespace(config_name=None, dataset_config_name='sw', dataset_name='xnli', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='en', template_name='MNLI crowdsource', tokenizer_name=None, use_slow_tokenizer=False)"
+}
\ No newline at end of file
diff --git a/evaluation_l1/xnli/sw/can_we_infer/results.json b/evaluation_l1/xnli/sw/can_we_infer/results.json
new file mode 100644
index 0000000000000000000000000000000000000000..c04f874fe0f58a56b90ffb038bee36b10fbb2b8c
--- /dev/null
+++ b/evaluation_l1/xnli/sw/can_we_infer/results.json
@@ -0,0 +1,9 @@
+{
+  "dataset_name": "xnli",
+  "dataset_config_name": "sw",
+  "template_name": "can we infer",
+  "evaluation": {
+    "accuracy": 0.44216867469879517
+  },
+  "arguments": "Namespace(config_name=None, dataset_config_name='sw', dataset_name='xnli', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='en', template_name='can we infer', tokenizer_name=None, use_slow_tokenizer=False)"
+}
\ No newline at end of file
diff --git a/evaluation_l1/xnli/sw/guaranteed_possible_impossible/results.json b/evaluation_l1/xnli/sw/guaranteed_possible_impossible/results.json
new file mode 100644
index 0000000000000000000000000000000000000000..a1a771df194005c2f9a30786e341de8e9a310609
--- /dev/null
+++ b/evaluation_l1/xnli/sw/guaranteed_possible_impossible/results.json
@@ -0,0 +1,9 @@
+{
+  "dataset_name": "xnli",
+  "dataset_config_name": "sw",
+  "template_name": "guaranteed/possible/impossible",
+  "evaluation": {
+    "accuracy": 0.38795180722891565
+  },
+  "arguments": "Namespace(config_name=None, dataset_config_name='sw', dataset_name='xnli', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='en', template_name='guaranteed/possible/impossible', tokenizer_name=None, use_slow_tokenizer=False)"
+}
\ No newline at end of file
diff --git a/evaluation_l1/xnli/sw/justified_in_saying/results.json b/evaluation_l1/xnli/sw/justified_in_saying/results.json
new file mode 100644
index 0000000000000000000000000000000000000000..b06b5ac8c564d4da30284d6f8d545022a192b420
--- /dev/null
+++ b/evaluation_l1/xnli/sw/justified_in_saying/results.json
@@ -0,0 +1,9 @@
+{
+  "dataset_name": "xnli",
+  "dataset_config_name": "sw",
+  "template_name": "justified in saying",
+  "evaluation": {
+    "accuracy": 0.4397590361445783
+  },
+  "arguments": "Namespace(config_name=None, dataset_config_name='sw', dataset_name='xnli', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='en', template_name='justified in saying', tokenizer_name=None, use_slow_tokenizer=False)"
+}
\ No newline at end of file
diff --git a/evaluation_l1/xnli/ur/GPT-3_style/results.json b/evaluation_l1/xnli/ur/GPT-3_style/results.json
new file mode 100644
index 0000000000000000000000000000000000000000..361ed448d271cdef8b6876d50d3e9500e535bcc8
--- /dev/null
+++ b/evaluation_l1/xnli/ur/GPT-3_style/results.json
@@ -0,0 +1,9 @@
+{
+  "dataset_name": "xnli",
+  "dataset_config_name": "ur",
+  "template_name": "GPT-3 style",
+  "evaluation": {
+    "accuracy": 0.4907630522088353
+  },
+  "arguments": "Namespace(config_name=None, dataset_config_name='ur', dataset_name='xnli', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='en', template_name='GPT-3 style', tokenizer_name=None, use_slow_tokenizer=False)"
+}
\ No newline at end of file
diff --git a/evaluation_l1/xnli/ur/MNLI_crowdsource/results.json b/evaluation_l1/xnli/ur/MNLI_crowdsource/results.json
new file mode 100644
index 0000000000000000000000000000000000000000..d09364d03f5460b06357537390716479b588a719
--- /dev/null
+++ b/evaluation_l1/xnli/ur/MNLI_crowdsource/results.json
@@ -0,0 +1,9 @@
+{
+  "dataset_name": "xnli",
+  "dataset_config_name": "ur",
+  "template_name": "MNLI crowdsource",
+  "evaluation": {
+    "accuracy": 0.37309236947791163
+  },
+  "arguments": "Namespace(config_name=None, dataset_config_name='ur', dataset_name='xnli', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='en', template_name='MNLI crowdsource', tokenizer_name=None, use_slow_tokenizer=False)"
+}
\ No newline at end of file
diff --git a/evaluation_l1/xnli/ur/can_we_infer/results.json b/evaluation_l1/xnli/ur/can_we_infer/results.json
new file mode 100644
index 0000000000000000000000000000000000000000..55616a19ddbd889cc33255b73fdc8fa6b91e68ec
--- /dev/null
+++ b/evaluation_l1/xnli/ur/can_we_infer/results.json
@@ -0,0 +1,9 @@
+{
+  "dataset_name": "xnli",
+  "dataset_config_name": "ur",
+  "template_name": "can we infer",
+  "evaluation": {
+    "accuracy": 0.45863453815261046
+  },
+  "arguments": "Namespace(config_name=None, dataset_config_name='ur', dataset_name='xnli', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='en', template_name='can we infer', tokenizer_name=None, use_slow_tokenizer=False)"
+}
\ No newline at end of file
diff --git a/evaluation_l1/xnli/ur/guaranteed_possible_impossible/results.json b/evaluation_l1/xnli/ur/guaranteed_possible_impossible/results.json
new file mode 100644
index 0000000000000000000000000000000000000000..e4d0ae8991d72c90be89ca5f763e817b03ac05d5
--- /dev/null
+++ b/evaluation_l1/xnli/ur/guaranteed_possible_impossible/results.json
@@ -0,0 +1,9 @@
+{
+  "dataset_name": "xnli",
+  "dataset_config_name": "ur",
+  "template_name": "guaranteed/possible/impossible",
+  "evaluation": {
+    "accuracy": 0.5124497991967871
+  },
+  "arguments": "Namespace(config_name=None, dataset_config_name='ur', dataset_name='xnli', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='en', template_name='guaranteed/possible/impossible', tokenizer_name=None, use_slow_tokenizer=False)"
+}
\ No newline at end of file
diff --git a/evaluation_l1/xnli/ur/justified_in_saying/results.json b/evaluation_l1/xnli/ur/justified_in_saying/results.json
new file mode 100644
index 0000000000000000000000000000000000000000..eb743b1f0c2c178dec6881a3be1d5336589cbfa1
--- /dev/null
+++ b/evaluation_l1/xnli/ur/justified_in_saying/results.json
@@ -0,0 +1,9 @@
+{
+  "dataset_name": "xnli",
+  "dataset_config_name": "ur",
+  "template_name": "justified in saying",
+  "evaluation": {
+    "accuracy": 0.45582329317269077
+  },
+  "arguments": "Namespace(config_name=None, dataset_config_name='ur', dataset_name='xnli', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='en', template_name='justified in saying', tokenizer_name=None, use_slow_tokenizer=False)"
+}
\ No newline at end of file
diff --git a/evaluation_l1/xnli/vi/GPT-3_style/results.json b/evaluation_l1/xnli/vi/GPT-3_style/results.json
new file mode 100644
index 0000000000000000000000000000000000000000..e77b5f3004afba26184222f7a45fadcaabd63989
--- /dev/null
+++ b/evaluation_l1/xnli/vi/GPT-3_style/results.json
@@ -0,0 +1,9 @@
+{
+  "dataset_name": "xnli",
+  "dataset_config_name": "vi",
+  "template_name": "GPT-3 style",
+  "evaluation": {
+    "accuracy": 0.5582329317269076
+  },
+  "arguments": "Namespace(config_name=None, dataset_config_name='vi', dataset_name='xnli', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='en', template_name='GPT-3 style', tokenizer_name=None, use_slow_tokenizer=False)"
+}
\ No newline at end of file
diff --git a/evaluation_l1/xnli/vi/MNLI_crowdsource/results.json b/evaluation_l1/xnli/vi/MNLI_crowdsource/results.json
new file mode 100644
index 0000000000000000000000000000000000000000..f84b726698ca47e601a1aa4e3896d0ee7185e510
--- /dev/null
+++ b/evaluation_l1/xnli/vi/MNLI_crowdsource/results.json
@@ -0,0 +1,9 @@
+{
+  "dataset_name": "xnli",
+  "dataset_config_name": "vi",
+  "template_name": "MNLI crowdsource",
+  "evaluation": {
+    "accuracy": 0.42690763052208835
+  },
+  "arguments": "Namespace(config_name=None, dataset_config_name='vi', dataset_name='xnli', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='en', template_name='MNLI crowdsource', tokenizer_name=None, use_slow_tokenizer=False)"
+}
\ No newline at end of file
diff --git a/evaluation_l1/xnli/vi/can_we_infer/results.json b/evaluation_l1/xnli/vi/can_we_infer/results.json
new file mode 100644
index 0000000000000000000000000000000000000000..4fd391f132bfedf7cd39efd2b6656b564d343458
--- /dev/null
+++ b/evaluation_l1/xnli/vi/can_we_infer/results.json
@@ -0,0 +1,9 @@
+{
+  "dataset_name": "xnli",
+  "dataset_config_name": "vi",
+  "template_name": "can we infer",
+  "evaluation": {
+    "accuracy": 0.4759036144578313
+  },
+  "arguments": "Namespace(config_name=None, dataset_config_name='vi', dataset_name='xnli', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='en', template_name='can we infer', tokenizer_name=None, use_slow_tokenizer=False)"
+}
\ No newline at end of file
diff --git a/evaluation_l1/xnli/vi/guaranteed_possible_impossible/results.json b/evaluation_l1/xnli/vi/guaranteed_possible_impossible/results.json
new file mode 100644
index 0000000000000000000000000000000000000000..ab70dc2fca42dab93474d9a5f5432d1bc74968ff
--- /dev/null
+++ b/evaluation_l1/xnli/vi/guaranteed_possible_impossible/results.json
@@ -0,0 +1,9 @@
+{
+  "dataset_name": "xnli",
+  "dataset_config_name": "vi",
+  "template_name": "guaranteed/possible/impossible",
+  "evaluation": {
+    "accuracy": 0.5008032128514056
+  },
+  "arguments": "Namespace(config_name=None, dataset_config_name='vi', dataset_name='xnli', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='en', template_name='guaranteed/possible/impossible', tokenizer_name=None, use_slow_tokenizer=False)"
+}
\ No newline at end of file
diff --git a/evaluation_l1/xnli/vi/justified_in_saying/results.json b/evaluation_l1/xnli/vi/justified_in_saying/results.json
new file mode 100644
index 0000000000000000000000000000000000000000..5186d53e6ea41cde8f8a201fa2699a49eb570ba5
--- /dev/null
+++ b/evaluation_l1/xnli/vi/justified_in_saying/results.json
@@ -0,0 +1,9 @@
+{
+  "dataset_name": "xnli",
+  "dataset_config_name": "vi",
+  "template_name": "justified in saying",
+  "evaluation": {
+    "accuracy": 0.4827309236947791
+  },
+  "arguments": "Namespace(config_name=None, dataset_config_name='vi', dataset_name='xnli', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='en', template_name='justified in saying', tokenizer_name=None, use_slow_tokenizer=False)"
+}
\ No newline at end of file
diff --git a/evaluation_l1/xnli/zh/GPT-3_style/results.json b/evaluation_l1/xnli/zh/GPT-3_style/results.json
new file mode 100644
index 0000000000000000000000000000000000000000..5fbaef1325c16e12a77c3c23abd166c6fa411e1c
--- /dev/null
+++ b/evaluation_l1/xnli/zh/GPT-3_style/results.json
@@ -0,0 +1,9 @@
+{
+  "dataset_name": "xnli",
+  "dataset_config_name": "zh",
+  "template_name": "GPT-3 style",
+  "evaluation": {
+    "accuracy": 0.5550200803212851
+  },
+  "arguments": "Namespace(config_name=None, dataset_config_name='zh', dataset_name='xnli', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='en', template_name='GPT-3 style', tokenizer_name=None, use_slow_tokenizer=False)"
+}
\ No newline at end of file
diff --git a/evaluation_l1/xnli/zh/MNLI_crowdsource/results.json b/evaluation_l1/xnli/zh/MNLI_crowdsource/results.json
new file mode 100644
index 0000000000000000000000000000000000000000..1b2ff42f6dc33b2fb66c0047bacea591578bf91e
--- /dev/null
+++ b/evaluation_l1/xnli/zh/MNLI_crowdsource/results.json
@@ -0,0 +1,9 @@
+{
+  "dataset_name": "xnli",
+  "dataset_config_name": "zh",
+  "template_name": "MNLI crowdsource",
+  "evaluation": {
+    "accuracy": 0.4248995983935743
+  },
+  "arguments": "Namespace(config_name=None, dataset_config_name='zh', dataset_name='xnli', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='en', template_name='MNLI crowdsource', tokenizer_name=None, use_slow_tokenizer=False)"
+}
\ No newline at end of file
diff --git a/evaluation_l1/xnli/zh/can_we_infer/results.json b/evaluation_l1/xnli/zh/can_we_infer/results.json
new file mode 100644
index 0000000000000000000000000000000000000000..82d063dec3853aa0ea836bb0c48c95984a99f234
--- /dev/null
+++ b/evaluation_l1/xnli/zh/can_we_infer/results.json
@@ -0,0 +1,9 @@
+{
+  "dataset_name": "xnli",
+  "dataset_config_name": "zh",
+  "template_name": "can we infer",
+  "evaluation": {
+    "accuracy": 0.43052208835341366
+  },
+  "arguments": "Namespace(config_name=None, dataset_config_name='zh', dataset_name='xnli', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='en', template_name='can we infer', tokenizer_name=None, use_slow_tokenizer=False)"
+}
\ No newline at end of file
diff --git a/evaluation_l1/xnli/zh/guaranteed_possible_impossible/results.json b/evaluation_l1/xnli/zh/guaranteed_possible_impossible/results.json
new file mode 100644
index 0000000000000000000000000000000000000000..18da408e415812cc705f7f27ba28ee3069c8b85a
--- /dev/null
+++ b/evaluation_l1/xnli/zh/guaranteed_possible_impossible/results.json
@@ -0,0 +1,9 @@
+{
+  "dataset_name": "xnli",
+  "dataset_config_name": "zh",
+  "template_name": "guaranteed/possible/impossible",
+  "evaluation": {
+    "accuracy": 0.5526104417670683
+  },
+  "arguments": "Namespace(config_name=None, dataset_config_name='zh', dataset_name='xnli', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='en', template_name='guaranteed/possible/impossible', tokenizer_name=None, use_slow_tokenizer=False)"
+}
\ No newline at end of file
diff --git a/evaluation_l1/xnli/zh/justified_in_saying/results.json b/evaluation_l1/xnli/zh/justified_in_saying/results.json
new file mode 100644
index 0000000000000000000000000000000000000000..f668b2453fefbc4459efecb5e12ee1831705740c
--- /dev/null
+++ b/evaluation_l1/xnli/zh/justified_in_saying/results.json
@@ -0,0 +1,9 @@
+{
+  "dataset_name": "xnli",
+  "dataset_config_name": "zh",
+  "template_name": "justified in saying",
+  "evaluation": {
+    "accuracy": 0.44016064257028115
+  },
+  "arguments": "Namespace(config_name=None, dataset_config_name='zh', dataset_name='xnli', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='en', template_name='justified in saying', tokenizer_name=None, use_slow_tokenizer=False)"
+}
\ No newline at end of file
diff --git a/evaluation_xcopawinostorymt/Muennighoff_xstory_cloze/ar/Answer_Given_options_armt/results.json b/evaluation_xcopawinostorymt/Muennighoff_xstory_cloze/ar/Answer_Given_options_armt/results.json
new file mode 100644
index 0000000000000000000000000000000000000000..08eaf8d55649a67760962798f16115077603ea06
--- /dev/null
+++ b/evaluation_xcopawinostorymt/Muennighoff_xstory_cloze/ar/Answer_Given_options_armt/results.json
@@ -0,0 +1,9 @@
+{
+  "dataset_name": "Muennighoff/xstory_cloze",
+  "dataset_config_name": "ar",
+  "template_name": "Answer Given options_armt",
+  "evaluation": {
+    "accuracy": 0.8941098610191925
+  },
+  "arguments": "Namespace(config_name=None, dataset_config_name='ar', dataset_name='Muennighoff/xstory_cloze', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='ar', template_name='Answer Given options_armt', tokenizer_name=None, use_slow_tokenizer=False)"
+}
\ No newline at end of file
diff --git a/evaluation_xcopawinostorymt/Muennighoff_xstory_cloze/ar/Choose_Story_Ending_armt/results.json b/evaluation_xcopawinostorymt/Muennighoff_xstory_cloze/ar/Choose_Story_Ending_armt/results.json
new file mode 100644
index 0000000000000000000000000000000000000000..773f18b6f44483c2b276c74fa8dba3a73173f336
--- /dev/null
+++ b/evaluation_xcopawinostorymt/Muennighoff_xstory_cloze/ar/Choose_Story_Ending_armt/results.json
@@ -0,0 +1,9 @@
+{
+  "dataset_name": "Muennighoff/xstory_cloze",
+  "dataset_config_name": "ar",
+  "template_name": "Choose Story Ending_armt",
+  "evaluation": {
+    "accuracy": 0.9404367968232958
+  },
+  "arguments": "Namespace(config_name=None, dataset_config_name='ar', dataset_name='Muennighoff/xstory_cloze', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='ar', template_name='Choose Story Ending_armt', tokenizer_name=None, use_slow_tokenizer=False)"
+}
\ No newline at end of file
diff --git a/evaluation_xcopawinostorymt/Muennighoff_xstory_cloze/ar/Generate_Ending_armt/results.json b/evaluation_xcopawinostorymt/Muennighoff_xstory_cloze/ar/Generate_Ending_armt/results.json
new file mode 100644
index 0000000000000000000000000000000000000000..1019f9b10c16be41066096a733d486afaec79fe9
--- /dev/null
+++ b/evaluation_xcopawinostorymt/Muennighoff_xstory_cloze/ar/Generate_Ending_armt/results.json
@@ -0,0 +1,9 @@
+{
+  "dataset_name": "Muennighoff/xstory_cloze",
+  "dataset_config_name": "ar",
+  "template_name": "Generate Ending_armt",
+  "evaluation": {
+    "accuracy": 0.6598279285241562
+  },
+  "arguments": "Namespace(config_name=None, dataset_config_name='ar', dataset_name='Muennighoff/xstory_cloze', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='ar', template_name='Generate Ending_armt', tokenizer_name=None, use_slow_tokenizer=False)"
+}
\ No newline at end of file
diff --git a/evaluation_xcopawinostorymt/Muennighoff_xstory_cloze/ar/Novel_Correct_Ending_armt/results.json b/evaluation_xcopawinostorymt/Muennighoff_xstory_cloze/ar/Novel_Correct_Ending_armt/results.json
new file mode 100644
index 0000000000000000000000000000000000000000..2ef0ca9ebf98a076846d9f2cfc5e299323efb0ce
--- /dev/null
+++ b/evaluation_xcopawinostorymt/Muennighoff_xstory_cloze/ar/Novel_Correct_Ending_armt/results.json
@@ -0,0 +1,9 @@
+{
+  "dataset_name": "Muennighoff/xstory_cloze",
+  "dataset_config_name": "ar",
+  "template_name": "Novel Correct Ending_armt",
+  "evaluation": {
+    "accuracy": 0.9272005294506949
+  },
+  "arguments": "Namespace(config_name=None, dataset_config_name='ar', dataset_name='Muennighoff/xstory_cloze', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='ar', template_name='Novel Correct Ending_armt', tokenizer_name=None, use_slow_tokenizer=False)"
+}
\ No newline at end of file
diff --git a/evaluation_xcopawinostorymt/Muennighoff_xstory_cloze/ar/Story_Continuation_and_Options_armt/results.json b/evaluation_xcopawinostorymt/Muennighoff_xstory_cloze/ar/Story_Continuation_and_Options_armt/results.json
new file mode 100644
index 0000000000000000000000000000000000000000..8a8a2cfc67d927bf9dd38579780168915877b79f
--- /dev/null
+++ b/evaluation_xcopawinostorymt/Muennighoff_xstory_cloze/ar/Story_Continuation_and_Options_armt/results.json
@@ -0,0 +1,9 @@
+{
+  "dataset_name": "Muennighoff/xstory_cloze",
+  "dataset_config_name": "ar",
+  "template_name": "Story Continuation and Options_armt",
+  "evaluation": {
+    "accuracy": 0.9172733289212442
+  },
+  "arguments": "Namespace(config_name=None, dataset_config_name='ar', dataset_name='Muennighoff/xstory_cloze', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='ar', template_name='Story Continuation and Options_armt', tokenizer_name=None, use_slow_tokenizer=False)"
+}
\ No newline at end of file
diff --git a/evaluation_xcopawinostorymt/Muennighoff_xstory_cloze/es/Answer_Given_options_esmt/results.json b/evaluation_xcopawinostorymt/Muennighoff_xstory_cloze/es/Answer_Given_options_esmt/results.json
new file mode 100644
index 0000000000000000000000000000000000000000..9346b5f9fe2b425df3ba7ae90573ef5c7e7fb3d1
--- /dev/null
+++ b/evaluation_xcopawinostorymt/Muennighoff_xstory_cloze/es/Answer_Given_options_esmt/results.json
@@ -0,0 +1,9 @@
+{
+  "dataset_name": "Muennighoff/xstory_cloze",
+  "dataset_config_name": "es",
+  "template_name": "Answer Given options_esmt",
+  "evaluation": {
+    "accuracy": 0.9311714096624751
+  },
+  "arguments": "Namespace(config_name=None, dataset_config_name='es', dataset_name='Muennighoff/xstory_cloze', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='es', template_name='Answer Given options_esmt', tokenizer_name=None, use_slow_tokenizer=False)"
+}
\ No newline at end of file
diff --git a/evaluation_xcopawinostorymt/Muennighoff_xstory_cloze/es/Choose_Story_Ending_esmt/results.json b/evaluation_xcopawinostorymt/Muennighoff_xstory_cloze/es/Choose_Story_Ending_esmt/results.json
new file mode 100644
index 0000000000000000000000000000000000000000..0ecff6e31deb89478ba84a5dc3a59031dc4c7704
--- /dev/null
+++ b/evaluation_xcopawinostorymt/Muennighoff_xstory_cloze/es/Choose_Story_Ending_esmt/results.json
@@ -0,0 +1,9 @@
+{
+  "dataset_name": "Muennighoff/xstory_cloze",
+  "dataset_config_name": "es",
+  "template_name": "Choose Story Ending_esmt",
+  "evaluation": {
+    "accuracy": 0.9549966909331569
+  },
+  "arguments": "Namespace(config_name=None, dataset_config_name='es', dataset_name='Muennighoff/xstory_cloze', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='es', template_name='Choose Story Ending_esmt', tokenizer_name=None, use_slow_tokenizer=False)"
+}
\ No newline at end of file
diff --git a/evaluation_xcopawinostorymt/Muennighoff_xstory_cloze/es/Generate_Ending_esmt/results.json b/evaluation_xcopawinostorymt/Muennighoff_xstory_cloze/es/Generate_Ending_esmt/results.json
new file mode 100644
index 0000000000000000000000000000000000000000..42e4c5650dd6e7becbb01e1846ed5287011574a3
--- /dev/null
+++ b/evaluation_xcopawinostorymt/Muennighoff_xstory_cloze/es/Generate_Ending_esmt/results.json
@@ -0,0 +1,9 @@
+{
+  "dataset_name": "Muennighoff/xstory_cloze",
+  "dataset_config_name": "es",
+  "template_name": "Generate Ending_esmt",
+  "evaluation": {
+    "accuracy": 0.7405691594970218
+  },
+  "arguments": "Namespace(config_name=None, dataset_config_name='es', dataset_name='Muennighoff/xstory_cloze', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='es', template_name='Generate Ending_esmt', tokenizer_name=None, use_slow_tokenizer=False)"
+}
\ No newline at end of file
diff --git a/evaluation_xcopawinostorymt/Muennighoff_xstory_cloze/es/Novel_Correct_Ending_esmt/results.json b/evaluation_xcopawinostorymt/Muennighoff_xstory_cloze/es/Novel_Correct_Ending_esmt/results.json
new file mode 100644
index 0000000000000000000000000000000000000000..2d175315a4a49d7ffbc1c794488f2c4170c72aa3
--- /dev/null
+++ b/evaluation_xcopawinostorymt/Muennighoff_xstory_cloze/es/Novel_Correct_Ending_esmt/results.json
@@ -0,0 +1,9 @@
+{
+  "dataset_name": "Muennighoff/xstory_cloze",
+  "dataset_config_name": "es",
+  "template_name": "Novel Correct Ending_esmt",
+  "evaluation": {
+    "accuracy": 0.9490403706154864
+  },
+  "arguments": "Namespace(config_name=None, dataset_config_name='es', dataset_name='Muennighoff/xstory_cloze', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='es', template_name='Novel Correct Ending_esmt', tokenizer_name=None, use_slow_tokenizer=False)"
+}
\ No newline at end of file
diff --git a/evaluation_xcopawinostorymt/Muennighoff_xstory_cloze/es/Story_Continuation_and_Options_esmt/results.json b/evaluation_xcopawinostorymt/Muennighoff_xstory_cloze/es/Story_Continuation_and_Options_esmt/results.json
new file mode 100644
index 0000000000000000000000000000000000000000..a92049818a18be1bd1dc53949dd5f5dab08bb5b6
--- /dev/null
+++ b/evaluation_xcopawinostorymt/Muennighoff_xstory_cloze/es/Story_Continuation_and_Options_esmt/results.json
@@ -0,0 +1,9 @@
+{
+  "dataset_name": "Muennighoff/xstory_cloze",
+  "dataset_config_name": "es",
+  "template_name": "Story Continuation and Options_esmt",
+  "evaluation": {
+    "accuracy": 0.9523494374586366
+  },
+  "arguments": "Namespace(config_name=None, dataset_config_name='es', dataset_name='Muennighoff/xstory_cloze', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='es', template_name='Story Continuation and Options_esmt', tokenizer_name=None, use_slow_tokenizer=False)"
+}
\ No newline at end of file
diff --git a/evaluation_xcopawinostorymt/Muennighoff_xstory_cloze/eu/Answer_Given_options_eumt/results.json b/evaluation_xcopawinostorymt/Muennighoff_xstory_cloze/eu/Answer_Given_options_eumt/results.json
new file mode 100644
index 0000000000000000000000000000000000000000..82077be9a713eba8286c85e8f7a54f405f67493c
--- /dev/null
+++ b/evaluation_xcopawinostorymt/Muennighoff_xstory_cloze/eu/Answer_Given_options_eumt/results.json
@@ -0,0 +1,9 @@
+{
+  "dataset_name": "Muennighoff/xstory_cloze",
+  "dataset_config_name": "eu",
+  "template_name": "Answer Given options_eumt",
+  "evaluation": {
+    "accuracy": 0.7326273990734613
+  },
+  "arguments": "Namespace(config_name=None, dataset_config_name='eu', dataset_name='Muennighoff/xstory_cloze', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='eu', template_name='Answer Given options_eumt', tokenizer_name=None, use_slow_tokenizer=False)"
+}
\ No newline at end of file
diff --git a/evaluation_xcopawinostorymt/Muennighoff_xstory_cloze/eu/Choose_Story_Ending_eumt/results.json b/evaluation_xcopawinostorymt/Muennighoff_xstory_cloze/eu/Choose_Story_Ending_eumt/results.json
new file mode 100644
index 0000000000000000000000000000000000000000..6ffff53ee20de1422ddc411de36952d95f5c26df
--- /dev/null
+++ b/evaluation_xcopawinostorymt/Muennighoff_xstory_cloze/eu/Choose_Story_Ending_eumt/results.json
@@ -0,0 +1,9 @@
+{
+  "dataset_name": "Muennighoff/xstory_cloze",
+  "dataset_config_name": "eu",
+  "template_name": "Choose Story Ending_eumt",
+  "evaluation": {
+    "accuracy": 0.8682991396426207
+  },
+  "arguments": "Namespace(config_name=None, dataset_config_name='eu', dataset_name='Muennighoff/xstory_cloze', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='eu', template_name='Choose Story Ending_eumt', tokenizer_name=None, use_slow_tokenizer=False)"
+}
\ No newline at end of file
diff --git a/evaluation_xcopawinostorymt/Muennighoff_xstory_cloze/eu/Generate_Ending_eumt/results.json b/evaluation_xcopawinostorymt/Muennighoff_xstory_cloze/eu/Generate_Ending_eumt/results.json
new file mode 100644
index 0000000000000000000000000000000000000000..11f291c388061a9a54cebf327d928b0e57b3b414
--- /dev/null
+++ b/evaluation_xcopawinostorymt/Muennighoff_xstory_cloze/eu/Generate_Ending_eumt/results.json
@@ -0,0 +1,9 @@
+{
+  "dataset_name": "Muennighoff/xstory_cloze",
+  "dataset_config_name": "eu",
+  "template_name": "Generate Ending_eumt",
+  "evaluation": {
+    "accuracy": 0.6293845135671741
+  },
+  "arguments": "Namespace(config_name=None, dataset_config_name='eu', dataset_name='Muennighoff/xstory_cloze', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='eu', template_name='Generate Ending_eumt', tokenizer_name=None, use_slow_tokenizer=False)"
+}
\ No newline at end of file
diff --git a/evaluation_xcopawinostorymt/Muennighoff_xstory_cloze/eu/Novel_Correct_Ending_eumt/results.json b/evaluation_xcopawinostorymt/Muennighoff_xstory_cloze/eu/Novel_Correct_Ending_eumt/results.json
new file mode 100644
index 0000000000000000000000000000000000000000..6f8a03f86ea1eb3fed92e88b02a7d03e38c1a0a2
--- /dev/null
+++ b/evaluation_xcopawinostorymt/Muennighoff_xstory_cloze/eu/Novel_Correct_Ending_eumt/results.json
@@ -0,0 +1,9 @@
+{
+  "dataset_name": "Muennighoff/xstory_cloze",
+  "dataset_config_name": "eu",
+  "template_name": "Novel Correct Ending_eumt",
+  "evaluation": {
+    "accuracy": 0.8305757776307081
+  },
+  "arguments": "Namespace(config_name=None, dataset_config_name='eu', dataset_name='Muennighoff/xstory_cloze', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='eu', template_name='Novel Correct Ending_eumt', tokenizer_name=None, use_slow_tokenizer=False)"
+}
\ No newline at end of file
diff --git a/evaluation_xcopawinostorymt/Muennighoff_xstory_cloze/eu/Story_Continuation_and_Options_eumt/results.json b/evaluation_xcopawinostorymt/Muennighoff_xstory_cloze/eu/Story_Continuation_and_Options_eumt/results.json
new file mode 100644
index 0000000000000000000000000000000000000000..2e3bb84f6e97d3686dfadd1dd1241be67e255127
--- /dev/null
+++ b/evaluation_xcopawinostorymt/Muennighoff_xstory_cloze/eu/Story_Continuation_and_Options_eumt/results.json
@@ -0,0 +1,9 @@
+{
+  "dataset_name": "Muennighoff/xstory_cloze",
+  "dataset_config_name": "eu",
+  "template_name": "Story Continuation and Options_eumt",
+  "evaluation": {
+    "accuracy": 0.8259430840502978
+  },
+  "arguments": "Namespace(config_name=None, dataset_config_name='eu', dataset_name='Muennighoff/xstory_cloze', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='eu', template_name='Story Continuation and Options_eumt', tokenizer_name=None, use_slow_tokenizer=False)"
+}
\ No newline at end of file
diff --git a/evaluation_xcopawinostorymt/Muennighoff_xstory_cloze/hi/Answer_Given_options_himt/results.json b/evaluation_xcopawinostorymt/Muennighoff_xstory_cloze/hi/Answer_Given_options_himt/results.json
new file mode 100644
index 0000000000000000000000000000000000000000..9f600887dc434b8634d25cb214fceaeb6670e60e
--- /dev/null
+++ b/evaluation_xcopawinostorymt/Muennighoff_xstory_cloze/hi/Answer_Given_options_himt/results.json
@@ -0,0 +1,9 @@
+{
+  "dataset_name": "Muennighoff/xstory_cloze",
+  "dataset_config_name": "hi",
+  "template_name": "Answer Given options_himt",
+  "evaluation": {
+    "accuracy": 0.8530774321641297
+  },
+  "arguments": "Namespace(config_name=None, dataset_config_name='hi', dataset_name='Muennighoff/xstory_cloze', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='hi', template_name='Answer Given options_himt', tokenizer_name=None, use_slow_tokenizer=False)"
+}
\ No newline at end of file
diff --git a/evaluation_xcopawinostorymt/Muennighoff_xstory_cloze/hi/Choose_Story_Ending_himt/results.json b/evaluation_xcopawinostorymt/Muennighoff_xstory_cloze/hi/Choose_Story_Ending_himt/results.json
new file mode 100644
index 0000000000000000000000000000000000000000..5d10e8076e863899177eb48671f8f8510760cabc
--- /dev/null
+++ b/evaluation_xcopawinostorymt/Muennighoff_xstory_cloze/hi/Choose_Story_Ending_himt/results.json
@@ -0,0 +1,9 @@
+{
+  "dataset_name": "Muennighoff/xstory_cloze",
+  "dataset_config_name": "hi",
+  "template_name": "Choose Story Ending_himt",
+  "evaluation": {
+    "accuracy": 0.8914626075446724
+  },
+  "arguments": "Namespace(config_name=None, dataset_config_name='hi', dataset_name='Muennighoff/xstory_cloze', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='hi', template_name='Choose Story Ending_himt', tokenizer_name=None, use_slow_tokenizer=False)"
+}
\ No newline at end of file
diff --git a/evaluation_xcopawinostorymt/Muennighoff_xstory_cloze/hi/Generate_Ending_himt/results.json b/evaluation_xcopawinostorymt/Muennighoff_xstory_cloze/hi/Generate_Ending_himt/results.json
new file mode 100644
index 0000000000000000000000000000000000000000..748d79306b080fe587762245d865188c38858bdb
--- /dev/null
+++ b/evaluation_xcopawinostorymt/Muennighoff_xstory_cloze/hi/Generate_Ending_himt/results.json
@@ -0,0 +1,9 @@
+{
+  "dataset_name": "Muennighoff/xstory_cloze",
+  "dataset_config_name": "hi",
+  "template_name": "Generate Ending_himt",
+  "evaluation": {
+    "accuracy": 0.6644606221045665
+  },
+  "arguments": "Namespace(config_name=None, dataset_config_name='hi', dataset_name='Muennighoff/xstory_cloze', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='hi', template_name='Generate Ending_himt', tokenizer_name=None, use_slow_tokenizer=False)"
+}
\ No newline at end of file
diff --git a/evaluation_xcopawinostorymt/Muennighoff_xstory_cloze/hi/Novel_Correct_Ending_himt/results.json b/evaluation_xcopawinostorymt/Muennighoff_xstory_cloze/hi/Novel_Correct_Ending_himt/results.json
new file mode 100644
index 0000000000000000000000000000000000000000..12f20fab7fb8dfe1bed8c7d32adaf44c74a8ea79
--- /dev/null
+++ b/evaluation_xcopawinostorymt/Muennighoff_xstory_cloze/hi/Novel_Correct_Ending_himt/results.json
@@ -0,0 +1,9 @@
+{
+  "dataset_name": "Muennighoff/xstory_cloze",
+  "dataset_config_name": "hi",
+  "template_name": "Novel Correct Ending_himt",
+  "evaluation": {
+    "accuracy": 0.8821972203838517
+  },
+  "arguments": "Namespace(config_name=None, dataset_config_name='hi', dataset_name='Muennighoff/xstory_cloze', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='hi', template_name='Novel Correct Ending_himt', tokenizer_name=None, use_slow_tokenizer=False)"
+}
\ No newline at end of file
diff --git a/evaluation_xcopawinostorymt/Muennighoff_xstory_cloze/hi/Story_Continuation_and_Options_himt/results.json b/evaluation_xcopawinostorymt/Muennighoff_xstory_cloze/hi/Story_Continuation_and_Options_himt/results.json
new file mode 100644
index 0000000000000000000000000000000000000000..57a270e97d7e8a4d2a480b22824e9bd101b5c200
--- /dev/null
+++ b/evaluation_xcopawinostorymt/Muennighoff_xstory_cloze/hi/Story_Continuation_and_Options_himt/results.json
@@ -0,0 +1,9 @@
+{
+  "dataset_name": "Muennighoff/xstory_cloze",
+  "dataset_config_name": "hi",
+  "template_name": "Story Continuation and Options_himt",
+  "evaluation": {
+    "accuracy": 0.8735936465916612
+  },
+  "arguments": "Namespace(config_name=None, dataset_config_name='hi', dataset_name='Muennighoff/xstory_cloze', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='hi', template_name='Story Continuation and Options_himt', tokenizer_name=None, use_slow_tokenizer=False)"
+}
\ No newline at end of file
diff --git a/evaluation_xcopawinostorymt/Muennighoff_xstory_cloze/id/Answer_Given_options_idmt/results.json b/evaluation_xcopawinostorymt/Muennighoff_xstory_cloze/id/Answer_Given_options_idmt/results.json
new file mode 100644
index 0000000000000000000000000000000000000000..7455cbf4b0e4932b109dabdb981ceca022d78048
--- /dev/null
+++ b/evaluation_xcopawinostorymt/Muennighoff_xstory_cloze/id/Answer_Given_options_idmt/results.json
@@ -0,0 +1,9 @@
+{
+  "dataset_name": "Muennighoff/xstory_cloze",
+  "dataset_config_name": "id",
+  "template_name": "Answer Given options_idmt",
+  "evaluation": {
+    "accuracy": 0.8682991396426207
+  },
+  "arguments": "Namespace(config_name=None, dataset_config_name='id', dataset_name='Muennighoff/xstory_cloze', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='id', template_name='Answer Given options_idmt', tokenizer_name=None, use_slow_tokenizer=False)"
+}
\ No newline at end of file
diff --git a/evaluation_xcopawinostorymt/Muennighoff_xstory_cloze/id/Choose_Story_Ending_idmt/results.json b/evaluation_xcopawinostorymt/Muennighoff_xstory_cloze/id/Choose_Story_Ending_idmt/results.json
new file mode 100644
index 0000000000000000000000000000000000000000..7ac777f4ceac3494e1523f45ddbe7948803f95df
--- /dev/null
+++ b/evaluation_xcopawinostorymt/Muennighoff_xstory_cloze/id/Choose_Story_Ending_idmt/results.json
@@ -0,0 +1,9 @@
+{
+  "dataset_name": "Muennighoff/xstory_cloze",
+  "dataset_config_name": "id",
+  "template_name": "Choose Story Ending_idmt",
+  "evaluation": {
+    "accuracy": 0.927862342819325
+  },
+  "arguments": "Namespace(config_name=None, dataset_config_name='id', dataset_name='Muennighoff/xstory_cloze', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='id', template_name='Choose Story Ending_idmt', tokenizer_name=None, use_slow_tokenizer=False)"
+}
\ No newline at end of file
diff --git a/evaluation_xcopawinostorymt/Muennighoff_xstory_cloze/id/Generate_Ending_idmt/results.json b/evaluation_xcopawinostorymt/Muennighoff_xstory_cloze/id/Generate_Ending_idmt/results.json
new file mode 100644
index 0000000000000000000000000000000000000000..d23d62b87eebcf899b1a5e9822834ce0ab500ae3
--- /dev/null
+++ b/evaluation_xcopawinostorymt/Muennighoff_xstory_cloze/id/Generate_Ending_idmt/results.json
@@ -0,0 +1,9 @@
+{
+  "dataset_name": "Muennighoff/xstory_cloze",
+  "dataset_config_name": "id",
+  "template_name": "Generate Ending_idmt",
+  "evaluation": {
+    "accuracy": 0.6929185969556585
+  },
+  "arguments": "Namespace(config_name=None, dataset_config_name='id', dataset_name='Muennighoff/xstory_cloze', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='id', template_name='Generate Ending_idmt', tokenizer_name=None, use_slow_tokenizer=False)"
+}
\ No newline at end of file
diff --git a/evaluation_xcopawinostorymt/Muennighoff_xstory_cloze/id/Novel_Correct_Ending_idmt/results.json b/evaluation_xcopawinostorymt/Muennighoff_xstory_cloze/id/Novel_Correct_Ending_idmt/results.json
new file mode 100644
index 0000000000000000000000000000000000000000..349fb09ae9669f153cee1b2fafd8a3454cfcee50
--- /dev/null
+++ b/evaluation_xcopawinostorymt/Muennighoff_xstory_cloze/id/Novel_Correct_Ending_idmt/results.json
@@ -0,0 +1,9 @@
+{
+  "dataset_name": "Muennighoff/xstory_cloze",
+  "dataset_config_name": "id",
+  "template_name": "Novel Correct Ending_idmt",
+  "evaluation": {
+    "accuracy": 0.9086697551290536
+  },
+  "arguments": "Namespace(config_name=None, dataset_config_name='id', dataset_name='Muennighoff/xstory_cloze', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='id', template_name='Novel Correct Ending_idmt', tokenizer_name=None, use_slow_tokenizer=False)"
+}
\ No newline at end of file
diff --git a/evaluation_xcopawinostorymt/Muennighoff_xstory_cloze/id/Story_Continuation_and_Options_idmt/results.json b/evaluation_xcopawinostorymt/Muennighoff_xstory_cloze/id/Story_Continuation_and_Options_idmt/results.json
new file mode 100644
index 0000000000000000000000000000000000000000..07b42403a63a6e9a0d9cd305f9d626b80f2cb0dd
--- /dev/null
+++ b/evaluation_xcopawinostorymt/Muennighoff_xstory_cloze/id/Story_Continuation_and_Options_idmt/results.json
@@ -0,0 +1,9 @@
+{
+  "dataset_name": "Muennighoff/xstory_cloze",
+  "dataset_config_name": "id",
+  "template_name": "Story Continuation and Options_idmt",
+  "evaluation": {
+    "accuracy": 0.9159497021839841
+  },
+  "arguments": "Namespace(config_name=None, dataset_config_name='id', dataset_name='Muennighoff/xstory_cloze', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='id', template_name='Story Continuation and Options_idmt', tokenizer_name=None, use_slow_tokenizer=False)"
+}
\ No newline at end of file
diff --git a/evaluation_xcopawinostorymt/Muennighoff_xstory_cloze/zh/Answer_Given_options_zhmt/results.json b/evaluation_xcopawinostorymt/Muennighoff_xstory_cloze/zh/Answer_Given_options_zhmt/results.json
new file mode 100644
index 0000000000000000000000000000000000000000..622a2b5c90190978f50caa3771721da393dc5e48
--- /dev/null
+++ b/evaluation_xcopawinostorymt/Muennighoff_xstory_cloze/zh/Answer_Given_options_zhmt/results.json
@@ -0,0 +1,9 @@
+{
+  "dataset_name": "Muennighoff/xstory_cloze",
+  "dataset_config_name": "zh",
+  "template_name": "Answer Given options_zhmt",
+  "evaluation": {
+    "accuracy": 0.913964262078094
+  },
+  "arguments": "Namespace(config_name=None, dataset_config_name='zh', dataset_name='Muennighoff/xstory_cloze', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='zh', template_name='Answer Given options_zhmt', tokenizer_name=None, use_slow_tokenizer=False)"
+}
\ No newline at end of file
diff --git a/evaluation_xcopawinostorymt/Muennighoff_xstory_cloze/zh/Choose_Story_Ending_zhmt/results.json b/evaluation_xcopawinostorymt/Muennighoff_xstory_cloze/zh/Choose_Story_Ending_zhmt/results.json
new file mode 100644
index 0000000000000000000000000000000000000000..74d7933a087ca4d3b33932b27692f9a149c889c7
--- /dev/null
+++ b/evaluation_xcopawinostorymt/Muennighoff_xstory_cloze/zh/Choose_Story_Ending_zhmt/results.json
@@ -0,0 +1,9 @@
+{
+  "dataset_name": "Muennighoff/xstory_cloze",
+  "dataset_config_name": "zh",
+  "template_name": "Choose Story Ending_zhmt",
+  "evaluation": {
+    "accuracy": 0.9238914626075446
+  },
+  "arguments": "Namespace(config_name=None, dataset_config_name='zh', dataset_name='Muennighoff/xstory_cloze', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='zh', template_name='Choose Story Ending_zhmt', tokenizer_name=None, use_slow_tokenizer=False)"
+}
\ No newline at end of file
diff --git a/evaluation_xcopawinostorymt/Muennighoff_xstory_cloze/zh/Generate_Ending_zhmt/results.json b/evaluation_xcopawinostorymt/Muennighoff_xstory_cloze/zh/Generate_Ending_zhmt/results.json
new file mode 100644
index 0000000000000000000000000000000000000000..c7d07f0bc402a8bc64659f7ad0d662a11563f472
--- /dev/null
+++ b/evaluation_xcopawinostorymt/Muennighoff_xstory_cloze/zh/Generate_Ending_zhmt/results.json
@@ -0,0 +1,9 @@
+{
+  "dataset_name": "Muennighoff/xstory_cloze",
+  "dataset_config_name": "zh",
+  "template_name": "Generate Ending_zhmt",
+  "evaluation": {
+    "accuracy": 0.6843150231634679
+  },
+  "arguments": "Namespace(config_name=None, dataset_config_name='zh', dataset_name='Muennighoff/xstory_cloze', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='zh', template_name='Generate Ending_zhmt', tokenizer_name=None, use_slow_tokenizer=False)"
+}
\ No newline at end of file
diff --git a/evaluation_xcopawinostorymt/Muennighoff_xstory_cloze/zh/Novel_Correct_Ending_zhmt/results.json b/evaluation_xcopawinostorymt/Muennighoff_xstory_cloze/zh/Novel_Correct_Ending_zhmt/results.json
new file mode 100644
index 0000000000000000000000000000000000000000..f2cc73ab58a67c8e296e4d941608f9ee4ddceed1
--- /dev/null
+++ b/evaluation_xcopawinostorymt/Muennighoff_xstory_cloze/zh/Novel_Correct_Ending_zhmt/results.json
@@ -0,0 +1,9 @@
+{
+  "dataset_name": "Muennighoff/xstory_cloze",
+  "dataset_config_name": "zh",
+  "template_name": "Novel Correct Ending_zhmt",
+  "evaluation": {
+    "accuracy": 0.9252150893448048
+  },
+  "arguments": "Namespace(config_name=None, dataset_config_name='zh', dataset_name='Muennighoff/xstory_cloze', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='zh', template_name='Novel Correct Ending_zhmt', tokenizer_name=None, use_slow_tokenizer=False)"
+}
\ No newline at end of file
diff --git a/evaluation_xcopawinostorymt/Muennighoff_xstory_cloze/zh/Story_Continuation_and_Options_zhmt/results.json b/evaluation_xcopawinostorymt/Muennighoff_xstory_cloze/zh/Story_Continuation_and_Options_zhmt/results.json
new file mode 100644
index 0000000000000000000000000000000000000000..3cccd12bcf6b155e7860a35ac58b67635ec70895
--- /dev/null
+++ b/evaluation_xcopawinostorymt/Muennighoff_xstory_cloze/zh/Story_Continuation_and_Options_zhmt/results.json
@@ -0,0 +1,9 @@
+{
+  "dataset_name": "Muennighoff/xstory_cloze",
+  "dataset_config_name": "zh",
+  "template_name": "Story Continuation and Options_zhmt",
+  "evaluation": {
+    "accuracy": 0.913302448709464
+  },
+  "arguments": "Namespace(config_name=None, dataset_config_name='zh', dataset_name='Muennighoff/xstory_cloze', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='zh', template_name='Story Continuation and Options_zhmt', tokenizer_name=None, use_slow_tokenizer=False)"
+}
\ No newline at end of file
diff --git a/evaluation_xcopawinostorymt/Muennighoff_xwinograd/fr/Replace_frmt/results.json b/evaluation_xcopawinostorymt/Muennighoff_xwinograd/fr/Replace_frmt/results.json
new file mode 100644
index 0000000000000000000000000000000000000000..22618bc9f7b3bfe018197f506330b9ac0ffcf243
--- /dev/null
+++ b/evaluation_xcopawinostorymt/Muennighoff_xwinograd/fr/Replace_frmt/results.json
@@ -0,0 +1,9 @@
+{
+  "dataset_name": "Muennighoff/xwinograd",
+  "dataset_config_name": "fr",
+  "template_name": "Replace_frmt",
+  "evaluation": {
+    "accuracy": 0.6626506024096386
+  },
+  "arguments": "Namespace(config_name=None, dataset_config_name='fr', dataset_name='Muennighoff/xwinograd', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='test', target_max_length=256, template_config_name='fr', template_name='Replace_frmt', tokenizer_name=None, use_slow_tokenizer=False)"
+}
\ No newline at end of file
diff --git a/evaluation_xcopawinostorymt/Muennighoff_xwinograd/fr/True_or_False_frmt/results.json b/evaluation_xcopawinostorymt/Muennighoff_xwinograd/fr/True_or_False_frmt/results.json
new file mode 100644
index 0000000000000000000000000000000000000000..081254c848e81d983b60c29b734e4e228f88cff5
--- /dev/null
+++ b/evaluation_xcopawinostorymt/Muennighoff_xwinograd/fr/True_or_False_frmt/results.json
@@ -0,0 +1,9 @@
+{
+  "dataset_name": "Muennighoff/xwinograd",
+  "dataset_config_name": "fr",
+  "template_name": "True or False_frmt",
+  "evaluation": {
+    "accuracy": 0.4578313253012048
+  },
+  "arguments": "Namespace(config_name=None, dataset_config_name='fr', dataset_name='Muennighoff/xwinograd', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='test', target_max_length=256, template_config_name='fr', template_name='True or False_frmt', tokenizer_name=None, use_slow_tokenizer=False)"
+}
\ No newline at end of file
diff --git a/evaluation_xcopawinostorymt/Muennighoff_xwinograd/fr/does_underscore_refer_to_frmt/results.json b/evaluation_xcopawinostorymt/Muennighoff_xwinograd/fr/does_underscore_refer_to_frmt/results.json
new file mode 100644
index 0000000000000000000000000000000000000000..4c1ee4a09d6ceaaba5fa1afaa5d2b69d1682d596
--- /dev/null
+++ b/evaluation_xcopawinostorymt/Muennighoff_xwinograd/fr/does_underscore_refer_to_frmt/results.json
@@ -0,0 +1,9 @@
+{
+  "dataset_name": "Muennighoff/xwinograd",
+  "dataset_config_name": "fr",
+  "template_name": "does underscore refer to_frmt",
+  "evaluation": {
+    "accuracy": 0.5783132530120482
+  },
+  "arguments": "Namespace(config_name=None, dataset_config_name='fr', dataset_name='Muennighoff/xwinograd', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='test', target_max_length=256, template_config_name='fr', template_name='does underscore refer to_frmt', tokenizer_name=None, use_slow_tokenizer=False)"
+}
\ No newline at end of file
diff --git a/evaluation_xcopawinostorymt/Muennighoff_xwinograd/fr/stand_for_frmt/results.json b/evaluation_xcopawinostorymt/Muennighoff_xwinograd/fr/stand_for_frmt/results.json
new file mode 100644
index 0000000000000000000000000000000000000000..3cf0b9d6d126b11ffd3d6820319d25b229d8cbdb
--- /dev/null
+++ b/evaluation_xcopawinostorymt/Muennighoff_xwinograd/fr/stand_for_frmt/results.json
@@ -0,0 +1,9 @@
+{
+  "dataset_name": "Muennighoff/xwinograd",
+  "dataset_config_name": "fr",
+  "template_name": "stand for_frmt",
+  "evaluation": {
+    "accuracy": 0.5421686746987951
+  },
+  "arguments": "Namespace(config_name=None, dataset_config_name='fr', dataset_name='Muennighoff/xwinograd', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='test', target_max_length=256, template_config_name='fr', template_name='stand for_frmt', tokenizer_name=None, use_slow_tokenizer=False)"
+}
\ No newline at end of file
diff --git a/evaluation_xcopawinostorymt/Muennighoff_xwinograd/fr/underscore_refer_to_frmt/results.json b/evaluation_xcopawinostorymt/Muennighoff_xwinograd/fr/underscore_refer_to_frmt/results.json
new file mode 100644
index 0000000000000000000000000000000000000000..5d4c32dc9b18cbb25f767896325a96dbae2516d8
--- /dev/null
+++ b/evaluation_xcopawinostorymt/Muennighoff_xwinograd/fr/underscore_refer_to_frmt/results.json
@@ -0,0 +1,9 @@
+{
+  "dataset_name": "Muennighoff/xwinograd",
+  "dataset_config_name": "fr",
+  "template_name": "underscore refer to_frmt",
+  "evaluation": {
+    "accuracy": 0.6265060240963856
+  },
+  "arguments": "Namespace(config_name=None, dataset_config_name='fr', dataset_name='Muennighoff/xwinograd', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='test', target_max_length=256, template_config_name='fr', template_name='underscore refer to_frmt', tokenizer_name=None, use_slow_tokenizer=False)"
+}
\ No newline at end of file
diff --git a/evaluation_xcopawinostorymt/Muennighoff_xwinograd/pt/Replace_ptmt/results.json b/evaluation_xcopawinostorymt/Muennighoff_xwinograd/pt/Replace_ptmt/results.json
new file mode 100644
index 0000000000000000000000000000000000000000..e34759d77b70ee482aade2ca041c153ed8b79e63
--- /dev/null
+++ b/evaluation_xcopawinostorymt/Muennighoff_xwinograd/pt/Replace_ptmt/results.json
@@ -0,0 +1,9 @@
+{
+  "dataset_name": "Muennighoff/xwinograd",
+  "dataset_config_name": "pt",
+  "template_name": "Replace_ptmt",
+  "evaluation": {
+    "accuracy": 0.6273764258555133
+  },
+  "arguments": "Namespace(config_name=None, dataset_config_name='pt', dataset_name='Muennighoff/xwinograd', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='test', target_max_length=256, template_config_name='pt', template_name='Replace_ptmt', tokenizer_name=None, use_slow_tokenizer=False)"
+}
\ No newline at end of file
diff --git a/evaluation_xcopawinostorymt/Muennighoff_xwinograd/pt/True_or_False_ptmt/results.json b/evaluation_xcopawinostorymt/Muennighoff_xwinograd/pt/True_or_False_ptmt/results.json
new file mode 100644
index 0000000000000000000000000000000000000000..69542cc44b7cff9aeb10f59d8455b7853479e3d3
--- /dev/null
+++ b/evaluation_xcopawinostorymt/Muennighoff_xwinograd/pt/True_or_False_ptmt/results.json
@@ -0,0 +1,9 @@
+{
+  "dataset_name": "Muennighoff/xwinograd",
+  "dataset_config_name": "pt",
+  "template_name": "True or False_ptmt",
+  "evaluation": {
+    "accuracy": 0.532319391634981
+  },
+  "arguments": "Namespace(config_name=None, dataset_config_name='pt', dataset_name='Muennighoff/xwinograd', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='test', target_max_length=256, template_config_name='pt', template_name='True or False_ptmt', tokenizer_name=None, use_slow_tokenizer=False)"
+}
\ No newline at end of file
diff --git a/evaluation_xcopawinostorymt/Muennighoff_xwinograd/pt/does_underscore_refer_to_ptmt/results.json b/evaluation_xcopawinostorymt/Muennighoff_xwinograd/pt/does_underscore_refer_to_ptmt/results.json
new file mode 100644
index 0000000000000000000000000000000000000000..44700ff9b05ead1e066bcce7df9fa17041b6f8f2
--- /dev/null
+++ b/evaluation_xcopawinostorymt/Muennighoff_xwinograd/pt/does_underscore_refer_to_ptmt/results.json
@@ -0,0 +1,9 @@
+{
+  "dataset_name": "Muennighoff/xwinograd",
+  "dataset_config_name": "pt",
+  "template_name": "does underscore refer to_ptmt",
+  "evaluation": {
+    "accuracy": 0.596958174904943
+  },
+  "arguments": "Namespace(config_name=None, dataset_config_name='pt', dataset_name='Muennighoff/xwinograd', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='test', target_max_length=256, template_config_name='pt', template_name='does underscore refer to_ptmt', tokenizer_name=None, use_slow_tokenizer=False)"
+}
\ No newline at end of file
diff --git a/evaluation_xcopawinostorymt/Muennighoff_xwinograd/pt/stand_for_ptmt/results.json b/evaluation_xcopawinostorymt/Muennighoff_xwinograd/pt/stand_for_ptmt/results.json
new file mode 100644
index 0000000000000000000000000000000000000000..5fdda9ecd6b979210133cd38a86a29e9aa3bab1f
--- /dev/null
+++ b/evaluation_xcopawinostorymt/Muennighoff_xwinograd/pt/stand_for_ptmt/results.json
@@ -0,0 +1,9 @@
+{
+  "dataset_name": "Muennighoff/xwinograd",
+  "dataset_config_name": "pt",
+  "template_name": "stand for_ptmt",
+  "evaluation": {
+    "accuracy": 0.5399239543726235
+  },
+  "arguments": "Namespace(config_name=None, dataset_config_name='pt', dataset_name='Muennighoff/xwinograd', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='test', target_max_length=256, template_config_name='pt', template_name='stand for_ptmt', tokenizer_name=None, use_slow_tokenizer=False)"
+}
\ No newline at end of file
diff --git a/evaluation_xcopawinostorymt/Muennighoff_xwinograd/pt/underscore_refer_to_ptmt/results.json b/evaluation_xcopawinostorymt/Muennighoff_xwinograd/pt/underscore_refer_to_ptmt/results.json
new file mode 100644
index 0000000000000000000000000000000000000000..6e2aaeea1b9fb349a7ff9e70ec10f10dca34a10c
--- /dev/null
+++ b/evaluation_xcopawinostorymt/Muennighoff_xwinograd/pt/underscore_refer_to_ptmt/results.json
@@ -0,0 +1,9 @@
+{
+  "dataset_name": "Muennighoff/xwinograd",
+  "dataset_config_name": "pt",
+  "template_name": "underscore refer to_ptmt",
+  "evaluation": {
+    "accuracy": 0.623574144486692
+  },
+  "arguments": "Namespace(config_name=None, dataset_config_name='pt', dataset_name='Muennighoff/xwinograd', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='test', target_max_length=256, template_config_name='pt', template_name='underscore refer to_ptmt', tokenizer_name=None, use_slow_tokenizer=False)"
+}
\ No newline at end of file
diff --git a/evaluation_xcopawinostorymt/Muennighoff_xwinograd/zh/Replace_zhmt/results.json b/evaluation_xcopawinostorymt/Muennighoff_xwinograd/zh/Replace_zhmt/results.json
new file mode 100644
index 0000000000000000000000000000000000000000..7e97551910c6ec94f2103fe0bb0d23e98453bb09
--- /dev/null
+++ b/evaluation_xcopawinostorymt/Muennighoff_xwinograd/zh/Replace_zhmt/results.json
@@ -0,0 +1,9 @@
+{
+  "dataset_name": "Muennighoff/xwinograd",
+  "dataset_config_name": "zh",
+  "template_name": "Replace_zhmt",
+  "evaluation": {
+    "accuracy": 0.7202380952380952
+  },
+  "arguments": "Namespace(config_name=None, dataset_config_name='zh', dataset_name='Muennighoff/xwinograd', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='test', target_max_length=256, template_config_name='zh', template_name='Replace_zhmt', tokenizer_name=None, use_slow_tokenizer=False)"
+}
\ No newline at end of file
diff --git a/evaluation_xcopawinostorymt/Muennighoff_xwinograd/zh/True_or_False_zhmt/results.json b/evaluation_xcopawinostorymt/Muennighoff_xwinograd/zh/True_or_False_zhmt/results.json
new file mode 100644
index 0000000000000000000000000000000000000000..3a5fe840ade7d5406e9be3869192447224fe6c64
--- /dev/null
+++ b/evaluation_xcopawinostorymt/Muennighoff_xwinograd/zh/True_or_False_zhmt/results.json
@@ -0,0 +1,9 @@
+{
+  "dataset_name": "Muennighoff/xwinograd",
+  "dataset_config_name": "zh",
+  "template_name": "True or False_zhmt",
+  "evaluation": {
+    "accuracy": 0.5099206349206349
+  },
+  "arguments": "Namespace(config_name=None, dataset_config_name='zh', dataset_name='Muennighoff/xwinograd', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='test', target_max_length=256, template_config_name='zh', template_name='True or False_zhmt', tokenizer_name=None, use_slow_tokenizer=False)"
+}
\ No newline at end of file
diff --git a/evaluation_xcopawinostorymt/Muennighoff_xwinograd/zh/does_underscore_refer_to_zhmt/results.json b/evaluation_xcopawinostorymt/Muennighoff_xwinograd/zh/does_underscore_refer_to_zhmt/results.json
new file mode 100644
index 0000000000000000000000000000000000000000..05a41bc50f8a282e2b75434bb1662975be85e479
--- /dev/null
+++ b/evaluation_xcopawinostorymt/Muennighoff_xwinograd/zh/does_underscore_refer_to_zhmt/results.json
@@ -0,0 +1,9 @@
+{
+  "dataset_name": "Muennighoff/xwinograd",
+  "dataset_config_name": "zh",
+  "template_name": "does underscore refer to_zhmt",
+  "evaluation": {
+    "accuracy": 0.6746031746031746
+  },
+  "arguments": "Namespace(config_name=None, dataset_config_name='zh', dataset_name='Muennighoff/xwinograd', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='test', target_max_length=256, template_config_name='zh', template_name='does underscore refer to_zhmt', tokenizer_name=None, use_slow_tokenizer=False)"
+}
\ No newline at end of file
diff --git a/evaluation_xcopawinostorymt/Muennighoff_xwinograd/zh/stand_for_zhmt/results.json b/evaluation_xcopawinostorymt/Muennighoff_xwinograd/zh/stand_for_zhmt/results.json
new file mode 100644
index 0000000000000000000000000000000000000000..2e81e46585f08d06cfa91467de512a7b59dced33
--- /dev/null
+++ b/evaluation_xcopawinostorymt/Muennighoff_xwinograd/zh/stand_for_zhmt/results.json
@@ -0,0 +1,9 @@
+{
+  "dataset_name": "Muennighoff/xwinograd",
+  "dataset_config_name": "zh",
+  "template_name": "stand for_zhmt",
+  "evaluation": {
+    "accuracy": 0.5654761904761905
+  },
+  "arguments": "Namespace(config_name=None, dataset_config_name='zh', dataset_name='Muennighoff/xwinograd', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='test', target_max_length=256, template_config_name='zh', template_name='stand for_zhmt', tokenizer_name=None, use_slow_tokenizer=False)"
+}
\ No newline at end of file
diff --git a/evaluation_xcopawinostorymt/Muennighoff_xwinograd/zh/underscore_refer_to_zhmt/results.json b/evaluation_xcopawinostorymt/Muennighoff_xwinograd/zh/underscore_refer_to_zhmt/results.json
new file mode 100644
index 0000000000000000000000000000000000000000..d26b81ae8659f7cc0d23004b1dd7982239f52469
--- /dev/null
+++ b/evaluation_xcopawinostorymt/Muennighoff_xwinograd/zh/underscore_refer_to_zhmt/results.json
@@ -0,0 +1,9 @@
+{
+  "dataset_name": "Muennighoff/xwinograd",
+  "dataset_config_name": "zh",
+  "template_name": "underscore refer to_zhmt",
+  "evaluation": {
+    "accuracy": 0.7638888888888888
+  },
+  "arguments": "Namespace(config_name=None, dataset_config_name='zh', dataset_name='Muennighoff/xwinograd', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='test', target_max_length=256, template_config_name='zh', template_name='underscore refer to_zhmt', tokenizer_name=None, use_slow_tokenizer=False)"
+}
\ No newline at end of file
diff --git a/evaluation_xcopawinostorymt/merged.csv b/evaluation_xcopawinostorymt/merged.csv
new file mode 100644
index 0000000000000000000000000000000000000000..52bd41e5c505fd7c47e2c1749f8939257d057fec
--- /dev/null
+++ b/evaluation_xcopawinostorymt/merged.csv
@@ -0,0 +1,86 @@
+dataset,prompt,metric,value
+xcopa_id,C1 or C2? premise_idmt,accuracy,0.57
+xcopa_id,best_option_idmt,accuracy,0.78
+xcopa_id,cause_effect_idmt,accuracy,0.84
+xcopa_id,i_am_hesitating_idmt,accuracy,0.84
+xcopa_id,plausible_alternatives_idmt,accuracy,0.83
+xcopa_id,median,accuracy,0.83
+xcopa_sw,C1 or C2? premise_swmt,accuracy,0.6
+xcopa_sw,best_option_swmt,accuracy,0.59
+xcopa_sw,cause_effect_swmt,accuracy,0.63
+xcopa_sw,i_am_hesitating_swmt,accuracy,0.67
+xcopa_sw,plausible_alternatives_swmt,accuracy,0.62
+xcopa_sw,median,accuracy,0.62
+xcopa_ta,C1 or C2? premise_tamt,accuracy,0.64
+xcopa_ta,best_option_tamt,accuracy,0.56
+xcopa_ta,cause_effect_tamt,accuracy,0.62
+xcopa_ta,i_am_hesitating_tamt,accuracy,0.64
+xcopa_ta,plausible_alternatives_tamt,accuracy,0.63
+xcopa_ta,median,accuracy,0.63
+xcopa_vi,C1 or C2? premise_vimt,accuracy,0.61
+xcopa_vi,best_option_vimt,accuracy,0.77
+xcopa_vi,cause_effect_vimt,accuracy,0.89
+xcopa_vi,i_am_hesitating_vimt,accuracy,0.85
+xcopa_vi,plausible_alternatives_vimt,accuracy,0.87
+xcopa_vi,median,accuracy,0.85
+xcopa_zh,C1 or C2? premise_zhmt,accuracy,0.63
+xcopa_zh,best_option_zhmt,accuracy,0.75
+xcopa_zh,cause_effect_zhmt,accuracy,0.83
+xcopa_zh,i_am_hesitating_zhmt,accuracy,0.84
+xcopa_zh,plausible_alternatives_zhmt,accuracy,0.86
+xcopa_zh,median,accuracy,0.83
+xstory_cloze_ar,Answer Given options_armt,accuracy,0.8941098610191925
+xstory_cloze_ar,Choose Story Ending_armt,accuracy,0.9404367968232958
+xstory_cloze_ar,Generate Ending_armt,accuracy,0.6598279285241562
+xstory_cloze_ar,Novel Correct Ending_armt,accuracy,0.9272005294506949
+xstory_cloze_ar,Story Continuation and Options_armt,accuracy,0.9172733289212442
+xstory_cloze_ar,median,accuracy,0.9172733289212442
+xstory_cloze_es,Answer Given options_esmt,accuracy,0.9311714096624751
+xstory_cloze_es,Choose Story Ending_esmt,accuracy,0.9549966909331569
+xstory_cloze_es,Generate Ending_esmt,accuracy,0.7405691594970218
+xstory_cloze_es,Novel Correct Ending_esmt,accuracy,0.9490403706154864
+xstory_cloze_es,Story Continuation and Options_esmt,accuracy,0.9523494374586366
+xstory_cloze_es,median,accuracy,0.9490403706154864
+xstory_cloze_eu,Answer Given options_eumt,accuracy,0.7326273990734613
+xstory_cloze_eu,Choose Story Ending_eumt,accuracy,0.8682991396426207
+xstory_cloze_eu,Generate Ending_eumt,accuracy,0.6293845135671741
+xstory_cloze_eu,Novel Correct Ending_eumt,accuracy,0.8305757776307081
+xstory_cloze_eu,Story Continuation and Options_eumt,accuracy,0.8259430840502978
+xstory_cloze_eu,median,accuracy,0.8259430840502978
+xstory_cloze_hi,Answer Given options_himt,accuracy,0.8530774321641297
+xstory_cloze_hi,Choose Story Ending_himt,accuracy,0.8914626075446724
+xstory_cloze_hi,Generate Ending_himt,accuracy,0.6644606221045665
+xstory_cloze_hi,Novel Correct Ending_himt,accuracy,0.8821972203838517
+xstory_cloze_hi,Story Continuation and Options_himt,accuracy,0.8735936465916612
+xstory_cloze_hi,median,accuracy,0.8735936465916612
+xstory_cloze_id,Answer Given options_idmt,accuracy,0.8682991396426207
+xstory_cloze_id,Choose Story Ending_idmt,accuracy,0.927862342819325
+xstory_cloze_id,Generate Ending_idmt,accuracy,0.6929185969556585
+xstory_cloze_id,Novel Correct Ending_idmt,accuracy,0.9086697551290536
+xstory_cloze_id,Story Continuation and Options_idmt,accuracy,0.9159497021839841
+xstory_cloze_id,median,accuracy,0.9086697551290536
+xstory_cloze_zh,Answer Given options_zhmt,accuracy,0.913964262078094
+xstory_cloze_zh,Choose Story Ending_zhmt,accuracy,0.9238914626075446
+xstory_cloze_zh,Generate Ending_zhmt,accuracy,0.6843150231634679
+xstory_cloze_zh,Novel Correct Ending_zhmt,accuracy,0.9252150893448048
+xstory_cloze_zh,Story Continuation and Options_zhmt,accuracy,0.913302448709464
+xstory_cloze_zh,median,accuracy,0.913964262078094
+xwinograd_fr,Replace_frmt,accuracy,0.6626506024096386
+xwinograd_fr,True or False_frmt,accuracy,0.4578313253012048
+xwinograd_fr,does underscore refer to_frmt,accuracy,0.5783132530120482
+xwinograd_fr,stand for_frmt,accuracy,0.5421686746987951
+xwinograd_fr,underscore refer to_frmt,accuracy,0.6265060240963856
+xwinograd_fr,median,accuracy,0.5783132530120482
+xwinograd_pt,Replace_ptmt,accuracy,0.6273764258555133
+xwinograd_pt,True or False_ptmt,accuracy,0.532319391634981
+xwinograd_pt,does underscore refer to_ptmt,accuracy,0.596958174904943
+xwinograd_pt,stand for_ptmt,accuracy,0.5399239543726235
+xwinograd_pt,underscore refer to_ptmt,accuracy,0.623574144486692
+xwinograd_pt,median,accuracy,0.596958174904943
+xwinograd_zh,Replace_zhmt,accuracy,0.7202380952380952
+xwinograd_zh,True or False_zhmt,accuracy,0.5099206349206349
+xwinograd_zh,does underscore refer to_zhmt,accuracy,0.6746031746031746
+xwinograd_zh,stand for_zhmt,accuracy,0.5654761904761905
+xwinograd_zh,underscore refer to_zhmt,accuracy,0.7638888888888888
+xwinograd_zh,median,accuracy,0.6746031746031746
+multiple,average,multiple,0.7855970749932859
diff --git a/evaluation_xcopawinostorymt/merged.json b/evaluation_xcopawinostorymt/merged.json
new file mode 100644
index 0000000000000000000000000000000000000000..6ba79edf768cf12f26f0a9792c6b293b2ffeffc7
--- /dev/null
+++ b/evaluation_xcopawinostorymt/merged.json
@@ -0,0 +1 @@
+{"Muennighoff/xstory_cloze_ar": {"Answer Given options_armt": {"arguments": "Namespace(config_name=None, dataset_config_name='ar', dataset_name='Muennighoff/xstory_cloze', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='ar', template_name='Answer Given options_armt', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "ar", "dataset_name": "Muennighoff/xstory_cloze", "evaluation": {"accuracy": 0.8941098610191925}, "template_name": "Answer Given options_armt"}, "Choose Story Ending_armt": {"arguments": "Namespace(config_name=None, dataset_config_name='ar', dataset_name='Muennighoff/xstory_cloze', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='ar', template_name='Choose Story Ending_armt', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "ar", "dataset_name": "Muennighoff/xstory_cloze", "evaluation": {"accuracy": 0.9404367968232958}, "template_name": "Choose Story Ending_armt"}, "Generate Ending_armt": {"arguments": "Namespace(config_name=None, dataset_config_name='ar', dataset_name='Muennighoff/xstory_cloze', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='ar', template_name='Generate Ending_armt', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "ar", "dataset_name": "Muennighoff/xstory_cloze", "evaluation": {"accuracy": 0.6598279285241562}, "template_name": "Generate Ending_armt"}, "Novel Correct Ending_armt": {"arguments": "Namespace(config_name=None, dataset_config_name='ar', dataset_name='Muennighoff/xstory_cloze', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='ar', template_name='Novel Correct Ending_armt', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "ar", "dataset_name": "Muennighoff/xstory_cloze", "evaluation": {"accuracy": 0.9272005294506949}, "template_name": "Novel Correct Ending_armt"}, "Story Continuation and Options_armt": {"arguments": "Namespace(config_name=None, dataset_config_name='ar', dataset_name='Muennighoff/xstory_cloze', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='ar', template_name='Story Continuation and Options_armt', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "ar", "dataset_name": "Muennighoff/xstory_cloze", "evaluation": {"accuracy": 0.9172733289212442}, "template_name": "Story Continuation and Options_armt"}}, "Muennighoff/xstory_cloze_es": {"Answer Given options_esmt": {"arguments": "Namespace(config_name=None, dataset_config_name='es', dataset_name='Muennighoff/xstory_cloze', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='es', template_name='Answer Given options_esmt', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "es", "dataset_name": "Muennighoff/xstory_cloze", "evaluation": {"accuracy": 0.9311714096624751}, "template_name": "Answer Given options_esmt"}, "Choose Story Ending_esmt": {"arguments": "Namespace(config_name=None, dataset_config_name='es', dataset_name='Muennighoff/xstory_cloze', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='es', template_name='Choose Story Ending_esmt', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "es", "dataset_name": "Muennighoff/xstory_cloze", "evaluation": {"accuracy": 0.9549966909331569}, "template_name": "Choose Story Ending_esmt"}, "Generate Ending_esmt": {"arguments": "Namespace(config_name=None, dataset_config_name='es', dataset_name='Muennighoff/xstory_cloze', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='es', template_name='Generate Ending_esmt', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "es", "dataset_name": "Muennighoff/xstory_cloze", "evaluation": {"accuracy": 0.7405691594970218}, "template_name": "Generate Ending_esmt"}, "Novel Correct Ending_esmt": {"arguments": "Namespace(config_name=None, dataset_config_name='es', dataset_name='Muennighoff/xstory_cloze', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='es', template_name='Novel Correct Ending_esmt', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "es", "dataset_name": "Muennighoff/xstory_cloze", "evaluation": {"accuracy": 0.9490403706154864}, "template_name": "Novel Correct Ending_esmt"}, "Story Continuation and Options_esmt": {"arguments": "Namespace(config_name=None, dataset_config_name='es', dataset_name='Muennighoff/xstory_cloze', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='es', template_name='Story Continuation and Options_esmt', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "es", "dataset_name": "Muennighoff/xstory_cloze", "evaluation": {"accuracy": 0.9523494374586366}, "template_name": "Story Continuation and Options_esmt"}}, "Muennighoff/xstory_cloze_eu": {"Answer Given options_eumt": {"arguments": "Namespace(config_name=None, dataset_config_name='eu', dataset_name='Muennighoff/xstory_cloze', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='eu', template_name='Answer Given options_eumt', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "eu", "dataset_name": "Muennighoff/xstory_cloze", "evaluation": {"accuracy": 0.7326273990734613}, "template_name": "Answer Given options_eumt"}, "Choose Story Ending_eumt": {"arguments": "Namespace(config_name=None, dataset_config_name='eu', dataset_name='Muennighoff/xstory_cloze', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='eu', template_name='Choose Story Ending_eumt', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "eu", "dataset_name": "Muennighoff/xstory_cloze", "evaluation": {"accuracy": 0.8682991396426207}, "template_name": "Choose Story Ending_eumt"}, "Generate Ending_eumt": {"arguments": "Namespace(config_name=None, dataset_config_name='eu', dataset_name='Muennighoff/xstory_cloze', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='eu', template_name='Generate Ending_eumt', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "eu", "dataset_name": "Muennighoff/xstory_cloze", "evaluation": {"accuracy": 0.6293845135671741}, "template_name": "Generate Ending_eumt"}, "Novel Correct Ending_eumt": {"arguments": "Namespace(config_name=None, dataset_config_name='eu', dataset_name='Muennighoff/xstory_cloze', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='eu', template_name='Novel Correct Ending_eumt', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "eu", "dataset_name": "Muennighoff/xstory_cloze", "evaluation": {"accuracy": 0.8305757776307081}, "template_name": "Novel Correct Ending_eumt"}, "Story Continuation and Options_eumt": {"arguments": "Namespace(config_name=None, dataset_config_name='eu', dataset_name='Muennighoff/xstory_cloze', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='eu', template_name='Story Continuation and Options_eumt', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "eu", "dataset_name": "Muennighoff/xstory_cloze", "evaluation": {"accuracy": 0.8259430840502978}, "template_name": "Story Continuation and Options_eumt"}}, "Muennighoff/xstory_cloze_hi": {"Answer Given options_himt": {"arguments": "Namespace(config_name=None, dataset_config_name='hi', dataset_name='Muennighoff/xstory_cloze', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='hi', template_name='Answer Given options_himt', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "hi", "dataset_name": "Muennighoff/xstory_cloze", "evaluation": {"accuracy": 0.8530774321641297}, "template_name": "Answer Given options_himt"}, "Choose Story Ending_himt": {"arguments": "Namespace(config_name=None, dataset_config_name='hi', dataset_name='Muennighoff/xstory_cloze', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='hi', template_name='Choose Story Ending_himt', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "hi", "dataset_name": "Muennighoff/xstory_cloze", "evaluation": {"accuracy": 0.8914626075446724}, "template_name": "Choose Story Ending_himt"}, "Generate Ending_himt": {"arguments": "Namespace(config_name=None, dataset_config_name='hi', dataset_name='Muennighoff/xstory_cloze', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='hi', template_name='Generate Ending_himt', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "hi", "dataset_name": "Muennighoff/xstory_cloze", "evaluation": {"accuracy": 0.6644606221045665}, "template_name": "Generate Ending_himt"}, "Novel Correct Ending_himt": {"arguments": "Namespace(config_name=None, dataset_config_name='hi', dataset_name='Muennighoff/xstory_cloze', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='hi', template_name='Novel Correct Ending_himt', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "hi", "dataset_name": "Muennighoff/xstory_cloze", "evaluation": {"accuracy": 0.8821972203838517}, "template_name": "Novel Correct Ending_himt"}, "Story Continuation and Options_himt": {"arguments": "Namespace(config_name=None, dataset_config_name='hi', dataset_name='Muennighoff/xstory_cloze', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='hi', template_name='Story Continuation and Options_himt', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "hi", "dataset_name": "Muennighoff/xstory_cloze", "evaluation": {"accuracy": 0.8735936465916612}, "template_name": "Story Continuation and Options_himt"}}, "Muennighoff/xstory_cloze_id": {"Answer Given options_idmt": {"arguments": "Namespace(config_name=None, dataset_config_name='id', dataset_name='Muennighoff/xstory_cloze', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='id', template_name='Answer Given options_idmt', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "id", "dataset_name": "Muennighoff/xstory_cloze", "evaluation": {"accuracy": 0.8682991396426207}, "template_name": "Answer Given options_idmt"}, "Choose Story Ending_idmt": {"arguments": "Namespace(config_name=None, dataset_config_name='id', dataset_name='Muennighoff/xstory_cloze', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='id', template_name='Choose Story Ending_idmt', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "id", "dataset_name": "Muennighoff/xstory_cloze", "evaluation": {"accuracy": 0.927862342819325}, "template_name": "Choose Story Ending_idmt"}, "Generate Ending_idmt": {"arguments": "Namespace(config_name=None, dataset_config_name='id', dataset_name='Muennighoff/xstory_cloze', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='id', template_name='Generate Ending_idmt', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "id", "dataset_name": "Muennighoff/xstory_cloze", "evaluation": {"accuracy": 0.6929185969556585}, "template_name": "Generate Ending_idmt"}, "Novel Correct Ending_idmt": {"arguments": "Namespace(config_name=None, dataset_config_name='id', dataset_name='Muennighoff/xstory_cloze', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='id', template_name='Novel Correct Ending_idmt', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "id", "dataset_name": "Muennighoff/xstory_cloze", "evaluation": {"accuracy": 0.9086697551290536}, "template_name": "Novel Correct Ending_idmt"}, "Story Continuation and Options_idmt": {"arguments": "Namespace(config_name=None, dataset_config_name='id', dataset_name='Muennighoff/xstory_cloze', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='id', template_name='Story Continuation and Options_idmt', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "id", "dataset_name": "Muennighoff/xstory_cloze", "evaluation": {"accuracy": 0.9159497021839841}, "template_name": "Story Continuation and Options_idmt"}}, "Muennighoff/xstory_cloze_zh": {"Answer Given options_zhmt": {"arguments": "Namespace(config_name=None, dataset_config_name='zh', dataset_name='Muennighoff/xstory_cloze', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='zh', template_name='Answer Given options_zhmt', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "zh", "dataset_name": "Muennighoff/xstory_cloze", "evaluation": {"accuracy": 0.913964262078094}, "template_name": "Answer Given options_zhmt"}, "Choose Story Ending_zhmt": {"arguments": "Namespace(config_name=None, dataset_config_name='zh', dataset_name='Muennighoff/xstory_cloze', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='zh', template_name='Choose Story Ending_zhmt', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "zh", "dataset_name": "Muennighoff/xstory_cloze", "evaluation": {"accuracy": 0.9238914626075446}, "template_name": "Choose Story Ending_zhmt"}, "Generate Ending_zhmt": {"arguments": "Namespace(config_name=None, dataset_config_name='zh', dataset_name='Muennighoff/xstory_cloze', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='zh', template_name='Generate Ending_zhmt', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "zh", "dataset_name": "Muennighoff/xstory_cloze", "evaluation": {"accuracy": 0.6843150231634679}, "template_name": "Generate Ending_zhmt"}, "Novel Correct Ending_zhmt": {"arguments": "Namespace(config_name=None, dataset_config_name='zh', dataset_name='Muennighoff/xstory_cloze', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='zh', template_name='Novel Correct Ending_zhmt', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "zh", "dataset_name": "Muennighoff/xstory_cloze", "evaluation": {"accuracy": 0.9252150893448048}, "template_name": "Novel Correct Ending_zhmt"}, "Story Continuation and Options_zhmt": {"arguments": "Namespace(config_name=None, dataset_config_name='zh', dataset_name='Muennighoff/xstory_cloze', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='zh', template_name='Story Continuation and Options_zhmt', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "zh", "dataset_name": "Muennighoff/xstory_cloze", "evaluation": {"accuracy": 0.913302448709464}, "template_name": "Story Continuation and Options_zhmt"}}, "Muennighoff/xwinograd_fr": {"Replace_frmt": {"arguments": "Namespace(config_name=None, dataset_config_name='fr', dataset_name='Muennighoff/xwinograd', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='test', target_max_length=256, template_config_name='fr', template_name='Replace_frmt', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "fr", "dataset_name": "Muennighoff/xwinograd", "evaluation": {"accuracy": 0.6626506024096386}, "template_name": "Replace_frmt"}, "True or False_frmt": {"arguments": "Namespace(config_name=None, dataset_config_name='fr', dataset_name='Muennighoff/xwinograd', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='test', target_max_length=256, template_config_name='fr', template_name='True or False_frmt', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "fr", "dataset_name": "Muennighoff/xwinograd", "evaluation": {"accuracy": 0.4578313253012048}, "template_name": "True or False_frmt"}, "does underscore refer to_frmt": {"arguments": "Namespace(config_name=None, dataset_config_name='fr', dataset_name='Muennighoff/xwinograd', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='test', target_max_length=256, template_config_name='fr', template_name='does underscore refer to_frmt', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "fr", "dataset_name": "Muennighoff/xwinograd", "evaluation": {"accuracy": 0.5783132530120482}, "template_name": "does underscore refer to_frmt"}, "stand for_frmt": {"arguments": "Namespace(config_name=None, dataset_config_name='fr', dataset_name='Muennighoff/xwinograd', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='test', target_max_length=256, template_config_name='fr', template_name='stand for_frmt', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "fr", "dataset_name": "Muennighoff/xwinograd", "evaluation": {"accuracy": 0.5421686746987951}, "template_name": "stand for_frmt"}, "underscore refer to_frmt": {"arguments": "Namespace(config_name=None, dataset_config_name='fr', dataset_name='Muennighoff/xwinograd', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='test', target_max_length=256, template_config_name='fr', template_name='underscore refer to_frmt', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "fr", "dataset_name": "Muennighoff/xwinograd", "evaluation": {"accuracy": 0.6265060240963856}, "template_name": "underscore refer to_frmt"}}, "Muennighoff/xwinograd_pt": {"Replace_ptmt": {"arguments": "Namespace(config_name=None, dataset_config_name='pt', dataset_name='Muennighoff/xwinograd', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='test', target_max_length=256, template_config_name='pt', template_name='Replace_ptmt', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "pt", "dataset_name": "Muennighoff/xwinograd", "evaluation": {"accuracy": 0.6273764258555133}, "template_name": "Replace_ptmt"}, "True or False_ptmt": {"arguments": "Namespace(config_name=None, dataset_config_name='pt', dataset_name='Muennighoff/xwinograd', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='test', target_max_length=256, template_config_name='pt', template_name='True or False_ptmt', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "pt", "dataset_name": "Muennighoff/xwinograd", "evaluation": {"accuracy": 0.532319391634981}, "template_name": "True or False_ptmt"}, "does underscore refer to_ptmt": {"arguments": "Namespace(config_name=None, dataset_config_name='pt', dataset_name='Muennighoff/xwinograd', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='test', target_max_length=256, template_config_name='pt', template_name='does underscore refer to_ptmt', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "pt", "dataset_name": "Muennighoff/xwinograd", "evaluation": {"accuracy": 0.596958174904943}, "template_name": "does underscore refer to_ptmt"}, "stand for_ptmt": {"arguments": "Namespace(config_name=None, dataset_config_name='pt', dataset_name='Muennighoff/xwinograd', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='test', target_max_length=256, template_config_name='pt', template_name='stand for_ptmt', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "pt", "dataset_name": "Muennighoff/xwinograd", "evaluation": {"accuracy": 0.5399239543726235}, "template_name": "stand for_ptmt"}, "underscore refer to_ptmt": {"arguments": "Namespace(config_name=None, dataset_config_name='pt', dataset_name='Muennighoff/xwinograd', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='test', target_max_length=256, template_config_name='pt', template_name='underscore refer to_ptmt', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "pt", "dataset_name": "Muennighoff/xwinograd", "evaluation": {"accuracy": 0.623574144486692}, "template_name": "underscore refer to_ptmt"}}, "Muennighoff/xwinograd_zh": {"Replace_zhmt": {"arguments": "Namespace(config_name=None, dataset_config_name='zh', dataset_name='Muennighoff/xwinograd', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='test', target_max_length=256, template_config_name='zh', template_name='Replace_zhmt', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "zh", "dataset_name": "Muennighoff/xwinograd", "evaluation": {"accuracy": 0.7202380952380952}, "template_name": "Replace_zhmt"}, "True or False_zhmt": {"arguments": "Namespace(config_name=None, dataset_config_name='zh', dataset_name='Muennighoff/xwinograd', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='test', target_max_length=256, template_config_name='zh', template_name='True or False_zhmt', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "zh", "dataset_name": "Muennighoff/xwinograd", "evaluation": {"accuracy": 0.5099206349206349}, "template_name": "True or False_zhmt"}, "does underscore refer to_zhmt": {"arguments": "Namespace(config_name=None, dataset_config_name='zh', dataset_name='Muennighoff/xwinograd', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='test', target_max_length=256, template_config_name='zh', template_name='does underscore refer to_zhmt', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "zh", "dataset_name": "Muennighoff/xwinograd", "evaluation": {"accuracy": 0.6746031746031746}, "template_name": "does underscore refer to_zhmt"}, "stand for_zhmt": {"arguments": "Namespace(config_name=None, dataset_config_name='zh', dataset_name='Muennighoff/xwinograd', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='test', target_max_length=256, template_config_name='zh', template_name='stand for_zhmt', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "zh", "dataset_name": "Muennighoff/xwinograd", "evaluation": {"accuracy": 0.5654761904761905}, "template_name": "stand for_zhmt"}, "underscore refer to_zhmt": {"arguments": "Namespace(config_name=None, dataset_config_name='zh', dataset_name='Muennighoff/xwinograd', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='test', target_max_length=256, template_config_name='zh', template_name='underscore refer to_zhmt', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "zh", "dataset_name": "Muennighoff/xwinograd", "evaluation": {"accuracy": 0.7638888888888888}, "template_name": "underscore refer to_zhmt"}}, "xcopa_id": {"C1 or C2? premise_idmt": {"arguments": "Namespace(config_name=None, dataset_config_name='id', dataset_name='xcopa', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='id', template_name='C1 or C2? premise_idmt', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "id", "dataset_name": "xcopa", "evaluation": {"accuracy": 0.57}, "template_name": "C1 or C2? premise_idmt"}, "best_option_idmt": {"arguments": "Namespace(config_name=None, dataset_config_name='id', dataset_name='xcopa', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='id', template_name='best_option_idmt', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "id", "dataset_name": "xcopa", "evaluation": {"accuracy": 0.78}, "template_name": "best_option_idmt"}, "cause_effect_idmt": {"arguments": "Namespace(config_name=None, dataset_config_name='id', dataset_name='xcopa', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='id', template_name='cause_effect_idmt', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "id", "dataset_name": "xcopa", "evaluation": {"accuracy": 0.84}, "template_name": "cause_effect_idmt"}, "i_am_hesitating_idmt": {"arguments": "Namespace(config_name=None, dataset_config_name='id', dataset_name='xcopa', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='id', template_name='i_am_hesitating_idmt', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "id", "dataset_name": "xcopa", "evaluation": {"accuracy": 0.84}, "template_name": "i_am_hesitating_idmt"}, "plausible_alternatives_idmt": {"arguments": "Namespace(config_name=None, dataset_config_name='id', dataset_name='xcopa', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='id', template_name='plausible_alternatives_idmt', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "id", "dataset_name": "xcopa", "evaluation": {"accuracy": 0.83}, "template_name": "plausible_alternatives_idmt"}}, "xcopa_sw": {"C1 or C2? premise_swmt": {"arguments": "Namespace(config_name=None, dataset_config_name='sw', dataset_name='xcopa', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='sw', template_name='C1 or C2? premise_swmt', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "sw", "dataset_name": "xcopa", "evaluation": {"accuracy": 0.6}, "template_name": "C1 or C2? premise_swmt"}, "best_option_swmt": {"arguments": "Namespace(config_name=None, dataset_config_name='sw', dataset_name='xcopa', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='sw', template_name='best_option_swmt', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "sw", "dataset_name": "xcopa", "evaluation": {"accuracy": 0.59}, "template_name": "best_option_swmt"}, "cause_effect_swmt": {"arguments": "Namespace(config_name=None, dataset_config_name='sw', dataset_name='xcopa', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='sw', template_name='cause_effect_swmt', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "sw", "dataset_name": "xcopa", "evaluation": {"accuracy": 0.63}, "template_name": "cause_effect_swmt"}, "i_am_hesitating_swmt": {"arguments": "Namespace(config_name=None, dataset_config_name='sw', dataset_name='xcopa', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='sw', template_name='i_am_hesitating_swmt', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "sw", "dataset_name": "xcopa", "evaluation": {"accuracy": 0.67}, "template_name": "i_am_hesitating_swmt"}, "plausible_alternatives_swmt": {"arguments": "Namespace(config_name=None, dataset_config_name='sw', dataset_name='xcopa', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='sw', template_name='plausible_alternatives_swmt', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "sw", "dataset_name": "xcopa", "evaluation": {"accuracy": 0.62}, "template_name": "plausible_alternatives_swmt"}}, "xcopa_ta": {"C1 or C2? premise_tamt": {"arguments": "Namespace(config_name=None, dataset_config_name='ta', dataset_name='xcopa', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='ta', template_name='C1 or C2? premise_tamt', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "ta", "dataset_name": "xcopa", "evaluation": {"accuracy": 0.64}, "template_name": "C1 or C2? premise_tamt"}, "best_option_tamt": {"arguments": "Namespace(config_name=None, dataset_config_name='ta', dataset_name='xcopa', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='ta', template_name='best_option_tamt', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "ta", "dataset_name": "xcopa", "evaluation": {"accuracy": 0.56}, "template_name": "best_option_tamt"}, "cause_effect_tamt": {"arguments": "Namespace(config_name=None, dataset_config_name='ta', dataset_name='xcopa', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='ta', template_name='cause_effect_tamt', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "ta", "dataset_name": "xcopa", "evaluation": {"accuracy": 0.62}, "template_name": "cause_effect_tamt"}, "i_am_hesitating_tamt": {"arguments": "Namespace(config_name=None, dataset_config_name='ta', dataset_name='xcopa', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='ta', template_name='i_am_hesitating_tamt', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "ta", "dataset_name": "xcopa", "evaluation": {"accuracy": 0.64}, "template_name": "i_am_hesitating_tamt"}, "plausible_alternatives_tamt": {"arguments": "Namespace(config_name=None, dataset_config_name='ta', dataset_name='xcopa', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='ta', template_name='plausible_alternatives_tamt', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "ta", "dataset_name": "xcopa", "evaluation": {"accuracy": 0.63}, "template_name": "plausible_alternatives_tamt"}}, "xcopa_vi": {"C1 or C2? premise_vimt": {"arguments": "Namespace(config_name=None, dataset_config_name='vi', dataset_name='xcopa', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='vi', template_name='C1 or C2? premise_vimt', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "vi", "dataset_name": "xcopa", "evaluation": {"accuracy": 0.61}, "template_name": "C1 or C2? premise_vimt"}, "best_option_vimt": {"arguments": "Namespace(config_name=None, dataset_config_name='vi', dataset_name='xcopa', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='vi', template_name='best_option_vimt', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "vi", "dataset_name": "xcopa", "evaluation": {"accuracy": 0.77}, "template_name": "best_option_vimt"}, "cause_effect_vimt": {"arguments": "Namespace(config_name=None, dataset_config_name='vi', dataset_name='xcopa', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='vi', template_name='cause_effect_vimt', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "vi", "dataset_name": "xcopa", "evaluation": {"accuracy": 0.89}, "template_name": "cause_effect_vimt"}, "i_am_hesitating_vimt": {"arguments": "Namespace(config_name=None, dataset_config_name='vi', dataset_name='xcopa', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='vi', template_name='i_am_hesitating_vimt', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "vi", "dataset_name": "xcopa", "evaluation": {"accuracy": 0.85}, "template_name": "i_am_hesitating_vimt"}, "plausible_alternatives_vimt": {"arguments": "Namespace(config_name=None, dataset_config_name='vi', dataset_name='xcopa', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='vi', template_name='plausible_alternatives_vimt', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "vi", "dataset_name": "xcopa", "evaluation": {"accuracy": 0.87}, "template_name": "plausible_alternatives_vimt"}}, "xcopa_zh": {"C1 or C2? premise_zhmt": {"arguments": "Namespace(config_name=None, dataset_config_name='zh', dataset_name='xcopa', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='zh', template_name='C1 or C2? premise_zhmt', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "zh", "dataset_name": "xcopa", "evaluation": {"accuracy": 0.63}, "template_name": "C1 or C2? premise_zhmt"}, "best_option_zhmt": {"arguments": "Namespace(config_name=None, dataset_config_name='zh', dataset_name='xcopa', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='zh', template_name='best_option_zhmt', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "zh", "dataset_name": "xcopa", "evaluation": {"accuracy": 0.75}, "template_name": "best_option_zhmt"}, "cause_effect_zhmt": {"arguments": "Namespace(config_name=None, dataset_config_name='zh', dataset_name='xcopa', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='zh', template_name='cause_effect_zhmt', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "zh", "dataset_name": "xcopa", "evaluation": {"accuracy": 0.83}, "template_name": "cause_effect_zhmt"}, "i_am_hesitating_zhmt": {"arguments": "Namespace(config_name=None, dataset_config_name='zh', dataset_name='xcopa', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='zh', template_name='i_am_hesitating_zhmt', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "zh", "dataset_name": "xcopa", "evaluation": {"accuracy": 0.84}, "template_name": "i_am_hesitating_zhmt"}, "plausible_alternatives_zhmt": {"arguments": "Namespace(config_name=None, dataset_config_name='zh', dataset_name='xcopa', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='zh', template_name='plausible_alternatives_zhmt', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "zh", "dataset_name": "xcopa", "evaluation": {"accuracy": 0.86}, "template_name": "plausible_alternatives_zhmt"}}}
\ No newline at end of file
diff --git a/evaluation_xcopawinostorymt/xcopa/id/C1_or_C2?_premise_idmt/results.json b/evaluation_xcopawinostorymt/xcopa/id/C1_or_C2?_premise_idmt/results.json
new file mode 100644
index 0000000000000000000000000000000000000000..a3b159b6975c5a2a3ba1704d4a114cff18e67241
--- /dev/null
+++ b/evaluation_xcopawinostorymt/xcopa/id/C1_or_C2?_premise_idmt/results.json
@@ -0,0 +1,9 @@
+{
+  "dataset_name": "xcopa",
+  "dataset_config_name": "id",
+  "template_name": "C1 or C2? premise_idmt",
+  "evaluation": {
+    "accuracy": 0.57
+  },
+  "arguments": "Namespace(config_name=None, dataset_config_name='id', dataset_name='xcopa', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='id', template_name='C1 or C2? premise_idmt', tokenizer_name=None, use_slow_tokenizer=False)"
+}
\ No newline at end of file
diff --git a/evaluation_xcopawinostorymt/xcopa/id/best_option_idmt/results.json b/evaluation_xcopawinostorymt/xcopa/id/best_option_idmt/results.json
new file mode 100644
index 0000000000000000000000000000000000000000..855f53dfb43806d722065ba2cfa8ca448756a2cb
--- /dev/null
+++ b/evaluation_xcopawinostorymt/xcopa/id/best_option_idmt/results.json
@@ -0,0 +1,9 @@
+{
+  "dataset_name": "xcopa",
+  "dataset_config_name": "id",
+  "template_name": "best_option_idmt",
+  "evaluation": {
+    "accuracy": 0.78
+  },
+  "arguments": "Namespace(config_name=None, dataset_config_name='id', dataset_name='xcopa', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='id', template_name='best_option_idmt', tokenizer_name=None, use_slow_tokenizer=False)"
+}
\ No newline at end of file
diff --git a/evaluation_xcopawinostorymt/xcopa/id/cause_effect_idmt/results.json b/evaluation_xcopawinostorymt/xcopa/id/cause_effect_idmt/results.json
new file mode 100644
index 0000000000000000000000000000000000000000..c68da0c2acb059fa1add54ef45470d452c3e9c07
--- /dev/null
+++ b/evaluation_xcopawinostorymt/xcopa/id/cause_effect_idmt/results.json
@@ -0,0 +1,9 @@
+{
+  "dataset_name": "xcopa",
+  "dataset_config_name": "id",
+  "template_name": "cause_effect_idmt",
+  "evaluation": {
+    "accuracy": 0.84
+  },
+  "arguments": "Namespace(config_name=None, dataset_config_name='id', dataset_name='xcopa', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='id', template_name='cause_effect_idmt', tokenizer_name=None, use_slow_tokenizer=False)"
+}
\ No newline at end of file
diff --git a/evaluation_xcopawinostorymt/xcopa/id/i_am_hesitating_idmt/results.json b/evaluation_xcopawinostorymt/xcopa/id/i_am_hesitating_idmt/results.json
new file mode 100644
index 0000000000000000000000000000000000000000..bc354e5e99e62eebea0b523de3823573281b5791
--- /dev/null
+++ b/evaluation_xcopawinostorymt/xcopa/id/i_am_hesitating_idmt/results.json
@@ -0,0 +1,9 @@
+{
+  "dataset_name": "xcopa",
+  "dataset_config_name": "id",
+  "template_name": "i_am_hesitating_idmt",
+  "evaluation": {
+    "accuracy": 0.84
+  },
+  "arguments": "Namespace(config_name=None, dataset_config_name='id', dataset_name='xcopa', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='id', template_name='i_am_hesitating_idmt', tokenizer_name=None, use_slow_tokenizer=False)"
+}
\ No newline at end of file
diff --git a/evaluation_xcopawinostorymt/xcopa/id/plausible_alternatives_idmt/results.json b/evaluation_xcopawinostorymt/xcopa/id/plausible_alternatives_idmt/results.json
new file mode 100644
index 0000000000000000000000000000000000000000..56547788a32feb5d2b997d3590f637dae6eb98dc
--- /dev/null
+++ b/evaluation_xcopawinostorymt/xcopa/id/plausible_alternatives_idmt/results.json
@@ -0,0 +1,9 @@
+{
+  "dataset_name": "xcopa",
+  "dataset_config_name": "id",
+  "template_name": "plausible_alternatives_idmt",
+  "evaluation": {
+    "accuracy": 0.83
+  },
+  "arguments": "Namespace(config_name=None, dataset_config_name='id', dataset_name='xcopa', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='id', template_name='plausible_alternatives_idmt', tokenizer_name=None, use_slow_tokenizer=False)"
+}
\ No newline at end of file
diff --git a/evaluation_xcopawinostorymt/xcopa/sw/C1_or_C2?_premise_swmt/results.json b/evaluation_xcopawinostorymt/xcopa/sw/C1_or_C2?_premise_swmt/results.json
new file mode 100644
index 0000000000000000000000000000000000000000..ece2dd50d4d67e426e5e519747b1841c335c8c2c
--- /dev/null
+++ b/evaluation_xcopawinostorymt/xcopa/sw/C1_or_C2?_premise_swmt/results.json
@@ -0,0 +1,9 @@
+{
+  "dataset_name": "xcopa",
+  "dataset_config_name": "sw",
+  "template_name": "C1 or C2? premise_swmt",
+  "evaluation": {
+    "accuracy": 0.6
+  },
+  "arguments": "Namespace(config_name=None, dataset_config_name='sw', dataset_name='xcopa', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='sw', template_name='C1 or C2? premise_swmt', tokenizer_name=None, use_slow_tokenizer=False)"
+}
\ No newline at end of file
diff --git a/evaluation_xcopawinostorymt/xcopa/sw/best_option_swmt/results.json b/evaluation_xcopawinostorymt/xcopa/sw/best_option_swmt/results.json
new file mode 100644
index 0000000000000000000000000000000000000000..1dd7fd8221b9ed98ae38d201ec20eb69fd81e899
--- /dev/null
+++ b/evaluation_xcopawinostorymt/xcopa/sw/best_option_swmt/results.json
@@ -0,0 +1,9 @@
+{
+  "dataset_name": "xcopa",
+  "dataset_config_name": "sw",
+  "template_name": "best_option_swmt",
+  "evaluation": {
+    "accuracy": 0.59
+  },
+  "arguments": "Namespace(config_name=None, dataset_config_name='sw', dataset_name='xcopa', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='sw', template_name='best_option_swmt', tokenizer_name=None, use_slow_tokenizer=False)"
+}
\ No newline at end of file
diff --git a/evaluation_xcopawinostorymt/xcopa/sw/cause_effect_swmt/results.json b/evaluation_xcopawinostorymt/xcopa/sw/cause_effect_swmt/results.json
new file mode 100644
index 0000000000000000000000000000000000000000..8eb58372c78dd53c849b27c08b98c3e9c6d8eaad
--- /dev/null
+++ b/evaluation_xcopawinostorymt/xcopa/sw/cause_effect_swmt/results.json
@@ -0,0 +1,9 @@
+{
+  "dataset_name": "xcopa",
+  "dataset_config_name": "sw",
+  "template_name": "cause_effect_swmt",
+  "evaluation": {
+    "accuracy": 0.63
+  },
+  "arguments": "Namespace(config_name=None, dataset_config_name='sw', dataset_name='xcopa', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='sw', template_name='cause_effect_swmt', tokenizer_name=None, use_slow_tokenizer=False)"
+}
\ No newline at end of file
diff --git a/evaluation_xcopawinostorymt/xcopa/sw/i_am_hesitating_swmt/results.json b/evaluation_xcopawinostorymt/xcopa/sw/i_am_hesitating_swmt/results.json
new file mode 100644
index 0000000000000000000000000000000000000000..e803205adcfefba9f30d43af3290e2004e39e3aa
--- /dev/null
+++ b/evaluation_xcopawinostorymt/xcopa/sw/i_am_hesitating_swmt/results.json
@@ -0,0 +1,9 @@
+{
+  "dataset_name": "xcopa",
+  "dataset_config_name": "sw",
+  "template_name": "i_am_hesitating_swmt",
+  "evaluation": {
+    "accuracy": 0.67
+  },
+  "arguments": "Namespace(config_name=None, dataset_config_name='sw', dataset_name='xcopa', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='sw', template_name='i_am_hesitating_swmt', tokenizer_name=None, use_slow_tokenizer=False)"
+}
\ No newline at end of file
diff --git a/evaluation_xcopawinostorymt/xcopa/sw/plausible_alternatives_swmt/results.json b/evaluation_xcopawinostorymt/xcopa/sw/plausible_alternatives_swmt/results.json
new file mode 100644
index 0000000000000000000000000000000000000000..51f2499b03c23ca354dfe8be026a5d87885a7929
--- /dev/null
+++ b/evaluation_xcopawinostorymt/xcopa/sw/plausible_alternatives_swmt/results.json
@@ -0,0 +1,9 @@
+{
+  "dataset_name": "xcopa",
+  "dataset_config_name": "sw",
+  "template_name": "plausible_alternatives_swmt",
+  "evaluation": {
+    "accuracy": 0.62
+  },
+  "arguments": "Namespace(config_name=None, dataset_config_name='sw', dataset_name='xcopa', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='sw', template_name='plausible_alternatives_swmt', tokenizer_name=None, use_slow_tokenizer=False)"
+}
\ No newline at end of file
diff --git a/evaluation_xcopawinostorymt/xcopa/ta/C1_or_C2?_premise_tamt/results.json b/evaluation_xcopawinostorymt/xcopa/ta/C1_or_C2?_premise_tamt/results.json
new file mode 100644
index 0000000000000000000000000000000000000000..f3ef16661ff501e41d7fb1a38c1aa3bb5289d109
--- /dev/null
+++ b/evaluation_xcopawinostorymt/xcopa/ta/C1_or_C2?_premise_tamt/results.json
@@ -0,0 +1,9 @@
+{
+  "dataset_name": "xcopa",
+  "dataset_config_name": "ta",
+  "template_name": "C1 or C2? premise_tamt",
+  "evaluation": {
+    "accuracy": 0.64
+  },
+  "arguments": "Namespace(config_name=None, dataset_config_name='ta', dataset_name='xcopa', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='ta', template_name='C1 or C2? premise_tamt', tokenizer_name=None, use_slow_tokenizer=False)"
+}
\ No newline at end of file
diff --git a/evaluation_xcopawinostorymt/xcopa/ta/best_option_tamt/results.json b/evaluation_xcopawinostorymt/xcopa/ta/best_option_tamt/results.json
new file mode 100644
index 0000000000000000000000000000000000000000..947f7537af74f40d992e21e57b4e0b3031461954
--- /dev/null
+++ b/evaluation_xcopawinostorymt/xcopa/ta/best_option_tamt/results.json
@@ -0,0 +1,9 @@
+{
+  "dataset_name": "xcopa",
+  "dataset_config_name": "ta",
+  "template_name": "best_option_tamt",
+  "evaluation": {
+    "accuracy": 0.56
+  },
+  "arguments": "Namespace(config_name=None, dataset_config_name='ta', dataset_name='xcopa', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='ta', template_name='best_option_tamt', tokenizer_name=None, use_slow_tokenizer=False)"
+}
\ No newline at end of file
diff --git a/evaluation_xcopawinostorymt/xcopa/ta/cause_effect_tamt/results.json b/evaluation_xcopawinostorymt/xcopa/ta/cause_effect_tamt/results.json
new file mode 100644
index 0000000000000000000000000000000000000000..8b22c4ea50f2cbf069ce00e68da7ce949e4b60a3
--- /dev/null
+++ b/evaluation_xcopawinostorymt/xcopa/ta/cause_effect_tamt/results.json
@@ -0,0 +1,9 @@
+{
+  "dataset_name": "xcopa",
+  "dataset_config_name": "ta",
+  "template_name": "cause_effect_tamt",
+  "evaluation": {
+    "accuracy": 0.62
+  },
+  "arguments": "Namespace(config_name=None, dataset_config_name='ta', dataset_name='xcopa', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='ta', template_name='cause_effect_tamt', tokenizer_name=None, use_slow_tokenizer=False)"
+}
\ No newline at end of file
diff --git a/evaluation_xcopawinostorymt/xcopa/ta/i_am_hesitating_tamt/results.json b/evaluation_xcopawinostorymt/xcopa/ta/i_am_hesitating_tamt/results.json
new file mode 100644
index 0000000000000000000000000000000000000000..a246afec199aa339abb27753067bd74ff6298781
--- /dev/null
+++ b/evaluation_xcopawinostorymt/xcopa/ta/i_am_hesitating_tamt/results.json
@@ -0,0 +1,9 @@
+{
+  "dataset_name": "xcopa",
+  "dataset_config_name": "ta",
+  "template_name": "i_am_hesitating_tamt",
+  "evaluation": {
+    "accuracy": 0.64
+  },
+  "arguments": "Namespace(config_name=None, dataset_config_name='ta', dataset_name='xcopa', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='ta', template_name='i_am_hesitating_tamt', tokenizer_name=None, use_slow_tokenizer=False)"
+}
\ No newline at end of file
diff --git a/evaluation_xcopawinostorymt/xcopa/ta/plausible_alternatives_tamt/results.json b/evaluation_xcopawinostorymt/xcopa/ta/plausible_alternatives_tamt/results.json
new file mode 100644
index 0000000000000000000000000000000000000000..010f33175cc3862712980ab18b2b8cacda387d00
--- /dev/null
+++ b/evaluation_xcopawinostorymt/xcopa/ta/plausible_alternatives_tamt/results.json
@@ -0,0 +1,9 @@
+{
+  "dataset_name": "xcopa",
+  "dataset_config_name": "ta",
+  "template_name": "plausible_alternatives_tamt",
+  "evaluation": {
+    "accuracy": 0.63
+  },
+  "arguments": "Namespace(config_name=None, dataset_config_name='ta', dataset_name='xcopa', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='ta', template_name='plausible_alternatives_tamt', tokenizer_name=None, use_slow_tokenizer=False)"
+}
\ No newline at end of file
diff --git a/evaluation_xcopawinostorymt/xcopa/vi/C1_or_C2?_premise_vimt/results.json b/evaluation_xcopawinostorymt/xcopa/vi/C1_or_C2?_premise_vimt/results.json
new file mode 100644
index 0000000000000000000000000000000000000000..a11e4478e8d0d68f2b1da5f504c2412c75fdba56
--- /dev/null
+++ b/evaluation_xcopawinostorymt/xcopa/vi/C1_or_C2?_premise_vimt/results.json
@@ -0,0 +1,9 @@
+{
+  "dataset_name": "xcopa",
+  "dataset_config_name": "vi",
+  "template_name": "C1 or C2? premise_vimt",
+  "evaluation": {
+    "accuracy": 0.61
+  },
+  "arguments": "Namespace(config_name=None, dataset_config_name='vi', dataset_name='xcopa', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='vi', template_name='C1 or C2? premise_vimt', tokenizer_name=None, use_slow_tokenizer=False)"
+}
\ No newline at end of file
diff --git a/evaluation_xcopawinostorymt/xcopa/vi/best_option_vimt/results.json b/evaluation_xcopawinostorymt/xcopa/vi/best_option_vimt/results.json
new file mode 100644
index 0000000000000000000000000000000000000000..a4969d11187ca6c988435275f85d713f14da8ba7
--- /dev/null
+++ b/evaluation_xcopawinostorymt/xcopa/vi/best_option_vimt/results.json
@@ -0,0 +1,9 @@
+{
+  "dataset_name": "xcopa",
+  "dataset_config_name": "vi",
+  "template_name": "best_option_vimt",
+  "evaluation": {
+    "accuracy": 0.77
+  },
+  "arguments": "Namespace(config_name=None, dataset_config_name='vi', dataset_name='xcopa', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='vi', template_name='best_option_vimt', tokenizer_name=None, use_slow_tokenizer=False)"
+}
\ No newline at end of file
diff --git a/evaluation_xcopawinostorymt/xcopa/vi/cause_effect_vimt/results.json b/evaluation_xcopawinostorymt/xcopa/vi/cause_effect_vimt/results.json
new file mode 100644
index 0000000000000000000000000000000000000000..3f3e2a8ef414dda19185126456aba80cc3c9dd64
--- /dev/null
+++ b/evaluation_xcopawinostorymt/xcopa/vi/cause_effect_vimt/results.json
@@ -0,0 +1,9 @@
+{
+  "dataset_name": "xcopa",
+  "dataset_config_name": "vi",
+  "template_name": "cause_effect_vimt",
+  "evaluation": {
+    "accuracy": 0.89
+  },
+  "arguments": "Namespace(config_name=None, dataset_config_name='vi', dataset_name='xcopa', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='vi', template_name='cause_effect_vimt', tokenizer_name=None, use_slow_tokenizer=False)"
+}
\ No newline at end of file
diff --git a/evaluation_xcopawinostorymt/xcopa/vi/i_am_hesitating_vimt/results.json b/evaluation_xcopawinostorymt/xcopa/vi/i_am_hesitating_vimt/results.json
new file mode 100644
index 0000000000000000000000000000000000000000..c4786bd77a9ec21e35c1b62f2ec5a5a548975afe
--- /dev/null
+++ b/evaluation_xcopawinostorymt/xcopa/vi/i_am_hesitating_vimt/results.json
@@ -0,0 +1,9 @@
+{
+  "dataset_name": "xcopa",
+  "dataset_config_name": "vi",
+  "template_name": "i_am_hesitating_vimt",
+  "evaluation": {
+    "accuracy": 0.85
+  },
+  "arguments": "Namespace(config_name=None, dataset_config_name='vi', dataset_name='xcopa', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='vi', template_name='i_am_hesitating_vimt', tokenizer_name=None, use_slow_tokenizer=False)"
+}
\ No newline at end of file
diff --git a/evaluation_xcopawinostorymt/xcopa/vi/plausible_alternatives_vimt/results.json b/evaluation_xcopawinostorymt/xcopa/vi/plausible_alternatives_vimt/results.json
new file mode 100644
index 0000000000000000000000000000000000000000..45f092dd4589031c1a20671997c0f94b75e80371
--- /dev/null
+++ b/evaluation_xcopawinostorymt/xcopa/vi/plausible_alternatives_vimt/results.json
@@ -0,0 +1,9 @@
+{
+  "dataset_name": "xcopa",
+  "dataset_config_name": "vi",
+  "template_name": "plausible_alternatives_vimt",
+  "evaluation": {
+    "accuracy": 0.87
+  },
+  "arguments": "Namespace(config_name=None, dataset_config_name='vi', dataset_name='xcopa', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='vi', template_name='plausible_alternatives_vimt', tokenizer_name=None, use_slow_tokenizer=False)"
+}
\ No newline at end of file
diff --git a/evaluation_xcopawinostorymt/xcopa/zh/C1_or_C2?_premise_zhmt/results.json b/evaluation_xcopawinostorymt/xcopa/zh/C1_or_C2?_premise_zhmt/results.json
new file mode 100644
index 0000000000000000000000000000000000000000..6fa79011c4d9ffa2bebb87d9c24606bc43b1d35d
--- /dev/null
+++ b/evaluation_xcopawinostorymt/xcopa/zh/C1_or_C2?_premise_zhmt/results.json
@@ -0,0 +1,9 @@
+{
+  "dataset_name": "xcopa",
+  "dataset_config_name": "zh",
+  "template_name": "C1 or C2? premise_zhmt",
+  "evaluation": {
+    "accuracy": 0.63
+  },
+  "arguments": "Namespace(config_name=None, dataset_config_name='zh', dataset_name='xcopa', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='zh', template_name='C1 or C2? premise_zhmt', tokenizer_name=None, use_slow_tokenizer=False)"
+}
\ No newline at end of file
diff --git a/evaluation_xcopawinostorymt/xcopa/zh/best_option_zhmt/results.json b/evaluation_xcopawinostorymt/xcopa/zh/best_option_zhmt/results.json
new file mode 100644
index 0000000000000000000000000000000000000000..4b69455120fb83b58f3f2d20dc0a6de9499b8c25
--- /dev/null
+++ b/evaluation_xcopawinostorymt/xcopa/zh/best_option_zhmt/results.json
@@ -0,0 +1,9 @@
+{
+  "dataset_name": "xcopa",
+  "dataset_config_name": "zh",
+  "template_name": "best_option_zhmt",
+  "evaluation": {
+    "accuracy": 0.75
+  },
+  "arguments": "Namespace(config_name=None, dataset_config_name='zh', dataset_name='xcopa', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='zh', template_name='best_option_zhmt', tokenizer_name=None, use_slow_tokenizer=False)"
+}
\ No newline at end of file
diff --git a/evaluation_xcopawinostorymt/xcopa/zh/cause_effect_zhmt/results.json b/evaluation_xcopawinostorymt/xcopa/zh/cause_effect_zhmt/results.json
new file mode 100644
index 0000000000000000000000000000000000000000..0d45d6dec2e02a88c4efae655983cf388b732b83
--- /dev/null
+++ b/evaluation_xcopawinostorymt/xcopa/zh/cause_effect_zhmt/results.json
@@ -0,0 +1,9 @@
+{
+  "dataset_name": "xcopa",
+  "dataset_config_name": "zh",
+  "template_name": "cause_effect_zhmt",
+  "evaluation": {
+    "accuracy": 0.83
+  },
+  "arguments": "Namespace(config_name=None, dataset_config_name='zh', dataset_name='xcopa', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='zh', template_name='cause_effect_zhmt', tokenizer_name=None, use_slow_tokenizer=False)"
+}
\ No newline at end of file
diff --git a/evaluation_xcopawinostorymt/xcopa/zh/i_am_hesitating_zhmt/results.json b/evaluation_xcopawinostorymt/xcopa/zh/i_am_hesitating_zhmt/results.json
new file mode 100644
index 0000000000000000000000000000000000000000..7f0f51c3e84c028612690851eafd25fa8693114c
--- /dev/null
+++ b/evaluation_xcopawinostorymt/xcopa/zh/i_am_hesitating_zhmt/results.json
@@ -0,0 +1,9 @@
+{
+  "dataset_name": "xcopa",
+  "dataset_config_name": "zh",
+  "template_name": "i_am_hesitating_zhmt",
+  "evaluation": {
+    "accuracy": 0.84
+  },
+  "arguments": "Namespace(config_name=None, dataset_config_name='zh', dataset_name='xcopa', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='zh', template_name='i_am_hesitating_zhmt', tokenizer_name=None, use_slow_tokenizer=False)"
+}
\ No newline at end of file
diff --git a/evaluation_xcopawinostorymt/xcopa/zh/plausible_alternatives_zhmt/results.json b/evaluation_xcopawinostorymt/xcopa/zh/plausible_alternatives_zhmt/results.json
new file mode 100644
index 0000000000000000000000000000000000000000..6e0ebbeb1c92528c677e0c3041a5520d00eb246a
--- /dev/null
+++ b/evaluation_xcopawinostorymt/xcopa/zh/plausible_alternatives_zhmt/results.json
@@ -0,0 +1,9 @@
+{
+  "dataset_name": "xcopa",
+  "dataset_config_name": "zh",
+  "template_name": "plausible_alternatives_zhmt",
+  "evaluation": {
+    "accuracy": 0.86
+  },
+  "arguments": "Namespace(config_name=None, dataset_config_name='zh', dataset_name='xcopa', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='zh', template_name='plausible_alternatives_zhmt', tokenizer_name=None, use_slow_tokenizer=False)"
+}
\ No newline at end of file
diff --git a/evaluation_xnlihtmt/xnliht/ar/GPT-3_style_arht/results.json b/evaluation_xnlihtmt/xnliht/ar/GPT-3_style_arht/results.json
new file mode 100644
index 0000000000000000000000000000000000000000..1151e88244939e66270073875e6352cbcb194777
--- /dev/null
+++ b/evaluation_xnlihtmt/xnliht/ar/GPT-3_style_arht/results.json
@@ -0,0 +1,9 @@
+{
+  "dataset_name": "xnli",
+  "dataset_config_name": "ar",
+  "template_name": "GPT-3 style_arht",
+  "evaluation": {
+    "accuracy": 0.40441767068273093
+  },
+  "arguments": "Namespace(config_name=None, dataset_config_name='ar', dataset_name='xnli', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='ar', template_name='GPT-3 style_arht', tokenizer_name=None, use_slow_tokenizer=False)"
+}
\ No newline at end of file
diff --git a/evaluation_xnlihtmt/xnliht/ar/MNLI_crowdsource_arht/results.json b/evaluation_xnlihtmt/xnliht/ar/MNLI_crowdsource_arht/results.json
new file mode 100644
index 0000000000000000000000000000000000000000..3e50548b0ae0c7bda585ddb4ea99660d84f1f86a
--- /dev/null
+++ b/evaluation_xnlihtmt/xnliht/ar/MNLI_crowdsource_arht/results.json
@@ -0,0 +1,9 @@
+{
+  "dataset_name": "xnli",
+  "dataset_config_name": "ar",
+  "template_name": "MNLI crowdsource_arht",
+  "evaluation": {
+    "accuracy": 0.43012048192771085
+  },
+  "arguments": "Namespace(config_name=None, dataset_config_name='ar', dataset_name='xnli', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='ar', template_name='MNLI crowdsource_arht', tokenizer_name=None, use_slow_tokenizer=False)"
+}
\ No newline at end of file
diff --git a/evaluation_xnlihtmt/xnliht/ar/can_we_infer_arht/results.json b/evaluation_xnlihtmt/xnliht/ar/can_we_infer_arht/results.json
new file mode 100644
index 0000000000000000000000000000000000000000..0d4880853137e81090cda44fc001e9425693628e
--- /dev/null
+++ b/evaluation_xnlihtmt/xnliht/ar/can_we_infer_arht/results.json
@@ -0,0 +1,9 @@
+{
+  "dataset_name": "xnli",
+  "dataset_config_name": "ar",
+  "template_name": "can we infer_arht",
+  "evaluation": {
+    "accuracy": 0.3610441767068273
+  },
+  "arguments": "Namespace(config_name=None, dataset_config_name='ar', dataset_name='xnli', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='ar', template_name='can we infer_arht', tokenizer_name=None, use_slow_tokenizer=False)"
+}
\ No newline at end of file
diff --git a/evaluation_xnlihtmt/xnliht/ar/guaranteed_possible_impossible_arht/results.json b/evaluation_xnlihtmt/xnliht/ar/guaranteed_possible_impossible_arht/results.json
new file mode 100644
index 0000000000000000000000000000000000000000..43aade15d507b7981f09fef192ecbd4de817e416
--- /dev/null
+++ b/evaluation_xnlihtmt/xnliht/ar/guaranteed_possible_impossible_arht/results.json
@@ -0,0 +1,9 @@
+{
+  "dataset_name": "xnli",
+  "dataset_config_name": "ar",
+  "template_name": "guaranteed/possible/impossible_arht",
+  "evaluation": {
+    "accuracy": 0.3642570281124498
+  },
+  "arguments": "Namespace(config_name=None, dataset_config_name='ar', dataset_name='xnli', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='ar', template_name='guaranteed/possible/impossible_arht', tokenizer_name=None, use_slow_tokenizer=False)"
+}
\ No newline at end of file
diff --git a/evaluation_xnlihtmt/xnliht/ar/justified_in_saying_arht/results.json b/evaluation_xnlihtmt/xnliht/ar/justified_in_saying_arht/results.json
new file mode 100644
index 0000000000000000000000000000000000000000..d7673b1d800b7e181915339d05873c3f4f9046b7
--- /dev/null
+++ b/evaluation_xnlihtmt/xnliht/ar/justified_in_saying_arht/results.json
@@ -0,0 +1,9 @@
+{
+  "dataset_name": "xnli",
+  "dataset_config_name": "ar",
+  "template_name": "justified in saying_arht",
+  "evaluation": {
+    "accuracy": 0.37309236947791163
+  },
+  "arguments": "Namespace(config_name=None, dataset_config_name='ar', dataset_name='xnli', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='ar', template_name='justified in saying_arht', tokenizer_name=None, use_slow_tokenizer=False)"
+}
\ No newline at end of file
diff --git a/evaluation_xnlihtmt/xnliht/es/GPT-3_style_esht/results.json b/evaluation_xnlihtmt/xnliht/es/GPT-3_style_esht/results.json
new file mode 100644
index 0000000000000000000000000000000000000000..c0ab7199857150c8bdbf32568948c735ac767b6d
--- /dev/null
+++ b/evaluation_xnlihtmt/xnliht/es/GPT-3_style_esht/results.json
@@ -0,0 +1,9 @@
+{
+  "dataset_name": "xnli",
+  "dataset_config_name": "es",
+  "template_name": "GPT-3 style_esht",
+  "evaluation": {
+    "accuracy": 0.5698795180722892
+  },
+  "arguments": "Namespace(config_name=None, dataset_config_name='es', dataset_name='xnli', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='es', template_name='GPT-3 style_esht', tokenizer_name=None, use_slow_tokenizer=False)"
+}
\ No newline at end of file
diff --git a/evaluation_xnlihtmt/xnliht/es/MNLI_crowdsource_esht/results.json b/evaluation_xnlihtmt/xnliht/es/MNLI_crowdsource_esht/results.json
new file mode 100644
index 0000000000000000000000000000000000000000..c71d79218ce58a8e449580613f75a4c2e19e1e8e
--- /dev/null
+++ b/evaluation_xnlihtmt/xnliht/es/MNLI_crowdsource_esht/results.json
@@ -0,0 +1,9 @@
+{
+  "dataset_name": "xnli",
+  "dataset_config_name": "es",
+  "template_name": "MNLI crowdsource_esht",
+  "evaluation": {
+    "accuracy": 0.342570281124498
+  },
+  "arguments": "Namespace(config_name=None, dataset_config_name='es', dataset_name='xnli', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='es', template_name='MNLI crowdsource_esht', tokenizer_name=None, use_slow_tokenizer=False)"
+}
\ No newline at end of file
diff --git a/evaluation_xnlihtmt/xnliht/es/can_we_infer_esht/results.json b/evaluation_xnlihtmt/xnliht/es/can_we_infer_esht/results.json
new file mode 100644
index 0000000000000000000000000000000000000000..9e42be28ecd114e6f6f4d88689f566e7a5aab611
--- /dev/null
+++ b/evaluation_xnlihtmt/xnliht/es/can_we_infer_esht/results.json
@@ -0,0 +1,9 @@
+{
+  "dataset_name": "xnli",
+  "dataset_config_name": "es",
+  "template_name": "can we infer_esht",
+  "evaluation": {
+    "accuracy": 0.46546184738955826
+  },
+  "arguments": "Namespace(config_name=None, dataset_config_name='es', dataset_name='xnli', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='es', template_name='can we infer_esht', tokenizer_name=None, use_slow_tokenizer=False)"
+}
\ No newline at end of file
diff --git a/evaluation_xnlihtmt/xnliht/es/guaranteed_possible_impossible_esht/results.json b/evaluation_xnlihtmt/xnliht/es/guaranteed_possible_impossible_esht/results.json
new file mode 100644
index 0000000000000000000000000000000000000000..0c99dceb99ba9331895c687b2c4e5f3a7f92578e
--- /dev/null
+++ b/evaluation_xnlihtmt/xnliht/es/guaranteed_possible_impossible_esht/results.json
@@ -0,0 +1,9 @@
+{
+  "dataset_name": "xnli",
+  "dataset_config_name": "es",
+  "template_name": "guaranteed/possible/impossible_esht",
+  "evaluation": {
+    "accuracy": 0.5526104417670683
+  },
+  "arguments": "Namespace(config_name=None, dataset_config_name='es', dataset_name='xnli', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='es', template_name='guaranteed/possible/impossible_esht', tokenizer_name=None, use_slow_tokenizer=False)"
+}
\ No newline at end of file
diff --git a/evaluation_xnlihtmt/xnliht/es/justified_in_saying_esht/results.json b/evaluation_xnlihtmt/xnliht/es/justified_in_saying_esht/results.json
new file mode 100644
index 0000000000000000000000000000000000000000..fb77a7a139dd8b9c24ea015e0d02823933a7e16a
--- /dev/null
+++ b/evaluation_xnlihtmt/xnliht/es/justified_in_saying_esht/results.json
@@ -0,0 +1,9 @@
+{
+  "dataset_name": "xnli",
+  "dataset_config_name": "es",
+  "template_name": "justified in saying_esht",
+  "evaluation": {
+    "accuracy": 0.4321285140562249
+  },
+  "arguments": "Namespace(config_name=None, dataset_config_name='es', dataset_name='xnli', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='es', template_name='justified in saying_esht', tokenizer_name=None, use_slow_tokenizer=False)"
+}
\ No newline at end of file
diff --git a/evaluation_xnlihtmt/xnliht/fr/GPT-3_style_frht/results.json b/evaluation_xnlihtmt/xnliht/fr/GPT-3_style_frht/results.json
new file mode 100644
index 0000000000000000000000000000000000000000..4b13c026436426907cc26ba42d716088ce4b59b4
--- /dev/null
+++ b/evaluation_xnlihtmt/xnliht/fr/GPT-3_style_frht/results.json
@@ -0,0 +1,9 @@
+{
+  "dataset_name": "xnli",
+  "dataset_config_name": "fr",
+  "template_name": "GPT-3 style_frht",
+  "evaluation": {
+    "accuracy": 0.4995983935742972
+  },
+  "arguments": "Namespace(config_name=None, dataset_config_name='fr', dataset_name='xnli', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='fr', template_name='GPT-3 style_frht', tokenizer_name=None, use_slow_tokenizer=False)"
+}
\ No newline at end of file
diff --git a/evaluation_xnlihtmt/xnliht/fr/MNLI_crowdsource_frht/results.json b/evaluation_xnlihtmt/xnliht/fr/MNLI_crowdsource_frht/results.json
new file mode 100644
index 0000000000000000000000000000000000000000..7d7e02d1a9f66af72a883aa01d74e2dde9197e98
--- /dev/null
+++ b/evaluation_xnlihtmt/xnliht/fr/MNLI_crowdsource_frht/results.json
@@ -0,0 +1,9 @@
+{
+  "dataset_name": "xnli",
+  "dataset_config_name": "fr",
+  "template_name": "MNLI crowdsource_frht",
+  "evaluation": {
+    "accuracy": 0.4004016064257028
+  },
+  "arguments": "Namespace(config_name=None, dataset_config_name='fr', dataset_name='xnli', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='fr', template_name='MNLI crowdsource_frht', tokenizer_name=None, use_slow_tokenizer=False)"
+}
\ No newline at end of file
diff --git a/evaluation_xnlihtmt/xnliht/fr/can_we_infer_frht/results.json b/evaluation_xnlihtmt/xnliht/fr/can_we_infer_frht/results.json
new file mode 100644
index 0000000000000000000000000000000000000000..a407e7ae562371e6260c90b1ea0a3c9107aa37f7
--- /dev/null
+++ b/evaluation_xnlihtmt/xnliht/fr/can_we_infer_frht/results.json
@@ -0,0 +1,9 @@
+{
+  "dataset_name": "xnli",
+  "dataset_config_name": "fr",
+  "template_name": "can we infer_frht",
+  "evaluation": {
+    "accuracy": 0.5694779116465863
+  },
+  "arguments": "Namespace(config_name=None, dataset_config_name='fr', dataset_name='xnli', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='fr', template_name='can we infer_frht', tokenizer_name=None, use_slow_tokenizer=False)"
+}
\ No newline at end of file
diff --git a/evaluation_xnlihtmt/xnliht/fr/guaranteed_possible_impossible_frht/results.json b/evaluation_xnlihtmt/xnliht/fr/guaranteed_possible_impossible_frht/results.json
new file mode 100644
index 0000000000000000000000000000000000000000..aa655ebc807cb20ba71577ff3194250f5616a1f3
--- /dev/null
+++ b/evaluation_xnlihtmt/xnliht/fr/guaranteed_possible_impossible_frht/results.json
@@ -0,0 +1,9 @@
+{
+  "dataset_name": "xnli",
+  "dataset_config_name": "fr",
+  "template_name": "guaranteed/possible/impossible_frht",
+  "evaluation": {
+    "accuracy": 0.5152610441767068
+  },
+  "arguments": "Namespace(config_name=None, dataset_config_name='fr', dataset_name='xnli', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='fr', template_name='guaranteed/possible/impossible_frht', tokenizer_name=None, use_slow_tokenizer=False)"
+}
\ No newline at end of file
diff --git a/evaluation_xnlihtmt/xnliht/fr/justified_in_saying_frht/results.json b/evaluation_xnlihtmt/xnliht/fr/justified_in_saying_frht/results.json
new file mode 100644
index 0000000000000000000000000000000000000000..014ade2a465f303693ffe5cab94fbdd643774842
--- /dev/null
+++ b/evaluation_xnlihtmt/xnliht/fr/justified_in_saying_frht/results.json
@@ -0,0 +1,9 @@
+{
+  "dataset_name": "xnli",
+  "dataset_config_name": "fr",
+  "template_name": "justified in saying_frht",
+  "evaluation": {
+    "accuracy": 0.5493975903614458
+  },
+  "arguments": "Namespace(config_name=None, dataset_config_name='fr', dataset_name='xnli', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='fr', template_name='justified in saying_frht', tokenizer_name=None, use_slow_tokenizer=False)"
+}
\ No newline at end of file
diff --git a/evaluation_xnlihtmt/xnliht/hi/GPT-3_style_hiht/results.json b/evaluation_xnlihtmt/xnliht/hi/GPT-3_style_hiht/results.json
new file mode 100644
index 0000000000000000000000000000000000000000..a4e92a96761a9a536d4bb8a5e4ba4f4061b7f539
--- /dev/null
+++ b/evaluation_xnlihtmt/xnliht/hi/GPT-3_style_hiht/results.json
@@ -0,0 +1,9 @@
+{
+  "dataset_name": "xnli",
+  "dataset_config_name": "hi",
+  "template_name": "GPT-3 style_hiht",
+  "evaluation": {
+    "accuracy": 0.44417670682730925
+  },
+  "arguments": "Namespace(config_name=None, dataset_config_name='hi', dataset_name='xnli', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='hi', template_name='GPT-3 style_hiht', tokenizer_name=None, use_slow_tokenizer=False)"
+}
\ No newline at end of file
diff --git a/evaluation_xnlihtmt/xnliht/hi/MNLI_crowdsource_hiht/results.json b/evaluation_xnlihtmt/xnliht/hi/MNLI_crowdsource_hiht/results.json
new file mode 100644
index 0000000000000000000000000000000000000000..8efef72cd5d3fc77ef9b4366770aaae329a5fb7a
--- /dev/null
+++ b/evaluation_xnlihtmt/xnliht/hi/MNLI_crowdsource_hiht/results.json
@@ -0,0 +1,9 @@
+{
+  "dataset_name": "xnli",
+  "dataset_config_name": "hi",
+  "template_name": "MNLI crowdsource_hiht",
+  "evaluation": {
+    "accuracy": 0.5236947791164659
+  },
+  "arguments": "Namespace(config_name=None, dataset_config_name='hi', dataset_name='xnli', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='hi', template_name='MNLI crowdsource_hiht', tokenizer_name=None, use_slow_tokenizer=False)"
+}
\ No newline at end of file
diff --git a/evaluation_xnlihtmt/xnliht/hi/can_we_infer_hiht/results.json b/evaluation_xnlihtmt/xnliht/hi/can_we_infer_hiht/results.json
new file mode 100644
index 0000000000000000000000000000000000000000..778e69f3893a372b47a6fbdc28161f775335d96c
--- /dev/null
+++ b/evaluation_xnlihtmt/xnliht/hi/can_we_infer_hiht/results.json
@@ -0,0 +1,9 @@
+{
+  "dataset_name": "xnli",
+  "dataset_config_name": "hi",
+  "template_name": "can we infer_hiht",
+  "evaluation": {
+    "accuracy": 0.4963855421686747
+  },
+  "arguments": "Namespace(config_name=None, dataset_config_name='hi', dataset_name='xnli', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='hi', template_name='can we infer_hiht', tokenizer_name=None, use_slow_tokenizer=False)"
+}
\ No newline at end of file
diff --git a/evaluation_xnlihtmt/xnliht/hi/guaranteed_possible_impossible_hiht/results.json b/evaluation_xnlihtmt/xnliht/hi/guaranteed_possible_impossible_hiht/results.json
new file mode 100644
index 0000000000000000000000000000000000000000..1ebc1461a52c3cde14887dca4097c766c74b2fbb
--- /dev/null
+++ b/evaluation_xnlihtmt/xnliht/hi/guaranteed_possible_impossible_hiht/results.json
@@ -0,0 +1,9 @@
+{
+  "dataset_name": "xnli",
+  "dataset_config_name": "hi",
+  "template_name": "guaranteed/possible/impossible_hiht",
+  "evaluation": {
+    "accuracy": 0.4493975903614458
+  },
+  "arguments": "Namespace(config_name=None, dataset_config_name='hi', dataset_name='xnli', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='hi', template_name='guaranteed/possible/impossible_hiht', tokenizer_name=None, use_slow_tokenizer=False)"
+}
\ No newline at end of file
diff --git a/evaluation_xnlihtmt/xnliht/hi/justified_in_saying_hiht/results.json b/evaluation_xnlihtmt/xnliht/hi/justified_in_saying_hiht/results.json
new file mode 100644
index 0000000000000000000000000000000000000000..d4e84c9aaefa0f61a8a742fc3d8391391085d071
--- /dev/null
+++ b/evaluation_xnlihtmt/xnliht/hi/justified_in_saying_hiht/results.json
@@ -0,0 +1,9 @@
+{
+  "dataset_name": "xnli",
+  "dataset_config_name": "hi",
+  "template_name": "justified in saying_hiht",
+  "evaluation": {
+    "accuracy": 0.4963855421686747
+  },
+  "arguments": "Namespace(config_name=None, dataset_config_name='hi', dataset_name='xnli', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='hi', template_name='justified in saying_hiht', tokenizer_name=None, use_slow_tokenizer=False)"
+}
\ No newline at end of file
diff --git a/evaluation_xnlihtmt/xnliht/merged.csv b/evaluation_xnlihtmt/xnliht/merged.csv
new file mode 100644
index 0000000000000000000000000000000000000000..38974d265a2b337eb24fd64b0a27d6af10f7d5c6
--- /dev/null
+++ b/evaluation_xnlihtmt/xnliht/merged.csv
@@ -0,0 +1,50 @@
+dataset,prompt,metric,value
+xnli_ar,GPT-3 style_arht,accuracy,0.40441767068273093
+xnli_ar,MNLI crowdsource_arht,accuracy,0.43012048192771085
+xnli_ar,can we infer_arht,accuracy,0.3610441767068273
+xnli_ar,guaranteed/possible/impossible_arht,accuracy,0.3642570281124498
+xnli_ar,justified in saying_arht,accuracy,0.37309236947791163
+xnli_ar,median,accuracy,0.37309236947791163
+xnli_es,GPT-3 style_esht,accuracy,0.5698795180722892
+xnli_es,MNLI crowdsource_esht,accuracy,0.342570281124498
+xnli_es,can we infer_esht,accuracy,0.46546184738955826
+xnli_es,guaranteed/possible/impossible_esht,accuracy,0.5526104417670683
+xnli_es,justified in saying_esht,accuracy,0.4321285140562249
+xnli_es,median,accuracy,0.46546184738955826
+xnli_fr,GPT-3 style_frht,accuracy,0.4995983935742972
+xnli_fr,MNLI crowdsource_frht,accuracy,0.4004016064257028
+xnli_fr,can we infer_frht,accuracy,0.5694779116465863
+xnli_fr,guaranteed/possible/impossible_frht,accuracy,0.5152610441767068
+xnli_fr,justified in saying_frht,accuracy,0.5493975903614458
+xnli_fr,median,accuracy,0.5152610441767068
+xnli_hi,GPT-3 style_hiht,accuracy,0.44417670682730925
+xnli_hi,MNLI crowdsource_hiht,accuracy,0.5236947791164659
+xnli_hi,can we infer_hiht,accuracy,0.4963855421686747
+xnli_hi,guaranteed/possible/impossible_hiht,accuracy,0.4493975903614458
+xnli_hi,justified in saying_hiht,accuracy,0.4963855421686747
+xnli_hi,median,accuracy,0.4963855421686747
+xnli_sw,GPT-3 style_swht,accuracy,0.39397590361445783
+xnli_sw,MNLI crowdsource_swht,accuracy,0.3329317269076305
+xnli_sw,can we infer_swht,accuracy,0.4285140562248996
+xnli_sw,guaranteed/possible/impossible_swht,accuracy,0.38433734939759034
+xnli_sw,justified in saying_swht,accuracy,0.41967871485943775
+xnli_sw,median,accuracy,0.39397590361445783
+xnli_ur,GPT-3 style_urht,accuracy,0.463855421686747
+xnli_ur,MNLI crowdsource_urht,accuracy,0.40441767068273093
+xnli_ur,can we infer_urht,accuracy,0.3895582329317269
+xnli_ur,guaranteed/possible/impossible_urht,accuracy,0.3405622489959839
+xnli_ur,justified in saying_urht,accuracy,0.43293172690763054
+xnli_ur,median,accuracy,0.40441767068273093
+xnli_vi,GPT-3 style_viht,accuracy,0.5261044176706827
+xnli_vi,MNLI crowdsource_viht,accuracy,0.39879518072289155
+xnli_vi,can we infer_viht,accuracy,0.5481927710843374
+xnli_vi,guaranteed/possible/impossible_viht,accuracy,0.43694779116465865
+xnli_vi,justified in saying_viht,accuracy,0.46546184738955826
+xnli_vi,median,accuracy,0.46546184738955826
+xnli_zh,GPT-3 style_zhht,accuracy,0.36947791164658633
+xnli_zh,MNLI crowdsource_zhht,accuracy,0.3457831325301205
+xnli_zh,can we infer_zhht,accuracy,0.3441767068273092
+xnli_zh,guaranteed/possible/impossible_zhht,accuracy,0.4923694779116466
+xnli_zh,justified in saying_zhht,accuracy,0.3927710843373494
+xnli_zh,median,accuracy,0.36947791164658633
+multiple,average,multiple,0.4354417670682731
diff --git a/evaluation_xnlihtmt/xnliht/merged.json b/evaluation_xnlihtmt/xnliht/merged.json
new file mode 100644
index 0000000000000000000000000000000000000000..2d843c2753dd0cbff8718e6589bb67fa198770c6
--- /dev/null
+++ b/evaluation_xnlihtmt/xnliht/merged.json
@@ -0,0 +1 @@
+{"xnli_ar": {"GPT-3 style_arht": {"arguments": "Namespace(config_name=None, dataset_config_name='ar', dataset_name='xnli', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='ar', template_name='GPT-3 style_arht', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "ar", "dataset_name": "xnli", "evaluation": {"accuracy": 0.40441767068273093}, "template_name": "GPT-3 style_arht"}, "MNLI crowdsource_arht": {"arguments": "Namespace(config_name=None, dataset_config_name='ar', dataset_name='xnli', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='ar', template_name='MNLI crowdsource_arht', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "ar", "dataset_name": "xnli", "evaluation": {"accuracy": 0.43012048192771085}, "template_name": "MNLI crowdsource_arht"}, "can we infer_arht": {"arguments": "Namespace(config_name=None, dataset_config_name='ar', dataset_name='xnli', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='ar', template_name='can we infer_arht', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "ar", "dataset_name": "xnli", "evaluation": {"accuracy": 0.3610441767068273}, "template_name": "can we infer_arht"}, "guaranteed/possible/impossible_arht": {"arguments": "Namespace(config_name=None, dataset_config_name='ar', dataset_name='xnli', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='ar', template_name='guaranteed/possible/impossible_arht', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "ar", "dataset_name": "xnli", "evaluation": {"accuracy": 0.3642570281124498}, "template_name": "guaranteed/possible/impossible_arht"}, "justified in saying_arht": {"arguments": "Namespace(config_name=None, dataset_config_name='ar', dataset_name='xnli', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='ar', template_name='justified in saying_arht', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "ar", "dataset_name": "xnli", "evaluation": {"accuracy": 0.37309236947791163}, "template_name": "justified in saying_arht"}}, "xnli_es": {"GPT-3 style_esht": {"arguments": "Namespace(config_name=None, dataset_config_name='es', dataset_name='xnli', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='es', template_name='GPT-3 style_esht', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "es", "dataset_name": "xnli", "evaluation": {"accuracy": 0.5698795180722892}, "template_name": "GPT-3 style_esht"}, "MNLI crowdsource_esht": {"arguments": "Namespace(config_name=None, dataset_config_name='es', dataset_name='xnli', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='es', template_name='MNLI crowdsource_esht', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "es", "dataset_name": "xnli", "evaluation": {"accuracy": 0.342570281124498}, "template_name": "MNLI crowdsource_esht"}, "can we infer_esht": {"arguments": "Namespace(config_name=None, dataset_config_name='es', dataset_name='xnli', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='es', template_name='can we infer_esht', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "es", "dataset_name": "xnli", "evaluation": {"accuracy": 0.46546184738955826}, "template_name": "can we infer_esht"}, "guaranteed/possible/impossible_esht": {"arguments": "Namespace(config_name=None, dataset_config_name='es', dataset_name='xnli', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='es', template_name='guaranteed/possible/impossible_esht', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "es", "dataset_name": "xnli", "evaluation": {"accuracy": 0.5526104417670683}, "template_name": "guaranteed/possible/impossible_esht"}, "justified in saying_esht": {"arguments": "Namespace(config_name=None, dataset_config_name='es', dataset_name='xnli', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='es', template_name='justified in saying_esht', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "es", "dataset_name": "xnli", "evaluation": {"accuracy": 0.4321285140562249}, "template_name": "justified in saying_esht"}}, "xnli_fr": {"GPT-3 style_frht": {"arguments": "Namespace(config_name=None, dataset_config_name='fr', dataset_name='xnli', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='fr', template_name='GPT-3 style_frht', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "fr", "dataset_name": "xnli", "evaluation": {"accuracy": 0.4995983935742972}, "template_name": "GPT-3 style_frht"}, "MNLI crowdsource_frht": {"arguments": "Namespace(config_name=None, dataset_config_name='fr', dataset_name='xnli', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='fr', template_name='MNLI crowdsource_frht', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "fr", "dataset_name": "xnli", "evaluation": {"accuracy": 0.4004016064257028}, "template_name": "MNLI crowdsource_frht"}, "can we infer_frht": {"arguments": "Namespace(config_name=None, dataset_config_name='fr', dataset_name='xnli', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='fr', template_name='can we infer_frht', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "fr", "dataset_name": "xnli", "evaluation": {"accuracy": 0.5694779116465863}, "template_name": "can we infer_frht"}, "guaranteed/possible/impossible_frht": {"arguments": "Namespace(config_name=None, dataset_config_name='fr', dataset_name='xnli', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='fr', template_name='guaranteed/possible/impossible_frht', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "fr", "dataset_name": "xnli", "evaluation": {"accuracy": 0.5152610441767068}, "template_name": "guaranteed/possible/impossible_frht"}, "justified in saying_frht": {"arguments": "Namespace(config_name=None, dataset_config_name='fr', dataset_name='xnli', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='fr', template_name='justified in saying_frht', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "fr", "dataset_name": "xnli", "evaluation": {"accuracy": 0.5493975903614458}, "template_name": "justified in saying_frht"}}, "xnli_hi": {"GPT-3 style_hiht": {"arguments": "Namespace(config_name=None, dataset_config_name='hi', dataset_name='xnli', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='hi', template_name='GPT-3 style_hiht', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "hi", "dataset_name": "xnli", "evaluation": {"accuracy": 0.44417670682730925}, "template_name": "GPT-3 style_hiht"}, "MNLI crowdsource_hiht": {"arguments": "Namespace(config_name=None, dataset_config_name='hi', dataset_name='xnli', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='hi', template_name='MNLI crowdsource_hiht', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "hi", "dataset_name": "xnli", "evaluation": {"accuracy": 0.5236947791164659}, "template_name": "MNLI crowdsource_hiht"}, "can we infer_hiht": {"arguments": "Namespace(config_name=None, dataset_config_name='hi', dataset_name='xnli', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='hi', template_name='can we infer_hiht', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "hi", "dataset_name": "xnli", "evaluation": {"accuracy": 0.4963855421686747}, "template_name": "can we infer_hiht"}, "guaranteed/possible/impossible_hiht": {"arguments": "Namespace(config_name=None, dataset_config_name='hi', dataset_name='xnli', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='hi', template_name='guaranteed/possible/impossible_hiht', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "hi", "dataset_name": "xnli", "evaluation": {"accuracy": 0.4493975903614458}, "template_name": "guaranteed/possible/impossible_hiht"}, "justified in saying_hiht": {"arguments": "Namespace(config_name=None, dataset_config_name='hi', dataset_name='xnli', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='hi', template_name='justified in saying_hiht', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "hi", "dataset_name": "xnli", "evaluation": {"accuracy": 0.4963855421686747}, "template_name": "justified in saying_hiht"}}, "xnli_sw": {"GPT-3 style_swht": {"arguments": "Namespace(config_name=None, dataset_config_name='sw', dataset_name='xnli', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='sw', template_name='GPT-3 style_swht', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "sw", "dataset_name": "xnli", "evaluation": {"accuracy": 0.39397590361445783}, "template_name": "GPT-3 style_swht"}, "MNLI crowdsource_swht": {"arguments": "Namespace(config_name=None, dataset_config_name='sw', dataset_name='xnli', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='sw', template_name='MNLI crowdsource_swht', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "sw", "dataset_name": "xnli", "evaluation": {"accuracy": 0.3329317269076305}, "template_name": "MNLI crowdsource_swht"}, "can we infer_swht": {"arguments": "Namespace(config_name=None, dataset_config_name='sw', dataset_name='xnli', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='sw', template_name='can we infer_swht', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "sw", "dataset_name": "xnli", "evaluation": {"accuracy": 0.4285140562248996}, "template_name": "can we infer_swht"}, "guaranteed/possible/impossible_swht": {"arguments": "Namespace(config_name=None, dataset_config_name='sw', dataset_name='xnli', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='sw', template_name='guaranteed/possible/impossible_swht', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "sw", "dataset_name": "xnli", "evaluation": {"accuracy": 0.38433734939759034}, "template_name": "guaranteed/possible/impossible_swht"}, "justified in saying_swht": {"arguments": "Namespace(config_name=None, dataset_config_name='sw', dataset_name='xnli', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='sw', template_name='justified in saying_swht', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "sw", "dataset_name": "xnli", "evaluation": {"accuracy": 0.41967871485943775}, "template_name": "justified in saying_swht"}}, "xnli_ur": {"GPT-3 style_urht": {"arguments": "Namespace(config_name=None, dataset_config_name='ur', dataset_name='xnli', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='ur', template_name='GPT-3 style_urht', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "ur", "dataset_name": "xnli", "evaluation": {"accuracy": 0.463855421686747}, "template_name": "GPT-3 style_urht"}, "MNLI crowdsource_urht": {"arguments": "Namespace(config_name=None, dataset_config_name='ur', dataset_name='xnli', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='ur', template_name='MNLI crowdsource_urht', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "ur", "dataset_name": "xnli", "evaluation": {"accuracy": 0.40441767068273093}, "template_name": "MNLI crowdsource_urht"}, "can we infer_urht": {"arguments": "Namespace(config_name=None, dataset_config_name='ur', dataset_name='xnli', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='ur', template_name='can we infer_urht', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "ur", "dataset_name": "xnli", "evaluation": {"accuracy": 0.3895582329317269}, "template_name": "can we infer_urht"}, "guaranteed/possible/impossible_urht": {"arguments": "Namespace(config_name=None, dataset_config_name='ur', dataset_name='xnli', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='ur', template_name='guaranteed/possible/impossible_urht', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "ur", "dataset_name": "xnli", "evaluation": {"accuracy": 0.3405622489959839}, "template_name": "guaranteed/possible/impossible_urht"}, "justified in saying_urht": {"arguments": "Namespace(config_name=None, dataset_config_name='ur', dataset_name='xnli', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='ur', template_name='justified in saying_urht', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "ur", "dataset_name": "xnli", "evaluation": {"accuracy": 0.43293172690763054}, "template_name": "justified in saying_urht"}}, "xnli_vi": {"GPT-3 style_viht": {"arguments": "Namespace(config_name=None, dataset_config_name='vi', dataset_name='xnli', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='vi', template_name='GPT-3 style_viht', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "vi", "dataset_name": "xnli", "evaluation": {"accuracy": 0.5261044176706827}, "template_name": "GPT-3 style_viht"}, "MNLI crowdsource_viht": {"arguments": "Namespace(config_name=None, dataset_config_name='vi', dataset_name='xnli', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='vi', template_name='MNLI crowdsource_viht', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "vi", "dataset_name": "xnli", "evaluation": {"accuracy": 0.39879518072289155}, "template_name": "MNLI crowdsource_viht"}, "can we infer_viht": {"arguments": "Namespace(config_name=None, dataset_config_name='vi', dataset_name='xnli', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='vi', template_name='can we infer_viht', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "vi", "dataset_name": "xnli", "evaluation": {"accuracy": 0.5481927710843374}, "template_name": "can we infer_viht"}, "guaranteed/possible/impossible_viht": {"arguments": "Namespace(config_name=None, dataset_config_name='vi', dataset_name='xnli', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='vi', template_name='guaranteed/possible/impossible_viht', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "vi", "dataset_name": "xnli", "evaluation": {"accuracy": 0.43694779116465865}, "template_name": "guaranteed/possible/impossible_viht"}, "justified in saying_viht": {"arguments": "Namespace(config_name=None, dataset_config_name='vi', dataset_name='xnli', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='vi', template_name='justified in saying_viht', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "vi", "dataset_name": "xnli", "evaluation": {"accuracy": 0.46546184738955826}, "template_name": "justified in saying_viht"}}, "xnli_zh": {"GPT-3 style_zhht": {"arguments": "Namespace(config_name=None, dataset_config_name='zh', dataset_name='xnli', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='zh', template_name='GPT-3 style_zhht', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "zh", "dataset_name": "xnli", "evaluation": {"accuracy": 0.36947791164658633}, "template_name": "GPT-3 style_zhht"}, "MNLI crowdsource_zhht": {"arguments": "Namespace(config_name=None, dataset_config_name='zh', dataset_name='xnli', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='zh', template_name='MNLI crowdsource_zhht', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "zh", "dataset_name": "xnli", "evaluation": {"accuracy": 0.3457831325301205}, "template_name": "MNLI crowdsource_zhht"}, "can we infer_zhht": {"arguments": "Namespace(config_name=None, dataset_config_name='zh', dataset_name='xnli', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='zh', template_name='can we infer_zhht', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "zh", "dataset_name": "xnli", "evaluation": {"accuracy": 0.3441767068273092}, "template_name": "can we infer_zhht"}, "guaranteed/possible/impossible_zhht": {"arguments": "Namespace(config_name=None, dataset_config_name='zh', dataset_name='xnli', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='zh', template_name='guaranteed/possible/impossible_zhht', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "zh", "dataset_name": "xnli", "evaluation": {"accuracy": 0.4923694779116466}, "template_name": "guaranteed/possible/impossible_zhht"}, "justified in saying_zhht": {"arguments": "Namespace(config_name=None, dataset_config_name='zh', dataset_name='xnli', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='zh', template_name='justified in saying_zhht', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "zh", "dataset_name": "xnli", "evaluation": {"accuracy": 0.3927710843373494}, "template_name": "justified in saying_zhht"}}}
\ No newline at end of file
diff --git a/evaluation_xnlihtmt/xnliht/sw/GPT-3_style_swht/results.json b/evaluation_xnlihtmt/xnliht/sw/GPT-3_style_swht/results.json
new file mode 100644
index 0000000000000000000000000000000000000000..d7b7b923b78fb24fbccb524ad75690014a17cac7
--- /dev/null
+++ b/evaluation_xnlihtmt/xnliht/sw/GPT-3_style_swht/results.json
@@ -0,0 +1,9 @@
+{
+  "dataset_name": "xnli",
+  "dataset_config_name": "sw",
+  "template_name": "GPT-3 style_swht",
+  "evaluation": {
+    "accuracy": 0.39397590361445783
+  },
+  "arguments": "Namespace(config_name=None, dataset_config_name='sw', dataset_name='xnli', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='sw', template_name='GPT-3 style_swht', tokenizer_name=None, use_slow_tokenizer=False)"
+}
\ No newline at end of file
diff --git a/evaluation_xnlihtmt/xnliht/sw/MNLI_crowdsource_swht/results.json b/evaluation_xnlihtmt/xnliht/sw/MNLI_crowdsource_swht/results.json
new file mode 100644
index 0000000000000000000000000000000000000000..de9fd9c9a0227ffb16da0b5303649cad3dfccb9f
--- /dev/null
+++ b/evaluation_xnlihtmt/xnliht/sw/MNLI_crowdsource_swht/results.json
@@ -0,0 +1,9 @@
+{
+  "dataset_name": "xnli",
+  "dataset_config_name": "sw",
+  "template_name": "MNLI crowdsource_swht",
+  "evaluation": {
+    "accuracy": 0.3329317269076305
+  },
+  "arguments": "Namespace(config_name=None, dataset_config_name='sw', dataset_name='xnli', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='sw', template_name='MNLI crowdsource_swht', tokenizer_name=None, use_slow_tokenizer=False)"
+}
\ No newline at end of file
diff --git a/evaluation_xnlihtmt/xnliht/sw/can_we_infer_swht/results.json b/evaluation_xnlihtmt/xnliht/sw/can_we_infer_swht/results.json
new file mode 100644
index 0000000000000000000000000000000000000000..ac81892cb9c74f6107dafad135338f6d9683507d
--- /dev/null
+++ b/evaluation_xnlihtmt/xnliht/sw/can_we_infer_swht/results.json
@@ -0,0 +1,9 @@
+{
+  "dataset_name": "xnli",
+  "dataset_config_name": "sw",
+  "template_name": "can we infer_swht",
+  "evaluation": {
+    "accuracy": 0.4285140562248996
+  },
+  "arguments": "Namespace(config_name=None, dataset_config_name='sw', dataset_name='xnli', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='sw', template_name='can we infer_swht', tokenizer_name=None, use_slow_tokenizer=False)"
+}
\ No newline at end of file
diff --git a/evaluation_xnlihtmt/xnliht/sw/guaranteed_possible_impossible_swht/results.json b/evaluation_xnlihtmt/xnliht/sw/guaranteed_possible_impossible_swht/results.json
new file mode 100644
index 0000000000000000000000000000000000000000..9ba9a00506c2e6fc4a16f3e21fd815d44dbca4a5
--- /dev/null
+++ b/evaluation_xnlihtmt/xnliht/sw/guaranteed_possible_impossible_swht/results.json
@@ -0,0 +1,9 @@
+{
+  "dataset_name": "xnli",
+  "dataset_config_name": "sw",
+  "template_name": "guaranteed/possible/impossible_swht",
+  "evaluation": {
+    "accuracy": 0.38433734939759034
+  },
+  "arguments": "Namespace(config_name=None, dataset_config_name='sw', dataset_name='xnli', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='sw', template_name='guaranteed/possible/impossible_swht', tokenizer_name=None, use_slow_tokenizer=False)"
+}
\ No newline at end of file
diff --git a/evaluation_xnlihtmt/xnliht/sw/justified_in_saying_swht/results.json b/evaluation_xnlihtmt/xnliht/sw/justified_in_saying_swht/results.json
new file mode 100644
index 0000000000000000000000000000000000000000..5b3e944be5fd9d9d99fb1b9a48e933a0837aa80f
--- /dev/null
+++ b/evaluation_xnlihtmt/xnliht/sw/justified_in_saying_swht/results.json
@@ -0,0 +1,9 @@
+{
+  "dataset_name": "xnli",
+  "dataset_config_name": "sw",
+  "template_name": "justified in saying_swht",
+  "evaluation": {
+    "accuracy": 0.41967871485943775
+  },
+  "arguments": "Namespace(config_name=None, dataset_config_name='sw', dataset_name='xnli', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='sw', template_name='justified in saying_swht', tokenizer_name=None, use_slow_tokenizer=False)"
+}
\ No newline at end of file
diff --git a/evaluation_xnlihtmt/xnliht/ur/GPT-3_style_urht/results.json b/evaluation_xnlihtmt/xnliht/ur/GPT-3_style_urht/results.json
new file mode 100644
index 0000000000000000000000000000000000000000..2a8b24bdd46252fb9a65c95158e410ba06d95808
--- /dev/null
+++ b/evaluation_xnlihtmt/xnliht/ur/GPT-3_style_urht/results.json
@@ -0,0 +1,9 @@
+{
+  "dataset_name": "xnli",
+  "dataset_config_name": "ur",
+  "template_name": "GPT-3 style_urht",
+  "evaluation": {
+    "accuracy": 0.463855421686747
+  },
+  "arguments": "Namespace(config_name=None, dataset_config_name='ur', dataset_name='xnli', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='ur', template_name='GPT-3 style_urht', tokenizer_name=None, use_slow_tokenizer=False)"
+}
\ No newline at end of file
diff --git a/evaluation_xnlihtmt/xnliht/ur/MNLI_crowdsource_urht/results.json b/evaluation_xnlihtmt/xnliht/ur/MNLI_crowdsource_urht/results.json
new file mode 100644
index 0000000000000000000000000000000000000000..daec90e79a511836f4cb80c164414151fd805e8d
--- /dev/null
+++ b/evaluation_xnlihtmt/xnliht/ur/MNLI_crowdsource_urht/results.json
@@ -0,0 +1,9 @@
+{
+  "dataset_name": "xnli",
+  "dataset_config_name": "ur",
+  "template_name": "MNLI crowdsource_urht",
+  "evaluation": {
+    "accuracy": 0.40441767068273093
+  },
+  "arguments": "Namespace(config_name=None, dataset_config_name='ur', dataset_name='xnli', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='ur', template_name='MNLI crowdsource_urht', tokenizer_name=None, use_slow_tokenizer=False)"
+}
\ No newline at end of file
diff --git a/evaluation_xnlihtmt/xnliht/ur/can_we_infer_urht/results.json b/evaluation_xnlihtmt/xnliht/ur/can_we_infer_urht/results.json
new file mode 100644
index 0000000000000000000000000000000000000000..0b83201fc43fdb6641cc4783c7008f87aed55bad
--- /dev/null
+++ b/evaluation_xnlihtmt/xnliht/ur/can_we_infer_urht/results.json
@@ -0,0 +1,9 @@
+{
+  "dataset_name": "xnli",
+  "dataset_config_name": "ur",
+  "template_name": "can we infer_urht",
+  "evaluation": {
+    "accuracy": 0.3895582329317269
+  },
+  "arguments": "Namespace(config_name=None, dataset_config_name='ur', dataset_name='xnli', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='ur', template_name='can we infer_urht', tokenizer_name=None, use_slow_tokenizer=False)"
+}
\ No newline at end of file
diff --git a/evaluation_xnlihtmt/xnliht/ur/guaranteed_possible_impossible_urht/results.json b/evaluation_xnlihtmt/xnliht/ur/guaranteed_possible_impossible_urht/results.json
new file mode 100644
index 0000000000000000000000000000000000000000..5c63d51d73fa542f757e07787912f164ca05d995
--- /dev/null
+++ b/evaluation_xnlihtmt/xnliht/ur/guaranteed_possible_impossible_urht/results.json
@@ -0,0 +1,9 @@
+{
+  "dataset_name": "xnli",
+  "dataset_config_name": "ur",
+  "template_name": "guaranteed/possible/impossible_urht",
+  "evaluation": {
+    "accuracy": 0.3405622489959839
+  },
+  "arguments": "Namespace(config_name=None, dataset_config_name='ur', dataset_name='xnli', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='ur', template_name='guaranteed/possible/impossible_urht', tokenizer_name=None, use_slow_tokenizer=False)"
+}
\ No newline at end of file
diff --git a/evaluation_xnlihtmt/xnliht/ur/justified_in_saying_urht/results.json b/evaluation_xnlihtmt/xnliht/ur/justified_in_saying_urht/results.json
new file mode 100644
index 0000000000000000000000000000000000000000..bdd9008231968eb2e6743a6a45934a07906fad79
--- /dev/null
+++ b/evaluation_xnlihtmt/xnliht/ur/justified_in_saying_urht/results.json
@@ -0,0 +1,9 @@
+{
+  "dataset_name": "xnli",
+  "dataset_config_name": "ur",
+  "template_name": "justified in saying_urht",
+  "evaluation": {
+    "accuracy": 0.43293172690763054
+  },
+  "arguments": "Namespace(config_name=None, dataset_config_name='ur', dataset_name='xnli', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='ur', template_name='justified in saying_urht', tokenizer_name=None, use_slow_tokenizer=False)"
+}
\ No newline at end of file
diff --git a/evaluation_xnlihtmt/xnliht/vi/GPT-3_style_viht/results.json b/evaluation_xnlihtmt/xnliht/vi/GPT-3_style_viht/results.json
new file mode 100644
index 0000000000000000000000000000000000000000..f4e01f9fb887fe7de2f4fe46d01345fd92a6d510
--- /dev/null
+++ b/evaluation_xnlihtmt/xnliht/vi/GPT-3_style_viht/results.json
@@ -0,0 +1,9 @@
+{
+  "dataset_name": "xnli",
+  "dataset_config_name": "vi",
+  "template_name": "GPT-3 style_viht",
+  "evaluation": {
+    "accuracy": 0.5261044176706827
+  },
+  "arguments": "Namespace(config_name=None, dataset_config_name='vi', dataset_name='xnli', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='vi', template_name='GPT-3 style_viht', tokenizer_name=None, use_slow_tokenizer=False)"
+}
\ No newline at end of file
diff --git a/evaluation_xnlihtmt/xnliht/vi/MNLI_crowdsource_viht/results.json b/evaluation_xnlihtmt/xnliht/vi/MNLI_crowdsource_viht/results.json
new file mode 100644
index 0000000000000000000000000000000000000000..8520acbf68d09bc928c0dfb7b737a41e17594de3
--- /dev/null
+++ b/evaluation_xnlihtmt/xnliht/vi/MNLI_crowdsource_viht/results.json
@@ -0,0 +1,9 @@
+{
+  "dataset_name": "xnli",
+  "dataset_config_name": "vi",
+  "template_name": "MNLI crowdsource_viht",
+  "evaluation": {
+    "accuracy": 0.39879518072289155
+  },
+  "arguments": "Namespace(config_name=None, dataset_config_name='vi', dataset_name='xnli', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='vi', template_name='MNLI crowdsource_viht', tokenizer_name=None, use_slow_tokenizer=False)"
+}
\ No newline at end of file
diff --git a/evaluation_xnlihtmt/xnliht/vi/can_we_infer_viht/results.json b/evaluation_xnlihtmt/xnliht/vi/can_we_infer_viht/results.json
new file mode 100644
index 0000000000000000000000000000000000000000..7b0bdaec60efc2784c288f22715d50e346280108
--- /dev/null
+++ b/evaluation_xnlihtmt/xnliht/vi/can_we_infer_viht/results.json
@@ -0,0 +1,9 @@
+{
+  "dataset_name": "xnli",
+  "dataset_config_name": "vi",
+  "template_name": "can we infer_viht",
+  "evaluation": {
+    "accuracy": 0.5481927710843374
+  },
+  "arguments": "Namespace(config_name=None, dataset_config_name='vi', dataset_name='xnli', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='vi', template_name='can we infer_viht', tokenizer_name=None, use_slow_tokenizer=False)"
+}
\ No newline at end of file
diff --git a/evaluation_xnlihtmt/xnliht/vi/guaranteed_possible_impossible_viht/results.json b/evaluation_xnlihtmt/xnliht/vi/guaranteed_possible_impossible_viht/results.json
new file mode 100644
index 0000000000000000000000000000000000000000..097d91dcde196e9d9496f46fa4581108a8bd4494
--- /dev/null
+++ b/evaluation_xnlihtmt/xnliht/vi/guaranteed_possible_impossible_viht/results.json
@@ -0,0 +1,9 @@
+{
+  "dataset_name": "xnli",
+  "dataset_config_name": "vi",
+  "template_name": "guaranteed/possible/impossible_viht",
+  "evaluation": {
+    "accuracy": 0.43694779116465865
+  },
+  "arguments": "Namespace(config_name=None, dataset_config_name='vi', dataset_name='xnli', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='vi', template_name='guaranteed/possible/impossible_viht', tokenizer_name=None, use_slow_tokenizer=False)"
+}
\ No newline at end of file
diff --git a/evaluation_xnlihtmt/xnliht/vi/justified_in_saying_viht/results.json b/evaluation_xnlihtmt/xnliht/vi/justified_in_saying_viht/results.json
new file mode 100644
index 0000000000000000000000000000000000000000..5efd3a1377077b27e6da271aae29d9ef8e3b883c
--- /dev/null
+++ b/evaluation_xnlihtmt/xnliht/vi/justified_in_saying_viht/results.json
@@ -0,0 +1,9 @@
+{
+  "dataset_name": "xnli",
+  "dataset_config_name": "vi",
+  "template_name": "justified in saying_viht",
+  "evaluation": {
+    "accuracy": 0.46546184738955826
+  },
+  "arguments": "Namespace(config_name=None, dataset_config_name='vi', dataset_name='xnli', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='vi', template_name='justified in saying_viht', tokenizer_name=None, use_slow_tokenizer=False)"
+}
\ No newline at end of file
diff --git a/evaluation_xnlihtmt/xnliht/zh/GPT-3_style_zhht/results.json b/evaluation_xnlihtmt/xnliht/zh/GPT-3_style_zhht/results.json
new file mode 100644
index 0000000000000000000000000000000000000000..553709670d2c50865f09e85fa119e026995287ea
--- /dev/null
+++ b/evaluation_xnlihtmt/xnliht/zh/GPT-3_style_zhht/results.json
@@ -0,0 +1,9 @@
+{
+  "dataset_name": "xnli",
+  "dataset_config_name": "zh",
+  "template_name": "GPT-3 style_zhht",
+  "evaluation": {
+    "accuracy": 0.36947791164658633
+  },
+  "arguments": "Namespace(config_name=None, dataset_config_name='zh', dataset_name='xnli', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='zh', template_name='GPT-3 style_zhht', tokenizer_name=None, use_slow_tokenizer=False)"
+}
\ No newline at end of file
diff --git a/evaluation_xnlihtmt/xnliht/zh/MNLI_crowdsource_zhht/results.json b/evaluation_xnlihtmt/xnliht/zh/MNLI_crowdsource_zhht/results.json
new file mode 100644
index 0000000000000000000000000000000000000000..efa8c119c81082c69eba765ada8008d2f34e146e
--- /dev/null
+++ b/evaluation_xnlihtmt/xnliht/zh/MNLI_crowdsource_zhht/results.json
@@ -0,0 +1,9 @@
+{
+  "dataset_name": "xnli",
+  "dataset_config_name": "zh",
+  "template_name": "MNLI crowdsource_zhht",
+  "evaluation": {
+    "accuracy": 0.3457831325301205
+  },
+  "arguments": "Namespace(config_name=None, dataset_config_name='zh', dataset_name='xnli', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='zh', template_name='MNLI crowdsource_zhht', tokenizer_name=None, use_slow_tokenizer=False)"
+}
\ No newline at end of file
diff --git a/evaluation_xnlihtmt/xnliht/zh/can_we_infer_zhht/results.json b/evaluation_xnlihtmt/xnliht/zh/can_we_infer_zhht/results.json
new file mode 100644
index 0000000000000000000000000000000000000000..f8cbc3083571d601c942d7fc15d93640c6c7eaa8
--- /dev/null
+++ b/evaluation_xnlihtmt/xnliht/zh/can_we_infer_zhht/results.json
@@ -0,0 +1,9 @@
+{
+  "dataset_name": "xnli",
+  "dataset_config_name": "zh",
+  "template_name": "can we infer_zhht",
+  "evaluation": {
+    "accuracy": 0.3441767068273092
+  },
+  "arguments": "Namespace(config_name=None, dataset_config_name='zh', dataset_name='xnli', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='zh', template_name='can we infer_zhht', tokenizer_name=None, use_slow_tokenizer=False)"
+}
\ No newline at end of file
diff --git a/evaluation_xnlihtmt/xnliht/zh/guaranteed_possible_impossible_zhht/results.json b/evaluation_xnlihtmt/xnliht/zh/guaranteed_possible_impossible_zhht/results.json
new file mode 100644
index 0000000000000000000000000000000000000000..a1e72172e6d797ddefbbd510348113cc78ac67a2
--- /dev/null
+++ b/evaluation_xnlihtmt/xnliht/zh/guaranteed_possible_impossible_zhht/results.json
@@ -0,0 +1,9 @@
+{
+  "dataset_name": "xnli",
+  "dataset_config_name": "zh",
+  "template_name": "guaranteed/possible/impossible_zhht",
+  "evaluation": {
+    "accuracy": 0.4923694779116466
+  },
+  "arguments": "Namespace(config_name=None, dataset_config_name='zh', dataset_name='xnli', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='zh', template_name='guaranteed/possible/impossible_zhht', tokenizer_name=None, use_slow_tokenizer=False)"
+}
\ No newline at end of file
diff --git a/evaluation_xnlihtmt/xnliht/zh/justified_in_saying_zhht/results.json b/evaluation_xnlihtmt/xnliht/zh/justified_in_saying_zhht/results.json
new file mode 100644
index 0000000000000000000000000000000000000000..5a0ba1b45a09e30711abbcfd3ff8c6bf9ae068fd
--- /dev/null
+++ b/evaluation_xnlihtmt/xnliht/zh/justified_in_saying_zhht/results.json
@@ -0,0 +1,9 @@
+{
+  "dataset_name": "xnli",
+  "dataset_config_name": "zh",
+  "template_name": "justified in saying_zhht",
+  "evaluation": {
+    "accuracy": 0.3927710843373494
+  },
+  "arguments": "Namespace(config_name=None, dataset_config_name='zh', dataset_name='xnli', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='zh', template_name='justified in saying_zhht', tokenizer_name=None, use_slow_tokenizer=False)"
+}
\ No newline at end of file
diff --git a/evaluation_xnlihtmt/xnlimt/ar/GPT-3_style_armt/results.json b/evaluation_xnlihtmt/xnlimt/ar/GPT-3_style_armt/results.json
new file mode 100644
index 0000000000000000000000000000000000000000..d3df8a608a2418831da7a727daaf768fc29d643a
--- /dev/null
+++ b/evaluation_xnlihtmt/xnlimt/ar/GPT-3_style_armt/results.json
@@ -0,0 +1,9 @@
+{
+  "dataset_name": "xnli",
+  "dataset_config_name": "ar",
+  "template_name": "GPT-3 style_armt",
+  "evaluation": {
+    "accuracy": 0.3333333333333333
+  },
+  "arguments": "Namespace(config_name=None, dataset_config_name='ar', dataset_name='xnli', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='ar', template_name='GPT-3 style_armt', tokenizer_name=None, use_slow_tokenizer=False)"
+}
\ No newline at end of file
diff --git a/evaluation_xnlihtmt/xnlimt/ar/MNLI_crowdsource_armt/results.json b/evaluation_xnlihtmt/xnlimt/ar/MNLI_crowdsource_armt/results.json
new file mode 100644
index 0000000000000000000000000000000000000000..484fea8074d4096940c1cf3e4cb14838e1ba76f8
--- /dev/null
+++ b/evaluation_xnlihtmt/xnlimt/ar/MNLI_crowdsource_armt/results.json
@@ -0,0 +1,9 @@
+{
+  "dataset_name": "xnli",
+  "dataset_config_name": "ar",
+  "template_name": "MNLI crowdsource_armt",
+  "evaluation": {
+    "accuracy": 0.4542168674698795
+  },
+  "arguments": "Namespace(config_name=None, dataset_config_name='ar', dataset_name='xnli', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='ar', template_name='MNLI crowdsource_armt', tokenizer_name=None, use_slow_tokenizer=False)"
+}
\ No newline at end of file
diff --git a/evaluation_xnlihtmt/xnlimt/ar/can_we_infer_armt/results.json b/evaluation_xnlihtmt/xnlimt/ar/can_we_infer_armt/results.json
new file mode 100644
index 0000000000000000000000000000000000000000..e9b7ea9ed0effc3aa4313b68b2bdebbd1a48c5a6
--- /dev/null
+++ b/evaluation_xnlihtmt/xnlimt/ar/can_we_infer_armt/results.json
@@ -0,0 +1,9 @@
+{
+  "dataset_name": "xnli",
+  "dataset_config_name": "ar",
+  "template_name": "can we infer_armt",
+  "evaluation": {
+    "accuracy": 0.41967871485943775
+  },
+  "arguments": "Namespace(config_name=None, dataset_config_name='ar', dataset_name='xnli', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='ar', template_name='can we infer_armt', tokenizer_name=None, use_slow_tokenizer=False)"
+}
\ No newline at end of file
diff --git a/evaluation_xnlihtmt/xnlimt/ar/guaranteed_possible_impossible_armt/results.json b/evaluation_xnlihtmt/xnlimt/ar/guaranteed_possible_impossible_armt/results.json
new file mode 100644
index 0000000000000000000000000000000000000000..2c8ceb85c15b119101189af34b5928207747a25b
--- /dev/null
+++ b/evaluation_xnlihtmt/xnlimt/ar/guaranteed_possible_impossible_armt/results.json
@@ -0,0 +1,9 @@
+{
+  "dataset_name": "xnli",
+  "dataset_config_name": "ar",
+  "template_name": "guaranteed/possible/impossible_armt",
+  "evaluation": {
+    "accuracy": 0.3795180722891566
+  },
+  "arguments": "Namespace(config_name=None, dataset_config_name='ar', dataset_name='xnli', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='ar', template_name='guaranteed/possible/impossible_armt', tokenizer_name=None, use_slow_tokenizer=False)"
+}
\ No newline at end of file
diff --git a/evaluation_xnlihtmt/xnlimt/ar/justified_in_saying_armt/results.json b/evaluation_xnlihtmt/xnlimt/ar/justified_in_saying_armt/results.json
new file mode 100644
index 0000000000000000000000000000000000000000..f770884c9af2373782a2991969239d8e9ac36957
--- /dev/null
+++ b/evaluation_xnlihtmt/xnlimt/ar/justified_in_saying_armt/results.json
@@ -0,0 +1,9 @@
+{
+  "dataset_name": "xnli",
+  "dataset_config_name": "ar",
+  "template_name": "justified in saying_armt",
+  "evaluation": {
+    "accuracy": 0.44016064257028115
+  },
+  "arguments": "Namespace(config_name=None, dataset_config_name='ar', dataset_name='xnli', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='ar', template_name='justified in saying_armt', tokenizer_name=None, use_slow_tokenizer=False)"
+}
\ No newline at end of file
diff --git a/evaluation_xnlihtmt/xnlimt/es/GPT-3_style_esmt/results.json b/evaluation_xnlihtmt/xnlimt/es/GPT-3_style_esmt/results.json
new file mode 100644
index 0000000000000000000000000000000000000000..fb0db17b3f51d113c8094890d8e3387427676096
--- /dev/null
+++ b/evaluation_xnlihtmt/xnlimt/es/GPT-3_style_esmt/results.json
@@ -0,0 +1,9 @@
+{
+  "dataset_name": "xnli",
+  "dataset_config_name": "es",
+  "template_name": "GPT-3 style_esmt",
+  "evaluation": {
+    "accuracy": 0.5381526104417671
+  },
+  "arguments": "Namespace(config_name=None, dataset_config_name='es', dataset_name='xnli', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='es', template_name='GPT-3 style_esmt', tokenizer_name=None, use_slow_tokenizer=False)"
+}
\ No newline at end of file
diff --git a/evaluation_xnlihtmt/xnlimt/es/MNLI_crowdsource_esmt/results.json b/evaluation_xnlihtmt/xnlimt/es/MNLI_crowdsource_esmt/results.json
new file mode 100644
index 0000000000000000000000000000000000000000..c0297e49e925ab677bb63ea7548be8f16ca193e0
--- /dev/null
+++ b/evaluation_xnlihtmt/xnlimt/es/MNLI_crowdsource_esmt/results.json
@@ -0,0 +1,9 @@
+{
+  "dataset_name": "xnli",
+  "dataset_config_name": "es",
+  "template_name": "MNLI crowdsource_esmt",
+  "evaluation": {
+    "accuracy": 0.4951807228915663
+  },
+  "arguments": "Namespace(config_name=None, dataset_config_name='es', dataset_name='xnli', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='es', template_name='MNLI crowdsource_esmt', tokenizer_name=None, use_slow_tokenizer=False)"
+}
\ No newline at end of file
diff --git a/evaluation_xnlihtmt/xnlimt/es/can_we_infer_esmt/results.json b/evaluation_xnlihtmt/xnlimt/es/can_we_infer_esmt/results.json
new file mode 100644
index 0000000000000000000000000000000000000000..16f5ae1d0c3637385fc3b3ac9109610bcab41caf
--- /dev/null
+++ b/evaluation_xnlihtmt/xnlimt/es/can_we_infer_esmt/results.json
@@ -0,0 +1,9 @@
+{
+  "dataset_name": "xnli",
+  "dataset_config_name": "es",
+  "template_name": "can we infer_esmt",
+  "evaluation": {
+    "accuracy": 0.4951807228915663
+  },
+  "arguments": "Namespace(config_name=None, dataset_config_name='es', dataset_name='xnli', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='es', template_name='can we infer_esmt', tokenizer_name=None, use_slow_tokenizer=False)"
+}
\ No newline at end of file
diff --git a/evaluation_xnlihtmt/xnlimt/es/guaranteed_possible_impossible_esmt/results.json b/evaluation_xnlihtmt/xnlimt/es/guaranteed_possible_impossible_esmt/results.json
new file mode 100644
index 0000000000000000000000000000000000000000..7de5aec8f541506b3bd55756b31ee8e84edd71de
--- /dev/null
+++ b/evaluation_xnlihtmt/xnlimt/es/guaranteed_possible_impossible_esmt/results.json
@@ -0,0 +1,9 @@
+{
+  "dataset_name": "xnli",
+  "dataset_config_name": "es",
+  "template_name": "guaranteed/possible/impossible_esmt",
+  "evaluation": {
+    "accuracy": 0.3349397590361446
+  },
+  "arguments": "Namespace(config_name=None, dataset_config_name='es', dataset_name='xnli', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='es', template_name='guaranteed/possible/impossible_esmt', tokenizer_name=None, use_slow_tokenizer=False)"
+}
\ No newline at end of file
diff --git a/evaluation_xnlihtmt/xnlimt/es/justified_in_saying_esmt/results.json b/evaluation_xnlihtmt/xnlimt/es/justified_in_saying_esmt/results.json
new file mode 100644
index 0000000000000000000000000000000000000000..026c7c34328b7831028b3709d9a8918c87d9cbb6
--- /dev/null
+++ b/evaluation_xnlihtmt/xnlimt/es/justified_in_saying_esmt/results.json
@@ -0,0 +1,9 @@
+{
+  "dataset_name": "xnli",
+  "dataset_config_name": "es",
+  "template_name": "justified in saying_esmt",
+  "evaluation": {
+    "accuracy": 0.4955823293172691
+  },
+  "arguments": "Namespace(config_name=None, dataset_config_name='es', dataset_name='xnli', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='es', template_name='justified in saying_esmt', tokenizer_name=None, use_slow_tokenizer=False)"
+}
\ No newline at end of file
diff --git a/evaluation_xnlihtmt/xnlimt/fr/GPT-3_style_frmt/results.json b/evaluation_xnlihtmt/xnlimt/fr/GPT-3_style_frmt/results.json
new file mode 100644
index 0000000000000000000000000000000000000000..f7274d952c0de2c942814fd124fdbaca283485ff
--- /dev/null
+++ b/evaluation_xnlihtmt/xnlimt/fr/GPT-3_style_frmt/results.json
@@ -0,0 +1,9 @@
+{
+  "dataset_name": "xnli",
+  "dataset_config_name": "fr",
+  "template_name": "GPT-3 style_frmt",
+  "evaluation": {
+    "accuracy": 0.4746987951807229
+  },
+  "arguments": "Namespace(config_name=None, dataset_config_name='fr', dataset_name='xnli', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='fr', template_name='GPT-3 style_frmt', tokenizer_name=None, use_slow_tokenizer=False)"
+}
\ No newline at end of file
diff --git a/evaluation_xnlihtmt/xnlimt/fr/MNLI_crowdsource_frmt/results.json b/evaluation_xnlihtmt/xnlimt/fr/MNLI_crowdsource_frmt/results.json
new file mode 100644
index 0000000000000000000000000000000000000000..4e2e6908b8e98c832eae2b7c01606c6a7e4fbe3e
--- /dev/null
+++ b/evaluation_xnlihtmt/xnlimt/fr/MNLI_crowdsource_frmt/results.json
@@ -0,0 +1,9 @@
+{
+  "dataset_name": "xnli",
+  "dataset_config_name": "fr",
+  "template_name": "MNLI crowdsource_frmt",
+  "evaluation": {
+    "accuracy": 0.3538152610441767
+  },
+  "arguments": "Namespace(config_name=None, dataset_config_name='fr', dataset_name='xnli', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='fr', template_name='MNLI crowdsource_frmt', tokenizer_name=None, use_slow_tokenizer=False)"
+}
\ No newline at end of file
diff --git a/evaluation_xnlihtmt/xnlimt/fr/can_we_infer_frmt/results.json b/evaluation_xnlihtmt/xnlimt/fr/can_we_infer_frmt/results.json
new file mode 100644
index 0000000000000000000000000000000000000000..747591c5d05092765d34ce831b80c8247c686b4d
--- /dev/null
+++ b/evaluation_xnlihtmt/xnlimt/fr/can_we_infer_frmt/results.json
@@ -0,0 +1,9 @@
+{
+  "dataset_name": "xnli",
+  "dataset_config_name": "fr",
+  "template_name": "can we infer_frmt",
+  "evaluation": {
+    "accuracy": 0.5481927710843374
+  },
+  "arguments": "Namespace(config_name=None, dataset_config_name='fr', dataset_name='xnli', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='fr', template_name='can we infer_frmt', tokenizer_name=None, use_slow_tokenizer=False)"
+}
\ No newline at end of file
diff --git a/evaluation_xnlihtmt/xnlimt/fr/guaranteed_possible_impossible_frmt/results.json b/evaluation_xnlihtmt/xnlimt/fr/guaranteed_possible_impossible_frmt/results.json
new file mode 100644
index 0000000000000000000000000000000000000000..e68c0401faae36b7d5dd65ad7f70ff83b2017a99
--- /dev/null
+++ b/evaluation_xnlihtmt/xnlimt/fr/guaranteed_possible_impossible_frmt/results.json
@@ -0,0 +1,9 @@
+{
+  "dataset_name": "xnli",
+  "dataset_config_name": "fr",
+  "template_name": "guaranteed/possible/impossible_frmt",
+  "evaluation": {
+    "accuracy": 0.5200803212851406
+  },
+  "arguments": "Namespace(config_name=None, dataset_config_name='fr', dataset_name='xnli', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='fr', template_name='guaranteed/possible/impossible_frmt', tokenizer_name=None, use_slow_tokenizer=False)"
+}
\ No newline at end of file
diff --git a/evaluation_xnlihtmt/xnlimt/fr/justified_in_saying_frmt/results.json b/evaluation_xnlihtmt/xnlimt/fr/justified_in_saying_frmt/results.json
new file mode 100644
index 0000000000000000000000000000000000000000..e090b6a7eb4dd90f5841a558b077302d3215dc06
--- /dev/null
+++ b/evaluation_xnlihtmt/xnlimt/fr/justified_in_saying_frmt/results.json
@@ -0,0 +1,9 @@
+{
+  "dataset_name": "xnli",
+  "dataset_config_name": "fr",
+  "template_name": "justified in saying_frmt",
+  "evaluation": {
+    "accuracy": 0.5317269076305221
+  },
+  "arguments": "Namespace(config_name=None, dataset_config_name='fr', dataset_name='xnli', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='fr', template_name='justified in saying_frmt', tokenizer_name=None, use_slow_tokenizer=False)"
+}
\ No newline at end of file
diff --git a/evaluation_xnlihtmt/xnlimt/hi/GPT-3_style_himt/results.json b/evaluation_xnlihtmt/xnlimt/hi/GPT-3_style_himt/results.json
new file mode 100644
index 0000000000000000000000000000000000000000..6d2f4a90d39fd21e0d000fd2bddde19b805d14b7
--- /dev/null
+++ b/evaluation_xnlihtmt/xnlimt/hi/GPT-3_style_himt/results.json
@@ -0,0 +1,9 @@
+{
+  "dataset_name": "xnli",
+  "dataset_config_name": "hi",
+  "template_name": "GPT-3 style_himt",
+  "evaluation": {
+    "accuracy": 0.43734939759036146
+  },
+  "arguments": "Namespace(config_name=None, dataset_config_name='hi', dataset_name='xnli', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='hi', template_name='GPT-3 style_himt', tokenizer_name=None, use_slow_tokenizer=False)"
+}
\ No newline at end of file
diff --git a/evaluation_xnlihtmt/xnlimt/hi/MNLI_crowdsource_himt/results.json b/evaluation_xnlihtmt/xnlimt/hi/MNLI_crowdsource_himt/results.json
new file mode 100644
index 0000000000000000000000000000000000000000..abe331491e50afd38a294d6ba0a85fe7938d63ed
--- /dev/null
+++ b/evaluation_xnlihtmt/xnlimt/hi/MNLI_crowdsource_himt/results.json
@@ -0,0 +1,9 @@
+{
+  "dataset_name": "xnli",
+  "dataset_config_name": "hi",
+  "template_name": "MNLI crowdsource_himt",
+  "evaluation": {
+    "accuracy": 0.3333333333333333
+  },
+  "arguments": "Namespace(config_name=None, dataset_config_name='hi', dataset_name='xnli', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='hi', template_name='MNLI crowdsource_himt', tokenizer_name=None, use_slow_tokenizer=False)"
+}
\ No newline at end of file
diff --git a/evaluation_xnlihtmt/xnlimt/hi/can_we_infer_himt/results.json b/evaluation_xnlihtmt/xnlimt/hi/can_we_infer_himt/results.json
new file mode 100644
index 0000000000000000000000000000000000000000..15a23fa63fdb4337027f4313554e5ea759f1b6ff
--- /dev/null
+++ b/evaluation_xnlihtmt/xnlimt/hi/can_we_infer_himt/results.json
@@ -0,0 +1,9 @@
+{
+  "dataset_name": "xnli",
+  "dataset_config_name": "hi",
+  "template_name": "can we infer_himt",
+  "evaluation": {
+    "accuracy": 0.4795180722891566
+  },
+  "arguments": "Namespace(config_name=None, dataset_config_name='hi', dataset_name='xnli', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='hi', template_name='can we infer_himt', tokenizer_name=None, use_slow_tokenizer=False)"
+}
\ No newline at end of file
diff --git a/evaluation_xnlihtmt/xnlimt/hi/guaranteed_possible_impossible_himt/results.json b/evaluation_xnlihtmt/xnlimt/hi/guaranteed_possible_impossible_himt/results.json
new file mode 100644
index 0000000000000000000000000000000000000000..ed3e0ab27f59a22926fd7a9183629f8a4874116f
--- /dev/null
+++ b/evaluation_xnlihtmt/xnlimt/hi/guaranteed_possible_impossible_himt/results.json
@@ -0,0 +1,9 @@
+{
+  "dataset_name": "xnli",
+  "dataset_config_name": "hi",
+  "template_name": "guaranteed/possible/impossible_himt",
+  "evaluation": {
+    "accuracy": 0.44136546184738956
+  },
+  "arguments": "Namespace(config_name=None, dataset_config_name='hi', dataset_name='xnli', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='hi', template_name='guaranteed/possible/impossible_himt', tokenizer_name=None, use_slow_tokenizer=False)"
+}
\ No newline at end of file
diff --git a/evaluation_xnlihtmt/xnlimt/hi/justified_in_saying_himt/results.json b/evaluation_xnlihtmt/xnlimt/hi/justified_in_saying_himt/results.json
new file mode 100644
index 0000000000000000000000000000000000000000..6f63b529de724ddad08f41a3e982301790a159f8
--- /dev/null
+++ b/evaluation_xnlihtmt/xnlimt/hi/justified_in_saying_himt/results.json
@@ -0,0 +1,9 @@
+{
+  "dataset_name": "xnli",
+  "dataset_config_name": "hi",
+  "template_name": "justified in saying_himt",
+  "evaluation": {
+    "accuracy": 0.4931726907630522
+  },
+  "arguments": "Namespace(config_name=None, dataset_config_name='hi', dataset_name='xnli', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='hi', template_name='justified in saying_himt', tokenizer_name=None, use_slow_tokenizer=False)"
+}
\ No newline at end of file
diff --git a/evaluation_xnlihtmt/xnlimt/merged.csv b/evaluation_xnlihtmt/xnlimt/merged.csv
new file mode 100644
index 0000000000000000000000000000000000000000..75c13d2a1e9948b86e1e443ac0845ada11e93b59
--- /dev/null
+++ b/evaluation_xnlihtmt/xnlimt/merged.csv
@@ -0,0 +1,50 @@
+dataset,prompt,metric,value
+xnli_ar,GPT-3 style_armt,accuracy,0.3333333333333333
+xnli_ar,MNLI crowdsource_armt,accuracy,0.4542168674698795
+xnli_ar,can we infer_armt,accuracy,0.41967871485943775
+xnli_ar,guaranteed/possible/impossible_armt,accuracy,0.3795180722891566
+xnli_ar,justified in saying_armt,accuracy,0.44016064257028115
+xnli_ar,median,accuracy,0.41967871485943775
+xnli_es,GPT-3 style_esmt,accuracy,0.5381526104417671
+xnli_es,MNLI crowdsource_esmt,accuracy,0.4951807228915663
+xnli_es,can we infer_esmt,accuracy,0.4951807228915663
+xnli_es,guaranteed/possible/impossible_esmt,accuracy,0.3349397590361446
+xnli_es,justified in saying_esmt,accuracy,0.4955823293172691
+xnli_es,median,accuracy,0.4951807228915663
+xnli_fr,GPT-3 style_frmt,accuracy,0.4746987951807229
+xnli_fr,MNLI crowdsource_frmt,accuracy,0.3538152610441767
+xnli_fr,can we infer_frmt,accuracy,0.5481927710843374
+xnli_fr,guaranteed/possible/impossible_frmt,accuracy,0.5200803212851406
+xnli_fr,justified in saying_frmt,accuracy,0.5317269076305221
+xnli_fr,median,accuracy,0.5200803212851406
+xnli_hi,GPT-3 style_himt,accuracy,0.43734939759036146
+xnli_hi,MNLI crowdsource_himt,accuracy,0.3333333333333333
+xnli_hi,can we infer_himt,accuracy,0.4795180722891566
+xnli_hi,guaranteed/possible/impossible_himt,accuracy,0.44136546184738956
+xnli_hi,justified in saying_himt,accuracy,0.4931726907630522
+xnli_hi,median,accuracy,0.44136546184738956
+xnli_sw,GPT-3 style_swmt,accuracy,0.3357429718875502
+xnli_sw,MNLI crowdsource_swmt,accuracy,0.3353413654618474
+xnli_sw,can we infer_swmt,accuracy,0.3682730923694779
+xnli_sw,guaranteed/possible/impossible_swmt,accuracy,0.351004016064257
+xnli_sw,justified in saying_swmt,accuracy,0.36305220883534134
+xnli_sw,median,accuracy,0.351004016064257
+xnli_ur,GPT-3 style_urmt,accuracy,0.3586345381526104
+xnli_ur,MNLI crowdsource_urmt,accuracy,0.3369477911646586
+xnli_ur,can we infer_urmt,accuracy,0.351004016064257
+xnli_ur,guaranteed/possible/impossible_urmt,accuracy,0.3337349397590361
+xnli_ur,justified in saying_urmt,accuracy,0.3381526104417671
+xnli_ur,median,accuracy,0.3381526104417671
+xnli_vi,GPT-3 style_vimt,accuracy,0.3333333333333333
+xnli_vi,MNLI crowdsource_vimt,accuracy,0.3333333333333333
+xnli_vi,can we infer_vimt,accuracy,0.3333333333333333
+xnli_vi,guaranteed/possible/impossible_vimt,accuracy,0.3333333333333333
+xnli_vi,justified in saying_vimt,accuracy,0.3333333333333333
+xnli_vi,median,accuracy,0.3333333333333333
+xnli_zh,GPT-3 style_zhmt,accuracy,0.5224899598393574
+xnli_zh,MNLI crowdsource_zhmt,accuracy,0.4542168674698795
+xnli_zh,can we infer_zhmt,accuracy,0.5184738955823294
+xnli_zh,guaranteed/possible/impossible_zhmt,accuracy,0.334136546184739
+xnli_zh,justified in saying_zhmt,accuracy,0.4955823293172691
+xnli_zh,median,accuracy,0.4955823293172691
+multiple,average,multiple,0.4242971887550201
diff --git a/evaluation_xnlihtmt/xnlimt/merged.json b/evaluation_xnlihtmt/xnlimt/merged.json
new file mode 100644
index 0000000000000000000000000000000000000000..6093ec1c3c21248845b11f0f8607678434710f15
--- /dev/null
+++ b/evaluation_xnlihtmt/xnlimt/merged.json
@@ -0,0 +1 @@
+{"xnli_ar": {"GPT-3 style_armt": {"arguments": "Namespace(config_name=None, dataset_config_name='ar', dataset_name='xnli', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='ar', template_name='GPT-3 style_armt', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "ar", "dataset_name": "xnli", "evaluation": {"accuracy": 0.3333333333333333}, "template_name": "GPT-3 style_armt"}, "MNLI crowdsource_armt": {"arguments": "Namespace(config_name=None, dataset_config_name='ar', dataset_name='xnli', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='ar', template_name='MNLI crowdsource_armt', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "ar", "dataset_name": "xnli", "evaluation": {"accuracy": 0.4542168674698795}, "template_name": "MNLI crowdsource_armt"}, "can we infer_armt": {"arguments": "Namespace(config_name=None, dataset_config_name='ar', dataset_name='xnli', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='ar', template_name='can we infer_armt', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "ar", "dataset_name": "xnli", "evaluation": {"accuracy": 0.41967871485943775}, "template_name": "can we infer_armt"}, "guaranteed/possible/impossible_armt": {"arguments": "Namespace(config_name=None, dataset_config_name='ar', dataset_name='xnli', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='ar', template_name='guaranteed/possible/impossible_armt', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "ar", "dataset_name": "xnli", "evaluation": {"accuracy": 0.3795180722891566}, "template_name": "guaranteed/possible/impossible_armt"}, "justified in saying_armt": {"arguments": "Namespace(config_name=None, dataset_config_name='ar', dataset_name='xnli', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='ar', template_name='justified in saying_armt', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "ar", "dataset_name": "xnli", "evaluation": {"accuracy": 0.44016064257028115}, "template_name": "justified in saying_armt"}}, "xnli_es": {"GPT-3 style_esmt": {"arguments": "Namespace(config_name=None, dataset_config_name='es', dataset_name='xnli', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='es', template_name='GPT-3 style_esmt', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "es", "dataset_name": "xnli", "evaluation": {"accuracy": 0.5381526104417671}, "template_name": "GPT-3 style_esmt"}, "MNLI crowdsource_esmt": {"arguments": "Namespace(config_name=None, dataset_config_name='es', dataset_name='xnli', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='es', template_name='MNLI crowdsource_esmt', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "es", "dataset_name": "xnli", "evaluation": {"accuracy": 0.4951807228915663}, "template_name": "MNLI crowdsource_esmt"}, "can we infer_esmt": {"arguments": "Namespace(config_name=None, dataset_config_name='es', dataset_name='xnli', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='es', template_name='can we infer_esmt', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "es", "dataset_name": "xnli", "evaluation": {"accuracy": 0.4951807228915663}, "template_name": "can we infer_esmt"}, "guaranteed/possible/impossible_esmt": {"arguments": "Namespace(config_name=None, dataset_config_name='es', dataset_name='xnli', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='es', template_name='guaranteed/possible/impossible_esmt', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "es", "dataset_name": "xnli", "evaluation": {"accuracy": 0.3349397590361446}, "template_name": "guaranteed/possible/impossible_esmt"}, "justified in saying_esmt": {"arguments": "Namespace(config_name=None, dataset_config_name='es', dataset_name='xnli', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='es', template_name='justified in saying_esmt', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "es", "dataset_name": "xnli", "evaluation": {"accuracy": 0.4955823293172691}, "template_name": "justified in saying_esmt"}}, "xnli_fr": {"GPT-3 style_frmt": {"arguments": "Namespace(config_name=None, dataset_config_name='fr', dataset_name='xnli', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='fr', template_name='GPT-3 style_frmt', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "fr", "dataset_name": "xnli", "evaluation": {"accuracy": 0.4746987951807229}, "template_name": "GPT-3 style_frmt"}, "MNLI crowdsource_frmt": {"arguments": "Namespace(config_name=None, dataset_config_name='fr', dataset_name='xnli', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='fr', template_name='MNLI crowdsource_frmt', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "fr", "dataset_name": "xnli", "evaluation": {"accuracy": 0.3538152610441767}, "template_name": "MNLI crowdsource_frmt"}, "can we infer_frmt": {"arguments": "Namespace(config_name=None, dataset_config_name='fr', dataset_name='xnli', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='fr', template_name='can we infer_frmt', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "fr", "dataset_name": "xnli", "evaluation": {"accuracy": 0.5481927710843374}, "template_name": "can we infer_frmt"}, "guaranteed/possible/impossible_frmt": {"arguments": "Namespace(config_name=None, dataset_config_name='fr', dataset_name='xnli', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='fr', template_name='guaranteed/possible/impossible_frmt', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "fr", "dataset_name": "xnli", "evaluation": {"accuracy": 0.5200803212851406}, "template_name": "guaranteed/possible/impossible_frmt"}, "justified in saying_frmt": {"arguments": "Namespace(config_name=None, dataset_config_name='fr', dataset_name='xnli', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='fr', template_name='justified in saying_frmt', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "fr", "dataset_name": "xnli", "evaluation": {"accuracy": 0.5317269076305221}, "template_name": "justified in saying_frmt"}}, "xnli_hi": {"GPT-3 style_himt": {"arguments": "Namespace(config_name=None, dataset_config_name='hi', dataset_name='xnli', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='hi', template_name='GPT-3 style_himt', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "hi", "dataset_name": "xnli", "evaluation": {"accuracy": 0.43734939759036146}, "template_name": "GPT-3 style_himt"}, "MNLI crowdsource_himt": {"arguments": "Namespace(config_name=None, dataset_config_name='hi', dataset_name='xnli', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='hi', template_name='MNLI crowdsource_himt', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "hi", "dataset_name": "xnli", "evaluation": {"accuracy": 0.3333333333333333}, "template_name": "MNLI crowdsource_himt"}, "can we infer_himt": {"arguments": "Namespace(config_name=None, dataset_config_name='hi', dataset_name='xnli', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='hi', template_name='can we infer_himt', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "hi", "dataset_name": "xnli", "evaluation": {"accuracy": 0.4795180722891566}, "template_name": "can we infer_himt"}, "guaranteed/possible/impossible_himt": {"arguments": "Namespace(config_name=None, dataset_config_name='hi', dataset_name='xnli', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='hi', template_name='guaranteed/possible/impossible_himt', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "hi", "dataset_name": "xnli", "evaluation": {"accuracy": 0.44136546184738956}, "template_name": "guaranteed/possible/impossible_himt"}, "justified in saying_himt": {"arguments": "Namespace(config_name=None, dataset_config_name='hi', dataset_name='xnli', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='hi', template_name='justified in saying_himt', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "hi", "dataset_name": "xnli", "evaluation": {"accuracy": 0.4931726907630522}, "template_name": "justified in saying_himt"}}, "xnli_sw": {"GPT-3 style_swmt": {"arguments": "Namespace(config_name=None, dataset_config_name='sw', dataset_name='xnli', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='sw', template_name='GPT-3 style_swmt', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "sw", "dataset_name": "xnli", "evaluation": {"accuracy": 0.3357429718875502}, "template_name": "GPT-3 style_swmt"}, "MNLI crowdsource_swmt": {"arguments": "Namespace(config_name=None, dataset_config_name='sw', dataset_name='xnli', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='sw', template_name='MNLI crowdsource_swmt', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "sw", "dataset_name": "xnli", "evaluation": {"accuracy": 0.3353413654618474}, "template_name": "MNLI crowdsource_swmt"}, "can we infer_swmt": {"arguments": "Namespace(config_name=None, dataset_config_name='sw', dataset_name='xnli', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='sw', template_name='can we infer_swmt', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "sw", "dataset_name": "xnli", "evaluation": {"accuracy": 0.3682730923694779}, "template_name": "can we infer_swmt"}, "guaranteed/possible/impossible_swmt": {"arguments": "Namespace(config_name=None, dataset_config_name='sw', dataset_name='xnli', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='sw', template_name='guaranteed/possible/impossible_swmt', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "sw", "dataset_name": "xnli", "evaluation": {"accuracy": 0.351004016064257}, "template_name": "guaranteed/possible/impossible_swmt"}, "justified in saying_swmt": {"arguments": "Namespace(config_name=None, dataset_config_name='sw', dataset_name='xnli', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='sw', template_name='justified in saying_swmt', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "sw", "dataset_name": "xnli", "evaluation": {"accuracy": 0.36305220883534134}, "template_name": "justified in saying_swmt"}}, "xnli_ur": {"GPT-3 style_urmt": {"arguments": "Namespace(config_name=None, dataset_config_name='ur', dataset_name='xnli', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='ur', template_name='GPT-3 style_urmt', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "ur", "dataset_name": "xnli", "evaluation": {"accuracy": 0.3586345381526104}, "template_name": "GPT-3 style_urmt"}, "MNLI crowdsource_urmt": {"arguments": "Namespace(config_name=None, dataset_config_name='ur', dataset_name='xnli', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='ur', template_name='MNLI crowdsource_urmt', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "ur", "dataset_name": "xnli", "evaluation": {"accuracy": 0.3369477911646586}, "template_name": "MNLI crowdsource_urmt"}, "can we infer_urmt": {"arguments": "Namespace(config_name=None, dataset_config_name='ur', dataset_name='xnli', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='ur', template_name='can we infer_urmt', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "ur", "dataset_name": "xnli", "evaluation": {"accuracy": 0.351004016064257}, "template_name": "can we infer_urmt"}, "guaranteed/possible/impossible_urmt": {"arguments": "Namespace(config_name=None, dataset_config_name='ur', dataset_name='xnli', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='ur', template_name='guaranteed/possible/impossible_urmt', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "ur", "dataset_name": "xnli", "evaluation": {"accuracy": 0.3337349397590361}, "template_name": "guaranteed/possible/impossible_urmt"}, "justified in saying_urmt": {"arguments": "Namespace(config_name=None, dataset_config_name='ur', dataset_name='xnli', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='ur', template_name='justified in saying_urmt', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "ur", "dataset_name": "xnli", "evaluation": {"accuracy": 0.3381526104417671}, "template_name": "justified in saying_urmt"}}, "xnli_vi": {"GPT-3 style_vimt": {"arguments": "Namespace(config_name=None, dataset_config_name='vi', dataset_name='xnli', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='vi', template_name='GPT-3 style_vimt', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "vi", "dataset_name": "xnli", "evaluation": {"accuracy": 0.3333333333333333}, "template_name": "GPT-3 style_vimt"}, "MNLI crowdsource_vimt": {"arguments": "Namespace(config_name=None, dataset_config_name='vi', dataset_name='xnli', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='vi', template_name='MNLI crowdsource_vimt', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "vi", "dataset_name": "xnli", "evaluation": {"accuracy": 0.3333333333333333}, "template_name": "MNLI crowdsource_vimt"}, "can we infer_vimt": {"arguments": "Namespace(config_name=None, dataset_config_name='vi', dataset_name='xnli', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='vi', template_name='can we infer_vimt', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "vi", "dataset_name": "xnli", "evaluation": {"accuracy": 0.3333333333333333}, "template_name": "can we infer_vimt"}, "guaranteed/possible/impossible_vimt": {"arguments": "Namespace(config_name=None, dataset_config_name='vi', dataset_name='xnli', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='vi', template_name='guaranteed/possible/impossible_vimt', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "vi", "dataset_name": "xnli", "evaluation": {"accuracy": 0.3333333333333333}, "template_name": "guaranteed/possible/impossible_vimt"}, "justified in saying_vimt": {"arguments": "Namespace(config_name=None, dataset_config_name='vi', dataset_name='xnli', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='vi', template_name='justified in saying_vimt', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "vi", "dataset_name": "xnli", "evaluation": {"accuracy": 0.3333333333333333}, "template_name": "justified in saying_vimt"}}, "xnli_zh": {"GPT-3 style_zhmt": {"arguments": "Namespace(config_name=None, dataset_config_name='zh', dataset_name='xnli', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='zh', template_name='GPT-3 style_zhmt', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "zh", "dataset_name": "xnli", "evaluation": {"accuracy": 0.5224899598393574}, "template_name": "GPT-3 style_zhmt"}, "MNLI crowdsource_zhmt": {"arguments": "Namespace(config_name=None, dataset_config_name='zh', dataset_name='xnli', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='zh', template_name='MNLI crowdsource_zhmt', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "zh", "dataset_name": "xnli", "evaluation": {"accuracy": 0.4542168674698795}, "template_name": "MNLI crowdsource_zhmt"}, "can we infer_zhmt": {"arguments": "Namespace(config_name=None, dataset_config_name='zh', dataset_name='xnli', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='zh', template_name='can we infer_zhmt', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "zh", "dataset_name": "xnli", "evaluation": {"accuracy": 0.5184738955823294}, "template_name": "can we infer_zhmt"}, "guaranteed/possible/impossible_zhmt": {"arguments": "Namespace(config_name=None, dataset_config_name='zh', dataset_name='xnli', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='zh', template_name='guaranteed/possible/impossible_zhmt', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "zh", "dataset_name": "xnli", "evaluation": {"accuracy": 0.334136546184739}, "template_name": "guaranteed/possible/impossible_zhmt"}, "justified in saying_zhmt": {"arguments": "Namespace(config_name=None, dataset_config_name='zh', dataset_name='xnli', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='zh', template_name='justified in saying_zhmt', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "zh", "dataset_name": "xnli", "evaluation": {"accuracy": 0.4955823293172691}, "template_name": "justified in saying_zhmt"}}}
\ No newline at end of file
diff --git a/evaluation_xnlihtmt/xnlimt/sw/GPT-3_style_swmt/results.json b/evaluation_xnlihtmt/xnlimt/sw/GPT-3_style_swmt/results.json
new file mode 100644
index 0000000000000000000000000000000000000000..7af2ade81619d1d5d680f533d2d718004ac47e9f
--- /dev/null
+++ b/evaluation_xnlihtmt/xnlimt/sw/GPT-3_style_swmt/results.json
@@ -0,0 +1,9 @@
+{
+  "dataset_name": "xnli",
+  "dataset_config_name": "sw",
+  "template_name": "GPT-3 style_swmt",
+  "evaluation": {
+    "accuracy": 0.3357429718875502
+  },
+  "arguments": "Namespace(config_name=None, dataset_config_name='sw', dataset_name='xnli', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='sw', template_name='GPT-3 style_swmt', tokenizer_name=None, use_slow_tokenizer=False)"
+}
\ No newline at end of file
diff --git a/evaluation_xnlihtmt/xnlimt/sw/MNLI_crowdsource_swmt/results.json b/evaluation_xnlihtmt/xnlimt/sw/MNLI_crowdsource_swmt/results.json
new file mode 100644
index 0000000000000000000000000000000000000000..1a7435f3ab6279405fe0758f869cb8e13c33482e
--- /dev/null
+++ b/evaluation_xnlihtmt/xnlimt/sw/MNLI_crowdsource_swmt/results.json
@@ -0,0 +1,9 @@
+{
+  "dataset_name": "xnli",
+  "dataset_config_name": "sw",
+  "template_name": "MNLI crowdsource_swmt",
+  "evaluation": {
+    "accuracy": 0.3353413654618474
+  },
+  "arguments": "Namespace(config_name=None, dataset_config_name='sw', dataset_name='xnli', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='sw', template_name='MNLI crowdsource_swmt', tokenizer_name=None, use_slow_tokenizer=False)"
+}
\ No newline at end of file
diff --git a/evaluation_xnlihtmt/xnlimt/sw/can_we_infer_swmt/results.json b/evaluation_xnlihtmt/xnlimt/sw/can_we_infer_swmt/results.json
new file mode 100644
index 0000000000000000000000000000000000000000..a09fc6d4bbbeee4f4f9f9c803acfaf3b1c48c451
--- /dev/null
+++ b/evaluation_xnlihtmt/xnlimt/sw/can_we_infer_swmt/results.json
@@ -0,0 +1,9 @@
+{
+  "dataset_name": "xnli",
+  "dataset_config_name": "sw",
+  "template_name": "can we infer_swmt",
+  "evaluation": {
+    "accuracy": 0.3682730923694779
+  },
+  "arguments": "Namespace(config_name=None, dataset_config_name='sw', dataset_name='xnli', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='sw', template_name='can we infer_swmt', tokenizer_name=None, use_slow_tokenizer=False)"
+}
\ No newline at end of file
diff --git a/evaluation_xnlihtmt/xnlimt/sw/guaranteed_possible_impossible_swmt/results.json b/evaluation_xnlihtmt/xnlimt/sw/guaranteed_possible_impossible_swmt/results.json
new file mode 100644
index 0000000000000000000000000000000000000000..f15426fe5979ede1ee47fc0edfcd025c24963e85
--- /dev/null
+++ b/evaluation_xnlihtmt/xnlimt/sw/guaranteed_possible_impossible_swmt/results.json
@@ -0,0 +1,9 @@
+{
+  "dataset_name": "xnli",
+  "dataset_config_name": "sw",
+  "template_name": "guaranteed/possible/impossible_swmt",
+  "evaluation": {
+    "accuracy": 0.351004016064257
+  },
+  "arguments": "Namespace(config_name=None, dataset_config_name='sw', dataset_name='xnli', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='sw', template_name='guaranteed/possible/impossible_swmt', tokenizer_name=None, use_slow_tokenizer=False)"
+}
\ No newline at end of file
diff --git a/evaluation_xnlihtmt/xnlimt/sw/justified_in_saying_swmt/results.json b/evaluation_xnlihtmt/xnlimt/sw/justified_in_saying_swmt/results.json
new file mode 100644
index 0000000000000000000000000000000000000000..90394faebea44e1c6d7894b2cc89c522ebd8df7a
--- /dev/null
+++ b/evaluation_xnlihtmt/xnlimt/sw/justified_in_saying_swmt/results.json
@@ -0,0 +1,9 @@
+{
+  "dataset_name": "xnli",
+  "dataset_config_name": "sw",
+  "template_name": "justified in saying_swmt",
+  "evaluation": {
+    "accuracy": 0.36305220883534134
+  },
+  "arguments": "Namespace(config_name=None, dataset_config_name='sw', dataset_name='xnli', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='sw', template_name='justified in saying_swmt', tokenizer_name=None, use_slow_tokenizer=False)"
+}
\ No newline at end of file
diff --git a/evaluation_xnlihtmt/xnlimt/ur/GPT-3_style_urmt/results.json b/evaluation_xnlihtmt/xnlimt/ur/GPT-3_style_urmt/results.json
new file mode 100644
index 0000000000000000000000000000000000000000..0282d0be6303786a180f50618e07877161f505b3
--- /dev/null
+++ b/evaluation_xnlihtmt/xnlimt/ur/GPT-3_style_urmt/results.json
@@ -0,0 +1,9 @@
+{
+  "dataset_name": "xnli",
+  "dataset_config_name": "ur",
+  "template_name": "GPT-3 style_urmt",
+  "evaluation": {
+    "accuracy": 0.3586345381526104
+  },
+  "arguments": "Namespace(config_name=None, dataset_config_name='ur', dataset_name='xnli', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='ur', template_name='GPT-3 style_urmt', tokenizer_name=None, use_slow_tokenizer=False)"
+}
\ No newline at end of file
diff --git a/evaluation_xnlihtmt/xnlimt/ur/MNLI_crowdsource_urmt/results.json b/evaluation_xnlihtmt/xnlimt/ur/MNLI_crowdsource_urmt/results.json
new file mode 100644
index 0000000000000000000000000000000000000000..611968c8b90be64d7d60113395460c0fd457d2b2
--- /dev/null
+++ b/evaluation_xnlihtmt/xnlimt/ur/MNLI_crowdsource_urmt/results.json
@@ -0,0 +1,9 @@
+{
+  "dataset_name": "xnli",
+  "dataset_config_name": "ur",
+  "template_name": "MNLI crowdsource_urmt",
+  "evaluation": {
+    "accuracy": 0.3369477911646586
+  },
+  "arguments": "Namespace(config_name=None, dataset_config_name='ur', dataset_name='xnli', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='ur', template_name='MNLI crowdsource_urmt', tokenizer_name=None, use_slow_tokenizer=False)"
+}
\ No newline at end of file
diff --git a/evaluation_xnlihtmt/xnlimt/ur/can_we_infer_urmt/results.json b/evaluation_xnlihtmt/xnlimt/ur/can_we_infer_urmt/results.json
new file mode 100644
index 0000000000000000000000000000000000000000..4c1ae10e00ad59b9af33932760d44cc12ea1f8ee
--- /dev/null
+++ b/evaluation_xnlihtmt/xnlimt/ur/can_we_infer_urmt/results.json
@@ -0,0 +1,9 @@
+{
+  "dataset_name": "xnli",
+  "dataset_config_name": "ur",
+  "template_name": "can we infer_urmt",
+  "evaluation": {
+    "accuracy": 0.351004016064257
+  },
+  "arguments": "Namespace(config_name=None, dataset_config_name='ur', dataset_name='xnli', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='ur', template_name='can we infer_urmt', tokenizer_name=None, use_slow_tokenizer=False)"
+}
\ No newline at end of file
diff --git a/evaluation_xnlihtmt/xnlimt/ur/guaranteed_possible_impossible_urmt/results.json b/evaluation_xnlihtmt/xnlimt/ur/guaranteed_possible_impossible_urmt/results.json
new file mode 100644
index 0000000000000000000000000000000000000000..35f9483bd5e19259d6f473f0c4d973e39f80fca4
--- /dev/null
+++ b/evaluation_xnlihtmt/xnlimt/ur/guaranteed_possible_impossible_urmt/results.json
@@ -0,0 +1,9 @@
+{
+  "dataset_name": "xnli",
+  "dataset_config_name": "ur",
+  "template_name": "guaranteed/possible/impossible_urmt",
+  "evaluation": {
+    "accuracy": 0.3337349397590361
+  },
+  "arguments": "Namespace(config_name=None, dataset_config_name='ur', dataset_name='xnli', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='ur', template_name='guaranteed/possible/impossible_urmt', tokenizer_name=None, use_slow_tokenizer=False)"
+}
\ No newline at end of file
diff --git a/evaluation_xnlihtmt/xnlimt/ur/justified_in_saying_urmt/results.json b/evaluation_xnlihtmt/xnlimt/ur/justified_in_saying_urmt/results.json
new file mode 100644
index 0000000000000000000000000000000000000000..ca92fbad1769d545db2d32b0cc2cd21fc2531536
--- /dev/null
+++ b/evaluation_xnlihtmt/xnlimt/ur/justified_in_saying_urmt/results.json
@@ -0,0 +1,9 @@
+{
+  "dataset_name": "xnli",
+  "dataset_config_name": "ur",
+  "template_name": "justified in saying_urmt",
+  "evaluation": {
+    "accuracy": 0.3381526104417671
+  },
+  "arguments": "Namespace(config_name=None, dataset_config_name='ur', dataset_name='xnli', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='ur', template_name='justified in saying_urmt', tokenizer_name=None, use_slow_tokenizer=False)"
+}
\ No newline at end of file
diff --git a/evaluation_xnlihtmt/xnlimt/vi/GPT-3_style_vimt/results.json b/evaluation_xnlihtmt/xnlimt/vi/GPT-3_style_vimt/results.json
new file mode 100644
index 0000000000000000000000000000000000000000..58246bf4d433606693cb97adc8622b6fa1c74e4c
--- /dev/null
+++ b/evaluation_xnlihtmt/xnlimt/vi/GPT-3_style_vimt/results.json
@@ -0,0 +1,9 @@
+{
+  "dataset_name": "xnli",
+  "dataset_config_name": "vi",
+  "template_name": "GPT-3 style_vimt",
+  "evaluation": {
+    "accuracy": 0.3333333333333333
+  },
+  "arguments": "Namespace(config_name=None, dataset_config_name='vi', dataset_name='xnli', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='vi', template_name='GPT-3 style_vimt', tokenizer_name=None, use_slow_tokenizer=False)"
+}
\ No newline at end of file
diff --git a/evaluation_xnlihtmt/xnlimt/vi/MNLI_crowdsource_vimt/results.json b/evaluation_xnlihtmt/xnlimt/vi/MNLI_crowdsource_vimt/results.json
new file mode 100644
index 0000000000000000000000000000000000000000..5dd3868927989b006df5b17daa4cc23d5238ad22
--- /dev/null
+++ b/evaluation_xnlihtmt/xnlimt/vi/MNLI_crowdsource_vimt/results.json
@@ -0,0 +1,9 @@
+{
+  "dataset_name": "xnli",
+  "dataset_config_name": "vi",
+  "template_name": "MNLI crowdsource_vimt",
+  "evaluation": {
+    "accuracy": 0.3333333333333333
+  },
+  "arguments": "Namespace(config_name=None, dataset_config_name='vi', dataset_name='xnli', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='vi', template_name='MNLI crowdsource_vimt', tokenizer_name=None, use_slow_tokenizer=False)"
+}
\ No newline at end of file
diff --git a/evaluation_xnlihtmt/xnlimt/vi/can_we_infer_vimt/results.json b/evaluation_xnlihtmt/xnlimt/vi/can_we_infer_vimt/results.json
new file mode 100644
index 0000000000000000000000000000000000000000..b73e0f205f3841b05d86a9449c601b8b6f111e5e
--- /dev/null
+++ b/evaluation_xnlihtmt/xnlimt/vi/can_we_infer_vimt/results.json
@@ -0,0 +1,9 @@
+{
+  "dataset_name": "xnli",
+  "dataset_config_name": "vi",
+  "template_name": "can we infer_vimt",
+  "evaluation": {
+    "accuracy": 0.3333333333333333
+  },
+  "arguments": "Namespace(config_name=None, dataset_config_name='vi', dataset_name='xnli', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='vi', template_name='can we infer_vimt', tokenizer_name=None, use_slow_tokenizer=False)"
+}
\ No newline at end of file
diff --git a/evaluation_xnlihtmt/xnlimt/vi/guaranteed_possible_impossible_vimt/results.json b/evaluation_xnlihtmt/xnlimt/vi/guaranteed_possible_impossible_vimt/results.json
new file mode 100644
index 0000000000000000000000000000000000000000..39992e80223b4a30b931b52f708a7838f6061c8c
--- /dev/null
+++ b/evaluation_xnlihtmt/xnlimt/vi/guaranteed_possible_impossible_vimt/results.json
@@ -0,0 +1,9 @@
+{
+  "dataset_name": "xnli",
+  "dataset_config_name": "vi",
+  "template_name": "guaranteed/possible/impossible_vimt",
+  "evaluation": {
+    "accuracy": 0.3333333333333333
+  },
+  "arguments": "Namespace(config_name=None, dataset_config_name='vi', dataset_name='xnli', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='vi', template_name='guaranteed/possible/impossible_vimt', tokenizer_name=None, use_slow_tokenizer=False)"
+}
\ No newline at end of file
diff --git a/evaluation_xnlihtmt/xnlimt/vi/justified_in_saying_vimt/results.json b/evaluation_xnlihtmt/xnlimt/vi/justified_in_saying_vimt/results.json
new file mode 100644
index 0000000000000000000000000000000000000000..e723b8a7e36d3a1063dfd79f26ee3443b922fbb8
--- /dev/null
+++ b/evaluation_xnlihtmt/xnlimt/vi/justified_in_saying_vimt/results.json
@@ -0,0 +1,9 @@
+{
+  "dataset_name": "xnli",
+  "dataset_config_name": "vi",
+  "template_name": "justified in saying_vimt",
+  "evaluation": {
+    "accuracy": 0.3333333333333333
+  },
+  "arguments": "Namespace(config_name=None, dataset_config_name='vi', dataset_name='xnli', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='vi', template_name='justified in saying_vimt', tokenizer_name=None, use_slow_tokenizer=False)"
+}
\ No newline at end of file
diff --git a/evaluation_xnlihtmt/xnlimt/zh/GPT-3_style_zhmt/results.json b/evaluation_xnlihtmt/xnlimt/zh/GPT-3_style_zhmt/results.json
new file mode 100644
index 0000000000000000000000000000000000000000..e1ae998ee8df46751b4fe8f0cb439cb1a29acbea
--- /dev/null
+++ b/evaluation_xnlihtmt/xnlimt/zh/GPT-3_style_zhmt/results.json
@@ -0,0 +1,9 @@
+{
+  "dataset_name": "xnli",
+  "dataset_config_name": "zh",
+  "template_name": "GPT-3 style_zhmt",
+  "evaluation": {
+    "accuracy": 0.5224899598393574
+  },
+  "arguments": "Namespace(config_name=None, dataset_config_name='zh', dataset_name='xnli', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='zh', template_name='GPT-3 style_zhmt', tokenizer_name=None, use_slow_tokenizer=False)"
+}
\ No newline at end of file
diff --git a/evaluation_xnlihtmt/xnlimt/zh/MNLI_crowdsource_zhmt/results.json b/evaluation_xnlihtmt/xnlimt/zh/MNLI_crowdsource_zhmt/results.json
new file mode 100644
index 0000000000000000000000000000000000000000..2b524f2a25b623cbb28fe5d4ebd40ac24d4578c2
--- /dev/null
+++ b/evaluation_xnlihtmt/xnlimt/zh/MNLI_crowdsource_zhmt/results.json
@@ -0,0 +1,9 @@
+{
+  "dataset_name": "xnli",
+  "dataset_config_name": "zh",
+  "template_name": "MNLI crowdsource_zhmt",
+  "evaluation": {
+    "accuracy": 0.4542168674698795
+  },
+  "arguments": "Namespace(config_name=None, dataset_config_name='zh', dataset_name='xnli', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='zh', template_name='MNLI crowdsource_zhmt', tokenizer_name=None, use_slow_tokenizer=False)"
+}
\ No newline at end of file
diff --git a/evaluation_xnlihtmt/xnlimt/zh/can_we_infer_zhmt/results.json b/evaluation_xnlihtmt/xnlimt/zh/can_we_infer_zhmt/results.json
new file mode 100644
index 0000000000000000000000000000000000000000..7a9e5b9cd1bd435fe6320882569918515362eecd
--- /dev/null
+++ b/evaluation_xnlihtmt/xnlimt/zh/can_we_infer_zhmt/results.json
@@ -0,0 +1,9 @@
+{
+  "dataset_name": "xnli",
+  "dataset_config_name": "zh",
+  "template_name": "can we infer_zhmt",
+  "evaluation": {
+    "accuracy": 0.5184738955823294
+  },
+  "arguments": "Namespace(config_name=None, dataset_config_name='zh', dataset_name='xnli', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='zh', template_name='can we infer_zhmt', tokenizer_name=None, use_slow_tokenizer=False)"
+}
\ No newline at end of file
diff --git a/evaluation_xnlihtmt/xnlimt/zh/guaranteed_possible_impossible_zhmt/results.json b/evaluation_xnlihtmt/xnlimt/zh/guaranteed_possible_impossible_zhmt/results.json
new file mode 100644
index 0000000000000000000000000000000000000000..9af567d30784f4d6ccd77e76af4f9c84323d92db
--- /dev/null
+++ b/evaluation_xnlihtmt/xnlimt/zh/guaranteed_possible_impossible_zhmt/results.json
@@ -0,0 +1,9 @@
+{
+  "dataset_name": "xnli",
+  "dataset_config_name": "zh",
+  "template_name": "guaranteed/possible/impossible_zhmt",
+  "evaluation": {
+    "accuracy": 0.334136546184739
+  },
+  "arguments": "Namespace(config_name=None, dataset_config_name='zh', dataset_name='xnli', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='zh', template_name='guaranteed/possible/impossible_zhmt', tokenizer_name=None, use_slow_tokenizer=False)"
+}
\ No newline at end of file
diff --git a/evaluation_xnlihtmt/xnlimt/zh/justified_in_saying_zhmt/results.json b/evaluation_xnlihtmt/xnlimt/zh/justified_in_saying_zhmt/results.json
new file mode 100644
index 0000000000000000000000000000000000000000..9c6db375f86ea1876f4eca2db924d921d1ce7b11
--- /dev/null
+++ b/evaluation_xnlihtmt/xnlimt/zh/justified_in_saying_zhmt/results.json
@@ -0,0 +1,9 @@
+{
+  "dataset_name": "xnli",
+  "dataset_config_name": "zh",
+  "template_name": "justified in saying_zhmt",
+  "evaluation": {
+    "accuracy": 0.4955823293172691
+  },
+  "arguments": "Namespace(config_name=None, dataset_config_name='zh', dataset_name='xnli', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='zh', template_name='justified in saying_zhmt', tokenizer_name=None, use_slow_tokenizer=False)"
+}
\ No newline at end of file
diff --git a/pytorch_model.bin.index.json b/pytorch_model.bin.index.json
new file mode 100644
index 0000000000000000000000000000000000000000..745df138d046295c44b52b3a17342a4b8ef7345c
--- /dev/null
+++ b/pytorch_model.bin.index.json
@@ -0,0 +1,852 @@
+{
+  "metadata": {
+    "total_size": 352494542848
+  },
+  "weight_map": {
+    "h.0.input_layernorm.bias": "pytorch_model_00002-of-00072.bin",
+    "h.0.input_layernorm.weight": "pytorch_model_00002-of-00072.bin",
+    "h.0.mlp.dense_4h_to_h.bias": "pytorch_model_00002-of-00072.bin",
+    "h.0.mlp.dense_4h_to_h.weight": "pytorch_model_00002-of-00072.bin",
+    "h.0.mlp.dense_h_to_4h.bias": "pytorch_model_00002-of-00072.bin",
+    "h.0.mlp.dense_h_to_4h.weight": "pytorch_model_00002-of-00072.bin",
+    "h.0.post_attention_layernorm.bias": "pytorch_model_00002-of-00072.bin",
+    "h.0.post_attention_layernorm.weight": "pytorch_model_00002-of-00072.bin",
+    "h.0.self_attention.dense.bias": "pytorch_model_00002-of-00072.bin",
+    "h.0.self_attention.dense.weight": "pytorch_model_00002-of-00072.bin",
+    "h.0.self_attention.query_key_value.bias": "pytorch_model_00002-of-00072.bin",
+    "h.0.self_attention.query_key_value.weight": "pytorch_model_00002-of-00072.bin",
+    "h.1.input_layernorm.bias": "pytorch_model_00003-of-00072.bin",
+    "h.1.input_layernorm.weight": "pytorch_model_00003-of-00072.bin",
+    "h.1.mlp.dense_4h_to_h.bias": "pytorch_model_00003-of-00072.bin",
+    "h.1.mlp.dense_4h_to_h.weight": "pytorch_model_00003-of-00072.bin",
+    "h.1.mlp.dense_h_to_4h.bias": "pytorch_model_00003-of-00072.bin",
+    "h.1.mlp.dense_h_to_4h.weight": "pytorch_model_00003-of-00072.bin",
+    "h.1.post_attention_layernorm.bias": "pytorch_model_00003-of-00072.bin",
+    "h.1.post_attention_layernorm.weight": "pytorch_model_00003-of-00072.bin",
+    "h.1.self_attention.dense.bias": "pytorch_model_00003-of-00072.bin",
+    "h.1.self_attention.dense.weight": "pytorch_model_00003-of-00072.bin",
+    "h.1.self_attention.query_key_value.bias": "pytorch_model_00003-of-00072.bin",
+    "h.1.self_attention.query_key_value.weight": "pytorch_model_00003-of-00072.bin",
+    "h.10.input_layernorm.bias": "pytorch_model_00012-of-00072.bin",
+    "h.10.input_layernorm.weight": "pytorch_model_00012-of-00072.bin",
+    "h.10.mlp.dense_4h_to_h.bias": "pytorch_model_00012-of-00072.bin",
+    "h.10.mlp.dense_4h_to_h.weight": "pytorch_model_00012-of-00072.bin",
+    "h.10.mlp.dense_h_to_4h.bias": "pytorch_model_00012-of-00072.bin",
+    "h.10.mlp.dense_h_to_4h.weight": "pytorch_model_00012-of-00072.bin",
+    "h.10.post_attention_layernorm.bias": "pytorch_model_00012-of-00072.bin",
+    "h.10.post_attention_layernorm.weight": "pytorch_model_00012-of-00072.bin",
+    "h.10.self_attention.dense.bias": "pytorch_model_00012-of-00072.bin",
+    "h.10.self_attention.dense.weight": "pytorch_model_00012-of-00072.bin",
+    "h.10.self_attention.query_key_value.bias": "pytorch_model_00012-of-00072.bin",
+    "h.10.self_attention.query_key_value.weight": "pytorch_model_00012-of-00072.bin",
+    "h.11.input_layernorm.bias": "pytorch_model_00013-of-00072.bin",
+    "h.11.input_layernorm.weight": "pytorch_model_00013-of-00072.bin",
+    "h.11.mlp.dense_4h_to_h.bias": "pytorch_model_00013-of-00072.bin",
+    "h.11.mlp.dense_4h_to_h.weight": "pytorch_model_00013-of-00072.bin",
+    "h.11.mlp.dense_h_to_4h.bias": "pytorch_model_00013-of-00072.bin",
+    "h.11.mlp.dense_h_to_4h.weight": "pytorch_model_00013-of-00072.bin",
+    "h.11.post_attention_layernorm.bias": "pytorch_model_00013-of-00072.bin",
+    "h.11.post_attention_layernorm.weight": "pytorch_model_00013-of-00072.bin",
+    "h.11.self_attention.dense.bias": "pytorch_model_00013-of-00072.bin",
+    "h.11.self_attention.dense.weight": "pytorch_model_00013-of-00072.bin",
+    "h.11.self_attention.query_key_value.bias": "pytorch_model_00013-of-00072.bin",
+    "h.11.self_attention.query_key_value.weight": "pytorch_model_00013-of-00072.bin",
+    "h.12.input_layernorm.bias": "pytorch_model_00014-of-00072.bin",
+    "h.12.input_layernorm.weight": "pytorch_model_00014-of-00072.bin",
+    "h.12.mlp.dense_4h_to_h.bias": "pytorch_model_00014-of-00072.bin",
+    "h.12.mlp.dense_4h_to_h.weight": "pytorch_model_00014-of-00072.bin",
+    "h.12.mlp.dense_h_to_4h.bias": "pytorch_model_00014-of-00072.bin",
+    "h.12.mlp.dense_h_to_4h.weight": "pytorch_model_00014-of-00072.bin",
+    "h.12.post_attention_layernorm.bias": "pytorch_model_00014-of-00072.bin",
+    "h.12.post_attention_layernorm.weight": "pytorch_model_00014-of-00072.bin",
+    "h.12.self_attention.dense.bias": "pytorch_model_00014-of-00072.bin",
+    "h.12.self_attention.dense.weight": "pytorch_model_00014-of-00072.bin",
+    "h.12.self_attention.query_key_value.bias": "pytorch_model_00014-of-00072.bin",
+    "h.12.self_attention.query_key_value.weight": "pytorch_model_00014-of-00072.bin",
+    "h.13.input_layernorm.bias": "pytorch_model_00015-of-00072.bin",
+    "h.13.input_layernorm.weight": "pytorch_model_00015-of-00072.bin",
+    "h.13.mlp.dense_4h_to_h.bias": "pytorch_model_00015-of-00072.bin",
+    "h.13.mlp.dense_4h_to_h.weight": "pytorch_model_00015-of-00072.bin",
+    "h.13.mlp.dense_h_to_4h.bias": "pytorch_model_00015-of-00072.bin",
+    "h.13.mlp.dense_h_to_4h.weight": "pytorch_model_00015-of-00072.bin",
+    "h.13.post_attention_layernorm.bias": "pytorch_model_00015-of-00072.bin",
+    "h.13.post_attention_layernorm.weight": "pytorch_model_00015-of-00072.bin",
+    "h.13.self_attention.dense.bias": "pytorch_model_00015-of-00072.bin",
+    "h.13.self_attention.dense.weight": "pytorch_model_00015-of-00072.bin",
+    "h.13.self_attention.query_key_value.bias": "pytorch_model_00015-of-00072.bin",
+    "h.13.self_attention.query_key_value.weight": "pytorch_model_00015-of-00072.bin",
+    "h.14.input_layernorm.bias": "pytorch_model_00016-of-00072.bin",
+    "h.14.input_layernorm.weight": "pytorch_model_00016-of-00072.bin",
+    "h.14.mlp.dense_4h_to_h.bias": "pytorch_model_00016-of-00072.bin",
+    "h.14.mlp.dense_4h_to_h.weight": "pytorch_model_00016-of-00072.bin",
+    "h.14.mlp.dense_h_to_4h.bias": "pytorch_model_00016-of-00072.bin",
+    "h.14.mlp.dense_h_to_4h.weight": "pytorch_model_00016-of-00072.bin",
+    "h.14.post_attention_layernorm.bias": "pytorch_model_00016-of-00072.bin",
+    "h.14.post_attention_layernorm.weight": "pytorch_model_00016-of-00072.bin",
+    "h.14.self_attention.dense.bias": "pytorch_model_00016-of-00072.bin",
+    "h.14.self_attention.dense.weight": "pytorch_model_00016-of-00072.bin",
+    "h.14.self_attention.query_key_value.bias": "pytorch_model_00016-of-00072.bin",
+    "h.14.self_attention.query_key_value.weight": "pytorch_model_00016-of-00072.bin",
+    "h.15.input_layernorm.bias": "pytorch_model_00017-of-00072.bin",
+    "h.15.input_layernorm.weight": "pytorch_model_00017-of-00072.bin",
+    "h.15.mlp.dense_4h_to_h.bias": "pytorch_model_00017-of-00072.bin",
+    "h.15.mlp.dense_4h_to_h.weight": "pytorch_model_00017-of-00072.bin",
+    "h.15.mlp.dense_h_to_4h.bias": "pytorch_model_00017-of-00072.bin",
+    "h.15.mlp.dense_h_to_4h.weight": "pytorch_model_00017-of-00072.bin",
+    "h.15.post_attention_layernorm.bias": "pytorch_model_00017-of-00072.bin",
+    "h.15.post_attention_layernorm.weight": "pytorch_model_00017-of-00072.bin",
+    "h.15.self_attention.dense.bias": "pytorch_model_00017-of-00072.bin",
+    "h.15.self_attention.dense.weight": "pytorch_model_00017-of-00072.bin",
+    "h.15.self_attention.query_key_value.bias": "pytorch_model_00017-of-00072.bin",
+    "h.15.self_attention.query_key_value.weight": "pytorch_model_00017-of-00072.bin",
+    "h.16.input_layernorm.bias": "pytorch_model_00018-of-00072.bin",
+    "h.16.input_layernorm.weight": "pytorch_model_00018-of-00072.bin",
+    "h.16.mlp.dense_4h_to_h.bias": "pytorch_model_00018-of-00072.bin",
+    "h.16.mlp.dense_4h_to_h.weight": "pytorch_model_00018-of-00072.bin",
+    "h.16.mlp.dense_h_to_4h.bias": "pytorch_model_00018-of-00072.bin",
+    "h.16.mlp.dense_h_to_4h.weight": "pytorch_model_00018-of-00072.bin",
+    "h.16.post_attention_layernorm.bias": "pytorch_model_00018-of-00072.bin",
+    "h.16.post_attention_layernorm.weight": "pytorch_model_00018-of-00072.bin",
+    "h.16.self_attention.dense.bias": "pytorch_model_00018-of-00072.bin",
+    "h.16.self_attention.dense.weight": "pytorch_model_00018-of-00072.bin",
+    "h.16.self_attention.query_key_value.bias": "pytorch_model_00018-of-00072.bin",
+    "h.16.self_attention.query_key_value.weight": "pytorch_model_00018-of-00072.bin",
+    "h.17.input_layernorm.bias": "pytorch_model_00019-of-00072.bin",
+    "h.17.input_layernorm.weight": "pytorch_model_00019-of-00072.bin",
+    "h.17.mlp.dense_4h_to_h.bias": "pytorch_model_00019-of-00072.bin",
+    "h.17.mlp.dense_4h_to_h.weight": "pytorch_model_00019-of-00072.bin",
+    "h.17.mlp.dense_h_to_4h.bias": "pytorch_model_00019-of-00072.bin",
+    "h.17.mlp.dense_h_to_4h.weight": "pytorch_model_00019-of-00072.bin",
+    "h.17.post_attention_layernorm.bias": "pytorch_model_00019-of-00072.bin",
+    "h.17.post_attention_layernorm.weight": "pytorch_model_00019-of-00072.bin",
+    "h.17.self_attention.dense.bias": "pytorch_model_00019-of-00072.bin",
+    "h.17.self_attention.dense.weight": "pytorch_model_00019-of-00072.bin",
+    "h.17.self_attention.query_key_value.bias": "pytorch_model_00019-of-00072.bin",
+    "h.17.self_attention.query_key_value.weight": "pytorch_model_00019-of-00072.bin",
+    "h.18.input_layernorm.bias": "pytorch_model_00020-of-00072.bin",
+    "h.18.input_layernorm.weight": "pytorch_model_00020-of-00072.bin",
+    "h.18.mlp.dense_4h_to_h.bias": "pytorch_model_00020-of-00072.bin",
+    "h.18.mlp.dense_4h_to_h.weight": "pytorch_model_00020-of-00072.bin",
+    "h.18.mlp.dense_h_to_4h.bias": "pytorch_model_00020-of-00072.bin",
+    "h.18.mlp.dense_h_to_4h.weight": "pytorch_model_00020-of-00072.bin",
+    "h.18.post_attention_layernorm.bias": "pytorch_model_00020-of-00072.bin",
+    "h.18.post_attention_layernorm.weight": "pytorch_model_00020-of-00072.bin",
+    "h.18.self_attention.dense.bias": "pytorch_model_00020-of-00072.bin",
+    "h.18.self_attention.dense.weight": "pytorch_model_00020-of-00072.bin",
+    "h.18.self_attention.query_key_value.bias": "pytorch_model_00020-of-00072.bin",
+    "h.18.self_attention.query_key_value.weight": "pytorch_model_00020-of-00072.bin",
+    "h.19.input_layernorm.bias": "pytorch_model_00021-of-00072.bin",
+    "h.19.input_layernorm.weight": "pytorch_model_00021-of-00072.bin",
+    "h.19.mlp.dense_4h_to_h.bias": "pytorch_model_00021-of-00072.bin",
+    "h.19.mlp.dense_4h_to_h.weight": "pytorch_model_00021-of-00072.bin",
+    "h.19.mlp.dense_h_to_4h.bias": "pytorch_model_00021-of-00072.bin",
+    "h.19.mlp.dense_h_to_4h.weight": "pytorch_model_00021-of-00072.bin",
+    "h.19.post_attention_layernorm.bias": "pytorch_model_00021-of-00072.bin",
+    "h.19.post_attention_layernorm.weight": "pytorch_model_00021-of-00072.bin",
+    "h.19.self_attention.dense.bias": "pytorch_model_00021-of-00072.bin",
+    "h.19.self_attention.dense.weight": "pytorch_model_00021-of-00072.bin",
+    "h.19.self_attention.query_key_value.bias": "pytorch_model_00021-of-00072.bin",
+    "h.19.self_attention.query_key_value.weight": "pytorch_model_00021-of-00072.bin",
+    "h.2.input_layernorm.bias": "pytorch_model_00004-of-00072.bin",
+    "h.2.input_layernorm.weight": "pytorch_model_00004-of-00072.bin",
+    "h.2.mlp.dense_4h_to_h.bias": "pytorch_model_00004-of-00072.bin",
+    "h.2.mlp.dense_4h_to_h.weight": "pytorch_model_00004-of-00072.bin",
+    "h.2.mlp.dense_h_to_4h.bias": "pytorch_model_00004-of-00072.bin",
+    "h.2.mlp.dense_h_to_4h.weight": "pytorch_model_00004-of-00072.bin",
+    "h.2.post_attention_layernorm.bias": "pytorch_model_00004-of-00072.bin",
+    "h.2.post_attention_layernorm.weight": "pytorch_model_00004-of-00072.bin",
+    "h.2.self_attention.dense.bias": "pytorch_model_00004-of-00072.bin",
+    "h.2.self_attention.dense.weight": "pytorch_model_00004-of-00072.bin",
+    "h.2.self_attention.query_key_value.bias": "pytorch_model_00004-of-00072.bin",
+    "h.2.self_attention.query_key_value.weight": "pytorch_model_00004-of-00072.bin",
+    "h.20.input_layernorm.bias": "pytorch_model_00022-of-00072.bin",
+    "h.20.input_layernorm.weight": "pytorch_model_00022-of-00072.bin",
+    "h.20.mlp.dense_4h_to_h.bias": "pytorch_model_00022-of-00072.bin",
+    "h.20.mlp.dense_4h_to_h.weight": "pytorch_model_00022-of-00072.bin",
+    "h.20.mlp.dense_h_to_4h.bias": "pytorch_model_00022-of-00072.bin",
+    "h.20.mlp.dense_h_to_4h.weight": "pytorch_model_00022-of-00072.bin",
+    "h.20.post_attention_layernorm.bias": "pytorch_model_00022-of-00072.bin",
+    "h.20.post_attention_layernorm.weight": "pytorch_model_00022-of-00072.bin",
+    "h.20.self_attention.dense.bias": "pytorch_model_00022-of-00072.bin",
+    "h.20.self_attention.dense.weight": "pytorch_model_00022-of-00072.bin",
+    "h.20.self_attention.query_key_value.bias": "pytorch_model_00022-of-00072.bin",
+    "h.20.self_attention.query_key_value.weight": "pytorch_model_00022-of-00072.bin",
+    "h.21.input_layernorm.bias": "pytorch_model_00023-of-00072.bin",
+    "h.21.input_layernorm.weight": "pytorch_model_00023-of-00072.bin",
+    "h.21.mlp.dense_4h_to_h.bias": "pytorch_model_00023-of-00072.bin",
+    "h.21.mlp.dense_4h_to_h.weight": "pytorch_model_00023-of-00072.bin",
+    "h.21.mlp.dense_h_to_4h.bias": "pytorch_model_00023-of-00072.bin",
+    "h.21.mlp.dense_h_to_4h.weight": "pytorch_model_00023-of-00072.bin",
+    "h.21.post_attention_layernorm.bias": "pytorch_model_00023-of-00072.bin",
+    "h.21.post_attention_layernorm.weight": "pytorch_model_00023-of-00072.bin",
+    "h.21.self_attention.dense.bias": "pytorch_model_00023-of-00072.bin",
+    "h.21.self_attention.dense.weight": "pytorch_model_00023-of-00072.bin",
+    "h.21.self_attention.query_key_value.bias": "pytorch_model_00023-of-00072.bin",
+    "h.21.self_attention.query_key_value.weight": "pytorch_model_00023-of-00072.bin",
+    "h.22.input_layernorm.bias": "pytorch_model_00024-of-00072.bin",
+    "h.22.input_layernorm.weight": "pytorch_model_00024-of-00072.bin",
+    "h.22.mlp.dense_4h_to_h.bias": "pytorch_model_00024-of-00072.bin",
+    "h.22.mlp.dense_4h_to_h.weight": "pytorch_model_00024-of-00072.bin",
+    "h.22.mlp.dense_h_to_4h.bias": "pytorch_model_00024-of-00072.bin",
+    "h.22.mlp.dense_h_to_4h.weight": "pytorch_model_00024-of-00072.bin",
+    "h.22.post_attention_layernorm.bias": "pytorch_model_00024-of-00072.bin",
+    "h.22.post_attention_layernorm.weight": "pytorch_model_00024-of-00072.bin",
+    "h.22.self_attention.dense.bias": "pytorch_model_00024-of-00072.bin",
+    "h.22.self_attention.dense.weight": "pytorch_model_00024-of-00072.bin",
+    "h.22.self_attention.query_key_value.bias": "pytorch_model_00024-of-00072.bin",
+    "h.22.self_attention.query_key_value.weight": "pytorch_model_00024-of-00072.bin",
+    "h.23.input_layernorm.bias": "pytorch_model_00025-of-00072.bin",
+    "h.23.input_layernorm.weight": "pytorch_model_00025-of-00072.bin",
+    "h.23.mlp.dense_4h_to_h.bias": "pytorch_model_00025-of-00072.bin",
+    "h.23.mlp.dense_4h_to_h.weight": "pytorch_model_00025-of-00072.bin",
+    "h.23.mlp.dense_h_to_4h.bias": "pytorch_model_00025-of-00072.bin",
+    "h.23.mlp.dense_h_to_4h.weight": "pytorch_model_00025-of-00072.bin",
+    "h.23.post_attention_layernorm.bias": "pytorch_model_00025-of-00072.bin",
+    "h.23.post_attention_layernorm.weight": "pytorch_model_00025-of-00072.bin",
+    "h.23.self_attention.dense.bias": "pytorch_model_00025-of-00072.bin",
+    "h.23.self_attention.dense.weight": "pytorch_model_00025-of-00072.bin",
+    "h.23.self_attention.query_key_value.bias": "pytorch_model_00025-of-00072.bin",
+    "h.23.self_attention.query_key_value.weight": "pytorch_model_00025-of-00072.bin",
+    "h.24.input_layernorm.bias": "pytorch_model_00026-of-00072.bin",
+    "h.24.input_layernorm.weight": "pytorch_model_00026-of-00072.bin",
+    "h.24.mlp.dense_4h_to_h.bias": "pytorch_model_00026-of-00072.bin",
+    "h.24.mlp.dense_4h_to_h.weight": "pytorch_model_00026-of-00072.bin",
+    "h.24.mlp.dense_h_to_4h.bias": "pytorch_model_00026-of-00072.bin",
+    "h.24.mlp.dense_h_to_4h.weight": "pytorch_model_00026-of-00072.bin",
+    "h.24.post_attention_layernorm.bias": "pytorch_model_00026-of-00072.bin",
+    "h.24.post_attention_layernorm.weight": "pytorch_model_00026-of-00072.bin",
+    "h.24.self_attention.dense.bias": "pytorch_model_00026-of-00072.bin",
+    "h.24.self_attention.dense.weight": "pytorch_model_00026-of-00072.bin",
+    "h.24.self_attention.query_key_value.bias": "pytorch_model_00026-of-00072.bin",
+    "h.24.self_attention.query_key_value.weight": "pytorch_model_00026-of-00072.bin",
+    "h.25.input_layernorm.bias": "pytorch_model_00027-of-00072.bin",
+    "h.25.input_layernorm.weight": "pytorch_model_00027-of-00072.bin",
+    "h.25.mlp.dense_4h_to_h.bias": "pytorch_model_00027-of-00072.bin",
+    "h.25.mlp.dense_4h_to_h.weight": "pytorch_model_00027-of-00072.bin",
+    "h.25.mlp.dense_h_to_4h.bias": "pytorch_model_00027-of-00072.bin",
+    "h.25.mlp.dense_h_to_4h.weight": "pytorch_model_00027-of-00072.bin",
+    "h.25.post_attention_layernorm.bias": "pytorch_model_00027-of-00072.bin",
+    "h.25.post_attention_layernorm.weight": "pytorch_model_00027-of-00072.bin",
+    "h.25.self_attention.dense.bias": "pytorch_model_00027-of-00072.bin",
+    "h.25.self_attention.dense.weight": "pytorch_model_00027-of-00072.bin",
+    "h.25.self_attention.query_key_value.bias": "pytorch_model_00027-of-00072.bin",
+    "h.25.self_attention.query_key_value.weight": "pytorch_model_00027-of-00072.bin",
+    "h.26.input_layernorm.bias": "pytorch_model_00028-of-00072.bin",
+    "h.26.input_layernorm.weight": "pytorch_model_00028-of-00072.bin",
+    "h.26.mlp.dense_4h_to_h.bias": "pytorch_model_00028-of-00072.bin",
+    "h.26.mlp.dense_4h_to_h.weight": "pytorch_model_00028-of-00072.bin",
+    "h.26.mlp.dense_h_to_4h.bias": "pytorch_model_00028-of-00072.bin",
+    "h.26.mlp.dense_h_to_4h.weight": "pytorch_model_00028-of-00072.bin",
+    "h.26.post_attention_layernorm.bias": "pytorch_model_00028-of-00072.bin",
+    "h.26.post_attention_layernorm.weight": "pytorch_model_00028-of-00072.bin",
+    "h.26.self_attention.dense.bias": "pytorch_model_00028-of-00072.bin",
+    "h.26.self_attention.dense.weight": "pytorch_model_00028-of-00072.bin",
+    "h.26.self_attention.query_key_value.bias": "pytorch_model_00028-of-00072.bin",
+    "h.26.self_attention.query_key_value.weight": "pytorch_model_00028-of-00072.bin",
+    "h.27.input_layernorm.bias": "pytorch_model_00029-of-00072.bin",
+    "h.27.input_layernorm.weight": "pytorch_model_00029-of-00072.bin",
+    "h.27.mlp.dense_4h_to_h.bias": "pytorch_model_00029-of-00072.bin",
+    "h.27.mlp.dense_4h_to_h.weight": "pytorch_model_00029-of-00072.bin",
+    "h.27.mlp.dense_h_to_4h.bias": "pytorch_model_00029-of-00072.bin",
+    "h.27.mlp.dense_h_to_4h.weight": "pytorch_model_00029-of-00072.bin",
+    "h.27.post_attention_layernorm.bias": "pytorch_model_00029-of-00072.bin",
+    "h.27.post_attention_layernorm.weight": "pytorch_model_00029-of-00072.bin",
+    "h.27.self_attention.dense.bias": "pytorch_model_00029-of-00072.bin",
+    "h.27.self_attention.dense.weight": "pytorch_model_00029-of-00072.bin",
+    "h.27.self_attention.query_key_value.bias": "pytorch_model_00029-of-00072.bin",
+    "h.27.self_attention.query_key_value.weight": "pytorch_model_00029-of-00072.bin",
+    "h.28.input_layernorm.bias": "pytorch_model_00030-of-00072.bin",
+    "h.28.input_layernorm.weight": "pytorch_model_00030-of-00072.bin",
+    "h.28.mlp.dense_4h_to_h.bias": "pytorch_model_00030-of-00072.bin",
+    "h.28.mlp.dense_4h_to_h.weight": "pytorch_model_00030-of-00072.bin",
+    "h.28.mlp.dense_h_to_4h.bias": "pytorch_model_00030-of-00072.bin",
+    "h.28.mlp.dense_h_to_4h.weight": "pytorch_model_00030-of-00072.bin",
+    "h.28.post_attention_layernorm.bias": "pytorch_model_00030-of-00072.bin",
+    "h.28.post_attention_layernorm.weight": "pytorch_model_00030-of-00072.bin",
+    "h.28.self_attention.dense.bias": "pytorch_model_00030-of-00072.bin",
+    "h.28.self_attention.dense.weight": "pytorch_model_00030-of-00072.bin",
+    "h.28.self_attention.query_key_value.bias": "pytorch_model_00030-of-00072.bin",
+    "h.28.self_attention.query_key_value.weight": "pytorch_model_00030-of-00072.bin",
+    "h.29.input_layernorm.bias": "pytorch_model_00031-of-00072.bin",
+    "h.29.input_layernorm.weight": "pytorch_model_00031-of-00072.bin",
+    "h.29.mlp.dense_4h_to_h.bias": "pytorch_model_00031-of-00072.bin",
+    "h.29.mlp.dense_4h_to_h.weight": "pytorch_model_00031-of-00072.bin",
+    "h.29.mlp.dense_h_to_4h.bias": "pytorch_model_00031-of-00072.bin",
+    "h.29.mlp.dense_h_to_4h.weight": "pytorch_model_00031-of-00072.bin",
+    "h.29.post_attention_layernorm.bias": "pytorch_model_00031-of-00072.bin",
+    "h.29.post_attention_layernorm.weight": "pytorch_model_00031-of-00072.bin",
+    "h.29.self_attention.dense.bias": "pytorch_model_00031-of-00072.bin",
+    "h.29.self_attention.dense.weight": "pytorch_model_00031-of-00072.bin",
+    "h.29.self_attention.query_key_value.bias": "pytorch_model_00031-of-00072.bin",
+    "h.29.self_attention.query_key_value.weight": "pytorch_model_00031-of-00072.bin",
+    "h.3.input_layernorm.bias": "pytorch_model_00005-of-00072.bin",
+    "h.3.input_layernorm.weight": "pytorch_model_00005-of-00072.bin",
+    "h.3.mlp.dense_4h_to_h.bias": "pytorch_model_00005-of-00072.bin",
+    "h.3.mlp.dense_4h_to_h.weight": "pytorch_model_00005-of-00072.bin",
+    "h.3.mlp.dense_h_to_4h.bias": "pytorch_model_00005-of-00072.bin",
+    "h.3.mlp.dense_h_to_4h.weight": "pytorch_model_00005-of-00072.bin",
+    "h.3.post_attention_layernorm.bias": "pytorch_model_00005-of-00072.bin",
+    "h.3.post_attention_layernorm.weight": "pytorch_model_00005-of-00072.bin",
+    "h.3.self_attention.dense.bias": "pytorch_model_00005-of-00072.bin",
+    "h.3.self_attention.dense.weight": "pytorch_model_00005-of-00072.bin",
+    "h.3.self_attention.query_key_value.bias": "pytorch_model_00005-of-00072.bin",
+    "h.3.self_attention.query_key_value.weight": "pytorch_model_00005-of-00072.bin",
+    "h.30.input_layernorm.bias": "pytorch_model_00032-of-00072.bin",
+    "h.30.input_layernorm.weight": "pytorch_model_00032-of-00072.bin",
+    "h.30.mlp.dense_4h_to_h.bias": "pytorch_model_00032-of-00072.bin",
+    "h.30.mlp.dense_4h_to_h.weight": "pytorch_model_00032-of-00072.bin",
+    "h.30.mlp.dense_h_to_4h.bias": "pytorch_model_00032-of-00072.bin",
+    "h.30.mlp.dense_h_to_4h.weight": "pytorch_model_00032-of-00072.bin",
+    "h.30.post_attention_layernorm.bias": "pytorch_model_00032-of-00072.bin",
+    "h.30.post_attention_layernorm.weight": "pytorch_model_00032-of-00072.bin",
+    "h.30.self_attention.dense.bias": "pytorch_model_00032-of-00072.bin",
+    "h.30.self_attention.dense.weight": "pytorch_model_00032-of-00072.bin",
+    "h.30.self_attention.query_key_value.bias": "pytorch_model_00032-of-00072.bin",
+    "h.30.self_attention.query_key_value.weight": "pytorch_model_00032-of-00072.bin",
+    "h.31.input_layernorm.bias": "pytorch_model_00033-of-00072.bin",
+    "h.31.input_layernorm.weight": "pytorch_model_00033-of-00072.bin",
+    "h.31.mlp.dense_4h_to_h.bias": "pytorch_model_00033-of-00072.bin",
+    "h.31.mlp.dense_4h_to_h.weight": "pytorch_model_00033-of-00072.bin",
+    "h.31.mlp.dense_h_to_4h.bias": "pytorch_model_00033-of-00072.bin",
+    "h.31.mlp.dense_h_to_4h.weight": "pytorch_model_00033-of-00072.bin",
+    "h.31.post_attention_layernorm.bias": "pytorch_model_00033-of-00072.bin",
+    "h.31.post_attention_layernorm.weight": "pytorch_model_00033-of-00072.bin",
+    "h.31.self_attention.dense.bias": "pytorch_model_00033-of-00072.bin",
+    "h.31.self_attention.dense.weight": "pytorch_model_00033-of-00072.bin",
+    "h.31.self_attention.query_key_value.bias": "pytorch_model_00033-of-00072.bin",
+    "h.31.self_attention.query_key_value.weight": "pytorch_model_00033-of-00072.bin",
+    "h.32.input_layernorm.bias": "pytorch_model_00034-of-00072.bin",
+    "h.32.input_layernorm.weight": "pytorch_model_00034-of-00072.bin",
+    "h.32.mlp.dense_4h_to_h.bias": "pytorch_model_00034-of-00072.bin",
+    "h.32.mlp.dense_4h_to_h.weight": "pytorch_model_00034-of-00072.bin",
+    "h.32.mlp.dense_h_to_4h.bias": "pytorch_model_00034-of-00072.bin",
+    "h.32.mlp.dense_h_to_4h.weight": "pytorch_model_00034-of-00072.bin",
+    "h.32.post_attention_layernorm.bias": "pytorch_model_00034-of-00072.bin",
+    "h.32.post_attention_layernorm.weight": "pytorch_model_00034-of-00072.bin",
+    "h.32.self_attention.dense.bias": "pytorch_model_00034-of-00072.bin",
+    "h.32.self_attention.dense.weight": "pytorch_model_00034-of-00072.bin",
+    "h.32.self_attention.query_key_value.bias": "pytorch_model_00034-of-00072.bin",
+    "h.32.self_attention.query_key_value.weight": "pytorch_model_00034-of-00072.bin",
+    "h.33.input_layernorm.bias": "pytorch_model_00035-of-00072.bin",
+    "h.33.input_layernorm.weight": "pytorch_model_00035-of-00072.bin",
+    "h.33.mlp.dense_4h_to_h.bias": "pytorch_model_00035-of-00072.bin",
+    "h.33.mlp.dense_4h_to_h.weight": "pytorch_model_00035-of-00072.bin",
+    "h.33.mlp.dense_h_to_4h.bias": "pytorch_model_00035-of-00072.bin",
+    "h.33.mlp.dense_h_to_4h.weight": "pytorch_model_00035-of-00072.bin",
+    "h.33.post_attention_layernorm.bias": "pytorch_model_00035-of-00072.bin",
+    "h.33.post_attention_layernorm.weight": "pytorch_model_00035-of-00072.bin",
+    "h.33.self_attention.dense.bias": "pytorch_model_00035-of-00072.bin",
+    "h.33.self_attention.dense.weight": "pytorch_model_00035-of-00072.bin",
+    "h.33.self_attention.query_key_value.bias": "pytorch_model_00035-of-00072.bin",
+    "h.33.self_attention.query_key_value.weight": "pytorch_model_00035-of-00072.bin",
+    "h.34.input_layernorm.bias": "pytorch_model_00036-of-00072.bin",
+    "h.34.input_layernorm.weight": "pytorch_model_00036-of-00072.bin",
+    "h.34.mlp.dense_4h_to_h.bias": "pytorch_model_00036-of-00072.bin",
+    "h.34.mlp.dense_4h_to_h.weight": "pytorch_model_00036-of-00072.bin",
+    "h.34.mlp.dense_h_to_4h.bias": "pytorch_model_00036-of-00072.bin",
+    "h.34.mlp.dense_h_to_4h.weight": "pytorch_model_00036-of-00072.bin",
+    "h.34.post_attention_layernorm.bias": "pytorch_model_00036-of-00072.bin",
+    "h.34.post_attention_layernorm.weight": "pytorch_model_00036-of-00072.bin",
+    "h.34.self_attention.dense.bias": "pytorch_model_00036-of-00072.bin",
+    "h.34.self_attention.dense.weight": "pytorch_model_00036-of-00072.bin",
+    "h.34.self_attention.query_key_value.bias": "pytorch_model_00036-of-00072.bin",
+    "h.34.self_attention.query_key_value.weight": "pytorch_model_00036-of-00072.bin",
+    "h.35.input_layernorm.bias": "pytorch_model_00037-of-00072.bin",
+    "h.35.input_layernorm.weight": "pytorch_model_00037-of-00072.bin",
+    "h.35.mlp.dense_4h_to_h.bias": "pytorch_model_00037-of-00072.bin",
+    "h.35.mlp.dense_4h_to_h.weight": "pytorch_model_00037-of-00072.bin",
+    "h.35.mlp.dense_h_to_4h.bias": "pytorch_model_00037-of-00072.bin",
+    "h.35.mlp.dense_h_to_4h.weight": "pytorch_model_00037-of-00072.bin",
+    "h.35.post_attention_layernorm.bias": "pytorch_model_00037-of-00072.bin",
+    "h.35.post_attention_layernorm.weight": "pytorch_model_00037-of-00072.bin",
+    "h.35.self_attention.dense.bias": "pytorch_model_00037-of-00072.bin",
+    "h.35.self_attention.dense.weight": "pytorch_model_00037-of-00072.bin",
+    "h.35.self_attention.query_key_value.bias": "pytorch_model_00037-of-00072.bin",
+    "h.35.self_attention.query_key_value.weight": "pytorch_model_00037-of-00072.bin",
+    "h.36.input_layernorm.bias": "pytorch_model_00038-of-00072.bin",
+    "h.36.input_layernorm.weight": "pytorch_model_00038-of-00072.bin",
+    "h.36.mlp.dense_4h_to_h.bias": "pytorch_model_00038-of-00072.bin",
+    "h.36.mlp.dense_4h_to_h.weight": "pytorch_model_00038-of-00072.bin",
+    "h.36.mlp.dense_h_to_4h.bias": "pytorch_model_00038-of-00072.bin",
+    "h.36.mlp.dense_h_to_4h.weight": "pytorch_model_00038-of-00072.bin",
+    "h.36.post_attention_layernorm.bias": "pytorch_model_00038-of-00072.bin",
+    "h.36.post_attention_layernorm.weight": "pytorch_model_00038-of-00072.bin",
+    "h.36.self_attention.dense.bias": "pytorch_model_00038-of-00072.bin",
+    "h.36.self_attention.dense.weight": "pytorch_model_00038-of-00072.bin",
+    "h.36.self_attention.query_key_value.bias": "pytorch_model_00038-of-00072.bin",
+    "h.36.self_attention.query_key_value.weight": "pytorch_model_00038-of-00072.bin",
+    "h.37.input_layernorm.bias": "pytorch_model_00039-of-00072.bin",
+    "h.37.input_layernorm.weight": "pytorch_model_00039-of-00072.bin",
+    "h.37.mlp.dense_4h_to_h.bias": "pytorch_model_00039-of-00072.bin",
+    "h.37.mlp.dense_4h_to_h.weight": "pytorch_model_00039-of-00072.bin",
+    "h.37.mlp.dense_h_to_4h.bias": "pytorch_model_00039-of-00072.bin",
+    "h.37.mlp.dense_h_to_4h.weight": "pytorch_model_00039-of-00072.bin",
+    "h.37.post_attention_layernorm.bias": "pytorch_model_00039-of-00072.bin",
+    "h.37.post_attention_layernorm.weight": "pytorch_model_00039-of-00072.bin",
+    "h.37.self_attention.dense.bias": "pytorch_model_00039-of-00072.bin",
+    "h.37.self_attention.dense.weight": "pytorch_model_00039-of-00072.bin",
+    "h.37.self_attention.query_key_value.bias": "pytorch_model_00039-of-00072.bin",
+    "h.37.self_attention.query_key_value.weight": "pytorch_model_00039-of-00072.bin",
+    "h.38.input_layernorm.bias": "pytorch_model_00040-of-00072.bin",
+    "h.38.input_layernorm.weight": "pytorch_model_00040-of-00072.bin",
+    "h.38.mlp.dense_4h_to_h.bias": "pytorch_model_00040-of-00072.bin",
+    "h.38.mlp.dense_4h_to_h.weight": "pytorch_model_00040-of-00072.bin",
+    "h.38.mlp.dense_h_to_4h.bias": "pytorch_model_00040-of-00072.bin",
+    "h.38.mlp.dense_h_to_4h.weight": "pytorch_model_00040-of-00072.bin",
+    "h.38.post_attention_layernorm.bias": "pytorch_model_00040-of-00072.bin",
+    "h.38.post_attention_layernorm.weight": "pytorch_model_00040-of-00072.bin",
+    "h.38.self_attention.dense.bias": "pytorch_model_00040-of-00072.bin",
+    "h.38.self_attention.dense.weight": "pytorch_model_00040-of-00072.bin",
+    "h.38.self_attention.query_key_value.bias": "pytorch_model_00040-of-00072.bin",
+    "h.38.self_attention.query_key_value.weight": "pytorch_model_00040-of-00072.bin",
+    "h.39.input_layernorm.bias": "pytorch_model_00041-of-00072.bin",
+    "h.39.input_layernorm.weight": "pytorch_model_00041-of-00072.bin",
+    "h.39.mlp.dense_4h_to_h.bias": "pytorch_model_00041-of-00072.bin",
+    "h.39.mlp.dense_4h_to_h.weight": "pytorch_model_00041-of-00072.bin",
+    "h.39.mlp.dense_h_to_4h.bias": "pytorch_model_00041-of-00072.bin",
+    "h.39.mlp.dense_h_to_4h.weight": "pytorch_model_00041-of-00072.bin",
+    "h.39.post_attention_layernorm.bias": "pytorch_model_00041-of-00072.bin",
+    "h.39.post_attention_layernorm.weight": "pytorch_model_00041-of-00072.bin",
+    "h.39.self_attention.dense.bias": "pytorch_model_00041-of-00072.bin",
+    "h.39.self_attention.dense.weight": "pytorch_model_00041-of-00072.bin",
+    "h.39.self_attention.query_key_value.bias": "pytorch_model_00041-of-00072.bin",
+    "h.39.self_attention.query_key_value.weight": "pytorch_model_00041-of-00072.bin",
+    "h.4.input_layernorm.bias": "pytorch_model_00006-of-00072.bin",
+    "h.4.input_layernorm.weight": "pytorch_model_00006-of-00072.bin",
+    "h.4.mlp.dense_4h_to_h.bias": "pytorch_model_00006-of-00072.bin",
+    "h.4.mlp.dense_4h_to_h.weight": "pytorch_model_00006-of-00072.bin",
+    "h.4.mlp.dense_h_to_4h.bias": "pytorch_model_00006-of-00072.bin",
+    "h.4.mlp.dense_h_to_4h.weight": "pytorch_model_00006-of-00072.bin",
+    "h.4.post_attention_layernorm.bias": "pytorch_model_00006-of-00072.bin",
+    "h.4.post_attention_layernorm.weight": "pytorch_model_00006-of-00072.bin",
+    "h.4.self_attention.dense.bias": "pytorch_model_00006-of-00072.bin",
+    "h.4.self_attention.dense.weight": "pytorch_model_00006-of-00072.bin",
+    "h.4.self_attention.query_key_value.bias": "pytorch_model_00006-of-00072.bin",
+    "h.4.self_attention.query_key_value.weight": "pytorch_model_00006-of-00072.bin",
+    "h.40.input_layernorm.bias": "pytorch_model_00042-of-00072.bin",
+    "h.40.input_layernorm.weight": "pytorch_model_00042-of-00072.bin",
+    "h.40.mlp.dense_4h_to_h.bias": "pytorch_model_00042-of-00072.bin",
+    "h.40.mlp.dense_4h_to_h.weight": "pytorch_model_00042-of-00072.bin",
+    "h.40.mlp.dense_h_to_4h.bias": "pytorch_model_00042-of-00072.bin",
+    "h.40.mlp.dense_h_to_4h.weight": "pytorch_model_00042-of-00072.bin",
+    "h.40.post_attention_layernorm.bias": "pytorch_model_00042-of-00072.bin",
+    "h.40.post_attention_layernorm.weight": "pytorch_model_00042-of-00072.bin",
+    "h.40.self_attention.dense.bias": "pytorch_model_00042-of-00072.bin",
+    "h.40.self_attention.dense.weight": "pytorch_model_00042-of-00072.bin",
+    "h.40.self_attention.query_key_value.bias": "pytorch_model_00042-of-00072.bin",
+    "h.40.self_attention.query_key_value.weight": "pytorch_model_00042-of-00072.bin",
+    "h.41.input_layernorm.bias": "pytorch_model_00043-of-00072.bin",
+    "h.41.input_layernorm.weight": "pytorch_model_00043-of-00072.bin",
+    "h.41.mlp.dense_4h_to_h.bias": "pytorch_model_00043-of-00072.bin",
+    "h.41.mlp.dense_4h_to_h.weight": "pytorch_model_00043-of-00072.bin",
+    "h.41.mlp.dense_h_to_4h.bias": "pytorch_model_00043-of-00072.bin",
+    "h.41.mlp.dense_h_to_4h.weight": "pytorch_model_00043-of-00072.bin",
+    "h.41.post_attention_layernorm.bias": "pytorch_model_00043-of-00072.bin",
+    "h.41.post_attention_layernorm.weight": "pytorch_model_00043-of-00072.bin",
+    "h.41.self_attention.dense.bias": "pytorch_model_00043-of-00072.bin",
+    "h.41.self_attention.dense.weight": "pytorch_model_00043-of-00072.bin",
+    "h.41.self_attention.query_key_value.bias": "pytorch_model_00043-of-00072.bin",
+    "h.41.self_attention.query_key_value.weight": "pytorch_model_00043-of-00072.bin",
+    "h.42.input_layernorm.bias": "pytorch_model_00044-of-00072.bin",
+    "h.42.input_layernorm.weight": "pytorch_model_00044-of-00072.bin",
+    "h.42.mlp.dense_4h_to_h.bias": "pytorch_model_00044-of-00072.bin",
+    "h.42.mlp.dense_4h_to_h.weight": "pytorch_model_00044-of-00072.bin",
+    "h.42.mlp.dense_h_to_4h.bias": "pytorch_model_00044-of-00072.bin",
+    "h.42.mlp.dense_h_to_4h.weight": "pytorch_model_00044-of-00072.bin",
+    "h.42.post_attention_layernorm.bias": "pytorch_model_00044-of-00072.bin",
+    "h.42.post_attention_layernorm.weight": "pytorch_model_00044-of-00072.bin",
+    "h.42.self_attention.dense.bias": "pytorch_model_00044-of-00072.bin",
+    "h.42.self_attention.dense.weight": "pytorch_model_00044-of-00072.bin",
+    "h.42.self_attention.query_key_value.bias": "pytorch_model_00044-of-00072.bin",
+    "h.42.self_attention.query_key_value.weight": "pytorch_model_00044-of-00072.bin",
+    "h.43.input_layernorm.bias": "pytorch_model_00045-of-00072.bin",
+    "h.43.input_layernorm.weight": "pytorch_model_00045-of-00072.bin",
+    "h.43.mlp.dense_4h_to_h.bias": "pytorch_model_00045-of-00072.bin",
+    "h.43.mlp.dense_4h_to_h.weight": "pytorch_model_00045-of-00072.bin",
+    "h.43.mlp.dense_h_to_4h.bias": "pytorch_model_00045-of-00072.bin",
+    "h.43.mlp.dense_h_to_4h.weight": "pytorch_model_00045-of-00072.bin",
+    "h.43.post_attention_layernorm.bias": "pytorch_model_00045-of-00072.bin",
+    "h.43.post_attention_layernorm.weight": "pytorch_model_00045-of-00072.bin",
+    "h.43.self_attention.dense.bias": "pytorch_model_00045-of-00072.bin",
+    "h.43.self_attention.dense.weight": "pytorch_model_00045-of-00072.bin",
+    "h.43.self_attention.query_key_value.bias": "pytorch_model_00045-of-00072.bin",
+    "h.43.self_attention.query_key_value.weight": "pytorch_model_00045-of-00072.bin",
+    "h.44.input_layernorm.bias": "pytorch_model_00046-of-00072.bin",
+    "h.44.input_layernorm.weight": "pytorch_model_00046-of-00072.bin",
+    "h.44.mlp.dense_4h_to_h.bias": "pytorch_model_00046-of-00072.bin",
+    "h.44.mlp.dense_4h_to_h.weight": "pytorch_model_00046-of-00072.bin",
+    "h.44.mlp.dense_h_to_4h.bias": "pytorch_model_00046-of-00072.bin",
+    "h.44.mlp.dense_h_to_4h.weight": "pytorch_model_00046-of-00072.bin",
+    "h.44.post_attention_layernorm.bias": "pytorch_model_00046-of-00072.bin",
+    "h.44.post_attention_layernorm.weight": "pytorch_model_00046-of-00072.bin",
+    "h.44.self_attention.dense.bias": "pytorch_model_00046-of-00072.bin",
+    "h.44.self_attention.dense.weight": "pytorch_model_00046-of-00072.bin",
+    "h.44.self_attention.query_key_value.bias": "pytorch_model_00046-of-00072.bin",
+    "h.44.self_attention.query_key_value.weight": "pytorch_model_00046-of-00072.bin",
+    "h.45.input_layernorm.bias": "pytorch_model_00047-of-00072.bin",
+    "h.45.input_layernorm.weight": "pytorch_model_00047-of-00072.bin",
+    "h.45.mlp.dense_4h_to_h.bias": "pytorch_model_00047-of-00072.bin",
+    "h.45.mlp.dense_4h_to_h.weight": "pytorch_model_00047-of-00072.bin",
+    "h.45.mlp.dense_h_to_4h.bias": "pytorch_model_00047-of-00072.bin",
+    "h.45.mlp.dense_h_to_4h.weight": "pytorch_model_00047-of-00072.bin",
+    "h.45.post_attention_layernorm.bias": "pytorch_model_00047-of-00072.bin",
+    "h.45.post_attention_layernorm.weight": "pytorch_model_00047-of-00072.bin",
+    "h.45.self_attention.dense.bias": "pytorch_model_00047-of-00072.bin",
+    "h.45.self_attention.dense.weight": "pytorch_model_00047-of-00072.bin",
+    "h.45.self_attention.query_key_value.bias": "pytorch_model_00047-of-00072.bin",
+    "h.45.self_attention.query_key_value.weight": "pytorch_model_00047-of-00072.bin",
+    "h.46.input_layernorm.bias": "pytorch_model_00048-of-00072.bin",
+    "h.46.input_layernorm.weight": "pytorch_model_00048-of-00072.bin",
+    "h.46.mlp.dense_4h_to_h.bias": "pytorch_model_00048-of-00072.bin",
+    "h.46.mlp.dense_4h_to_h.weight": "pytorch_model_00048-of-00072.bin",
+    "h.46.mlp.dense_h_to_4h.bias": "pytorch_model_00048-of-00072.bin",
+    "h.46.mlp.dense_h_to_4h.weight": "pytorch_model_00048-of-00072.bin",
+    "h.46.post_attention_layernorm.bias": "pytorch_model_00048-of-00072.bin",
+    "h.46.post_attention_layernorm.weight": "pytorch_model_00048-of-00072.bin",
+    "h.46.self_attention.dense.bias": "pytorch_model_00048-of-00072.bin",
+    "h.46.self_attention.dense.weight": "pytorch_model_00048-of-00072.bin",
+    "h.46.self_attention.query_key_value.bias": "pytorch_model_00048-of-00072.bin",
+    "h.46.self_attention.query_key_value.weight": "pytorch_model_00048-of-00072.bin",
+    "h.47.input_layernorm.bias": "pytorch_model_00049-of-00072.bin",
+    "h.47.input_layernorm.weight": "pytorch_model_00049-of-00072.bin",
+    "h.47.mlp.dense_4h_to_h.bias": "pytorch_model_00049-of-00072.bin",
+    "h.47.mlp.dense_4h_to_h.weight": "pytorch_model_00049-of-00072.bin",
+    "h.47.mlp.dense_h_to_4h.bias": "pytorch_model_00049-of-00072.bin",
+    "h.47.mlp.dense_h_to_4h.weight": "pytorch_model_00049-of-00072.bin",
+    "h.47.post_attention_layernorm.bias": "pytorch_model_00049-of-00072.bin",
+    "h.47.post_attention_layernorm.weight": "pytorch_model_00049-of-00072.bin",
+    "h.47.self_attention.dense.bias": "pytorch_model_00049-of-00072.bin",
+    "h.47.self_attention.dense.weight": "pytorch_model_00049-of-00072.bin",
+    "h.47.self_attention.query_key_value.bias": "pytorch_model_00049-of-00072.bin",
+    "h.47.self_attention.query_key_value.weight": "pytorch_model_00049-of-00072.bin",
+    "h.48.input_layernorm.bias": "pytorch_model_00050-of-00072.bin",
+    "h.48.input_layernorm.weight": "pytorch_model_00050-of-00072.bin",
+    "h.48.mlp.dense_4h_to_h.bias": "pytorch_model_00050-of-00072.bin",
+    "h.48.mlp.dense_4h_to_h.weight": "pytorch_model_00050-of-00072.bin",
+    "h.48.mlp.dense_h_to_4h.bias": "pytorch_model_00050-of-00072.bin",
+    "h.48.mlp.dense_h_to_4h.weight": "pytorch_model_00050-of-00072.bin",
+    "h.48.post_attention_layernorm.bias": "pytorch_model_00050-of-00072.bin",
+    "h.48.post_attention_layernorm.weight": "pytorch_model_00050-of-00072.bin",
+    "h.48.self_attention.dense.bias": "pytorch_model_00050-of-00072.bin",
+    "h.48.self_attention.dense.weight": "pytorch_model_00050-of-00072.bin",
+    "h.48.self_attention.query_key_value.bias": "pytorch_model_00050-of-00072.bin",
+    "h.48.self_attention.query_key_value.weight": "pytorch_model_00050-of-00072.bin",
+    "h.49.input_layernorm.bias": "pytorch_model_00051-of-00072.bin",
+    "h.49.input_layernorm.weight": "pytorch_model_00051-of-00072.bin",
+    "h.49.mlp.dense_4h_to_h.bias": "pytorch_model_00051-of-00072.bin",
+    "h.49.mlp.dense_4h_to_h.weight": "pytorch_model_00051-of-00072.bin",
+    "h.49.mlp.dense_h_to_4h.bias": "pytorch_model_00051-of-00072.bin",
+    "h.49.mlp.dense_h_to_4h.weight": "pytorch_model_00051-of-00072.bin",
+    "h.49.post_attention_layernorm.bias": "pytorch_model_00051-of-00072.bin",
+    "h.49.post_attention_layernorm.weight": "pytorch_model_00051-of-00072.bin",
+    "h.49.self_attention.dense.bias": "pytorch_model_00051-of-00072.bin",
+    "h.49.self_attention.dense.weight": "pytorch_model_00051-of-00072.bin",
+    "h.49.self_attention.query_key_value.bias": "pytorch_model_00051-of-00072.bin",
+    "h.49.self_attention.query_key_value.weight": "pytorch_model_00051-of-00072.bin",
+    "h.5.input_layernorm.bias": "pytorch_model_00007-of-00072.bin",
+    "h.5.input_layernorm.weight": "pytorch_model_00007-of-00072.bin",
+    "h.5.mlp.dense_4h_to_h.bias": "pytorch_model_00007-of-00072.bin",
+    "h.5.mlp.dense_4h_to_h.weight": "pytorch_model_00007-of-00072.bin",
+    "h.5.mlp.dense_h_to_4h.bias": "pytorch_model_00007-of-00072.bin",
+    "h.5.mlp.dense_h_to_4h.weight": "pytorch_model_00007-of-00072.bin",
+    "h.5.post_attention_layernorm.bias": "pytorch_model_00007-of-00072.bin",
+    "h.5.post_attention_layernorm.weight": "pytorch_model_00007-of-00072.bin",
+    "h.5.self_attention.dense.bias": "pytorch_model_00007-of-00072.bin",
+    "h.5.self_attention.dense.weight": "pytorch_model_00007-of-00072.bin",
+    "h.5.self_attention.query_key_value.bias": "pytorch_model_00007-of-00072.bin",
+    "h.5.self_attention.query_key_value.weight": "pytorch_model_00007-of-00072.bin",
+    "h.50.input_layernorm.bias": "pytorch_model_00052-of-00072.bin",
+    "h.50.input_layernorm.weight": "pytorch_model_00052-of-00072.bin",
+    "h.50.mlp.dense_4h_to_h.bias": "pytorch_model_00052-of-00072.bin",
+    "h.50.mlp.dense_4h_to_h.weight": "pytorch_model_00052-of-00072.bin",
+    "h.50.mlp.dense_h_to_4h.bias": "pytorch_model_00052-of-00072.bin",
+    "h.50.mlp.dense_h_to_4h.weight": "pytorch_model_00052-of-00072.bin",
+    "h.50.post_attention_layernorm.bias": "pytorch_model_00052-of-00072.bin",
+    "h.50.post_attention_layernorm.weight": "pytorch_model_00052-of-00072.bin",
+    "h.50.self_attention.dense.bias": "pytorch_model_00052-of-00072.bin",
+    "h.50.self_attention.dense.weight": "pytorch_model_00052-of-00072.bin",
+    "h.50.self_attention.query_key_value.bias": "pytorch_model_00052-of-00072.bin",
+    "h.50.self_attention.query_key_value.weight": "pytorch_model_00052-of-00072.bin",
+    "h.51.input_layernorm.bias": "pytorch_model_00053-of-00072.bin",
+    "h.51.input_layernorm.weight": "pytorch_model_00053-of-00072.bin",
+    "h.51.mlp.dense_4h_to_h.bias": "pytorch_model_00053-of-00072.bin",
+    "h.51.mlp.dense_4h_to_h.weight": "pytorch_model_00053-of-00072.bin",
+    "h.51.mlp.dense_h_to_4h.bias": "pytorch_model_00053-of-00072.bin",
+    "h.51.mlp.dense_h_to_4h.weight": "pytorch_model_00053-of-00072.bin",
+    "h.51.post_attention_layernorm.bias": "pytorch_model_00053-of-00072.bin",
+    "h.51.post_attention_layernorm.weight": "pytorch_model_00053-of-00072.bin",
+    "h.51.self_attention.dense.bias": "pytorch_model_00053-of-00072.bin",
+    "h.51.self_attention.dense.weight": "pytorch_model_00053-of-00072.bin",
+    "h.51.self_attention.query_key_value.bias": "pytorch_model_00053-of-00072.bin",
+    "h.51.self_attention.query_key_value.weight": "pytorch_model_00053-of-00072.bin",
+    "h.52.input_layernorm.bias": "pytorch_model_00054-of-00072.bin",
+    "h.52.input_layernorm.weight": "pytorch_model_00054-of-00072.bin",
+    "h.52.mlp.dense_4h_to_h.bias": "pytorch_model_00054-of-00072.bin",
+    "h.52.mlp.dense_4h_to_h.weight": "pytorch_model_00054-of-00072.bin",
+    "h.52.mlp.dense_h_to_4h.bias": "pytorch_model_00054-of-00072.bin",
+    "h.52.mlp.dense_h_to_4h.weight": "pytorch_model_00054-of-00072.bin",
+    "h.52.post_attention_layernorm.bias": "pytorch_model_00054-of-00072.bin",
+    "h.52.post_attention_layernorm.weight": "pytorch_model_00054-of-00072.bin",
+    "h.52.self_attention.dense.bias": "pytorch_model_00054-of-00072.bin",
+    "h.52.self_attention.dense.weight": "pytorch_model_00054-of-00072.bin",
+    "h.52.self_attention.query_key_value.bias": "pytorch_model_00054-of-00072.bin",
+    "h.52.self_attention.query_key_value.weight": "pytorch_model_00054-of-00072.bin",
+    "h.53.input_layernorm.bias": "pytorch_model_00055-of-00072.bin",
+    "h.53.input_layernorm.weight": "pytorch_model_00055-of-00072.bin",
+    "h.53.mlp.dense_4h_to_h.bias": "pytorch_model_00055-of-00072.bin",
+    "h.53.mlp.dense_4h_to_h.weight": "pytorch_model_00055-of-00072.bin",
+    "h.53.mlp.dense_h_to_4h.bias": "pytorch_model_00055-of-00072.bin",
+    "h.53.mlp.dense_h_to_4h.weight": "pytorch_model_00055-of-00072.bin",
+    "h.53.post_attention_layernorm.bias": "pytorch_model_00055-of-00072.bin",
+    "h.53.post_attention_layernorm.weight": "pytorch_model_00055-of-00072.bin",
+    "h.53.self_attention.dense.bias": "pytorch_model_00055-of-00072.bin",
+    "h.53.self_attention.dense.weight": "pytorch_model_00055-of-00072.bin",
+    "h.53.self_attention.query_key_value.bias": "pytorch_model_00055-of-00072.bin",
+    "h.53.self_attention.query_key_value.weight": "pytorch_model_00055-of-00072.bin",
+    "h.54.input_layernorm.bias": "pytorch_model_00056-of-00072.bin",
+    "h.54.input_layernorm.weight": "pytorch_model_00056-of-00072.bin",
+    "h.54.mlp.dense_4h_to_h.bias": "pytorch_model_00056-of-00072.bin",
+    "h.54.mlp.dense_4h_to_h.weight": "pytorch_model_00056-of-00072.bin",
+    "h.54.mlp.dense_h_to_4h.bias": "pytorch_model_00056-of-00072.bin",
+    "h.54.mlp.dense_h_to_4h.weight": "pytorch_model_00056-of-00072.bin",
+    "h.54.post_attention_layernorm.bias": "pytorch_model_00056-of-00072.bin",
+    "h.54.post_attention_layernorm.weight": "pytorch_model_00056-of-00072.bin",
+    "h.54.self_attention.dense.bias": "pytorch_model_00056-of-00072.bin",
+    "h.54.self_attention.dense.weight": "pytorch_model_00056-of-00072.bin",
+    "h.54.self_attention.query_key_value.bias": "pytorch_model_00056-of-00072.bin",
+    "h.54.self_attention.query_key_value.weight": "pytorch_model_00056-of-00072.bin",
+    "h.55.input_layernorm.bias": "pytorch_model_00057-of-00072.bin",
+    "h.55.input_layernorm.weight": "pytorch_model_00057-of-00072.bin",
+    "h.55.mlp.dense_4h_to_h.bias": "pytorch_model_00057-of-00072.bin",
+    "h.55.mlp.dense_4h_to_h.weight": "pytorch_model_00057-of-00072.bin",
+    "h.55.mlp.dense_h_to_4h.bias": "pytorch_model_00057-of-00072.bin",
+    "h.55.mlp.dense_h_to_4h.weight": "pytorch_model_00057-of-00072.bin",
+    "h.55.post_attention_layernorm.bias": "pytorch_model_00057-of-00072.bin",
+    "h.55.post_attention_layernorm.weight": "pytorch_model_00057-of-00072.bin",
+    "h.55.self_attention.dense.bias": "pytorch_model_00057-of-00072.bin",
+    "h.55.self_attention.dense.weight": "pytorch_model_00057-of-00072.bin",
+    "h.55.self_attention.query_key_value.bias": "pytorch_model_00057-of-00072.bin",
+    "h.55.self_attention.query_key_value.weight": "pytorch_model_00057-of-00072.bin",
+    "h.56.input_layernorm.bias": "pytorch_model_00058-of-00072.bin",
+    "h.56.input_layernorm.weight": "pytorch_model_00058-of-00072.bin",
+    "h.56.mlp.dense_4h_to_h.bias": "pytorch_model_00058-of-00072.bin",
+    "h.56.mlp.dense_4h_to_h.weight": "pytorch_model_00058-of-00072.bin",
+    "h.56.mlp.dense_h_to_4h.bias": "pytorch_model_00058-of-00072.bin",
+    "h.56.mlp.dense_h_to_4h.weight": "pytorch_model_00058-of-00072.bin",
+    "h.56.post_attention_layernorm.bias": "pytorch_model_00058-of-00072.bin",
+    "h.56.post_attention_layernorm.weight": "pytorch_model_00058-of-00072.bin",
+    "h.56.self_attention.dense.bias": "pytorch_model_00058-of-00072.bin",
+    "h.56.self_attention.dense.weight": "pytorch_model_00058-of-00072.bin",
+    "h.56.self_attention.query_key_value.bias": "pytorch_model_00058-of-00072.bin",
+    "h.56.self_attention.query_key_value.weight": "pytorch_model_00058-of-00072.bin",
+    "h.57.input_layernorm.bias": "pytorch_model_00059-of-00072.bin",
+    "h.57.input_layernorm.weight": "pytorch_model_00059-of-00072.bin",
+    "h.57.mlp.dense_4h_to_h.bias": "pytorch_model_00059-of-00072.bin",
+    "h.57.mlp.dense_4h_to_h.weight": "pytorch_model_00059-of-00072.bin",
+    "h.57.mlp.dense_h_to_4h.bias": "pytorch_model_00059-of-00072.bin",
+    "h.57.mlp.dense_h_to_4h.weight": "pytorch_model_00059-of-00072.bin",
+    "h.57.post_attention_layernorm.bias": "pytorch_model_00059-of-00072.bin",
+    "h.57.post_attention_layernorm.weight": "pytorch_model_00059-of-00072.bin",
+    "h.57.self_attention.dense.bias": "pytorch_model_00059-of-00072.bin",
+    "h.57.self_attention.dense.weight": "pytorch_model_00059-of-00072.bin",
+    "h.57.self_attention.query_key_value.bias": "pytorch_model_00059-of-00072.bin",
+    "h.57.self_attention.query_key_value.weight": "pytorch_model_00059-of-00072.bin",
+    "h.58.input_layernorm.bias": "pytorch_model_00060-of-00072.bin",
+    "h.58.input_layernorm.weight": "pytorch_model_00060-of-00072.bin",
+    "h.58.mlp.dense_4h_to_h.bias": "pytorch_model_00060-of-00072.bin",
+    "h.58.mlp.dense_4h_to_h.weight": "pytorch_model_00060-of-00072.bin",
+    "h.58.mlp.dense_h_to_4h.bias": "pytorch_model_00060-of-00072.bin",
+    "h.58.mlp.dense_h_to_4h.weight": "pytorch_model_00060-of-00072.bin",
+    "h.58.post_attention_layernorm.bias": "pytorch_model_00060-of-00072.bin",
+    "h.58.post_attention_layernorm.weight": "pytorch_model_00060-of-00072.bin",
+    "h.58.self_attention.dense.bias": "pytorch_model_00060-of-00072.bin",
+    "h.58.self_attention.dense.weight": "pytorch_model_00060-of-00072.bin",
+    "h.58.self_attention.query_key_value.bias": "pytorch_model_00060-of-00072.bin",
+    "h.58.self_attention.query_key_value.weight": "pytorch_model_00060-of-00072.bin",
+    "h.59.input_layernorm.bias": "pytorch_model_00061-of-00072.bin",
+    "h.59.input_layernorm.weight": "pytorch_model_00061-of-00072.bin",
+    "h.59.mlp.dense_4h_to_h.bias": "pytorch_model_00061-of-00072.bin",
+    "h.59.mlp.dense_4h_to_h.weight": "pytorch_model_00061-of-00072.bin",
+    "h.59.mlp.dense_h_to_4h.bias": "pytorch_model_00061-of-00072.bin",
+    "h.59.mlp.dense_h_to_4h.weight": "pytorch_model_00061-of-00072.bin",
+    "h.59.post_attention_layernorm.bias": "pytorch_model_00061-of-00072.bin",
+    "h.59.post_attention_layernorm.weight": "pytorch_model_00061-of-00072.bin",
+    "h.59.self_attention.dense.bias": "pytorch_model_00061-of-00072.bin",
+    "h.59.self_attention.dense.weight": "pytorch_model_00061-of-00072.bin",
+    "h.59.self_attention.query_key_value.bias": "pytorch_model_00061-of-00072.bin",
+    "h.59.self_attention.query_key_value.weight": "pytorch_model_00061-of-00072.bin",
+    "h.6.input_layernorm.bias": "pytorch_model_00008-of-00072.bin",
+    "h.6.input_layernorm.weight": "pytorch_model_00008-of-00072.bin",
+    "h.6.mlp.dense_4h_to_h.bias": "pytorch_model_00008-of-00072.bin",
+    "h.6.mlp.dense_4h_to_h.weight": "pytorch_model_00008-of-00072.bin",
+    "h.6.mlp.dense_h_to_4h.bias": "pytorch_model_00008-of-00072.bin",
+    "h.6.mlp.dense_h_to_4h.weight": "pytorch_model_00008-of-00072.bin",
+    "h.6.post_attention_layernorm.bias": "pytorch_model_00008-of-00072.bin",
+    "h.6.post_attention_layernorm.weight": "pytorch_model_00008-of-00072.bin",
+    "h.6.self_attention.dense.bias": "pytorch_model_00008-of-00072.bin",
+    "h.6.self_attention.dense.weight": "pytorch_model_00008-of-00072.bin",
+    "h.6.self_attention.query_key_value.bias": "pytorch_model_00008-of-00072.bin",
+    "h.6.self_attention.query_key_value.weight": "pytorch_model_00008-of-00072.bin",
+    "h.60.input_layernorm.bias": "pytorch_model_00062-of-00072.bin",
+    "h.60.input_layernorm.weight": "pytorch_model_00062-of-00072.bin",
+    "h.60.mlp.dense_4h_to_h.bias": "pytorch_model_00062-of-00072.bin",
+    "h.60.mlp.dense_4h_to_h.weight": "pytorch_model_00062-of-00072.bin",
+    "h.60.mlp.dense_h_to_4h.bias": "pytorch_model_00062-of-00072.bin",
+    "h.60.mlp.dense_h_to_4h.weight": "pytorch_model_00062-of-00072.bin",
+    "h.60.post_attention_layernorm.bias": "pytorch_model_00062-of-00072.bin",
+    "h.60.post_attention_layernorm.weight": "pytorch_model_00062-of-00072.bin",
+    "h.60.self_attention.dense.bias": "pytorch_model_00062-of-00072.bin",
+    "h.60.self_attention.dense.weight": "pytorch_model_00062-of-00072.bin",
+    "h.60.self_attention.query_key_value.bias": "pytorch_model_00062-of-00072.bin",
+    "h.60.self_attention.query_key_value.weight": "pytorch_model_00062-of-00072.bin",
+    "h.61.input_layernorm.bias": "pytorch_model_00063-of-00072.bin",
+    "h.61.input_layernorm.weight": "pytorch_model_00063-of-00072.bin",
+    "h.61.mlp.dense_4h_to_h.bias": "pytorch_model_00063-of-00072.bin",
+    "h.61.mlp.dense_4h_to_h.weight": "pytorch_model_00063-of-00072.bin",
+    "h.61.mlp.dense_h_to_4h.bias": "pytorch_model_00063-of-00072.bin",
+    "h.61.mlp.dense_h_to_4h.weight": "pytorch_model_00063-of-00072.bin",
+    "h.61.post_attention_layernorm.bias": "pytorch_model_00063-of-00072.bin",
+    "h.61.post_attention_layernorm.weight": "pytorch_model_00063-of-00072.bin",
+    "h.61.self_attention.dense.bias": "pytorch_model_00063-of-00072.bin",
+    "h.61.self_attention.dense.weight": "pytorch_model_00063-of-00072.bin",
+    "h.61.self_attention.query_key_value.bias": "pytorch_model_00063-of-00072.bin",
+    "h.61.self_attention.query_key_value.weight": "pytorch_model_00063-of-00072.bin",
+    "h.62.input_layernorm.bias": "pytorch_model_00064-of-00072.bin",
+    "h.62.input_layernorm.weight": "pytorch_model_00064-of-00072.bin",
+    "h.62.mlp.dense_4h_to_h.bias": "pytorch_model_00064-of-00072.bin",
+    "h.62.mlp.dense_4h_to_h.weight": "pytorch_model_00064-of-00072.bin",
+    "h.62.mlp.dense_h_to_4h.bias": "pytorch_model_00064-of-00072.bin",
+    "h.62.mlp.dense_h_to_4h.weight": "pytorch_model_00064-of-00072.bin",
+    "h.62.post_attention_layernorm.bias": "pytorch_model_00064-of-00072.bin",
+    "h.62.post_attention_layernorm.weight": "pytorch_model_00064-of-00072.bin",
+    "h.62.self_attention.dense.bias": "pytorch_model_00064-of-00072.bin",
+    "h.62.self_attention.dense.weight": "pytorch_model_00064-of-00072.bin",
+    "h.62.self_attention.query_key_value.bias": "pytorch_model_00064-of-00072.bin",
+    "h.62.self_attention.query_key_value.weight": "pytorch_model_00064-of-00072.bin",
+    "h.63.input_layernorm.bias": "pytorch_model_00065-of-00072.bin",
+    "h.63.input_layernorm.weight": "pytorch_model_00065-of-00072.bin",
+    "h.63.mlp.dense_4h_to_h.bias": "pytorch_model_00065-of-00072.bin",
+    "h.63.mlp.dense_4h_to_h.weight": "pytorch_model_00065-of-00072.bin",
+    "h.63.mlp.dense_h_to_4h.bias": "pytorch_model_00065-of-00072.bin",
+    "h.63.mlp.dense_h_to_4h.weight": "pytorch_model_00065-of-00072.bin",
+    "h.63.post_attention_layernorm.bias": "pytorch_model_00065-of-00072.bin",
+    "h.63.post_attention_layernorm.weight": "pytorch_model_00065-of-00072.bin",
+    "h.63.self_attention.dense.bias": "pytorch_model_00065-of-00072.bin",
+    "h.63.self_attention.dense.weight": "pytorch_model_00065-of-00072.bin",
+    "h.63.self_attention.query_key_value.bias": "pytorch_model_00065-of-00072.bin",
+    "h.63.self_attention.query_key_value.weight": "pytorch_model_00065-of-00072.bin",
+    "h.64.input_layernorm.bias": "pytorch_model_00066-of-00072.bin",
+    "h.64.input_layernorm.weight": "pytorch_model_00066-of-00072.bin",
+    "h.64.mlp.dense_4h_to_h.bias": "pytorch_model_00066-of-00072.bin",
+    "h.64.mlp.dense_4h_to_h.weight": "pytorch_model_00066-of-00072.bin",
+    "h.64.mlp.dense_h_to_4h.bias": "pytorch_model_00066-of-00072.bin",
+    "h.64.mlp.dense_h_to_4h.weight": "pytorch_model_00066-of-00072.bin",
+    "h.64.post_attention_layernorm.bias": "pytorch_model_00066-of-00072.bin",
+    "h.64.post_attention_layernorm.weight": "pytorch_model_00066-of-00072.bin",
+    "h.64.self_attention.dense.bias": "pytorch_model_00066-of-00072.bin",
+    "h.64.self_attention.dense.weight": "pytorch_model_00066-of-00072.bin",
+    "h.64.self_attention.query_key_value.bias": "pytorch_model_00066-of-00072.bin",
+    "h.64.self_attention.query_key_value.weight": "pytorch_model_00066-of-00072.bin",
+    "h.65.input_layernorm.bias": "pytorch_model_00067-of-00072.bin",
+    "h.65.input_layernorm.weight": "pytorch_model_00067-of-00072.bin",
+    "h.65.mlp.dense_4h_to_h.bias": "pytorch_model_00067-of-00072.bin",
+    "h.65.mlp.dense_4h_to_h.weight": "pytorch_model_00067-of-00072.bin",
+    "h.65.mlp.dense_h_to_4h.bias": "pytorch_model_00067-of-00072.bin",
+    "h.65.mlp.dense_h_to_4h.weight": "pytorch_model_00067-of-00072.bin",
+    "h.65.post_attention_layernorm.bias": "pytorch_model_00067-of-00072.bin",
+    "h.65.post_attention_layernorm.weight": "pytorch_model_00067-of-00072.bin",
+    "h.65.self_attention.dense.bias": "pytorch_model_00067-of-00072.bin",
+    "h.65.self_attention.dense.weight": "pytorch_model_00067-of-00072.bin",
+    "h.65.self_attention.query_key_value.bias": "pytorch_model_00067-of-00072.bin",
+    "h.65.self_attention.query_key_value.weight": "pytorch_model_00067-of-00072.bin",
+    "h.66.input_layernorm.bias": "pytorch_model_00068-of-00072.bin",
+    "h.66.input_layernorm.weight": "pytorch_model_00068-of-00072.bin",
+    "h.66.mlp.dense_4h_to_h.bias": "pytorch_model_00068-of-00072.bin",
+    "h.66.mlp.dense_4h_to_h.weight": "pytorch_model_00068-of-00072.bin",
+    "h.66.mlp.dense_h_to_4h.bias": "pytorch_model_00068-of-00072.bin",
+    "h.66.mlp.dense_h_to_4h.weight": "pytorch_model_00068-of-00072.bin",
+    "h.66.post_attention_layernorm.bias": "pytorch_model_00068-of-00072.bin",
+    "h.66.post_attention_layernorm.weight": "pytorch_model_00068-of-00072.bin",
+    "h.66.self_attention.dense.bias": "pytorch_model_00068-of-00072.bin",
+    "h.66.self_attention.dense.weight": "pytorch_model_00068-of-00072.bin",
+    "h.66.self_attention.query_key_value.bias": "pytorch_model_00068-of-00072.bin",
+    "h.66.self_attention.query_key_value.weight": "pytorch_model_00068-of-00072.bin",
+    "h.67.input_layernorm.bias": "pytorch_model_00069-of-00072.bin",
+    "h.67.input_layernorm.weight": "pytorch_model_00069-of-00072.bin",
+    "h.67.mlp.dense_4h_to_h.bias": "pytorch_model_00069-of-00072.bin",
+    "h.67.mlp.dense_4h_to_h.weight": "pytorch_model_00069-of-00072.bin",
+    "h.67.mlp.dense_h_to_4h.bias": "pytorch_model_00069-of-00072.bin",
+    "h.67.mlp.dense_h_to_4h.weight": "pytorch_model_00069-of-00072.bin",
+    "h.67.post_attention_layernorm.bias": "pytorch_model_00069-of-00072.bin",
+    "h.67.post_attention_layernorm.weight": "pytorch_model_00069-of-00072.bin",
+    "h.67.self_attention.dense.bias": "pytorch_model_00069-of-00072.bin",
+    "h.67.self_attention.dense.weight": "pytorch_model_00069-of-00072.bin",
+    "h.67.self_attention.query_key_value.bias": "pytorch_model_00069-of-00072.bin",
+    "h.67.self_attention.query_key_value.weight": "pytorch_model_00069-of-00072.bin",
+    "h.68.input_layernorm.bias": "pytorch_model_00070-of-00072.bin",
+    "h.68.input_layernorm.weight": "pytorch_model_00070-of-00072.bin",
+    "h.68.mlp.dense_4h_to_h.bias": "pytorch_model_00070-of-00072.bin",
+    "h.68.mlp.dense_4h_to_h.weight": "pytorch_model_00070-of-00072.bin",
+    "h.68.mlp.dense_h_to_4h.bias": "pytorch_model_00070-of-00072.bin",
+    "h.68.mlp.dense_h_to_4h.weight": "pytorch_model_00070-of-00072.bin",
+    "h.68.post_attention_layernorm.bias": "pytorch_model_00070-of-00072.bin",
+    "h.68.post_attention_layernorm.weight": "pytorch_model_00070-of-00072.bin",
+    "h.68.self_attention.dense.bias": "pytorch_model_00070-of-00072.bin",
+    "h.68.self_attention.dense.weight": "pytorch_model_00070-of-00072.bin",
+    "h.68.self_attention.query_key_value.bias": "pytorch_model_00070-of-00072.bin",
+    "h.68.self_attention.query_key_value.weight": "pytorch_model_00070-of-00072.bin",
+    "h.69.input_layernorm.bias": "pytorch_model_00071-of-00072.bin",
+    "h.69.input_layernorm.weight": "pytorch_model_00071-of-00072.bin",
+    "h.69.mlp.dense_4h_to_h.bias": "pytorch_model_00071-of-00072.bin",
+    "h.69.mlp.dense_4h_to_h.weight": "pytorch_model_00071-of-00072.bin",
+    "h.69.mlp.dense_h_to_4h.bias": "pytorch_model_00071-of-00072.bin",
+    "h.69.mlp.dense_h_to_4h.weight": "pytorch_model_00071-of-00072.bin",
+    "h.69.post_attention_layernorm.bias": "pytorch_model_00071-of-00072.bin",
+    "h.69.post_attention_layernorm.weight": "pytorch_model_00071-of-00072.bin",
+    "h.69.self_attention.dense.bias": "pytorch_model_00071-of-00072.bin",
+    "h.69.self_attention.dense.weight": "pytorch_model_00071-of-00072.bin",
+    "h.69.self_attention.query_key_value.bias": "pytorch_model_00071-of-00072.bin",
+    "h.69.self_attention.query_key_value.weight": "pytorch_model_00071-of-00072.bin",
+    "h.7.input_layernorm.bias": "pytorch_model_00009-of-00072.bin",
+    "h.7.input_layernorm.weight": "pytorch_model_00009-of-00072.bin",
+    "h.7.mlp.dense_4h_to_h.bias": "pytorch_model_00009-of-00072.bin",
+    "h.7.mlp.dense_4h_to_h.weight": "pytorch_model_00009-of-00072.bin",
+    "h.7.mlp.dense_h_to_4h.bias": "pytorch_model_00009-of-00072.bin",
+    "h.7.mlp.dense_h_to_4h.weight": "pytorch_model_00009-of-00072.bin",
+    "h.7.post_attention_layernorm.bias": "pytorch_model_00009-of-00072.bin",
+    "h.7.post_attention_layernorm.weight": "pytorch_model_00009-of-00072.bin",
+    "h.7.self_attention.dense.bias": "pytorch_model_00009-of-00072.bin",
+    "h.7.self_attention.dense.weight": "pytorch_model_00009-of-00072.bin",
+    "h.7.self_attention.query_key_value.bias": "pytorch_model_00009-of-00072.bin",
+    "h.7.self_attention.query_key_value.weight": "pytorch_model_00009-of-00072.bin",
+    "h.8.input_layernorm.bias": "pytorch_model_00010-of-00072.bin",
+    "h.8.input_layernorm.weight": "pytorch_model_00010-of-00072.bin",
+    "h.8.mlp.dense_4h_to_h.bias": "pytorch_model_00010-of-00072.bin",
+    "h.8.mlp.dense_4h_to_h.weight": "pytorch_model_00010-of-00072.bin",
+    "h.8.mlp.dense_h_to_4h.bias": "pytorch_model_00010-of-00072.bin",
+    "h.8.mlp.dense_h_to_4h.weight": "pytorch_model_00010-of-00072.bin",
+    "h.8.post_attention_layernorm.bias": "pytorch_model_00010-of-00072.bin",
+    "h.8.post_attention_layernorm.weight": "pytorch_model_00010-of-00072.bin",
+    "h.8.self_attention.dense.bias": "pytorch_model_00010-of-00072.bin",
+    "h.8.self_attention.dense.weight": "pytorch_model_00010-of-00072.bin",
+    "h.8.self_attention.query_key_value.bias": "pytorch_model_00010-of-00072.bin",
+    "h.8.self_attention.query_key_value.weight": "pytorch_model_00010-of-00072.bin",
+    "h.9.input_layernorm.bias": "pytorch_model_00011-of-00072.bin",
+    "h.9.input_layernorm.weight": "pytorch_model_00011-of-00072.bin",
+    "h.9.mlp.dense_4h_to_h.bias": "pytorch_model_00011-of-00072.bin",
+    "h.9.mlp.dense_4h_to_h.weight": "pytorch_model_00011-of-00072.bin",
+    "h.9.mlp.dense_h_to_4h.bias": "pytorch_model_00011-of-00072.bin",
+    "h.9.mlp.dense_h_to_4h.weight": "pytorch_model_00011-of-00072.bin",
+    "h.9.post_attention_layernorm.bias": "pytorch_model_00011-of-00072.bin",
+    "h.9.post_attention_layernorm.weight": "pytorch_model_00011-of-00072.bin",
+    "h.9.self_attention.dense.bias": "pytorch_model_00011-of-00072.bin",
+    "h.9.self_attention.dense.weight": "pytorch_model_00011-of-00072.bin",
+    "h.9.self_attention.query_key_value.bias": "pytorch_model_00011-of-00072.bin",
+    "h.9.self_attention.query_key_value.weight": "pytorch_model_00011-of-00072.bin",
+    "ln_f.bias": "pytorch_model_00072-of-00072.bin",
+    "ln_f.weight": "pytorch_model_00072-of-00072.bin",
+    "word_embeddings.weight": "pytorch_model_00001-of-00072.bin",
+    "word_embeddings_layernorm.bias": "pytorch_model_00001-of-00072.bin",
+    "word_embeddings_layernorm.weight": "pytorch_model_00001-of-00072.bin"
+  }
+}
diff --git a/pytorch_model_00001-of-00072.bin b/pytorch_model_00001-of-00072.bin
new file mode 100644
index 0000000000000000000000000000000000000000..ecd963607dcab9f787e6ae07fbc9ff75ee9639e9
--- /dev/null
+++ b/pytorch_model_00001-of-00072.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:65db970a7cbf356cfb1b5f2e9aa52dd049b5aae8af93fbb21b5b8bcaf6fb8a11
+size 7193290147
diff --git a/pytorch_model_00002-of-00072.bin b/pytorch_model_00002-of-00072.bin
new file mode 100644
index 0000000000000000000000000000000000000000..5be9137509143f8c55587293fd9c1a47edd6ca05
--- /dev/null
+++ b/pytorch_model_00002-of-00072.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:21f0ce45fed83e1ec25d55c029c98bcf4e80d60061f795127a9bd0921b996be4
+size 4932877601
diff --git a/pytorch_model_00003-of-00072.bin b/pytorch_model_00003-of-00072.bin
new file mode 100644
index 0000000000000000000000000000000000000000..1087e4f15999f90574989e5a8924a457ab21a46d
--- /dev/null
+++ b/pytorch_model_00003-of-00072.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:9fd924c1e57cf4471ee13947adba8d5a3b79c6e36816a3d17c063bc4c31540f1
+size 4932877601
diff --git a/pytorch_model_00004-of-00072.bin b/pytorch_model_00004-of-00072.bin
new file mode 100644
index 0000000000000000000000000000000000000000..6e6217b3490cc51d017fd17b2be7e5b769fc9d3e
--- /dev/null
+++ b/pytorch_model_00004-of-00072.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:3369ffe896a27acfa30048b700050413ec52dcd873f523be85f75e6970e4cf7c
+size 4932877601
diff --git a/pytorch_model_00005-of-00072.bin b/pytorch_model_00005-of-00072.bin
new file mode 100644
index 0000000000000000000000000000000000000000..e85ab18f6e52527d814d1fe04e32bb4f0ae49e40
--- /dev/null
+++ b/pytorch_model_00005-of-00072.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:a64b51c9c1bae3fc0cb701f0461103979cfe3afe1fffa287556587fc64855e76
+size 4932877601
diff --git a/pytorch_model_00006-of-00072.bin b/pytorch_model_00006-of-00072.bin
new file mode 100644
index 0000000000000000000000000000000000000000..84ebb422ebfe5fcd187fb7c0c69b6eec7bc4198d
--- /dev/null
+++ b/pytorch_model_00006-of-00072.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:6df976238fa2ae2f2c2f967d24b904984eef5d742a6e3cc512ec4f7b1d48d3f7
+size 4932877601
diff --git a/pytorch_model_00007-of-00072.bin b/pytorch_model_00007-of-00072.bin
new file mode 100644
index 0000000000000000000000000000000000000000..c59e4b7de47d01f84eef29cf9cec826744f4ecf7
--- /dev/null
+++ b/pytorch_model_00007-of-00072.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:4dc04d1489906bfaccd07d2c5660154a70f8dde7900f4fd65880544a4656df83
+size 4932877601
diff --git a/pytorch_model_00008-of-00072.bin b/pytorch_model_00008-of-00072.bin
new file mode 100644
index 0000000000000000000000000000000000000000..2a62edff36d3c153292359543ffe6d7a84e183a5
--- /dev/null
+++ b/pytorch_model_00008-of-00072.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:4dcf4b4e6ca89eab1afe1a41440d3299845a6482bc7fcbbafd98dc4e60d7ecbe
+size 4932877601
diff --git a/pytorch_model_00009-of-00072.bin b/pytorch_model_00009-of-00072.bin
new file mode 100644
index 0000000000000000000000000000000000000000..387cb5da600208b77b52335e69aa6ada15895441
--- /dev/null
+++ b/pytorch_model_00009-of-00072.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:a7a1fb878ef7d6195e0bcff9254bdb24356d352548a2988efb36804727c891ba
+size 4932877601
diff --git a/pytorch_model_00010-of-00072.bin b/pytorch_model_00010-of-00072.bin
new file mode 100644
index 0000000000000000000000000000000000000000..019fc67ddc9a88fe5ccde0738475416fa767fdba
--- /dev/null
+++ b/pytorch_model_00010-of-00072.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:934fed4f1c6bae08b341889c53375ba59aca432c5d1dd013e1706a045f8336ba
+size 4932877601
diff --git a/pytorch_model_00011-of-00072.bin b/pytorch_model_00011-of-00072.bin
new file mode 100644
index 0000000000000000000000000000000000000000..137ef213549331a85576478c94eb92f568bcf9f1
--- /dev/null
+++ b/pytorch_model_00011-of-00072.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:00de7ac3a3b83fe7401eab45b49fa71fc884850f69ba4c6d8372f4e362ed9da9
+size 4932877601
diff --git a/pytorch_model_00012-of-00072.bin b/pytorch_model_00012-of-00072.bin
new file mode 100644
index 0000000000000000000000000000000000000000..3bfe823d3cce0b51b1b8ed0cc6643291b0561728
--- /dev/null
+++ b/pytorch_model_00012-of-00072.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:4699cd92e5cab0bf1e45f7660254896e39e2812d5648e8d9335cb48e2fc3b5e0
+size 4932877665
diff --git a/pytorch_model_00013-of-00072.bin b/pytorch_model_00013-of-00072.bin
new file mode 100644
index 0000000000000000000000000000000000000000..9f1fcc477118e86ff620df04c5b82856d84e42d7
--- /dev/null
+++ b/pytorch_model_00013-of-00072.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:ef696b624b5a43aeb953120acb3aec873582f867e94aacd4c8c5ae4d120f5464
+size 4932877665
diff --git a/pytorch_model_00014-of-00072.bin b/pytorch_model_00014-of-00072.bin
new file mode 100644
index 0000000000000000000000000000000000000000..4be6b173f08c9f384f08854b43e10dc0f42cb618
--- /dev/null
+++ b/pytorch_model_00014-of-00072.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:9d55aae2ed5666382aea4dc3c9ea875de8b4a482d7ac11d8dd8dd9a593d6e663
+size 4932877665
diff --git a/pytorch_model_00015-of-00072.bin b/pytorch_model_00015-of-00072.bin
new file mode 100644
index 0000000000000000000000000000000000000000..c132cde3ad77462c773d3c40faa61952f325438c
--- /dev/null
+++ b/pytorch_model_00015-of-00072.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:50378f45b27905aead05109d7f5f99dcb33b52571d63ab66cc04fe7f41140d1a
+size 4932877665
diff --git a/pytorch_model_00016-of-00072.bin b/pytorch_model_00016-of-00072.bin
new file mode 100644
index 0000000000000000000000000000000000000000..d21c5e20f33822e0b706fd0009d35c2fe1c7734a
--- /dev/null
+++ b/pytorch_model_00016-of-00072.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:a4a99a434d265da6753b9e0a330fe98dea879fbb45c80f41a2d95068ebb83bba
+size 4932877665
diff --git a/pytorch_model_00017-of-00072.bin b/pytorch_model_00017-of-00072.bin
new file mode 100644
index 0000000000000000000000000000000000000000..60fe9950d0248bdccf3b28d1d074f9610dabb49a
--- /dev/null
+++ b/pytorch_model_00017-of-00072.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:2e4b40b082de11687c3579a98429ddd4822934c9273e7c6014d75a59477d9d9e
+size 4932877665
diff --git a/pytorch_model_00018-of-00072.bin b/pytorch_model_00018-of-00072.bin
new file mode 100644
index 0000000000000000000000000000000000000000..b8f64d9f956a177990ea038a6d2d69fc6379eb28
--- /dev/null
+++ b/pytorch_model_00018-of-00072.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:b93d9739fff12fbe8c243e7594a1d4a83a640c7ab6cf8d409d6819e130e314c3
+size 4932877665
diff --git a/pytorch_model_00019-of-00072.bin b/pytorch_model_00019-of-00072.bin
new file mode 100644
index 0000000000000000000000000000000000000000..d859e955d350a4c1840297c2dc3111c1a62162d0
--- /dev/null
+++ b/pytorch_model_00019-of-00072.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:3fc6e5fe0a9ce342b4e6ae8f1807a7f3b26f268a585900321032d7ff4b001022
+size 4932877665
diff --git a/pytorch_model_00020-of-00072.bin b/pytorch_model_00020-of-00072.bin
new file mode 100644
index 0000000000000000000000000000000000000000..d73aa5b9ed99da0a5ac8d7942b4a5345d1478884
--- /dev/null
+++ b/pytorch_model_00020-of-00072.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:9ae53093d4a6fab3092db2fa968a076f2aa8d5898760b1d248b42bef77455bef
+size 4932877665
diff --git a/pytorch_model_00021-of-00072.bin b/pytorch_model_00021-of-00072.bin
new file mode 100644
index 0000000000000000000000000000000000000000..f1a41008a8cd8ddc3430c4c8227051e8e9e29a46
--- /dev/null
+++ b/pytorch_model_00021-of-00072.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:be8e0f20437e58c5a4d8a866563f6ae5778545be0413fad6c327c4e3c02370af
+size 4932877665
diff --git a/pytorch_model_00022-of-00072.bin b/pytorch_model_00022-of-00072.bin
new file mode 100644
index 0000000000000000000000000000000000000000..5f80ff0261be637f4e2a489d934279ccb0ffc8bb
--- /dev/null
+++ b/pytorch_model_00022-of-00072.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:c21d9d721b37d329cbb337e6551db810adc264bcd095217bd8b91ceb427f02f2
+size 4932877665
diff --git a/pytorch_model_00023-of-00072.bin b/pytorch_model_00023-of-00072.bin
new file mode 100644
index 0000000000000000000000000000000000000000..06094809075daaa245d78677e42e452977c3ce79
--- /dev/null
+++ b/pytorch_model_00023-of-00072.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:0a147cb67c9d437a6c7e34a0ef94f16153bbbd276b4038afae0644e950b9f256
+size 4932877665
diff --git a/pytorch_model_00024-of-00072.bin b/pytorch_model_00024-of-00072.bin
new file mode 100644
index 0000000000000000000000000000000000000000..2197e33b3f0e3ec7e8c1078f9e188a40ab50e28f
--- /dev/null
+++ b/pytorch_model_00024-of-00072.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:41c47810cf25c46dab8a84bd058d8b62aa1c4ff7e8cd76d1078870a997a34291
+size 4932877665
diff --git a/pytorch_model_00025-of-00072.bin b/pytorch_model_00025-of-00072.bin
new file mode 100644
index 0000000000000000000000000000000000000000..e83c6c195b51237520a694b8d23702160e97b777
--- /dev/null
+++ b/pytorch_model_00025-of-00072.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:44478168a6bfe9bf5ec911c71e085d02d7c438497db9768ab6deb22f8aaeddd9
+size 4932877665
diff --git a/pytorch_model_00026-of-00072.bin b/pytorch_model_00026-of-00072.bin
new file mode 100644
index 0000000000000000000000000000000000000000..5a78a4765df86c3d02ba907c754ac3fb57fe1170
--- /dev/null
+++ b/pytorch_model_00026-of-00072.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:1600f2e296599bfcd65cfe95f5abd3174c9c39f183f5b113518ac7215e0fd0ad
+size 4932877665
diff --git a/pytorch_model_00027-of-00072.bin b/pytorch_model_00027-of-00072.bin
new file mode 100644
index 0000000000000000000000000000000000000000..b8a4f417b0465267cc6af5d5f608969bc348e9c8
--- /dev/null
+++ b/pytorch_model_00027-of-00072.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:dc2af6dc349eecc7cf1c3b241c8f907f2bfe5135f3f5717677b1c64d161ac36d
+size 4932877665
diff --git a/pytorch_model_00028-of-00072.bin b/pytorch_model_00028-of-00072.bin
new file mode 100644
index 0000000000000000000000000000000000000000..7701fc57b0e2ca0e0fc8851d9700890bff2238d2
--- /dev/null
+++ b/pytorch_model_00028-of-00072.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:0b800e1337334ab94dcb6ba8649a4394a63be359a8a2d65f85b9c4e65104c3cc
+size 4932877665
diff --git a/pytorch_model_00029-of-00072.bin b/pytorch_model_00029-of-00072.bin
new file mode 100644
index 0000000000000000000000000000000000000000..038e7a4d4a0bf38f51949f4c7d336ff976f0ba7c
--- /dev/null
+++ b/pytorch_model_00029-of-00072.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:811dfec075f98f099055ffe062377f78cd9c2089630fa54fe257b22b55cf5762
+size 4932877665
diff --git a/pytorch_model_00030-of-00072.bin b/pytorch_model_00030-of-00072.bin
new file mode 100644
index 0000000000000000000000000000000000000000..1b528f943f91c51fcc0d407ab85b33a440342fc1
--- /dev/null
+++ b/pytorch_model_00030-of-00072.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:99a69cd1ff317de9e306f0c71af8b9ac517f3d604b5b3ce29337398298e38bb7
+size 4932877665
diff --git a/pytorch_model_00031-of-00072.bin b/pytorch_model_00031-of-00072.bin
new file mode 100644
index 0000000000000000000000000000000000000000..7643349b5a418a568600c3d46a44515b7565425a
--- /dev/null
+++ b/pytorch_model_00031-of-00072.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:79561355347dd74961c26d307eae0eb23649e190e83ff92c929ba4181d03bd4e
+size 4932877665
diff --git a/pytorch_model_00032-of-00072.bin b/pytorch_model_00032-of-00072.bin
new file mode 100644
index 0000000000000000000000000000000000000000..b3783fde9b575597698dd764b2e48eb474d07fc1
--- /dev/null
+++ b/pytorch_model_00032-of-00072.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:5766b6e0ebd0151046b4a2f4a60d67b487cd1caf6bb57e36aa3d5f5ab3cdad1a
+size 4932877665
diff --git a/pytorch_model_00033-of-00072.bin b/pytorch_model_00033-of-00072.bin
new file mode 100644
index 0000000000000000000000000000000000000000..78f177dd2a58a2869a78268acc04fa79d1966d9d
--- /dev/null
+++ b/pytorch_model_00033-of-00072.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:c46716794858776c7ba2a5a7794dd378105598a969412360f0579e80983a0a90
+size 4932877665
diff --git a/pytorch_model_00034-of-00072.bin b/pytorch_model_00034-of-00072.bin
new file mode 100644
index 0000000000000000000000000000000000000000..23df4c51d5fd02ad0906ae33d00ed4f241e56e2d
--- /dev/null
+++ b/pytorch_model_00034-of-00072.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:bac35f5bd58dae341c7da44c6df74572a902f0dbe3cbdfe92e1c775ad41335c5
+size 4932877665
diff --git a/pytorch_model_00035-of-00072.bin b/pytorch_model_00035-of-00072.bin
new file mode 100644
index 0000000000000000000000000000000000000000..de1e88db41372f180fb036b990255818df1daf38
--- /dev/null
+++ b/pytorch_model_00035-of-00072.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:61598ced238787c61c87e09781c2a246dc00a6b5344e170570035af885283473
+size 4932877665
diff --git a/pytorch_model_00036-of-00072.bin b/pytorch_model_00036-of-00072.bin
new file mode 100644
index 0000000000000000000000000000000000000000..9281d620d0b7eca2c20da924ba3dfe3afaf1c500
--- /dev/null
+++ b/pytorch_model_00036-of-00072.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:6cb95a292cf711d8dc555f1496797eb3cd99cea06b8f9a7f445b822e0ffdc0aa
+size 4932877665
diff --git a/pytorch_model_00037-of-00072.bin b/pytorch_model_00037-of-00072.bin
new file mode 100644
index 0000000000000000000000000000000000000000..dae33db70cb6adeb5379f00652b9a3cad80a52a7
--- /dev/null
+++ b/pytorch_model_00037-of-00072.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:78dd209f2220a1dcf72b2158c86ec62c9aec6effb8cffff2cde2aaeb06b02173
+size 4932877665
diff --git a/pytorch_model_00038-of-00072.bin b/pytorch_model_00038-of-00072.bin
new file mode 100644
index 0000000000000000000000000000000000000000..43526daf15ce30cea8a2aaeb38c6775b2d593b9a
--- /dev/null
+++ b/pytorch_model_00038-of-00072.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:661fcd4bc01ad8a472f456f73452732eeb0abfcd3038f59af339fb35d49b4cfc
+size 4932877665
diff --git a/pytorch_model_00039-of-00072.bin b/pytorch_model_00039-of-00072.bin
new file mode 100644
index 0000000000000000000000000000000000000000..a0a08ffe1e34f71f7954763ef4068b4b9be81c05
--- /dev/null
+++ b/pytorch_model_00039-of-00072.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:39095aeccaf81bffdbcc11340703985b14e6725a51c7f2da277d158ed8a042bf
+size 4932877665
diff --git a/pytorch_model_00040-of-00072.bin b/pytorch_model_00040-of-00072.bin
new file mode 100644
index 0000000000000000000000000000000000000000..9babe6c013fbb772bcc1bc5c87664bb0f3923f72
--- /dev/null
+++ b/pytorch_model_00040-of-00072.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:2ede312e6624b886c462757cb55c2d0f4289ae6b923679008a265cc97cebd130
+size 4932877665
diff --git a/pytorch_model_00041-of-00072.bin b/pytorch_model_00041-of-00072.bin
new file mode 100644
index 0000000000000000000000000000000000000000..6bdbfe95bba5d02c4df80f86bafcf4db4b74012d
--- /dev/null
+++ b/pytorch_model_00041-of-00072.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:d3077475fe3a99618151a31eafc7e9f655a78682d3c3382b5faa0cc55b19d49a
+size 4932877665
diff --git a/pytorch_model_00042-of-00072.bin b/pytorch_model_00042-of-00072.bin
new file mode 100644
index 0000000000000000000000000000000000000000..acbc5db8c0f9e7657ec7f18396c356bf7c8bd2c7
--- /dev/null
+++ b/pytorch_model_00042-of-00072.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:ad9b3ff95974db4ae51cc9880751c2576e1ef1a01a48b48dcdcd7d618b66c1c8
+size 4932877665
diff --git a/pytorch_model_00043-of-00072.bin b/pytorch_model_00043-of-00072.bin
new file mode 100644
index 0000000000000000000000000000000000000000..bdc0a7815dbea330df707e309e7e489c718d4bd7
--- /dev/null
+++ b/pytorch_model_00043-of-00072.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:7ed41fac6da73f98e55c1cca814a961da95cc32fd97311ca95c1963a1e8658f7
+size 4932877665
diff --git a/pytorch_model_00044-of-00072.bin b/pytorch_model_00044-of-00072.bin
new file mode 100644
index 0000000000000000000000000000000000000000..fde236368b19cb80dfddccf13c3b162a19259409
--- /dev/null
+++ b/pytorch_model_00044-of-00072.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:83ccaca307715c40e136a16e4018542e8f34a9939976018a76efa00254dcf698
+size 4932877665
diff --git a/pytorch_model_00045-of-00072.bin b/pytorch_model_00045-of-00072.bin
new file mode 100644
index 0000000000000000000000000000000000000000..f58ccc6a1c2bea9a993232552ae42d8913e2b070
--- /dev/null
+++ b/pytorch_model_00045-of-00072.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:93a852af7a92bcbf7671d5b0cdfb2e1183d087e6b4bc911d5a7c5e7982fbd5b3
+size 4932877665
diff --git a/pytorch_model_00046-of-00072.bin b/pytorch_model_00046-of-00072.bin
new file mode 100644
index 0000000000000000000000000000000000000000..1557728c187735e2109b11e5001cd50a5fe06ae7
--- /dev/null
+++ b/pytorch_model_00046-of-00072.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:9e04df8e4f1c3b70f98bad37ab42b6a01e14b9f3c2d185e2124e2da9aa259cdf
+size 4932877665
diff --git a/pytorch_model_00047-of-00072.bin b/pytorch_model_00047-of-00072.bin
new file mode 100644
index 0000000000000000000000000000000000000000..a3fc2caee5848df435945557e044746acd66bd21
--- /dev/null
+++ b/pytorch_model_00047-of-00072.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:e7345e00239e580e4dbdce3efbb2d980bab81778f149074a48994032a48dd12a
+size 4932877665
diff --git a/pytorch_model_00048-of-00072.bin b/pytorch_model_00048-of-00072.bin
new file mode 100644
index 0000000000000000000000000000000000000000..9c3ae0a067d96a079c835fa22fa22b1d2bd05306
--- /dev/null
+++ b/pytorch_model_00048-of-00072.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:96a1a10ec0f2490ea0098d9335c4dd35ccde41829e229c07877d7d66fd255f77
+size 4932877665
diff --git a/pytorch_model_00049-of-00072.bin b/pytorch_model_00049-of-00072.bin
new file mode 100644
index 0000000000000000000000000000000000000000..40cb1252eca479272a86ba722c34e10c2eda7b09
--- /dev/null
+++ b/pytorch_model_00049-of-00072.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:bc9dd396dec4f18f9f9f35658cc650a8af51fa24436434e24e7fe8c572d850ff
+size 4932877665
diff --git a/pytorch_model_00050-of-00072.bin b/pytorch_model_00050-of-00072.bin
new file mode 100644
index 0000000000000000000000000000000000000000..e024152eb90f8817ae365a4b6538024a54d6a1c9
--- /dev/null
+++ b/pytorch_model_00050-of-00072.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:673a48b108ae4fe6adc4f0dc325bf4802a7e76fafe2dfcfef81e981e2216a71e
+size 4932877665
diff --git a/pytorch_model_00051-of-00072.bin b/pytorch_model_00051-of-00072.bin
new file mode 100644
index 0000000000000000000000000000000000000000..0aea561edd42eb4c4c67caf8adef8542d5c09804
--- /dev/null
+++ b/pytorch_model_00051-of-00072.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:7eb56c256a955a215d8f23aa469275d9bc4c330b1ae9d5b6f7b96187b69d37be
+size 4932877665
diff --git a/pytorch_model_00052-of-00072.bin b/pytorch_model_00052-of-00072.bin
new file mode 100644
index 0000000000000000000000000000000000000000..f524ef0cd5d3a6a57e0317643f49460527f93f40
--- /dev/null
+++ b/pytorch_model_00052-of-00072.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:37fdd8c8dba14cb4aa94b83321ffbab3843404bf054afa5c42e72642b37f3325
+size 4932877665
diff --git a/pytorch_model_00053-of-00072.bin b/pytorch_model_00053-of-00072.bin
new file mode 100644
index 0000000000000000000000000000000000000000..5db16f2e50c4b720b2a09fc27d20322deb9c26f7
--- /dev/null
+++ b/pytorch_model_00053-of-00072.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:7dc5ba6b2bda3387f0e1bcbee67eadd9983320e8677b9d20c88967d7f36254ae
+size 4932877665
diff --git a/pytorch_model_00054-of-00072.bin b/pytorch_model_00054-of-00072.bin
new file mode 100644
index 0000000000000000000000000000000000000000..29c4f1d966c1847ac6d6e946a82818d7d5e0893e
--- /dev/null
+++ b/pytorch_model_00054-of-00072.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:a446f09763d087e54964642a4212e0841584701a9fad709d563624fce369e4a9
+size 4932877665
diff --git a/pytorch_model_00055-of-00072.bin b/pytorch_model_00055-of-00072.bin
new file mode 100644
index 0000000000000000000000000000000000000000..d9a17b00468cf9645e82e60f0fac3e85f1a6c944
--- /dev/null
+++ b/pytorch_model_00055-of-00072.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:8f4358881dfd444620c4211feb24b18e7b2892fefabd28e8f69aaedfa6bdaa56
+size 4932877665
diff --git a/pytorch_model_00056-of-00072.bin b/pytorch_model_00056-of-00072.bin
new file mode 100644
index 0000000000000000000000000000000000000000..5c8e607e96a04e5552cd71fc4653ead78d25451b
--- /dev/null
+++ b/pytorch_model_00056-of-00072.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:fff8819a56e3be153cc1cbe45ba48548b4ab40b0adced9e93efd4bd40e527f7a
+size 4932877665
diff --git a/pytorch_model_00057-of-00072.bin b/pytorch_model_00057-of-00072.bin
new file mode 100644
index 0000000000000000000000000000000000000000..19161f053d45767d61bccf25f4d780c7d5e9d00b
--- /dev/null
+++ b/pytorch_model_00057-of-00072.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:2f9a6e15735ca348fed8217770f09317eaec67de2449f424b957f322527da4d8
+size 4932877665
diff --git a/pytorch_model_00058-of-00072.bin b/pytorch_model_00058-of-00072.bin
new file mode 100644
index 0000000000000000000000000000000000000000..f9f2d2ecc5082bdfa3757e69e0f58d60f8a996df
--- /dev/null
+++ b/pytorch_model_00058-of-00072.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:335a592871934558a3985f1278007378e9d620478f92689447c89c443789b47f
+size 4932877665
diff --git a/pytorch_model_00059-of-00072.bin b/pytorch_model_00059-of-00072.bin
new file mode 100644
index 0000000000000000000000000000000000000000..006234583bb09bfe6b73aabb2415e573ecb08f75
--- /dev/null
+++ b/pytorch_model_00059-of-00072.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:ee1e976ed27f92f6c69bcd8421bce4f4afb2ac1dfe2840b42b4283dd21e1d869
+size 4932877665
diff --git a/pytorch_model_00060-of-00072.bin b/pytorch_model_00060-of-00072.bin
new file mode 100644
index 0000000000000000000000000000000000000000..f58e8b4b5d06678646360294e654ba778fa42f03
--- /dev/null
+++ b/pytorch_model_00060-of-00072.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:26dc4a3eb31d19828a3ff7e4f3c7e4b65b5ed041629b96118ae9c351e92c9e51
+size 4932877665
diff --git a/pytorch_model_00061-of-00072.bin b/pytorch_model_00061-of-00072.bin
new file mode 100644
index 0000000000000000000000000000000000000000..723b8278d62fea80463a6322caf0b7c9be74332b
--- /dev/null
+++ b/pytorch_model_00061-of-00072.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:8d90c7c77a866a2bea06a42dc04ce2b646fda876d55e0602838655885d4e2ba2
+size 4932877665
diff --git a/pytorch_model_00062-of-00072.bin b/pytorch_model_00062-of-00072.bin
new file mode 100644
index 0000000000000000000000000000000000000000..657244e473051f1b3f7321178240b2b98de166a2
--- /dev/null
+++ b/pytorch_model_00062-of-00072.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:d286c2b609a8cb831ea87a59fd05ad22074cb2c919f616e8f46c0b02cf637a25
+size 4932877665
diff --git a/pytorch_model_00063-of-00072.bin b/pytorch_model_00063-of-00072.bin
new file mode 100644
index 0000000000000000000000000000000000000000..4af17bd467154b1da873afe78d3350efb9b93bcf
--- /dev/null
+++ b/pytorch_model_00063-of-00072.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:d98a26df2951206a3b4615233d197f8a2908420928cf654800ea040eafe7ca1c
+size 4932877665
diff --git a/pytorch_model_00064-of-00072.bin b/pytorch_model_00064-of-00072.bin
new file mode 100644
index 0000000000000000000000000000000000000000..a8db680c5db5d0b948a85cec1e52e965395a4cfd
--- /dev/null
+++ b/pytorch_model_00064-of-00072.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:76438d2b847b03f746cf0171588b0fa91874de1582b81b9c0ab678b9dd777a7e
+size 4932877665
diff --git a/pytorch_model_00065-of-00072.bin b/pytorch_model_00065-of-00072.bin
new file mode 100644
index 0000000000000000000000000000000000000000..e9a7a0f90c22f04d1810bbe4672c043dd47a877a
--- /dev/null
+++ b/pytorch_model_00065-of-00072.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:0daceafac988ad9419669376ef4b6a94ac6d44e0cd35fe7a2a4c91decb4ba412
+size 4932877665
diff --git a/pytorch_model_00066-of-00072.bin b/pytorch_model_00066-of-00072.bin
new file mode 100644
index 0000000000000000000000000000000000000000..7ff56320c9a992eb77554d05ed54b2a469d94813
--- /dev/null
+++ b/pytorch_model_00066-of-00072.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:f03460b66795ef7a178444b9ebaf653d4db993034a29435d3c4eedac89b6c6f9
+size 4932877665
diff --git a/pytorch_model_00067-of-00072.bin b/pytorch_model_00067-of-00072.bin
new file mode 100644
index 0000000000000000000000000000000000000000..ecfc14f44105d2d523bfa6b5d4588ec9c2e867c2
--- /dev/null
+++ b/pytorch_model_00067-of-00072.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:ec3e836923ebe41162eabaac7f1955bd3ad3eaae853ee89591f971afe9912a9c
+size 4932877665
diff --git a/pytorch_model_00068-of-00072.bin b/pytorch_model_00068-of-00072.bin
new file mode 100644
index 0000000000000000000000000000000000000000..ca4a5641bd8745e056e25538ffa1869695181780
--- /dev/null
+++ b/pytorch_model_00068-of-00072.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:8e9fccb4f85598e19343f1d12d913a9c0b32e5572d61f793c8356144ead38654
+size 4932877665
diff --git a/pytorch_model_00069-of-00072.bin b/pytorch_model_00069-of-00072.bin
new file mode 100644
index 0000000000000000000000000000000000000000..e31ed98ed992deb6b304e7ca4bcb37b99d1bdd80
--- /dev/null
+++ b/pytorch_model_00069-of-00072.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:7bbde52ea1b98a99e8684def8b40be7ce5a2e5c940a14f2a25547d4b217fe689
+size 4932877665
diff --git a/pytorch_model_00070-of-00072.bin b/pytorch_model_00070-of-00072.bin
new file mode 100644
index 0000000000000000000000000000000000000000..b7ea310f8384ac30044d96eae75f6b13e0554c93
--- /dev/null
+++ b/pytorch_model_00070-of-00072.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:93a4b0304fb2e75a5cd041e7fcc5d265e923d13adaea6aaf1559240b44c6db84
+size 4932877665
diff --git a/pytorch_model_00071-of-00072.bin b/pytorch_model_00071-of-00072.bin
new file mode 100644
index 0000000000000000000000000000000000000000..3bd48b29a4d910a57a651a584dfa914c956c247a
--- /dev/null
+++ b/pytorch_model_00071-of-00072.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:8e13dca10345acd4b460bd5c9710b1838a5c642e093b2d1fc8119a860c49496f
+size 4932877665
diff --git a/pytorch_model_00072-of-00072.bin b/pytorch_model_00072-of-00072.bin
new file mode 100644
index 0000000000000000000000000000000000000000..d7ec2c89a4666a961a0ad378c458e77e01a4ace2
--- /dev/null
+++ b/pytorch_model_00072-of-00072.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:9fe030ef15c5368fc211aef57867cf14706e021b316fb7928e41505ec7301b0d
+size 58279
diff --git a/special_tokens_map.json b/special_tokens_map.json
new file mode 100644
index 0000000000000000000000000000000000000000..25bc39604f72700b3b8e10bd69bb2f227157edd1
--- /dev/null
+++ b/special_tokens_map.json
@@ -0,0 +1 @@
+{"bos_token": "<s>", "eos_token": "</s>", "unk_token": "<unk>", "pad_token": "<pad>"}
\ No newline at end of file
diff --git a/tokenizer.json b/tokenizer.json
new file mode 100644
index 0000000000000000000000000000000000000000..370bd68e20b4b6574ee05b213a74b244e3f492f3
--- /dev/null
+++ b/tokenizer.json
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:3fa39cd4b1500feb205bcce3b9703a4373414cafe4970e0657b413f7ddd2a9d3
+size 14500438
diff --git a/tokenizer_config.json b/tokenizer_config.json
new file mode 100644
index 0000000000000000000000000000000000000000..57576ae0ccd46e2a851bcfb912454077fb374c7e
--- /dev/null
+++ b/tokenizer_config.json
@@ -0,0 +1 @@
+{"unk_token": "<unk>", "eos_token": "</s>", "bos_token": "<s>", "pad_token": "<pad>", "name_or_path": "bigscience/tokenizer", "special_tokens_map_file": null, "tokenizer_class":"BloomTokenizerFast", "padding_side":"left"}