Yaning1001 commited on
Commit
94011a1
1 Parent(s): 5d21464

Add files using upload-large-folder tool

Browse files
This view is limited to 50 files because it contains too many changes.   See raw diff
Files changed (50) hide show
  1. .gitattributes +19 -0
  2. README.md +90 -0
  3. __pycache__/qwen_utils.cpython-39.pyc +0 -0
  4. __pycache__/utils.cpython-39.pyc +0 -0
  5. __pycache__/utils_llama.cpython-311.pyc +0 -0
  6. __pycache__/utils_llama.cpython-39.pyc +0 -0
  7. __pycache__/utils_qwen.cpython-311.pyc +0 -0
  8. __pycache__/utils_qwen.cpython-39.pyc +0 -0
  9. data/Perturbed_data/Llama-3.2-3B/babylm_hop_control/babylm_10M/bnc_spoken.train +3 -0
  10. data/Perturbed_data/Llama-3.2-3B/babylm_hop_control/babylm_10M/gutenberg.train +3 -0
  11. data/Perturbed_data/Llama-3.2-3B/babylm_hop_control/babylm_10M/open_subtitles.train +3 -0
  12. data/Perturbed_data/Llama-3.2-3B/babylm_hop_control/babylm_test_affected/bnc_spoken_affected.test +3 -0
  13. data/Perturbed_data/Llama-3.2-3B/babylm_hop_control/babylm_test_affected/childes_affected.test +3 -0
  14. data/Perturbed_data/Llama-3.2-3B/babylm_hop_control/babylm_test_affected/gutenberg_affected.test +3 -0
  15. data/Perturbed_data/Llama-3.2-3B/babylm_hop_control/babylm_test_affected/open_subtitles_affected.test +3 -0
  16. data/Perturbed_data/Llama-3.2-3B/babylm_hop_control/babylm_test_affected/simple_wiki_affected.test +3 -0
  17. data/Perturbed_data/Llama-3.2-3B/babylm_hop_control/babylm_test_affected/switchboard_affected.test +3 -0
  18. data/Perturbed_data/Llama-3.2-3B/babylm_hop_control/babylm_test_unaffected_sents/bnc_spoken_unaffected_sents.test +3 -0
  19. data/Perturbed_data/Llama-3.2-3B/babylm_hop_control/babylm_test_unaffected_sents/childes_unaffected_sents.test +3 -0
  20. data/Perturbed_data/Llama-3.2-3B/babylm_hop_control/babylm_test_unaffected_sents/gutenberg_unaffected_sents.test +3 -0
  21. data/Perturbed_data/Llama-3.2-3B/babylm_hop_control/babylm_test_unaffected_sents/open_subtitles_unaffected_sents.test +3 -0
  22. data/Perturbed_data/Llama-3.2-3B/babylm_hop_control/babylm_test_unaffected_sents/simple_wiki_unaffected_sents.test +3 -0
  23. data/Perturbed_data/Llama-3.2-3B/babylm_hop_control/babylm_test_unaffected_sents/switchboard_unaffected_sents.test +3 -0
  24. data/Perturbed_data/Llama-3.2-3B/babylm_shuffle_deterministic57/babylm_test_unaffected_sents/bnc_spoken_unaffected_sents.test +0 -0
  25. data/Perturbed_data/Llama-3.2-3B/babylm_shuffle_deterministic57/babylm_test_unaffected_sents/open_subtitles_unaffected_sents.test +0 -0
  26. data/Perturbed_data/Llama-3.2-3B/babylm_shuffle_deterministic57/babylm_test_unaffected_sents/simple_wiki_unaffected_sents.test +0 -0
  27. data/Perturbed_data/Llama-3.2-3B/babylm_shuffle_deterministic57/babylm_test_unaffected_sents/switchboard_unaffected_sents.test +0 -0
  28. data/Perturbed_data/Llama-3.2-3B/babylm_shuffle_local3/babylm_10M/bnc_spoken.train +3 -0
  29. data/Perturbed_data/Llama-3.2-3B/babylm_shuffle_local3/babylm_10M/childes.train +3 -0
  30. data/Perturbed_data/Llama-3.2-3B/babylm_shuffle_local3/babylm_10M/gutenberg.train +3 -0
  31. data/Perturbed_data/Llama-3.2-3B/babylm_shuffle_local3/babylm_10M/open_subtitles.train +3 -0
  32. data/Perturbed_data/Llama-3.2-3B/babylm_shuffle_local3/babylm_10M/simple_wiki.train +3 -0
  33. data/Perturbed_data/Llama-3.2-3B/babylm_shuffle_local3/babylm_10M/switchboard.train +3 -0
  34. data/Perturbed_data/Llama-3.2-3B/babylm_shuffle_local3/babylm_dev/bnc_spoken.dev +3 -0
  35. data/Perturbed_data/Llama-3.2-3B/babylm_shuffle_local3/babylm_dev/childes.dev +3 -0
  36. data/Perturbed_data/Llama-3.2-3B/babylm_shuffle_local3/babylm_dev/gutenberg.dev +3 -0
  37. data/Perturbed_data/Llama-3.2-3B/babylm_shuffle_local3/babylm_dev/open_subtitles.dev +3 -0
  38. data/Perturbed_data/Llama-3.2-3B/babylm_shuffle_local3/babylm_dev/simple_wiki.dev +3 -0
  39. data/Perturbed_data/Llama-3.2-3B/babylm_shuffle_local3/babylm_dev/switchboard.dev +3 -0
  40. data/Perturbed_data/Llama-3.2-3B/babylm_shuffle_local3/babylm_test_affected/bnc_spoken_affected.test +3 -0
  41. data/Perturbed_data/Llama-3.2-3B/babylm_shuffle_local3/babylm_test_affected/childes_affected.test +3 -0
  42. data/Perturbed_data/Llama-3.2-3B/babylm_shuffle_local3/babylm_test_affected/gutenberg_affected.test +3 -0
  43. data/Perturbed_data/Llama-3.2-3B/babylm_shuffle_local3/babylm_test_affected/open_subtitles_affected.test +3 -0
  44. data/Perturbed_data/Llama-3.2-3B/babylm_shuffle_local3/babylm_test_affected/simple_wiki_affected.test +3 -0
  45. data/Perturbed_data/Llama-3.2-3B/babylm_shuffle_local3/babylm_test_affected/switchboard_affected.test +3 -0
  46. data/Perturbed_data/Llama-3.2-3B/babylm_shuffle_nondeterministic/babylm_test_unaffected/childes_unaffected.test +0 -0
  47. data/Perturbed_data/Llama-3.2-3B/babylm_shuffle_nondeterministic/babylm_test_unaffected_sents/bnc_spoken_unaffected_sents.test +0 -0
  48. data/Perturbed_data/Llama-3.2-3B/babylm_shuffle_nondeterministic/babylm_test_unaffected_sents/childes_unaffected_sents.test +0 -0
  49. data/Perturbed_data/Llama-3.2-3B/babylm_shuffle_nondeterministic/babylm_test_unaffected_sents/gutenberg_unaffected_sents.test +0 -0
  50. data/Perturbed_data/Llama-3.2-3B/babylm_shuffle_nondeterministic/babylm_test_unaffected_sents/open_subtitles_unaffected_sents.test +0 -0
.gitattributes CHANGED
@@ -90,3 +90,22 @@ train/checkpoints/Llama-3.2-3B/babylm_reverse_partial_10M_seed0/runs/checkpoint-
90
  train/checkpoints/Llama-3.2-3B/babylm_reverse_partial_10M_seed0/runs/checkpoint-1050/tokenizer.json filter=lfs diff=lfs merge=lfs -text
91
  train/checkpoints/Llama-3.2-3B/babylm_reverse_partial_10M_seed0/runs/checkpoint-2080/tokenizer.json filter=lfs diff=lfs merge=lfs -text
92
  train/checkpoints/Llama-3.2-3B/babylm_reverse_full_10M_seed0/artifacts/models--meta-llama--Llama-3.2-3B/snapshots/5cc0ffe09ee49f7be6ca7c794ee6bd7245e84e60/model-00002-of-00002.safetensors filter=lfs diff=lfs merge=lfs -text
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
90
  train/checkpoints/Llama-3.2-3B/babylm_reverse_partial_10M_seed0/runs/checkpoint-1050/tokenizer.json filter=lfs diff=lfs merge=lfs -text
91
  train/checkpoints/Llama-3.2-3B/babylm_reverse_partial_10M_seed0/runs/checkpoint-2080/tokenizer.json filter=lfs diff=lfs merge=lfs -text
92
  train/checkpoints/Llama-3.2-3B/babylm_reverse_full_10M_seed0/artifacts/models--meta-llama--Llama-3.2-3B/snapshots/5cc0ffe09ee49f7be6ca7c794ee6bd7245e84e60/model-00002-of-00002.safetensors filter=lfs diff=lfs merge=lfs -text
93
+ train/checkpoints/Llama-3.2-3B/babylm_reverse_full_10M_seed0/runs/checkpoint-600/model-00002-of-00002.safetensors filter=lfs diff=lfs merge=lfs -text
94
+ train/checkpoints/Llama-3.2-3B/babylm_reverse_partial_10M_seed0/runs/checkpoint-1050/model-00002-of-00002.safetensors filter=lfs diff=lfs merge=lfs -text
95
+ train/checkpoints/Llama-3.2-3B/babylm_reverse_partial_10M_seed0/runs/checkpoint-1800/model-00002-of-00002.safetensors filter=lfs diff=lfs merge=lfs -text
96
+ train/checkpoints/Llama-3.2-3B/babylm_reverse_partial_10M_seed0/runs/checkpoint-2080/model-00002-of-00002.safetensors filter=lfs diff=lfs merge=lfs -text
97
+ train/checkpoints/Llama-3.2-3B/babylm_reverse_partial_10M_seed0/runs/checkpoint-1350/model-00002-of-00002.safetensors filter=lfs diff=lfs merge=lfs -text
98
+ train/checkpoints/Llama-3.2-3B/babylm_reverse_full_10M_seed0/runs/checkpoint-1200/model-00002-of-00002.safetensors filter=lfs diff=lfs merge=lfs -text
99
+ train/checkpoints/Llama-3.2-3B/babylm_reverse_partial_10M_seed0/runs/checkpoint-1200/model-00002-of-00002.safetensors filter=lfs diff=lfs merge=lfs -text
100
+ train/checkpoints/Llama-3.2-3B/babylm_reverse_full_10M_seed0/runs/checkpoint-1050/model-00002-of-00002.safetensors filter=lfs diff=lfs merge=lfs -text
101
+ train/checkpoints/Llama-3.2-3B/babylm_reverse_full_10M_seed0/runs/checkpoint-1950/model-00002-of-00002.safetensors filter=lfs diff=lfs merge=lfs -text
102
+ train/checkpoints/Llama-3.2-3B/babylm_reverse_full_10M_seed0/runs/checkpoint-1650/model-00002-of-00002.safetensors filter=lfs diff=lfs merge=lfs -text
103
+ train/checkpoints/Llama-3.2-3B/babylm_reverse_full_10M_seed0/runs/checkpoint-1350/model-00002-of-00002.safetensors filter=lfs diff=lfs merge=lfs -text
104
+ train/checkpoints/Llama-3.2-3B/babylm_reverse_partial_10M_seed0/runs/checkpoint-1950/model-00002-of-00002.safetensors filter=lfs diff=lfs merge=lfs -text
105
+ train/checkpoints/Llama-3.2-3B/babylm_reverse_full_10M_seed0/runs/checkpoint-300/model-00002-of-00002.safetensors filter=lfs diff=lfs merge=lfs -text
106
+ train/checkpoints/Llama-3.2-3B/babylm_reverse_partial_10M_seed0/runs/checkpoint-450/model-00002-of-00002.safetensors filter=lfs diff=lfs merge=lfs -text
107
+ train/checkpoints/Llama-3.2-3B/babylm_reverse_full_10M_seed0/runs/checkpoint-2080/model-00002-of-00002.safetensors filter=lfs diff=lfs merge=lfs -text
108
+ train/checkpoints/Llama-3.2-3B/babylm_reverse_partial_10M_seed0/runs/checkpoint-300/model-00002-of-00002.safetensors filter=lfs diff=lfs merge=lfs -text
109
+ train/checkpoints/Llama-3.2-3B/babylm_reverse_partial_10M_seed0/runs/checkpoint-1500/model-00002-of-00002.safetensors filter=lfs diff=lfs merge=lfs -text
110
+ train/checkpoints/Llama-3.2-3B/babylm_reverse_full_10M_seed0/runs/checkpoint-1500/model-00002-of-00002.safetensors filter=lfs diff=lfs merge=lfs -text
111
+ train/checkpoints/Llama-3.2-3B/babylm_reverse_partial_10M_seed0/runs/checkpoint-750/model-00001-of-00002.safetensors filter=lfs diff=lfs merge=lfs -text
README.md ADDED
@@ -0,0 +1,90 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # 💥 Mission: Impossible Language Models 💥
2
+
3
+ This is the code repository for the paper "[Mission: Impossible Language Models](https://arxiv.org/abs/2401.06416)".
4
+
5
+ If you use our code, please cite our paper:
6
+
7
+ ```
8
+ @misc{kallini2024mission,
9
+ title={Mission: Impossible Language Models},
10
+ author={Julie Kallini and Isabel Papadimitriou and Richard Futrell and Kyle Mahowald and Christopher Potts},
11
+ year={2024},
12
+ eprint={2401.06416},
13
+ archivePrefix={arXiv},
14
+ primaryClass={cs.CL}
15
+ }
16
+ ```
17
+
18
+ This repository contains the code necessary to fully replicate our paper, including the scripts to create impossible language datasets, train GPT-2 models, and run all experiments. We also include the notebooks to generate the result graphs in our paper.
19
+
20
+ Let's get started!
21
+
22
+ ## Setup
23
+
24
+ First, clone the repo and install dependencies:
25
+
26
+ ```
27
+ git clone https://github.com/jkallini/mission-impossible-language-models.git
28
+ cd mission-impossible-language-models
29
+ pip install -r requirements.txt
30
+ ```
31
+
32
+ ## Impossible Language Dataset Creation
33
+
34
+ The scripts for creating impossible language datasets are located in the `data/` directory.
35
+ First, you must download a copy of the [BabyLM dataset](https://babylm.github.io/), which we use for our experiments.
36
+ Then, make sure to set `BABYLM_DATA_PATH` in the `utils.py` file to the path on your system where your BabyLM dataset is located.
37
+
38
+ After downloading the BabyLM dataset, you will need to tag it with morphological features and part-of-speech tags. You can use our `tag.py` script.
39
+
40
+ With the tagged data, you can easily recreate one of the impossible language datasets described in our paper. These are predefined and listed in the `PERTURBATIONS` section at the end of `utils.py`. Here is an
41
+ example for the PartialReverse language from the paper:
42
+
43
+ ```
44
+ python3 perturb.py reverse_partial 100M
45
+ ```
46
+
47
+ This will create a perturbed version of the 100M BabyLM train set. You may use `perturb.py` or `perturb.sh` to perturb multiple splits at the same time.
48
+
49
+ ### Defining New Impossible Languages
50
+
51
+ You can also define your own impossible languages! They are described by four attributes:
52
+
53
+ 1. `perturbation_function`: function mapping tagged sentences to sequences of GPT-2 tokens.
54
+ 2. `affect_function`: function that determines whether an input sentences is "affected" or altered by the perturbation.
55
+ 3. `filter_function`: function that determines whether an input sentence should be included in the final dataset.
56
+ 4. `gpt2_tokenizer`: tokenizer used to perturb this dataset.
57
+
58
+ You can add these definitions to `utils.py`, where the existing perturbations are located.
59
+
60
+
61
+ ## Model Training
62
+
63
+ To train GPT-2 models, we use [`mistral`](https://github.com/stanford-crfm/mistral). If you would like to train GPT-2s with `mistral` as well, please follow their steps for installation. You may download their repo anywhere on your system.
64
+
65
+ Next, make sure to change the following constants in `utils.py`:
66
+ - `CHECKPOINT_WRITE_PATH`: the path where your training checkpoints will be written.
67
+ - `CHECKPOINT_READ_PATH`: the path where you will read training checkpoints when running experiments.
68
+
69
+ Our training scripts are in the `training/` directory.
70
+ Once you have `mistral` installed, set `MISTRAL_PATH` to the path of your library in `prepare_training.sh`. Then, you can use this script to generate the config files that you will use to launch `mistral` training runs.
71
+
72
+ Our scripts will create the config files and move them to the location of your `mistral` directory—you will only need to launch the training run. Here's an example command to launch training for the PartialReverse language using the 100M training set with the random seed set to 41:
73
+
74
+ ```
75
+ CUDA_VISIBLE_DEVICES=0 python3 train.py --config conf/train_reverse_partial_100M_randinit_seed41.yaml --nnodes 1 --nproc_per_node 1 --training_arguments.fp16 true --training_arguments.warmup_steps 300 --training_arguments.max_steps 3000
76
+ ```
77
+
78
+ ## Experiments
79
+
80
+ The main paper includes three experiments: perplexities, surprisals, and causal interventions. The appendix also includes a constituency probing experiment.
81
+
82
+ The scripts to run each of these experiments is separated into their own subdirectories:
83
+
84
+ 1. `perplexities/`: code to run perplexity experiments. You may use `perplexities.py` or `perplexities.sh` to run experiments for multiple languages at the same time.
85
+ 2. `hop_surprisal/`: code to run surprisal experiments for the *Hop languages, in `hop_surprisal.py`.
86
+ 3. `hop_interventions/`: code to run interchange intervention experiments for the *Hop languages. First generate the agreement data using `create_agreement_data.py`, then run the intervention experiments using `hop_interventions.py`.
87
+ You will need to separately clone and install [`align-transformers`](https://github.com/frankaging/align-transformers) (recently renamed to `pyvene`) and set `PATH_TO_ALIGN_TRANSFORMERS` to the path where the library is located on your system.
88
+ 4. `edge_probing/`: code to run constituency probing experiments. Use `get_constituency_parses.py` and `load_phrase_data.py` to prepare the test data, and use `edge_probing.py` to run the experiments.
89
+
90
+ Each directory contains python notebooks to generate the result graphs shown in the paper.
__pycache__/qwen_utils.cpython-39.pyc ADDED
Binary file (11.6 kB). View file
 
__pycache__/utils.cpython-39.pyc ADDED
Binary file (11.6 kB). View file
 
__pycache__/utils_llama.cpython-311.pyc ADDED
Binary file (20.9 kB). View file
 
__pycache__/utils_llama.cpython-39.pyc ADDED
Binary file (11.6 kB). View file
 
__pycache__/utils_qwen.cpython-311.pyc ADDED
Binary file (20.8 kB). View file
 
__pycache__/utils_qwen.cpython-39.pyc ADDED
Binary file (11.6 kB). View file
 
data/Perturbed_data/Llama-3.2-3B/babylm_hop_control/babylm_10M/bnc_spoken.train ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:b4bcf7adfa462513a7a1aa470c096b80cf1a43c0c97196212ccdf3c08ba7a364
3
+ size 5008360
data/Perturbed_data/Llama-3.2-3B/babylm_hop_control/babylm_10M/gutenberg.train ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:338ae2ca0ce7595795b35b7a27e8f34a52b1cea91273efc15c475ead9e64959a
3
+ size 16343495
data/Perturbed_data/Llama-3.2-3B/babylm_hop_control/babylm_10M/open_subtitles.train ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:90e3f7e89c21800473567ee8d5661b1915c7d734ea89a50f9120a60bd454836c
3
+ size 13128159
data/Perturbed_data/Llama-3.2-3B/babylm_hop_control/babylm_test_affected/bnc_spoken_affected.test ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:f022a598dbfbd131a35246022dffd4c57d859481075c79bc6499652423b88e91
3
+ size 2619317
data/Perturbed_data/Llama-3.2-3B/babylm_hop_control/babylm_test_affected/childes_affected.test ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:822bee21207097081f912992215bf810c67581f5109b4641acb6cf2790e7dc60
3
+ size 5072279
data/Perturbed_data/Llama-3.2-3B/babylm_hop_control/babylm_test_affected/gutenberg_affected.test ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:28cd50fdd46369061f945d681a6fdd2b0dd66dda303d941eea9d61f86eaff12f
3
+ size 3752642
data/Perturbed_data/Llama-3.2-3B/babylm_hop_control/babylm_test_affected/open_subtitles_affected.test ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:9ff3a890f0e407efed4f9e36b61b13620718cdea0312d456713e17fbc244e6cf
3
+ size 3189043
data/Perturbed_data/Llama-3.2-3B/babylm_hop_control/babylm_test_affected/simple_wiki_affected.test ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:2083e1c23595655afcf9822336ec1dd3da759bdf67b92f5d047598b96474387a
3
+ size 4581812
data/Perturbed_data/Llama-3.2-3B/babylm_hop_control/babylm_test_affected/switchboard_affected.test ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:25c440d7ea89fa1611f35e487fb3e31c009fb0b3ef1cd39d514abcf4c8cf52d2
3
+ size 529118
data/Perturbed_data/Llama-3.2-3B/babylm_hop_control/babylm_test_unaffected_sents/bnc_spoken_unaffected_sents.test ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:e6a52471e389f1b9e0ee8e9e81d6f07024a9d0e2bef64b3360bd3eea57b9af0d
3
+ size 1936645
data/Perturbed_data/Llama-3.2-3B/babylm_hop_control/babylm_test_unaffected_sents/childes_unaffected_sents.test ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:e32ba7d98ae9ebefe7f25ba8c14f5ef9c4ea2e18423062253953b7a229f30689
3
+ size 9069635
data/Perturbed_data/Llama-3.2-3B/babylm_hop_control/babylm_test_unaffected_sents/gutenberg_unaffected_sents.test ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:6ff490ad471e7a632f9cce0a79216fab698424b37322073f1d8f9193cec67d61
3
+ size 9908252
data/Perturbed_data/Llama-3.2-3B/babylm_hop_control/babylm_test_unaffected_sents/open_subtitles_unaffected_sents.test ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:a36df78972cda2a89ce70d2b159b073c9c2cf114e173d9623df3ba1704c0ed62
3
+ size 6614054
data/Perturbed_data/Llama-3.2-3B/babylm_hop_control/babylm_test_unaffected_sents/simple_wiki_unaffected_sents.test ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:9a6656ee5ed666a5879baab086ce842f7162195959330aee09327081b7f54946
3
+ size 3774618
data/Perturbed_data/Llama-3.2-3B/babylm_hop_control/babylm_test_unaffected_sents/switchboard_unaffected_sents.test ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:499ba9c7b67d50543f5c7b6977cc7c18ac5da7ef0cc28e9cbeb1816d45038f40
3
+ size 337196
data/Perturbed_data/Llama-3.2-3B/babylm_shuffle_deterministic57/babylm_test_unaffected_sents/bnc_spoken_unaffected_sents.test ADDED
File without changes
data/Perturbed_data/Llama-3.2-3B/babylm_shuffle_deterministic57/babylm_test_unaffected_sents/open_subtitles_unaffected_sents.test ADDED
File without changes
data/Perturbed_data/Llama-3.2-3B/babylm_shuffle_deterministic57/babylm_test_unaffected_sents/simple_wiki_unaffected_sents.test ADDED
File without changes
data/Perturbed_data/Llama-3.2-3B/babylm_shuffle_deterministic57/babylm_test_unaffected_sents/switchboard_unaffected_sents.test ADDED
File without changes
data/Perturbed_data/Llama-3.2-3B/babylm_shuffle_local3/babylm_10M/bnc_spoken.train ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:beca8a56cf5829e143edeb2298c3b3f4b64ed002166c4bcff3402adc4b68ffcf
3
+ size 5509282
data/Perturbed_data/Llama-3.2-3B/babylm_shuffle_local3/babylm_10M/childes.train ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:33cb4488ac5b029fb0988b204321ce59aaf822d5b4110bed26d2d8b848729e01
3
+ size 24490419
data/Perturbed_data/Llama-3.2-3B/babylm_shuffle_local3/babylm_10M/gutenberg.train ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:84adc9961d617d8fea49fa87021aa0a10f49fce21b1d9524462058ad0ef7285e
3
+ size 16364206
data/Perturbed_data/Llama-3.2-3B/babylm_shuffle_local3/babylm_10M/open_subtitles.train ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:0e3f9107ff70578785fc0ea9f7020a0e1edd5fe95c8b8643ed033a413ac42cef
3
+ size 14492108
data/Perturbed_data/Llama-3.2-3B/babylm_shuffle_local3/babylm_10M/simple_wiki.train ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:ae57fb5df45a4e74fca26340b1ceacece32c278a11b2c10a434d7271965687ef
3
+ size 10199712
data/Perturbed_data/Llama-3.2-3B/babylm_shuffle_local3/babylm_10M/switchboard.train ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:06967e038d68da7fc9ee53ceca256856b8cc840c281f6e04a9202c5efb3212ee
3
+ size 961566
data/Perturbed_data/Llama-3.2-3B/babylm_shuffle_local3/babylm_dev/bnc_spoken.dev ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:1030d66bbf3da860dd6115db93596fc63616dee33a8a581ca6201af0d4a0d95d
3
+ size 7706841
data/Perturbed_data/Llama-3.2-3B/babylm_shuffle_local3/babylm_dev/childes.dev ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:ffc14d3bcafa3aaeeb006f16fd9ff9cdaae695b596b10c4bbb030a7534d429d5
3
+ size 23046068
data/Perturbed_data/Llama-3.2-3B/babylm_shuffle_local3/babylm_dev/gutenberg.dev ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:df608f4977a2311a48fae5c98c919f966e30d6ddae879f6079f5ef150d16bccc
3
+ size 17909917
data/Perturbed_data/Llama-3.2-3B/babylm_shuffle_local3/babylm_dev/open_subtitles.dev ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:d1474009adcc0127568191a012f4725e5ef5d68b7c1f0fb749247dbf0dd58071
3
+ size 15254169
data/Perturbed_data/Llama-3.2-3B/babylm_shuffle_local3/babylm_dev/simple_wiki.dev ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:aacd50c10eced582fdc489070a96c46c1be371bbabbb8c6c7fb4f660c08a3016
3
+ size 9832550
data/Perturbed_data/Llama-3.2-3B/babylm_shuffle_local3/babylm_dev/switchboard.dev ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:74f777e7b31c6599001351db2f41de78ebeeba6780d68e282101781dac93f4f7
3
+ size 988060
data/Perturbed_data/Llama-3.2-3B/babylm_shuffle_local3/babylm_test_affected/bnc_spoken_affected.test ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:9697b5970ad4e0b636226cda7ea4a1c0061d6d007027e17084df795a7dfc96fb
3
+ size 5546587
data/Perturbed_data/Llama-3.2-3B/babylm_shuffle_local3/babylm_test_affected/childes_affected.test ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:555ac120d23e4f7bc6305011796046e95bd8372945368e42f41ee30a54573c6a
3
+ size 22923367
data/Perturbed_data/Llama-3.2-3B/babylm_shuffle_local3/babylm_test_affected/gutenberg_affected.test ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:ea7c30f238df670316c799f675fd182f60669a1820c55c84801c635fe4f3541e
3
+ size 15240278
data/Perturbed_data/Llama-3.2-3B/babylm_shuffle_local3/babylm_test_affected/open_subtitles_affected.test ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:ff06e0e5b1a66c0f4865f825526a44ec39d01d52e55b3dea9a3fb9a984f0c540
3
+ size 13869412
data/Perturbed_data/Llama-3.2-3B/babylm_shuffle_local3/babylm_test_affected/simple_wiki_affected.test ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:f26542c86d483485ac8e3bd03ad3b8860627673cf5e317affe22f040e9d03a33
3
+ size 9142615
data/Perturbed_data/Llama-3.2-3B/babylm_shuffle_local3/babylm_test_affected/switchboard_affected.test ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:01489603107dd8e69ecc3e6a5e76b25f74d5e890b6a9ac3068e8d879f408dbb7
3
+ size 1087539
data/Perturbed_data/Llama-3.2-3B/babylm_shuffle_nondeterministic/babylm_test_unaffected/childes_unaffected.test ADDED
File without changes
data/Perturbed_data/Llama-3.2-3B/babylm_shuffle_nondeterministic/babylm_test_unaffected_sents/bnc_spoken_unaffected_sents.test ADDED
File without changes
data/Perturbed_data/Llama-3.2-3B/babylm_shuffle_nondeterministic/babylm_test_unaffected_sents/childes_unaffected_sents.test ADDED
File without changes
data/Perturbed_data/Llama-3.2-3B/babylm_shuffle_nondeterministic/babylm_test_unaffected_sents/gutenberg_unaffected_sents.test ADDED
File without changes
data/Perturbed_data/Llama-3.2-3B/babylm_shuffle_nondeterministic/babylm_test_unaffected_sents/open_subtitles_unaffected_sents.test ADDED
File without changes