don't XOR the JSON files
Browse files- README.md +1 -3
- xor_codec.py +7 -0
- xor_encoded_files/config.json +1 -1
- xor_encoded_files/generation_config.json +1 -1
- xor_encoded_files/pytorch_model.bin.index.json +1 -1
- xor_encoded_files/special_tokens_map.json +1 -1
- xor_encoded_files/tokenizer.json +1 -1
- xor_encoded_files/tokenizer_config.json +2 -2
README.md
CHANGED
@@ -33,8 +33,6 @@ This is version 1. It has been fine-tuned using a subset of the data from Pygmal
|
|
33 |
--decode
|
34 |
```
|
35 |
|
36 |
-
**Note for Windows users:** If you're on Windows, you might run into issues where following the steps above will result in corrupted files. This seems to be because `git` messes with the encoding of text files (so the `.json`s and other relevant files). To avoid this, use WSL. For reference, these are the MD5 hashes you should get after following the steps above:
|
37 |
-
|
38 |
```bash
|
39 |
$ rhash -M *
|
40 |
4608facb4910118f8dfa80f090cbc4dc config.json
|
@@ -45,7 +43,7 @@ be9ba2f37228a0a9ea0eaf6530aba4de pytorch_model-00002-of-00002.bin
|
|
45 |
6b2e0a735969660e720c27061ef3f3d3 special_tokens_map.json
|
46 |
fdb311c39b8659a5d5c1991339bafc09 tokenizer.json
|
47 |
eeec4125e9c7560836b4873b6f8e3025 tokenizer.model
|
48 |
-
|
49 |
```
|
50 |
|
51 |
## Prompting
|
|
|
33 |
--decode
|
34 |
```
|
35 |
|
|
|
|
|
36 |
```bash
|
37 |
$ rhash -M *
|
38 |
4608facb4910118f8dfa80f090cbc4dc config.json
|
|
|
43 |
6b2e0a735969660e720c27061ef3f3d3 special_tokens_map.json
|
44 |
fdb311c39b8659a5d5c1991339bafc09 tokenizer.json
|
45 |
eeec4125e9c7560836b4873b6f8e3025 tokenizer.model
|
46 |
+
9b3cf7b8c0e4783dbc1419b4cafe8e1e tokenizer_config.json
|
47 |
```
|
48 |
|
49 |
## Prompting
|
xor_codec.py
CHANGED
@@ -4,6 +4,7 @@ https://huggingface.co/OpenAssistant/oasst-sft-6-llama-30b-xor/raw/main/xor_code
|
|
4 |
'''
|
5 |
import os
|
6 |
import sys
|
|
|
7 |
import gzip
|
8 |
import numpy
|
9 |
from pathlib import Path
|
@@ -63,6 +64,12 @@ def xor_dir(dst, src_payload, src_base, decode=True, compress=True):
|
|
63 |
xor = xor_uncompressed
|
64 |
Path(dst).mkdir(parents=True, exist_ok=True)
|
65 |
for path in os.listdir(src_payload):
|
|
|
|
|
|
|
|
|
|
|
|
|
66 |
print("[*] Processing '%s'" % path)
|
67 |
try:
|
68 |
xor("%s/%s" % (dst, path), "%s/%s" % (src_payload, path), "%s/%s" % (src_base, path))
|
|
|
4 |
'''
|
5 |
import os
|
6 |
import sys
|
7 |
+
import shutil
|
8 |
import gzip
|
9 |
import numpy
|
10 |
from pathlib import Path
|
|
|
64 |
xor = xor_uncompressed
|
65 |
Path(dst).mkdir(parents=True, exist_ok=True)
|
66 |
for path in os.listdir(src_payload):
|
67 |
+
# Don't care about uncopyrightable text files, just copy over.
|
68 |
+
if ".json" in path:
|
69 |
+
print("[*] Copying '%s'" % path)
|
70 |
+
shutil.copy(f"{src_payload}/{path}", f"{dst}/{path}")
|
71 |
+
continue
|
72 |
+
|
73 |
print("[*] Processing '%s'" % path)
|
74 |
try:
|
75 |
xor("%s/%s" % (dst, path), "%s/%s" % (src_payload, path), "%s/%s" % (src_base, path))
|
xor_encoded_files/config.json
CHANGED
@@ -1,3 +1,3 @@
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
-
oid sha256:
|
3 |
size 598
|
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:fe29f9c0424d17fcd8f23920d53b90f2cce9663668d256542bc21ea06adaac7b
|
3 |
size 598
|
xor_encoded_files/generation_config.json
CHANGED
@@ -1,3 +1,3 @@
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
-
oid sha256:
|
3 |
size 137
|
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:fd7ff399e5568cc21a0a8414f43df88ef7c424995b9b97a90563165d2cf79efd
|
3 |
size 137
|
xor_encoded_files/pytorch_model.bin.index.json
CHANGED
@@ -1,3 +1,3 @@
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
-
oid sha256:
|
3 |
size 26788
|
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:aca3c1facc89d12311a667c32e85ac86e625990992ea8a189e3b036ba371b931
|
3 |
size 26788
|
xor_encoded_files/special_tokens_map.json
CHANGED
@@ -1,3 +1,3 @@
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
-
oid sha256:
|
3 |
size 411
|
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:ff3b4a612c4e447acb02d40071bddd989fe0da87eb5b7fe0dbadfc4f74de7531
|
3 |
size 411
|
xor_encoded_files/tokenizer.json
CHANGED
@@ -1,3 +1,3 @@
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
-
oid sha256:
|
3 |
size 1842665
|
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:f9ffc4aede0845ab65324ce5dccb823dca2427f9a0710981e5bc2398d73d8162
|
3 |
size 1842665
|
xor_encoded_files/tokenizer_config.json
CHANGED
@@ -1,3 +1,3 @@
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
-
oid sha256:
|
3 |
-
size
|
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:9e7aa7d0f67f207036d981d7bbabfbf4b521c4c089c0280fcc08ef9c732634b5
|
3 |
+
size 783
|