don't XOR the JSON files

Files changed (8) hide show

README.md CHANGED Viewed

@@ -33,8 +33,6 @@ This is version 1. It has been fine-tuned using a subset of the data from Pygmal
     --decode
   ```
-**Note for Windows users:** If you're on Windows, you might run into issues where following the steps above will result in corrupted files. This seems to be because `git` messes with the encoding of text files (so the `.json`s and other relevant files). To avoid this, use WSL. For reference, these are the MD5 hashes you should get after following the steps above:
 ```bash
 $ rhash -M *
 4608facb4910118f8dfa80f090cbc4dc  config.json
@@ -45,7 +43,7 @@ be9ba2f37228a0a9ea0eaf6530aba4de  pytorch_model-00002-of-00002.bin
 6b2e0a735969660e720c27061ef3f3d3  special_tokens_map.json
 fdb311c39b8659a5d5c1991339bafc09  tokenizer.json
 eeec4125e9c7560836b4873b6f8e3025  tokenizer.model
-f0b65b44265ba51881b1e1881102504f  tokenizer_config.json
 ```
 ## Prompting

     --decode
   ```
 ```bash
 $ rhash -M *
 4608facb4910118f8dfa80f090cbc4dc  config.json
 6b2e0a735969660e720c27061ef3f3d3  special_tokens_map.json
 fdb311c39b8659a5d5c1991339bafc09  tokenizer.json
 eeec4125e9c7560836b4873b6f8e3025  tokenizer.model
+9b3cf7b8c0e4783dbc1419b4cafe8e1e  tokenizer_config.json
 ```
 ## Prompting

xor_codec.py CHANGED Viewed

@@ -4,6 +4,7 @@ https://huggingface.co/OpenAssistant/oasst-sft-6-llama-30b-xor/raw/main/xor_code
 '''
 import os
 import sys
 import gzip
 import numpy
 from pathlib import Path
@@ -63,6 +64,12 @@ def xor_dir(dst, src_payload, src_base, decode=True, compress=True):
         xor = xor_uncompressed
     Path(dst).mkdir(parents=True, exist_ok=True)
     for path in os.listdir(src_payload):
         print("[*] Processing '%s'" % path)
         try:
             xor("%s/%s" % (dst, path), "%s/%s" % (src_payload, path), "%s/%s" % (src_base, path))

 '''
 import os
 import sys
+import shutil
 import gzip
 import numpy
 from pathlib import Path
         xor = xor_uncompressed
     Path(dst).mkdir(parents=True, exist_ok=True)
     for path in os.listdir(src_payload):
+        # Don't care about uncopyrightable text files, just copy over.
+        if ".json" in path:
+            print("[*] Copying '%s'" % path)
+            shutil.copy(f"{src_payload}/{path}", f"{dst}/{path}")
+            continue
         print("[*] Processing '%s'" % path)
         try:
             xor("%s/%s" % (dst, path), "%s/%s" % (src_payload, path), "%s/%s" % (src_base, path))

xor_encoded_files/config.json CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:89f8649bb9ba162ad7f95167028f4230a2b9a597d5c9e6e28a2a6e45e2b8fb70
 size 598

 version https://git-lfs.github.com/spec/v1
+oid sha256:fe29f9c0424d17fcd8f23920d53b90f2cce9663668d256542bc21ea06adaac7b
 size 598

xor_encoded_files/generation_config.json CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:4a93894f08d98d707cd9a0274f4c9a51bcfa27e701359e12befcc78ffb488817
 size 137

 version https://git-lfs.github.com/spec/v1
+oid sha256:fd7ff399e5568cc21a0a8414f43df88ef7c424995b9b97a90563165d2cf79efd
 size 137

xor_encoded_files/pytorch_model.bin.index.json CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:4be5669c4eb805f9afb7648438733b6ff1a3fa0d988b4165cf353929c2b89d4f
 size 26788

 version https://git-lfs.github.com/spec/v1
+oid sha256:aca3c1facc89d12311a667c32e85ac86e625990992ea8a189e3b036ba371b931
 size 26788

xor_encoded_files/special_tokens_map.json CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:5f7bcd85900e62abb00ce739eaad53d80170a4a6152d951b6825110d2fc17965
 size 411

 version https://git-lfs.github.com/spec/v1
+oid sha256:ff3b4a612c4e447acb02d40071bddd989fe0da87eb5b7fe0dbadfc4f74de7531
 size 411

xor_encoded_files/tokenizer.json CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:721b8a5e36fc955713c7b5705281b2144e3505ee46a6194ebb39299d941094a9
 size 1842665

 version https://git-lfs.github.com/spec/v1
+oid sha256:f9ffc4aede0845ab65324ce5dccb823dca2427f9a0710981e5bc2398d73d8162
 size 1842665

xor_encoded_files/tokenizer_config.json CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:8cf2eeac1040c1965ce9f8333c2e763b4aba5366d3b3f3367807741325304dfb
-size 831

 version https://git-lfs.github.com/spec/v1
+oid sha256:9e7aa7d0f67f207036d981d7bbabfbf4b521c4c089c0280fcc08ef9c732634b5
+size 783