adding the exact data used to train this model
Browse files- .gitattributes +11 -0
- data/airoboros_3.2_without_contextual_slimorca_orca_sharegpt.json +3 -0
- data/allenai_wild_chat_gpt4_english_toxic_random_half_4k_sharegpt.json +3 -0
- data/capybara_sharegpt.json +3 -0
- data/cot_alpaca_gpt4_extracted_openhermes_2.5_sharegpt.json +3 -0
- data/gpt4_data_lmys_1m_sharegpt.json +3 -0
- data/gpteacher-instruct-special-alpaca.json +3 -0
- data/merged_all.json +3 -0
- data/pippa_bagel_repo_3k_sharegpt.json +3 -0
- data/remove_empty_output.py +13 -0
- data/sharegpt_gpt4_english.json +3 -0
- data/slimorca_dedup_filtered_95k_sharegpt.json +3 -0
- data/synthia-v1.3_sharegpt_12500.json +3 -0
.gitattributes
CHANGED
@@ -33,3 +33,14 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
|
|
33 |
*.zip filter=lfs diff=lfs merge=lfs -text
|
34 |
*.zst filter=lfs diff=lfs merge=lfs -text
|
35 |
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
33 |
*.zip filter=lfs diff=lfs merge=lfs -text
|
34 |
*.zst filter=lfs diff=lfs merge=lfs -text
|
35 |
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
36 |
+
data/airoboros_3.2_without_contextual_slimorca_orca_sharegpt.json filter=lfs diff=lfs merge=lfs -text
|
37 |
+
data/allenai_wild_chat_gpt4_english_toxic_random_half_4k_sharegpt.json filter=lfs diff=lfs merge=lfs -text
|
38 |
+
data/capybara_sharegpt.json filter=lfs diff=lfs merge=lfs -text
|
39 |
+
data/cot_alpaca_gpt4_extracted_openhermes_2.5_sharegpt.json filter=lfs diff=lfs merge=lfs -text
|
40 |
+
data/gpt4_data_lmys_1m_sharegpt.json filter=lfs diff=lfs merge=lfs -text
|
41 |
+
data/gpteacher-instruct-special-alpaca.json filter=lfs diff=lfs merge=lfs -text
|
42 |
+
data/merged_all.json filter=lfs diff=lfs merge=lfs -text
|
43 |
+
data/pippa_bagel_repo_3k_sharegpt.json filter=lfs diff=lfs merge=lfs -text
|
44 |
+
data/sharegpt_gpt4_english.json filter=lfs diff=lfs merge=lfs -text
|
45 |
+
data/slimorca_dedup_filtered_95k_sharegpt.json filter=lfs diff=lfs merge=lfs -text
|
46 |
+
data/synthia-v1.3_sharegpt_12500.json filter=lfs diff=lfs merge=lfs -text
|
data/airoboros_3.2_without_contextual_slimorca_orca_sharegpt.json
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:22b3140cce72bfaad2ae423c2c9bafd9ce128cf7820e8be3b9f6d415390c5689
|
3 |
+
size 89066312
|
data/allenai_wild_chat_gpt4_english_toxic_random_half_4k_sharegpt.json
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:d82b9c5d276a699b3712b42d08b34de4ce334ab06ce185f3e55ef25a2e933852
|
3 |
+
size 41890772
|
data/capybara_sharegpt.json
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:a1947d28999416a2f468d1e09654cfdfc9bab8ccd03aa184598d20f0000dd6e4
|
3 |
+
size 76361785
|
data/cot_alpaca_gpt4_extracted_openhermes_2.5_sharegpt.json
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:4a037af5bf62d30414b85d036c09c0f860922f66c3e7fd701abf809f7fc94c32
|
3 |
+
size 40074062
|
data/gpt4_data_lmys_1m_sharegpt.json
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:39a31e3af56bb53e7c723faf0d0bc8b838091bdbed0eabcd0de881f9b4f8c2a9
|
3 |
+
size 41647312
|
data/gpteacher-instruct-special-alpaca.json
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:598d08b29655a0da79c9c1b290431c4d22d0533231a29cb048d1056e19d95c97
|
3 |
+
size 12187144
|
data/merged_all.json
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:24ddc1528923e569d091d249e43866b0b3c8486fe6723a0c9431f613ec4b9f91
|
3 |
+
size 662813228
|
data/pippa_bagel_repo_3k_sharegpt.json
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:df2121d98d19e3e1e0fc873390486df71c2f502f309eaafd5af45f3c151cfe4f
|
3 |
+
size 18361804
|
data/remove_empty_output.py
ADDED
@@ -0,0 +1,13 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import json
|
2 |
+
|
3 |
+
with open('merged_all.json', 'r') as file:
|
4 |
+
data = json.load(file)
|
5 |
+
|
6 |
+
print(f"Normal len: {len(data)}")
|
7 |
+
|
8 |
+
data = [row for row in data if row["output"] != ""]
|
9 |
+
|
10 |
+
print(f"After len: {len(data)}")
|
11 |
+
|
12 |
+
with open('merged_all.json', 'w') as file:
|
13 |
+
json.dump(data, file, indent=1)
|
data/sharegpt_gpt4_english.json
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:1512057e9002710757d6d8478b6678138fd878ac5844866d602b7cb7fd3e9c41
|
3 |
+
size 78552993
|
data/slimorca_dedup_filtered_95k_sharegpt.json
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:302e8d78b1f5f08bb7dd0ab7ded0204935003aea0b4c5bdbd8821d8924ab15f8
|
3 |
+
size 227955996
|
data/synthia-v1.3_sharegpt_12500.json
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:dbdbc7413a3c7fc65a900518f0db8627bb5ced53e1e8ee82613d09856c1b3b70
|
3 |
+
size 30638009
|