TroyDoesAI commited on
Commit
eebf9af
1 Parent(s): 71a529c

1 - Epoch of BlackSheep Persona with Training Logs

Browse files
.gitattributes CHANGED
@@ -33,3 +33,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
 
 
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
36
+ Training_BlackSheep_MoE/Epoch_1/Epoch_1_WandDB/run-zd7c97g6.wandb filter=lfs diff=lfs merge=lfs -text
Training_BlackSheep_MoE/Epoch_1/Epoch_1-BlackSheep-with-Personas.png ADDED
Training_BlackSheep_MoE/Epoch_1/Epoch_1_WandDB/files/conda-environment.yaml ADDED
@@ -0,0 +1,380 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ name: C:\Users\Administrator\Desktop\text-generation-webui-main\installer_files\env
2
+ channels:
3
+ - defaults
4
+ dependencies:
5
+ - bzip2=1.0.8=h2bbff1b_6
6
+ - ca-certificates=2024.7.2=haa95532_0
7
+ - git=2.45.2=haa95532_0
8
+ - libffi=3.4.4=hd77b12b_1
9
+ - ninja-base=1.10.2=h6d14046_5
10
+ - openssl=3.0.14=h827c3e9_0
11
+ - pip=24.0=py311haa95532_0
12
+ - python=3.11.9=he1021f5_0
13
+ - setuptools=69.5.1=py311haa95532_0
14
+ - sqlite=3.45.3=h2bbff1b_0
15
+ - tk=8.6.14=h0416ee5_0
16
+ - vc=14.2=h2eaa2aa_4
17
+ - vs2015_runtime=14.29.30133=h43f2093_4
18
+ - wheel=0.43.0=py311haa95532_0
19
+ - xz=5.4.6=h8cc25b3_1
20
+ - zlib=1.2.13=h8cc25b3_1
21
+ - pip:
22
+ - absl-py==2.1.0
23
+ - accelerate==0.33.0
24
+ - aiofiles==23.2.1
25
+ - aiohappyeyeballs==2.4.3
26
+ - aiohttp==3.10.8
27
+ - aiosignal==1.3.1
28
+ - alembic==1.13.2
29
+ - altair==5.3.0
30
+ - annotated-types==0.7.0
31
+ - anthropic==0.36.0
32
+ - anyio==4.4.0
33
+ - apscheduler==3.10.4
34
+ - argon2-cffi==23.1.0
35
+ - argon2-cffi-bindings==21.2.0
36
+ - asgiref==3.8.1
37
+ - attrs==23.2.0
38
+ - authlib==1.3.2
39
+ - auto-gptq==0.7.1
40
+ - autoawq==0.2.6
41
+ - autoawq-kernels==0.0.7
42
+ - av==12.3.0
43
+ - backoff==2.2.1
44
+ - bcrypt==4.2.0
45
+ - beautifulsoup4==4.12.3
46
+ - bidict==0.23.1
47
+ - bitarray==2.9.3
48
+ - bitsandbytes==0.43.3
49
+ - black==24.8.0
50
+ - blinker==1.8.2
51
+ - boto3==1.35.0
52
+ - botocore==1.35.39
53
+ - build==1.2.2.post1
54
+ - cachetools==5.5.0
55
+ - certifi==2024.7.4
56
+ - cffi==1.17.1
57
+ - chardet==5.2.0
58
+ - charset-normalizer==3.3.2
59
+ - chroma-hnswlib==0.7.6
60
+ - chromadb==0.5.9
61
+ - click==8.1.7
62
+ - colbert-ai==0.2.21
63
+ - colorama==0.4.6
64
+ - colorclass==2.2.2
65
+ - coloredlogs==15.0.1
66
+ - compressed-rtf==1.0.6
67
+ - comtypes==1.4.7
68
+ - contourpy==1.2.1
69
+ - cramjam==2.8.3
70
+ - cryptography==43.0.1
71
+ - ctranslate2==4.4.0
72
+ - cycler==0.12.1
73
+ - dataclasses-json==0.6.7
74
+ - dataproperty==1.0.1
75
+ - datasets==2.20.0
76
+ - defusedxml==0.7.1
77
+ - deprecated==1.2.14
78
+ - dill==0.3.8
79
+ - diskcache==5.6.3
80
+ - distro==1.9.0
81
+ - dnspython==2.6.1
82
+ - docker==7.1.0
83
+ - docker-pycreds==0.4.0
84
+ - docx2txt==0.8
85
+ - duckduckgo-search==6.2.13
86
+ - durationpy==0.9
87
+ - easygui==0.98.3
88
+ - ebcdic==1.1.1
89
+ - ecdsa==0.19.0
90
+ - einops==0.8.0
91
+ - elevenlabs==1.9.0
92
+ - email-validator==2.2.0
93
+ - emoji==2.14.0
94
+ - environs==9.5.0
95
+ - et-xmlfile==1.1.0
96
+ - eval-type-backport==0.2.0
97
+ - exllamav2==0.1.8+cu121.torch2.2.2
98
+ - extract-msg==0.51.1
99
+ - fake-useragent==1.5.1
100
+ - fastapi==0.111.0
101
+ - fastapi-cli==0.0.4
102
+ - faster-whisper==1.0.3
103
+ - fastparquet==2024.5.0
104
+ - ffmpy==0.3.2
105
+ - filelock==3.13.1
106
+ - filetype==1.2.0
107
+ - flash-attn==2.6.1
108
+ - flask==3.0.3
109
+ - flask-cloudflared==0.0.14
110
+ - flask-cors==5.0.0
111
+ - flask-sqlalchemy==3.1.1
112
+ - flatbuffers==24.3.25
113
+ - fonttools==4.53.1
114
+ - fpdf2==2.7.9
115
+ - frozenlist==1.4.1
116
+ - fsspec==2024.2.0
117
+ - ftfy==6.2.3
118
+ - gekko==1.2.1
119
+ - gguf==0.9.1
120
+ - git-python==1.0.3
121
+ - gitdb==4.0.11
122
+ - gitpython==3.1.43
123
+ - google-ai-generativelanguage==0.6.6
124
+ - google-api-core==2.21.0
125
+ - google-api-python-client==2.149.0
126
+ - google-auth==2.35.0
127
+ - google-auth-httplib2==0.2.0
128
+ - google-generativeai==0.7.2
129
+ - googleapis-common-protos==1.65.0
130
+ - gradio==4.26.0
131
+ - gradio-client==0.15.1
132
+ - greenlet==3.1.1
133
+ - grpcio==1.65.1
134
+ - grpcio-status==1.62.3
135
+ - h11==0.14.0
136
+ - halo==0.0.31
137
+ - hqq==0.1.7.post3
138
+ - httpcore==1.0.5
139
+ - httplib2==0.22.0
140
+ - httptools==0.6.1
141
+ - httpx==0.27.0
142
+ - huggingface-hub==0.24.0
143
+ - humanfriendly==10.0
144
+ - idna==3.7
145
+ - importlib-metadata==8.4.0
146
+ - importlib-resources==6.4.0
147
+ - iniconfig==2.0.0
148
+ - itsdangerous==2.2.0
149
+ - jinja2==3.1.4
150
+ - jiter==0.6.1
151
+ - jmespath==1.0.1
152
+ - joblib==1.4.2
153
+ - jsonl2json==1.0.0
154
+ - jsonlines==4.0.0
155
+ - jsonpatch==1.33
156
+ - jsonpath-python==1.0.6
157
+ - jsonpointer==3.0.0
158
+ - jsonschema==4.23.0
159
+ - jsonschema-specifications==2023.12.1
160
+ - keyboard==0.13.5
161
+ - kiwisolver==1.4.5
162
+ - kubernetes==31.0.0
163
+ - langchain==0.2.15
164
+ - langchain-chroma==0.1.4
165
+ - langchain-community==0.2.12
166
+ - langchain-core==0.2.41
167
+ - langchain-text-splitters==0.2.4
168
+ - langdetect==1.0.9
169
+ - langfuse==2.44.0
170
+ - langsmith==0.1.134
171
+ - lark==1.1.9
172
+ - llama-cpp-python==0.2.89+cpuavx2
173
+ - llama-cpp-python-cuda==0.2.89+cu121
174
+ - llama-cpp-python-cuda-tensorcores==0.2.89+cu121
175
+ - llvmlite==0.42.0
176
+ - lm-eval==0.3.0
177
+ - log-symbols==0.0.14
178
+ - lxml==5.3.0
179
+ - mako==1.3.5
180
+ - markdown==3.7
181
+ - markdown-it-py==3.0.0
182
+ - markupsafe==2.1.5
183
+ - marshmallow==3.22.0
184
+ - matplotlib==3.9.1
185
+ - mbstrdecoder==1.1.3
186
+ - mdurl==0.1.2
187
+ - mmh3==5.0.1
188
+ - monotonic==1.6
189
+ - mpmath==1.3.0
190
+ - msoffcrypto-tool==5.4.2
191
+ - multidict==6.0.5
192
+ - multiprocess==0.70.16
193
+ - mypy-extensions==1.0.0
194
+ - nest-asyncio==1.6.0
195
+ - networkx==3.2.1
196
+ - ninja==1.11.1.1
197
+ - nltk==3.9.1
198
+ - numba==0.59.1
199
+ - numexpr==2.10.1
200
+ - numpy==1.26.4
201
+ - oauthlib==3.2.2
202
+ - olefile==0.47
203
+ - oletools==0.60.2
204
+ - onnxruntime==1.19.2
205
+ - open-webui==0.3.32
206
+ - openai==1.37.0
207
+ - opencv-python==4.10.0.84
208
+ - opencv-python-headless==4.10.0.84
209
+ - openpyxl==3.1.5
210
+ - opentelemetry-api==1.27.0
211
+ - opentelemetry-exporter-otlp-proto-common==1.27.0
212
+ - opentelemetry-exporter-otlp-proto-grpc==1.27.0
213
+ - opentelemetry-instrumentation==0.48b0
214
+ - opentelemetry-instrumentation-asgi==0.48b0
215
+ - opentelemetry-instrumentation-fastapi==0.48b0
216
+ - opentelemetry-proto==1.27.0
217
+ - opentelemetry-sdk==1.27.0
218
+ - opentelemetry-semantic-conventions==0.48b0
219
+ - opentelemetry-util-http==0.48b0
220
+ - optimum==1.17.1
221
+ - orjson==3.10.6
222
+ - overrides==7.7.0
223
+ - packaging==23.2
224
+ - pandas==2.2.3
225
+ - passlib==1.7.4
226
+ - pathspec==0.12.1
227
+ - pathvalidate==3.2.0
228
+ - pcodedmp==1.2.6
229
+ - peewee==3.17.6
230
+ - peewee-migrate==1.12.2
231
+ - peft==0.12.0
232
+ - pillow==10.4.0
233
+ - platformdirs==4.2.2
234
+ - pluggy==1.5.0
235
+ - portalocker==2.10.1
236
+ - posthog==3.7.0
237
+ - primp==0.6.3
238
+ - propcache==0.2.0
239
+ - proto-plus==1.24.0
240
+ - protobuf==4.25.3
241
+ - psutil==6.0.0
242
+ - psycopg2-binary==2.9.9
243
+ - py-cpuinfo==9.0.0
244
+ - pyarrow==17.0.0
245
+ - pyarrow-hotfix==0.6
246
+ - pyasn1==0.6.1
247
+ - pyasn1-modules==0.4.1
248
+ - pyaudio==0.2.14
249
+ - pybind11==2.13.1
250
+ - pycaw==20240210
251
+ - pyclipper==1.3.0.post5
252
+ - pycountry==24.6.1
253
+ - pycparser==2.22
254
+ - pydantic==2.9.2
255
+ - pydantic-core==2.23.4
256
+ - pydub==0.25.1
257
+ - pygame==2.6.1
258
+ - pygments==2.18.0
259
+ - pyjwt==2.9.0
260
+ - pymilvus==2.4.7
261
+ - pymongo==4.10.1
262
+ - pymysql==1.1.1
263
+ - pypandoc==1.13
264
+ - pyparsing==3.1.2
265
+ - pypdf==4.3.1
266
+ - pypika==0.48.9
267
+ - pyproject-hooks==1.2.0
268
+ - pyqt5==5.15.11
269
+ - pyqt5-qt5==5.15.2
270
+ - pyqt5-sip==12.15.0
271
+ - pyreadline3==3.4.1
272
+ - pytablewriter==1.2.0
273
+ - pytest==8.3.3
274
+ - pytest-docker==3.1.1
275
+ - python-dateutil==2.8.2
276
+ - python-dotenv==1.0.1
277
+ - python-engineio==4.9.1
278
+ - python-iso639==2024.4.27
279
+ - python-jose==3.3.0
280
+ - python-magic==0.4.27
281
+ - python-multipart==0.0.9
282
+ - python-oxmsg==0.0.1
283
+ - python-pptx==1.0.0
284
+ - python-socketio==5.11.3
285
+ - pytube==15.0.0
286
+ - pytz==2024.1
287
+ - pywin32==306
288
+ - pyxlsb==1.0.10
289
+ - pyyaml==6.0.1
290
+ - rank-bm25==0.2.2
291
+ - rapidfuzz==3.10.0
292
+ - rapidocr-onnxruntime==1.3.24
293
+ - red-black-tree-mod==1.20
294
+ - redis==5.1.1
295
+ - referencing==0.35.1
296
+ - regex==2024.5.15
297
+ - requests==2.32.3
298
+ - requests-oauthlib==2.0.0
299
+ - requests-toolbelt==1.0.0
300
+ - rich==13.7.1
301
+ - rouge==1.0.1
302
+ - rouge-score==0.1.2
303
+ - rpds-py==0.19.0
304
+ - rsa==4.9
305
+ - rtfde==0.1.2
306
+ - ruff==0.5.4
307
+ - s3transfer==0.10.3
308
+ - sacrebleu==1.5.0
309
+ - safetensors==0.4.3
310
+ - scikit-learn==1.5.1
311
+ - scipy==1.14.0
312
+ - semantic-version==2.10.0
313
+ - sentence-transformers==3.0.1
314
+ - sentencepiece==0.2.0
315
+ - sentry-sdk==2.10.0
316
+ - setproctitle==1.3.3
317
+ - shapely==2.0.6
318
+ - shellingham==1.5.4
319
+ - simple-websocket==1.1.0
320
+ - six==1.16.0
321
+ - smmap==5.0.1
322
+ - sniffio==1.3.1
323
+ - soupsieve==2.6
324
+ - speechrecognition==3.10.0
325
+ - spinners==0.0.24
326
+ - sqlalchemy==2.0.32
327
+ - sqlitedict==2.1.0
328
+ - sse-starlette==1.6.5
329
+ - starlette==0.37.2
330
+ - sympy==1.12
331
+ - tabledata==1.3.3
332
+ - tabulate==0.9.0
333
+ - tcolorpy==0.1.6
334
+ - tenacity==8.5.0
335
+ - tensorboard==2.17.0
336
+ - tensorboard-data-server==0.7.2
337
+ - termcolor==2.4.0
338
+ - threadpoolctl==3.5.0
339
+ - tiktoken==0.7.0
340
+ - timm==1.0.8
341
+ - tokenizers==0.19.1
342
+ - tomlkit==0.12.0
343
+ - toolz==0.12.1
344
+ - torch==2.2.2+cu121
345
+ - torchaudio==2.2.2+cu121
346
+ - torchvision==0.17.2+cu121
347
+ - tqdm==4.66.4
348
+ - tqdm-multiprocess==0.0.11
349
+ - transformers==4.44.2
350
+ - typepy==1.3.2
351
+ - typer==0.12.3
352
+ - typing-extensions==4.9.0
353
+ - typing-inspect==0.9.0
354
+ - tzdata==2024.1
355
+ - tzlocal==5.2
356
+ - ujson==5.10.0
357
+ - unstructured==0.15.9
358
+ - unstructured-client==0.26.1
359
+ - uritemplate==4.1.1
360
+ - urllib3==2.2.2
361
+ - uvicorn==0.30.6
362
+ - validators==0.33.0
363
+ - wandb==0.17.5
364
+ - watchfiles==0.22.0
365
+ - wcwidth==0.2.13
366
+ - webrtcvad==2.0.10
367
+ - websocket-client==1.8.0
368
+ - websockets==11.0.3
369
+ - werkzeug==3.0.3
370
+ - win-unicode-console==0.5
371
+ - wrapt==1.16.0
372
+ - wsproto==1.2.0
373
+ - xlrd==2.0.1
374
+ - xlsxwriter==3.2.0
375
+ - xxhash==3.4.1
376
+ - yarl==1.15.1
377
+ - youtube-transcript-api==0.6.2
378
+ - zipp==3.20.2
379
+ - zstandard==0.23.0
380
+ prefix: C:\Users\Administrator\Desktop\text-generation-webui-main\installer_files\env
Training_BlackSheep_MoE/Epoch_1/Epoch_1_WandDB/files/wandb-metadata.json ADDED
@@ -0,0 +1,53 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "os": "Windows-10-10.0.22621-SP0",
3
+ "python": "3.11.9",
4
+ "heartbeatAt": "2024-10-17T19:57:13.102016",
5
+ "startedAt": "2024-10-17T19:57:13.021163",
6
+ "docker": null,
7
+ "cuda": null,
8
+ "args": [],
9
+ "state": "running",
10
+ "program": "C:\\Users\\Administrator\\Desktop\\text-generation-webui-main\\server.py",
11
+ "codePathLocal": "server.py",
12
+ "codePath": "server.py",
13
+ "git": {
14
+ "remote": "https://github.com/oobabooga/text-generation-webui",
15
+ "commit": "5522584992c632d75d2389e9342793fd1dcc391d"
16
+ },
17
+ "email": "41653822+Troys-Code@users.noreply.github.com",
18
+ "root": "C:/Users/Administrator/Desktop/text-generation-webui-main",
19
+ "host": "Troy-New-PC",
20
+ "username": "Troy",
21
+ "executable": "C:\\Users\\Administrator\\Desktop\\text-generation-webui-main\\installer_files\\env\\python.exe",
22
+ "cpu_count": 12,
23
+ "cpu_count_logical": 20,
24
+ "cpu_freq": {
25
+ "current": 3600.0,
26
+ "min": 0.0,
27
+ "max": 3600.0
28
+ },
29
+ "cpu_freq_per_core": [
30
+ {
31
+ "current": 3600.0,
32
+ "min": 0.0,
33
+ "max": 3600.0
34
+ }
35
+ ],
36
+ "disk": {
37
+ "/": {
38
+ "total": 930.6826133728027,
39
+ "used": 716.3763427734375
40
+ }
41
+ },
42
+ "gpu": "NVIDIA GeForce RTX 3090",
43
+ "gpu_count": 1,
44
+ "gpu_devices": [
45
+ {
46
+ "name": "NVIDIA GeForce RTX 3090",
47
+ "memory_total": 25769803776
48
+ }
49
+ ],
50
+ "memory": {
51
+ "total": 31.863067626953125
52
+ }
53
+ }
Training_BlackSheep_MoE/Epoch_1/Epoch_1_WandDB/logs/debug-internal.log ADDED
The diff for this file is too large to render. See raw diff
 
Training_BlackSheep_MoE/Epoch_1/Epoch_1_WandDB/logs/debug.log ADDED
@@ -0,0 +1,28 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ 2024-10-17 12:57:13,023 INFO Thread-7 (threaded_run):32648 [wandb_setup.py:_flush():76] Current SDK version is 0.17.5
2
+ 2024-10-17 12:57:13,023 INFO Thread-7 (threaded_run):32648 [wandb_setup.py:_flush():76] Configure stats pid to 32648
3
+ 2024-10-17 12:57:13,023 INFO Thread-7 (threaded_run):32648 [wandb_setup.py:_flush():76] Loading settings from C:\Users\Administrator\.config\wandb\settings
4
+ 2024-10-17 12:57:13,023 INFO Thread-7 (threaded_run):32648 [wandb_setup.py:_flush():76] Loading settings from C:\Users\Administrator\Desktop\text-generation-webui-main\wandb\settings
5
+ 2024-10-17 12:57:13,023 INFO Thread-7 (threaded_run):32648 [wandb_setup.py:_flush():76] Loading settings from environment variables: {'mode': 'offline'}
6
+ 2024-10-17 12:57:13,023 INFO Thread-7 (threaded_run):32648 [wandb_setup.py:_flush():76] Applying setup settings: {'_disable_service': False}
7
+ 2024-10-17 12:57:13,023 INFO Thread-7 (threaded_run):32648 [wandb_setup.py:_flush():76] Inferring run settings from compute environment: {'program_relpath': 'server.py', 'program_abspath': 'C:\\Users\\Administrator\\Desktop\\text-generation-webui-main\\server.py', 'program': 'C:\\Users\\Administrator\\Desktop\\text-generation-webui-main\\server.py'}
8
+ 2024-10-17 12:57:13,023 INFO Thread-7 (threaded_run):32648 [wandb_init.py:_log_setup():529] Logging user logs to C:\Users\Administrator\Desktop\text-generation-webui-main\wandb\offline-run-20241017_125713-zd7c97g6\logs\debug.log
9
+ 2024-10-17 12:57:13,023 INFO Thread-7 (threaded_run):32648 [wandb_init.py:_log_setup():530] Logging internal logs to C:\Users\Administrator\Desktop\text-generation-webui-main\wandb\offline-run-20241017_125713-zd7c97g6\logs\debug-internal.log
10
+ 2024-10-17 12:57:13,023 INFO Thread-7 (threaded_run):32648 [wandb_init.py:init():569] calling init triggers
11
+ 2024-10-17 12:57:13,023 INFO Thread-7 (threaded_run):32648 [wandb_init.py:init():576] wandb.init called with sweep_config: {}
12
+ config: {}
13
+ 2024-10-17 12:57:13,023 INFO Thread-7 (threaded_run):32648 [wandb_init.py:init():619] starting backend
14
+ 2024-10-17 12:57:13,023 INFO Thread-7 (threaded_run):32648 [wandb_init.py:init():623] setting up manager
15
+ 2024-10-17 12:57:13,024 INFO Thread-7 (threaded_run):32648 [backend.py:_multiprocessing_setup():105] multiprocessing start_methods=spawn, using: spawn
16
+ 2024-10-17 12:57:13,025 INFO Thread-7 (threaded_run):32648 [wandb_init.py:init():631] backend started and connected
17
+ 2024-10-17 12:57:13,028 INFO Thread-7 (threaded_run):32648 [wandb_init.py:init():720] updated telemetry
18
+ 2024-10-17 12:57:13,052 INFO Thread-7 (threaded_run):32648 [wandb_init.py:init():753] communicating run to backend with 90.0 second timeout
19
+ 2024-10-17 12:57:13,054 INFO Thread-7 (threaded_run):32648 [wandb_init.py:init():804] starting run threads in backend
20
+ 2024-10-17 12:57:16,864 INFO Thread-7 (threaded_run):32648 [wandb_run.py:_console_start():2413] atexit reg
21
+ 2024-10-17 12:57:16,864 INFO Thread-7 (threaded_run):32648 [wandb_run.py:_redirect():2255] redirect: wrap_raw
22
+ 2024-10-17 12:57:16,864 INFO Thread-7 (threaded_run):32648 [wandb_run.py:_redirect():2320] Wrapping output streams.
23
+ 2024-10-17 12:57:16,864 INFO Thread-7 (threaded_run):32648 [wandb_run.py:_redirect():2345] Redirects installed.
24
+ 2024-10-17 12:57:16,865 INFO Thread-7 (threaded_run):32648 [wandb_init.py:init():847] run started, returning control to user process
25
+ 2024-10-17 12:57:16,867 INFO Thread-7 (threaded_run):32648 [wandb_run.py:_config_callback():1382] config_cb None None {'peft_config': {'default': {'peft_type': <PeftType.LORA: 'LORA'>, 'auto_mapping': None, 'base_model_name_or_path': 'models\\TroyDoesAI_ContextObedient-MoE', 'revision': None, 'task_type': 'CAUSAL_LM', 'inference_mode': False, 'r': 32, 'target_modules': {'q_proj', 'v_proj'}, 'lora_alpha': 64, 'lora_dropout': 0, 'fan_in_fan_out': False, 'bias': 'none', 'use_rslora': False, 'modules_to_save': None, 'init_lora_weights': True, 'layers_to_transform': None, 'layers_pattern': None, 'rank_pattern': {}, 'alpha_pattern': {}, 'megatron_config': None, 'megatron_core': 'megatron.core', 'loftq_config': {}, 'use_dora': False, 'layer_replication': None, 'runtime_config': {'ephemeral_gpu_offload': False}}}, 'vocab_size': 32064, 'max_position_embeddings': 16384, 'hidden_size': 3072, 'intermediate_size': 8192, 'num_hidden_layers': 32, 'num_attention_heads': 32, 'sliding_window': None, 'num_key_value_heads': 32, 'hidden_act': 'silu', 'initializer_range': 0.02, 'rms_norm_eps': 1e-05, 'use_cache': False, 'rope_theta': 10000.0, 'attention_dropout': 0.0, 'num_experts_per_tok': 2, 'num_local_experts': 3, 'output_router_logits': False, 'router_aux_loss_coef': 0.001, 'router_jitter_noise': 0.0, 'return_dict': True, 'output_hidden_states': False, 'output_attentions': False, 'torchscript': False, 'torch_dtype': 'bfloat16', 'use_bfloat16': False, 'tf_legacy_loss': False, 'pruned_heads': {}, 'tie_word_embeddings': False, 'chunk_size_feed_forward': 0, 'is_encoder_decoder': False, 'is_decoder': False, 'cross_attention_hidden_size': None, 'add_cross_attention': False, 'tie_encoder_decoder': False, 'max_length': 20, 'min_length': 0, 'do_sample': False, 'early_stopping': False, 'num_beams': 1, 'num_beam_groups': 1, 'diversity_penalty': 0.0, 'temperature': 1.0, 'top_k': 50, 'top_p': 1.0, 'typical_p': 1.0, 'repetition_penalty': 1.0, 'length_penalty': 1.0, 'no_repeat_ngram_size': 0, 'encoder_no_repeat_ngram_size': 0, 'bad_words_ids': None, 'num_return_sequences': 1, 'output_scores': False, 'return_dict_in_generate': False, 'forced_bos_token_id': None, 'forced_eos_token_id': None, 'remove_invalid_values': False, 'exponential_decay_length_penalty': None, 'suppress_tokens': None, 'begin_suppress_tokens': None, 'architectures': ['MixtralForCausalLM'], 'finetuning_task': None, 'id2label': {0: 'LABEL_0', 1: 'LABEL_1'}, 'label2id': {'LABEL_0': 0, 'LABEL_1': 1}, 'tokenizer_class': None, 'prefix': None, 'bos_token_id': 1, 'pad_token_id': None, 'eos_token_id': 32000, 'sep_token_id': None, 'decoder_start_token_id': None, 'task_specific_params': None, 'problem_type': None, '_name_or_path': 'models\\TroyDoesAI_ContextObedient-MoE', 'transformers_version': '4.44.2', 'attention_bias': False, 'mlp_bias': False, 'model_type': 'mixtral', 'pretraining_tp': 1, 'rope_scaling': None, 'quantization_config': {'quant_method': 'BITS_AND_BYTES', '_load_in_8bit': False, '_load_in_4bit': True, 'llm_int8_threshold': 6.0, 'llm_int8_skip_modules': None, 'llm_int8_enable_fp32_cpu_offload': True, 'llm_int8_has_fp16_weight': False, 'bnb_4bit_quant_type': 'nf4', 'bnb_4bit_use_double_quant': True, 'bnb_4bit_compute_dtype': 'float16', 'bnb_4bit_quant_storage': 'uint8', 'load_in_4bit': True, 'load_in_8bit': False}, 'output_dir': 'loras/BlackSheep-Lora', 'overwrite_output_dir': False, 'do_train': False, 'do_eval': False, 'do_predict': False, 'eval_strategy': 'no', 'prediction_loss_only': False, 'per_device_train_batch_size': 4, 'per_device_eval_batch_size': 8, 'per_gpu_train_batch_size': None, 'per_gpu_eval_batch_size': None, 'gradient_accumulation_steps': 1, 'eval_accumulation_steps': None, 'eval_delay': 0, 'torch_empty_cache_steps': None, 'learning_rate': 2e-05, 'weight_decay': 0.0, 'adam_beta1': 0.9, 'adam_beta2': 0.999, 'adam_epsilon': 1e-08, 'max_grad_norm': 1.0, 'num_train_epochs': 6, 'max_steps': -1, 'lr_scheduler_type': 'polynomial', 'lr_scheduler_kwargs': {}, 'warmup_ratio': 0.0, 'warmup_steps': 0, 'log_level': 'passive', 'log_level_replica': 'warning', 'log_on_each_node': True, 'logging_dir': 'loras/BlackSheep-Lora\\runs\\Oct17_12-57-11_Troy-New-PC', 'logging_strategy': 'steps', 'logging_first_step': False, 'logging_steps': 5, 'logging_nan_inf_filter': True, 'save_strategy': 'no', 'save_steps': 500, 'save_total_limit': None, 'save_safetensors': True, 'save_on_each_node': False, 'save_only_model': False, 'restore_callback_states_from_checkpoint': False, 'no_cuda': False, 'use_cpu': False, 'use_mps_device': False, 'seed': 42, 'data_seed': None, 'jit_mode_eval': False, 'use_ipex': False, 'bf16': True, 'fp16': False, 'fp16_opt_level': 'O1', 'half_precision_backend': 'auto', 'bf16_full_eval': False, 'fp16_full_eval': False, 'tf32': None, 'local_rank': 0, 'ddp_backend': None, 'tpu_num_cores': None, 'tpu_metrics_debug': False, 'debug': [], 'dataloader_drop_last': False, 'eval_steps': None, 'dataloader_num_workers': 0, 'dataloader_prefetch_factor': None, 'past_index': -1, 'run_name': 'loras/BlackSheep-Lora', 'disable_tqdm': True, 'remove_unused_columns': True, 'label_names': None, 'load_best_model_at_end': False, 'metric_for_best_model': None, 'greater_is_better': None, 'ignore_data_skip': False, 'fsdp': [], 'fsdp_min_num_params': 0, 'fsdp_config': {'min_num_params': 0, 'xla': False, 'xla_fsdp_v2': False, 'xla_fsdp_grad_ckpt': False}, 'fsdp_transformer_layer_cls_to_wrap': None, 'accelerator_config': {'split_batches': False, 'dispatch_batches': None, 'even_batches': True, 'use_seedable_sampler': True, 'non_blocking': False, 'gradient_accumulation_kwargs': None}, 'deepspeed': None, 'label_smoothing_factor': 0.0, 'optim': 'adamw_bnb_8bit', 'optim_args': None, 'adafactor': False, 'group_by_length': False, 'length_column_name': 'length', 'report_to': ['wandb'], 'ddp_find_unused_parameters': None, 'ddp_bucket_cap_mb': None, 'ddp_broadcast_buffers': None, 'dataloader_pin_memory': True, 'dataloader_persistent_workers': False, 'skip_memory_metrics': True, 'use_legacy_prediction_loop': False, 'push_to_hub': False, 'resume_from_checkpoint': None, 'hub_model_id': None, 'hub_strategy': 'every_save', 'hub_token': '<HUB_TOKEN>', 'hub_private_repo': False, 'hub_always_push': False, 'gradient_checkpointing': False, 'gradient_checkpointing_kwargs': None, 'include_inputs_for_metrics': False, 'eval_do_concat_batches': True, 'fp16_backend': 'auto', 'evaluation_strategy': 'no', 'push_to_hub_model_id': None, 'push_to_hub_organization': None, 'push_to_hub_token': '<PUSH_TO_HUB_TOKEN>', 'mp_parameters': '', 'auto_find_batch_size': False, 'full_determinism': False, 'torchdynamo': None, 'ray_scope': 'last', 'ddp_timeout': 1800, 'torch_compile': False, 'torch_compile_backend': None, 'torch_compile_mode': None, 'dispatch_batches': None, 'split_batches': None, 'include_tokens_per_second': False, 'include_num_input_tokens_seen': False, 'neftune_noise_alpha': None, 'optim_target_modules': None, 'batch_eval_metrics': False, 'eval_on_start': False, 'eval_use_gather_object': False}
26
+ 2024-10-17 12:57:16,870 INFO Thread-7 (threaded_run):32648 [wandb_config.py:__setitem__():151] config set model/num_parameters = 8665795584 - <bound method Run._config_callback of <wandb.sdk.wandb_run.Run object at 0x00000214C0853850>>
27
+ 2024-10-17 12:57:16,870 INFO Thread-7 (threaded_run):32648 [wandb_run.py:_config_callback():1382] config_cb model/num_parameters 8665795584 None
28
+ 2024-10-17 18:00:28,321 WARNING MsgRouterThr:32648 [router.py:message_loop():77] message_loop has been closed
Training_BlackSheep_MoE/Epoch_1/Epoch_1_WandDB/run-zd7c97g6.wandb ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:eaa1f304de6ed78411f4c9f14e9eedf8f4193726cc33705b574db125f45a5d3f
3
+ size 1038295
Training_BlackSheep_MoE/Epoch_1/MIXTRAL-training_log_Epoch1-BlackSheep-with-Personas.txt ADDED
@@ -0,0 +1,288 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ Step: 4 {'loss': 1.8393, 'grad_norm': 0.9135997295379639, 'learning_rate': 1.9988154761904764e-05, 'epoch': 0.0035714285714285713}
2
+ Step: 9 {'loss': 1.6701, 'grad_norm': 0.7541273832321167, 'learning_rate': 1.9976309523809527e-05, 'epoch': 0.007142857142857143}
3
+ Step: 14 {'loss': 1.2554, 'grad_norm': 0.6005926132202148, 'learning_rate': 1.9964464285714286e-05, 'epoch': 0.010714285714285714}
4
+ Step: 19 {'loss': 1.4734, 'grad_norm': 0.6956166625022888, 'learning_rate': 1.995261904761905e-05, 'epoch': 0.014285714285714285}
5
+ Step: 24 {'loss': 1.2668, 'grad_norm': 0.3963625133037567, 'learning_rate': 1.994077380952381e-05, 'epoch': 0.017857142857142856}
6
+ Step: 29 {'loss': 1.2668, 'grad_norm': 0.41620415449142456, 'learning_rate': 1.9928928571428574e-05, 'epoch': 0.02142857142857143}
7
+ Step: 34 {'loss': 1.4369, 'grad_norm': 0.7620450854301453, 'learning_rate': 1.9917083333333336e-05, 'epoch': 0.025}
8
+ Step: 39 {'loss': 1.3254, 'grad_norm': 0.6728664636611938, 'learning_rate': 1.99052380952381e-05, 'epoch': 0.02857142857142857}
9
+ Step: 44 {'loss': 1.0877, 'grad_norm': 0.5350437760353088, 'learning_rate': 1.9893392857142858e-05, 'epoch': 0.03214285714285714}
10
+ Step: 49 {'loss': 1.0392, 'grad_norm': 0.6517331600189209, 'learning_rate': 1.988154761904762e-05, 'epoch': 0.03571428571428571}
11
+ Step: 54 {'loss': 1.0966, 'grad_norm': 0.4985944628715515, 'learning_rate': 1.9869702380952383e-05, 'epoch': 0.039285714285714285}
12
+ Step: 59 {'loss': 1.0616, 'grad_norm': 0.6425554752349854, 'learning_rate': 1.9857857142857145e-05, 'epoch': 0.04285714285714286}
13
+ Step: 64 {'loss': 1.3173, 'grad_norm': 0.9711938500404358, 'learning_rate': 1.9846011904761908e-05, 'epoch': 0.04642857142857143}
14
+ Step: 69 {'loss': 0.965, 'grad_norm': 0.5132351517677307, 'learning_rate': 1.9834166666666667e-05, 'epoch': 0.05}
15
+ Step: 74 {'loss': 0.8959, 'grad_norm': 0.5035068988800049, 'learning_rate': 1.982232142857143e-05, 'epoch': 0.05357142857142857}
16
+ Step: 79 {'loss': 0.8977, 'grad_norm': 0.6544917821884155, 'learning_rate': 1.9810476190476192e-05, 'epoch': 0.05714285714285714}
17
+ Step: 84 {'loss': 0.8336, 'grad_norm': 0.7621152997016907, 'learning_rate': 1.9798630952380955e-05, 'epoch': 0.060714285714285714}
18
+ Step: 89 {'loss': 0.8617, 'grad_norm': 0.5246341228485107, 'learning_rate': 1.9786785714285717e-05, 'epoch': 0.06428571428571428}
19
+ Step: 94 {'loss': 0.7953, 'grad_norm': 0.5174001455307007, 'learning_rate': 1.977494047619048e-05, 'epoch': 0.06785714285714285}
20
+ Step: 99 {'loss': 1.0036, 'grad_norm': 0.8036244511604309, 'learning_rate': 1.976309523809524e-05, 'epoch': 0.07142857142857142}
21
+ Step: 104 {'loss': 0.9666, 'grad_norm': 0.8807173371315002, 'learning_rate': 1.975125e-05, 'epoch': 0.075}
22
+ Step: 109 {'loss': 0.7682, 'grad_norm': 0.6022303700447083, 'learning_rate': 1.9739404761904764e-05, 'epoch': 0.07857142857142857}
23
+ Step: 114 {'loss': 0.8332, 'grad_norm': 0.5524723529815674, 'learning_rate': 1.9727559523809527e-05, 'epoch': 0.08214285714285714}
24
+ Step: 119 {'loss': 0.6926, 'grad_norm': 0.6818602085113525, 'learning_rate': 1.971571428571429e-05, 'epoch': 0.08571428571428572}
25
+ Step: 124 {'loss': 0.7233, 'grad_norm': 0.5903816819190979, 'learning_rate': 1.9703869047619052e-05, 'epoch': 0.08928571428571429}
26
+ Step: 129 {'loss': 0.6642, 'grad_norm': 0.7943810820579529, 'learning_rate': 1.969202380952381e-05, 'epoch': 0.09285714285714286}
27
+ Step: 134 {'loss': 0.6425, 'grad_norm': 0.5778236985206604, 'learning_rate': 1.9680178571428574e-05, 'epoch': 0.09642857142857143}
28
+ Step: 139 {'loss': 0.6691, 'grad_norm': 0.5044384002685547, 'learning_rate': 1.9668333333333333e-05, 'epoch': 0.1}
29
+ Step: 144 {'loss': 0.6386, 'grad_norm': 0.6634503602981567, 'learning_rate': 1.9656488095238095e-05, 'epoch': 0.10357142857142858}
30
+ Step: 149 {'loss': 0.748, 'grad_norm': 0.653299868106842, 'learning_rate': 1.9644642857142858e-05, 'epoch': 0.10714285714285714}
31
+ Step: 154 {'loss': 0.6776, 'grad_norm': 0.6532788276672363, 'learning_rate': 1.963279761904762e-05, 'epoch': 0.11071428571428571}
32
+ Step: 159 {'loss': 0.4942, 'grad_norm': 0.6517465114593506, 'learning_rate': 1.9620952380952383e-05, 'epoch': 0.11428571428571428}
33
+ Step: 164 {'loss': 0.5849, 'grad_norm': 0.7105296850204468, 'learning_rate': 1.9609107142857142e-05, 'epoch': 0.11785714285714285}
34
+ Step: 169 {'loss': 0.5823, 'grad_norm': 0.8051909804344177, 'learning_rate': 1.9597261904761905e-05, 'epoch': 0.12142857142857143}
35
+ Step: 174 {'loss': 0.6124, 'grad_norm': 0.901710569858551, 'learning_rate': 1.9585416666666667e-05, 'epoch': 0.125}
36
+ Step: 179 {'loss': 0.7032, 'grad_norm': 0.8329763412475586, 'learning_rate': 1.957357142857143e-05, 'epoch': 0.12857142857142856}
37
+ Step: 184 {'loss': 0.5445, 'grad_norm': 0.41105031967163086, 'learning_rate': 1.9561726190476192e-05, 'epoch': 0.13214285714285715}
38
+ Step: 189 {'loss': 0.6622, 'grad_norm': 0.9969037175178528, 'learning_rate': 1.9549880952380955e-05, 'epoch': 0.1357142857142857}
39
+ Step: 194 {'loss': 0.6171, 'grad_norm': 0.6689130067825317, 'learning_rate': 1.9538035714285714e-05, 'epoch': 0.1392857142857143}
40
+ Step: 199 {'loss': 0.5791, 'grad_norm': 0.9522849917411804, 'learning_rate': 1.9526190476190477e-05, 'epoch': 0.14285714285714285}
41
+ Step: 204 {'loss': 0.7005, 'grad_norm': 0.6476870179176331, 'learning_rate': 1.951434523809524e-05, 'epoch': 0.14642857142857144}
42
+ Step: 209 {'loss': 0.548, 'grad_norm': 0.6938184499740601, 'learning_rate': 1.9502500000000002e-05, 'epoch': 0.15}
43
+ Step: 214 {'loss': 0.5984, 'grad_norm': 0.7758791446685791, 'learning_rate': 1.9490654761904764e-05, 'epoch': 0.15357142857142858}
44
+ Step: 219 {'loss': 0.5668, 'grad_norm': 0.6298092007637024, 'learning_rate': 1.9478809523809523e-05, 'epoch': 0.15714285714285714}
45
+ Step: 224 {'loss': 0.4565, 'grad_norm': 0.8767203688621521, 'learning_rate': 1.9466964285714286e-05, 'epoch': 0.16071428571428573}
46
+ Step: 229 {'loss': 0.4918, 'grad_norm': 0.8556920886039734, 'learning_rate': 1.945511904761905e-05, 'epoch': 0.16428571428571428}
47
+ Step: 234 {'loss': 0.6559, 'grad_norm': 0.9740686416625977, 'learning_rate': 1.944327380952381e-05, 'epoch': 0.16785714285714284}
48
+ Step: 239 {'loss': 0.5291, 'grad_norm': 1.2440892457962036, 'learning_rate': 1.9431428571428574e-05, 'epoch': 0.17142857142857143}
49
+ Step: 244 {'loss': 0.4741, 'grad_norm': 0.8751192092895508, 'learning_rate': 1.9419583333333336e-05, 'epoch': 0.175}
50
+ Step: 249 {'loss': 0.4616, 'grad_norm': 0.6261155009269714, 'learning_rate': 1.9407738095238095e-05, 'epoch': 0.17857142857142858}
51
+ Step: 254 {'loss': 0.6365, 'grad_norm': 0.7985801696777344, 'learning_rate': 1.9395892857142858e-05, 'epoch': 0.18214285714285713}
52
+ Step: 259 {'loss': 0.4893, 'grad_norm': 0.8446434140205383, 'learning_rate': 1.938404761904762e-05, 'epoch': 0.18571428571428572}
53
+ Step: 264 {'loss': 0.4726, 'grad_norm': 0.998410701751709, 'learning_rate': 1.9372202380952383e-05, 'epoch': 0.18928571428571428}
54
+ Step: 269 {'loss': 0.4389, 'grad_norm': 0.7421302199363708, 'learning_rate': 1.9360357142857146e-05, 'epoch': 0.19285714285714287}
55
+ Step: 274 {'loss': 0.3953, 'grad_norm': 0.7805348038673401, 'learning_rate': 1.9348511904761905e-05, 'epoch': 0.19642857142857142}
56
+ Step: 279 {'loss': 0.5141, 'grad_norm': 1.2035925388336182, 'learning_rate': 1.9336666666666667e-05, 'epoch': 0.2}
57
+ Step: 284 {'loss': 0.5336, 'grad_norm': 0.8996376395225525, 'learning_rate': 1.932482142857143e-05, 'epoch': 0.20357142857142857}
58
+ Step: 289 {'loss': 0.5258, 'grad_norm': 0.6805949211120605, 'learning_rate': 1.9312976190476192e-05, 'epoch': 0.20714285714285716}
59
+ Step: 294 {'loss': 0.4903, 'grad_norm': 0.7106399536132812, 'learning_rate': 1.9301130952380955e-05, 'epoch': 0.21071428571428572}
60
+ Step: 299 {'loss': 0.3474, 'grad_norm': 0.5261926054954529, 'learning_rate': 1.9289285714285718e-05, 'epoch': 0.21428571428571427}
61
+ Step: 304 {'loss': 0.5147, 'grad_norm': 0.8087174892425537, 'learning_rate': 1.9277440476190477e-05, 'epoch': 0.21785714285714286}
62
+ Step: 309 {'loss': 0.387, 'grad_norm': 0.6345266699790955, 'learning_rate': 1.926559523809524e-05, 'epoch': 0.22142857142857142}
63
+ Step: 314 {'loss': 0.5001, 'grad_norm': 0.6739416122436523, 'learning_rate': 1.9253750000000002e-05, 'epoch': 0.225}
64
+ Step: 319 {'loss': 0.4823, 'grad_norm': 0.9729430079460144, 'learning_rate': 1.9241904761904764e-05, 'epoch': 0.22857142857142856}
65
+ Step: 324 {'loss': 0.4074, 'grad_norm': 0.8870615363121033, 'learning_rate': 1.9230059523809527e-05, 'epoch': 0.23214285714285715}
66
+ Step: 329 {'loss': 0.404, 'grad_norm': 0.6122156381607056, 'learning_rate': 1.921821428571429e-05, 'epoch': 0.2357142857142857}
67
+ Step: 334 {'loss': 0.399, 'grad_norm': 0.7009484767913818, 'learning_rate': 1.920636904761905e-05, 'epoch': 0.2392857142857143}
68
+ Step: 339 {'loss': 0.4015, 'grad_norm': 0.9186666011810303, 'learning_rate': 1.919452380952381e-05, 'epoch': 0.24285714285714285}
69
+ Step: 344 {'loss': 0.3585, 'grad_norm': 0.7455053925514221, 'learning_rate': 1.9182678571428574e-05, 'epoch': 0.24642857142857144}
70
+ Step: 349 {'loss': 0.3825, 'grad_norm': 0.6361353993415833, 'learning_rate': 1.9170833333333336e-05, 'epoch': 0.25}
71
+ Step: 354 {'loss': 0.4294, 'grad_norm': 0.8647685050964355, 'learning_rate': 1.91589880952381e-05, 'epoch': 0.25357142857142856}
72
+ Step: 359 {'loss': 0.3671, 'grad_norm': 0.5729554295539856, 'learning_rate': 1.9147142857142858e-05, 'epoch': 0.2571428571428571}
73
+ Step: 364 {'loss': 0.3255, 'grad_norm': 0.5804448127746582, 'learning_rate': 1.913529761904762e-05, 'epoch': 0.26071428571428573}
74
+ Step: 369 {'loss': 0.513, 'grad_norm': 0.8690835237503052, 'learning_rate': 1.9123452380952383e-05, 'epoch': 0.2642857142857143}
75
+ Step: 374 {'loss': 0.3854, 'grad_norm': 1.04226553440094, 'learning_rate': 1.9111607142857146e-05, 'epoch': 0.26785714285714285}
76
+ Step: 379 {'loss': 0.3508, 'grad_norm': 0.9402156472206116, 'learning_rate': 1.9099761904761908e-05, 'epoch': 0.2714285714285714}
77
+ Step: 384 {'loss': 0.4982, 'grad_norm': 0.6728529930114746, 'learning_rate': 1.908791666666667e-05, 'epoch': 0.275}
78
+ Step: 389 {'loss': 0.3801, 'grad_norm': 0.6414267420768738, 'learning_rate': 1.907607142857143e-05, 'epoch': 0.2785714285714286}
79
+ Step: 394 {'loss': 0.4152, 'grad_norm': 1.0602443218231201, 'learning_rate': 1.9064226190476192e-05, 'epoch': 0.28214285714285714}
80
+ Step: 399 {'loss': 0.4706, 'grad_norm': 0.5564787983894348, 'learning_rate': 1.905238095238095e-05, 'epoch': 0.2857142857142857}
81
+ Step: 404 {'loss': 0.3742, 'grad_norm': 0.8708383440971375, 'learning_rate': 1.9040535714285714e-05, 'epoch': 0.2892857142857143}
82
+ Step: 409 {'loss': 0.4291, 'grad_norm': 1.4418401718139648, 'learning_rate': 1.9028690476190477e-05, 'epoch': 0.29285714285714287}
83
+ Step: 414 {'loss': 0.4047, 'grad_norm': 0.877226710319519, 'learning_rate': 1.901684523809524e-05, 'epoch': 0.29642857142857143}
84
+ Step: 419 {'loss': 0.5602, 'grad_norm': 0.9882636666297913, 'learning_rate': 1.9005000000000002e-05, 'epoch': 0.3}
85
+ Step: 424 {'loss': 0.4152, 'grad_norm': 0.6507172584533691, 'learning_rate': 1.899315476190476e-05, 'epoch': 0.30357142857142855}
86
+ Step: 429 {'loss': 0.385, 'grad_norm': 0.8154886960983276, 'learning_rate': 1.8981309523809524e-05, 'epoch': 0.30714285714285716}
87
+ Step: 434 {'loss': 0.4448, 'grad_norm': 1.0586435794830322, 'learning_rate': 1.8969464285714286e-05, 'epoch': 0.3107142857142857}
88
+ Step: 439 {'loss': 0.4122, 'grad_norm': 0.9430441856384277, 'learning_rate': 1.895761904761905e-05, 'epoch': 0.3142857142857143}
89
+ Step: 444 {'loss': 0.3908, 'grad_norm': 0.5545459985733032, 'learning_rate': 1.894577380952381e-05, 'epoch': 0.31785714285714284}
90
+ Step: 449 {'loss': 0.4621, 'grad_norm': 0.9478644132614136, 'learning_rate': 1.8933928571428574e-05, 'epoch': 0.32142857142857145}
91
+ Step: 454 {'loss': 0.3571, 'grad_norm': 0.5178350210189819, 'learning_rate': 1.8922083333333333e-05, 'epoch': 0.325}
92
+ Step: 459 {'loss': 0.283, 'grad_norm': 0.544847846031189, 'learning_rate': 1.8910238095238095e-05, 'epoch': 0.32857142857142857}
93
+ Step: 464 {'loss': 0.3541, 'grad_norm': 0.736697256565094, 'learning_rate': 1.8898392857142858e-05, 'epoch': 0.33214285714285713}
94
+ Step: 469 {'loss': 0.4792, 'grad_norm': 1.094739556312561, 'learning_rate': 1.888654761904762e-05, 'epoch': 0.3357142857142857}
95
+ Step: 474 {'loss': 0.4067, 'grad_norm': 0.8329822421073914, 'learning_rate': 1.8874702380952383e-05, 'epoch': 0.3392857142857143}
96
+ Step: 479 {'loss': 0.4462, 'grad_norm': 0.7292098999023438, 'learning_rate': 1.8862857142857142e-05, 'epoch': 0.34285714285714286}
97
+ Step: 484 {'loss': 0.4539, 'grad_norm': 0.8009581565856934, 'learning_rate': 1.8851011904761905e-05, 'epoch': 0.3464285714285714}
98
+ Step: 489 {'loss': 0.4553, 'grad_norm': 1.1885050535202026, 'learning_rate': 1.8839166666666667e-05, 'epoch': 0.35}
99
+ Step: 494 {'loss': 0.2783, 'grad_norm': 0.5375037789344788, 'learning_rate': 1.882732142857143e-05, 'epoch': 0.3535714285714286}
100
+ Step: 499 {'loss': 0.3999, 'grad_norm': 0.8919097185134888, 'learning_rate': 1.8815476190476193e-05, 'epoch': 0.35714285714285715}
101
+ Step: 504 {'loss': 0.3989, 'grad_norm': 0.8732523918151855, 'learning_rate': 1.8803630952380955e-05, 'epoch': 0.3607142857142857}
102
+ Step: 509 {'loss': 0.4154, 'grad_norm': 0.5995797514915466, 'learning_rate': 1.8791785714285714e-05, 'epoch': 0.36428571428571427}
103
+ Step: 514 {'loss': 0.3292, 'grad_norm': 0.6189977526664734, 'learning_rate': 1.8779940476190477e-05, 'epoch': 0.3678571428571429}
104
+ Step: 519 {'loss': 0.405, 'grad_norm': 0.8150209188461304, 'learning_rate': 1.876809523809524e-05, 'epoch': 0.37142857142857144}
105
+ Step: 524 {'loss': 0.3163, 'grad_norm': 0.9416306018829346, 'learning_rate': 1.8756250000000002e-05, 'epoch': 0.375}
106
+ Step: 529 {'loss': 0.4091, 'grad_norm': 1.3431493043899536, 'learning_rate': 1.8744404761904764e-05, 'epoch': 0.37857142857142856}
107
+ Step: 534 {'loss': 0.3866, 'grad_norm': 0.6864269375801086, 'learning_rate': 1.8732559523809527e-05, 'epoch': 0.3821428571428571}
108
+ Step: 539 {'loss': 0.4974, 'grad_norm': 1.5816361904144287, 'learning_rate': 1.8720714285714286e-05, 'epoch': 0.38571428571428573}
109
+ Step: 544 {'loss': 0.3429, 'grad_norm': 0.9576981663703918, 'learning_rate': 1.870886904761905e-05, 'epoch': 0.3892857142857143}
110
+ Step: 549 {'loss': 0.5506, 'grad_norm': 0.8996490836143494, 'learning_rate': 1.869702380952381e-05, 'epoch': 0.39285714285714285}
111
+ Step: 554 {'loss': 0.4465, 'grad_norm': 0.6192946434020996, 'learning_rate': 1.8685178571428574e-05, 'epoch': 0.3964285714285714}
112
+ Step: 559 {'loss': 0.5349, 'grad_norm': 0.8320725560188293, 'learning_rate': 1.8673333333333336e-05, 'epoch': 0.4}
113
+ Step: 564 {'loss': 0.4332, 'grad_norm': 0.984227180480957, 'learning_rate': 1.8661488095238096e-05, 'epoch': 0.4035714285714286}
114
+ Step: 569 {'loss': 0.3875, 'grad_norm': 0.9194024205207825, 'learning_rate': 1.8649642857142858e-05, 'epoch': 0.40714285714285714}
115
+ Step: 574 {'loss': 0.3601, 'grad_norm': 0.7953531742095947, 'learning_rate': 1.863779761904762e-05, 'epoch': 0.4107142857142857}
116
+ Step: 579 {'loss': 0.3632, 'grad_norm': 0.7107942700386047, 'learning_rate': 1.8625952380952383e-05, 'epoch': 0.4142857142857143}
117
+ Step: 584 {'loss': 0.3376, 'grad_norm': 1.1161280870437622, 'learning_rate': 1.8614107142857146e-05, 'epoch': 0.41785714285714287}
118
+ Step: 589 {'loss': 0.3145, 'grad_norm': 0.7818060517311096, 'learning_rate': 1.8602261904761908e-05, 'epoch': 0.42142857142857143}
119
+ Step: 594 {'loss': 0.3659, 'grad_norm': 0.9914860129356384, 'learning_rate': 1.8590416666666667e-05, 'epoch': 0.425}
120
+ Step: 599 {'loss': 0.4318, 'grad_norm': 1.2383373975753784, 'learning_rate': 1.857857142857143e-05, 'epoch': 0.42857142857142855}
121
+ Step: 604 {'loss': 0.3811, 'grad_norm': 1.2427709102630615, 'learning_rate': 1.8566726190476193e-05, 'epoch': 0.43214285714285716}
122
+ Step: 609 {'loss': 0.4007, 'grad_norm': 0.795452356338501, 'learning_rate': 1.8554880952380955e-05, 'epoch': 0.4357142857142857}
123
+ Step: 614 {'loss': 0.4641, 'grad_norm': 0.7359730005264282, 'learning_rate': 1.8543035714285718e-05, 'epoch': 0.4392857142857143}
124
+ Step: 619 {'loss': 0.3813, 'grad_norm': 1.0183659791946411, 'learning_rate': 1.8531190476190477e-05, 'epoch': 0.44285714285714284}
125
+ Step: 624 {'loss': 0.3183, 'grad_norm': 0.6068124175071716, 'learning_rate': 1.851934523809524e-05, 'epoch': 0.44642857142857145}
126
+ Step: 629 {'loss': 0.3476, 'grad_norm': 0.8716106414794922, 'learning_rate': 1.8507500000000002e-05, 'epoch': 0.45}
127
+ Step: 634 {'loss': 0.3398, 'grad_norm': 0.8179718852043152, 'learning_rate': 1.8495654761904765e-05, 'epoch': 0.45357142857142857}
128
+ Step: 639 {'loss': 0.3957, 'grad_norm': 0.8983686566352844, 'learning_rate': 1.8483809523809527e-05, 'epoch': 0.45714285714285713}
129
+ Step: 644 {'loss': 0.4374, 'grad_norm': 1.2546746730804443, 'learning_rate': 1.847196428571429e-05, 'epoch': 0.4607142857142857}
130
+ Step: 649 {'loss': 0.4142, 'grad_norm': 0.9628292322158813, 'learning_rate': 1.846011904761905e-05, 'epoch': 0.4642857142857143}
131
+ Step: 654 {'loss': 0.371, 'grad_norm': 1.2992581129074097, 'learning_rate': 1.844827380952381e-05, 'epoch': 0.46785714285714286}
132
+ Step: 659 {'loss': 0.3134, 'grad_norm': 0.9638750553131104, 'learning_rate': 1.8436428571428574e-05, 'epoch': 0.4714285714285714}
133
+ Step: 664 {'loss': 0.2918, 'grad_norm': 0.7682401537895203, 'learning_rate': 1.8424583333333336e-05, 'epoch': 0.475}
134
+ Step: 669 {'loss': 0.3406, 'grad_norm': 0.7767547965049744, 'learning_rate': 1.84127380952381e-05, 'epoch': 0.4785714285714286}
135
+ Step: 674 {'loss': 0.4479, 'grad_norm': 1.2478020191192627, 'learning_rate': 1.8400892857142858e-05, 'epoch': 0.48214285714285715}
136
+ Step: 679 {'loss': 0.2975, 'grad_norm': 1.1172363758087158, 'learning_rate': 1.838904761904762e-05, 'epoch': 0.4857142857142857}
137
+ Step: 684 {'loss': 0.3755, 'grad_norm': 0.6458436846733093, 'learning_rate': 1.837720238095238e-05, 'epoch': 0.48928571428571427}
138
+ Step: 689 {'loss': 0.4156, 'grad_norm': 0.7070118188858032, 'learning_rate': 1.8365357142857142e-05, 'epoch': 0.4928571428571429}
139
+ Step: 694 {'loss': 0.3455, 'grad_norm': 0.8581281900405884, 'learning_rate': 1.8353511904761905e-05, 'epoch': 0.49642857142857144}
140
+ Step: 699 {'loss': 0.3322, 'grad_norm': 0.7677241563796997, 'learning_rate': 1.8341666666666668e-05, 'epoch': 0.5}
141
+ Step: 704 {'loss': 0.3077, 'grad_norm': 0.6088917851448059, 'learning_rate': 1.832982142857143e-05, 'epoch': 0.5035714285714286}
142
+ Step: 709 {'loss': 0.332, 'grad_norm': 0.7178571224212646, 'learning_rate': 1.8317976190476193e-05, 'epoch': 0.5071428571428571}
143
+ Step: 714 {'loss': 0.3245, 'grad_norm': 0.7131450772285461, 'learning_rate': 1.8306130952380952e-05, 'epoch': 0.5107142857142857}
144
+ Step: 719 {'loss': 0.3513, 'grad_norm': 1.0770076513290405, 'learning_rate': 1.8294285714285714e-05, 'epoch': 0.5142857142857142}
145
+ Step: 724 {'loss': 0.2694, 'grad_norm': 0.798261821269989, 'learning_rate': 1.8282440476190477e-05, 'epoch': 0.5178571428571429}
146
+ Step: 729 {'loss': 0.2473, 'grad_norm': 0.7949568033218384, 'learning_rate': 1.827059523809524e-05, 'epoch': 0.5214285714285715}
147
+ Step: 734 {'loss': 0.4606, 'grad_norm': 1.0194681882858276, 'learning_rate': 1.8258750000000002e-05, 'epoch': 0.525}
148
+ Step: 739 {'loss': 0.3139, 'grad_norm': 0.6697856187820435, 'learning_rate': 1.8246904761904765e-05, 'epoch': 0.5285714285714286}
149
+ Step: 744 {'loss': 0.3363, 'grad_norm': 1.0922176837921143, 'learning_rate': 1.8235059523809524e-05, 'epoch': 0.5321428571428571}
150
+ Step: 749 {'loss': 0.3805, 'grad_norm': 0.5025736689567566, 'learning_rate': 1.8223214285714286e-05, 'epoch': 0.5357142857142857}
151
+ Step: 754 {'loss': 0.4875, 'grad_norm': 0.9163244366645813, 'learning_rate': 1.821136904761905e-05, 'epoch': 0.5392857142857143}
152
+ Step: 759 {'loss': 0.4214, 'grad_norm': 1.0958220958709717, 'learning_rate': 1.819952380952381e-05, 'epoch': 0.5428571428571428}
153
+ Step: 764 {'loss': 0.291, 'grad_norm': 0.7918387651443481, 'learning_rate': 1.8187678571428574e-05, 'epoch': 0.5464285714285714}
154
+ Step: 769 {'loss': 0.3236, 'grad_norm': 1.247233271598816, 'learning_rate': 1.8175833333333333e-05, 'epoch': 0.55}
155
+ Step: 774 {'loss': 0.2934, 'grad_norm': 0.5760002732276917, 'learning_rate': 1.8163988095238096e-05, 'epoch': 0.5535714285714286}
156
+ Step: 779 {'loss': 0.3677, 'grad_norm': 0.63747239112854, 'learning_rate': 1.8152142857142858e-05, 'epoch': 0.5571428571428572}
157
+ Step: 784 {'loss': 0.4133, 'grad_norm': 0.7723174691200256, 'learning_rate': 1.814029761904762e-05, 'epoch': 0.5607142857142857}
158
+ Step: 789 {'loss': 0.3961, 'grad_norm': 0.7719770669937134, 'learning_rate': 1.8128452380952383e-05, 'epoch': 0.5642857142857143}
159
+ Step: 794 {'loss': 0.3038, 'grad_norm': 0.5547974705696106, 'learning_rate': 1.8116607142857146e-05, 'epoch': 0.5678571428571428}
160
+ Step: 799 {'loss': 0.3156, 'grad_norm': 0.7582687735557556, 'learning_rate': 1.8104761904761905e-05, 'epoch': 0.5714285714285714}
161
+ Step: 804 {'loss': 0.2789, 'grad_norm': 0.7563489675521851, 'learning_rate': 1.8092916666666668e-05, 'epoch': 0.575}
162
+ Step: 809 {'loss': 0.4258, 'grad_norm': 0.6750617027282715, 'learning_rate': 1.808107142857143e-05, 'epoch': 0.5785714285714286}
163
+ Step: 814 {'loss': 0.3774, 'grad_norm': 0.9238749146461487, 'learning_rate': 1.8069226190476193e-05, 'epoch': 0.5821428571428572}
164
+ Step: 819 {'loss': 0.3267, 'grad_norm': 0.9808230400085449, 'learning_rate': 1.8057380952380955e-05, 'epoch': 0.5857142857142857}
165
+ Step: 824 {'loss': 0.3943, 'grad_norm': 0.673556923866272, 'learning_rate': 1.8045535714285714e-05, 'epoch': 0.5892857142857143}
166
+ Step: 829 {'loss': 0.4032, 'grad_norm': 0.9122436046600342, 'learning_rate': 1.8033690476190477e-05, 'epoch': 0.5928571428571429}
167
+ Step: 834 {'loss': 0.4152, 'grad_norm': 1.0862387418746948, 'learning_rate': 1.802184523809524e-05, 'epoch': 0.5964285714285714}
168
+ Step: 839 {'loss': 0.3155, 'grad_norm': 0.4949910640716553, 'learning_rate': 1.8010000000000002e-05, 'epoch': 0.6}
169
+ Step: 844 {'loss': 0.3416, 'grad_norm': 1.0176829099655151, 'learning_rate': 1.7998154761904765e-05, 'epoch': 0.6035714285714285}
170
+ Step: 849 {'loss': 0.421, 'grad_norm': 0.964309811592102, 'learning_rate': 1.7986309523809527e-05, 'epoch': 0.6071428571428571}
171
+ Step: 854 {'loss': 0.3827, 'grad_norm': 0.7528172731399536, 'learning_rate': 1.7974464285714286e-05, 'epoch': 0.6107142857142858}
172
+ Step: 859 {'loss': 0.434, 'grad_norm': 0.6447746753692627, 'learning_rate': 1.796261904761905e-05, 'epoch': 0.6142857142857143}
173
+ Step: 864 {'loss': 0.294, 'grad_norm': 0.7310487627983093, 'learning_rate': 1.795077380952381e-05, 'epoch': 0.6178571428571429}
174
+ Step: 869 {'loss': 0.4067, 'grad_norm': 0.8082703948020935, 'learning_rate': 1.7938928571428574e-05, 'epoch': 0.6214285714285714}
175
+ Step: 874 {'loss': 0.4972, 'grad_norm': 1.139521837234497, 'learning_rate': 1.7927083333333337e-05, 'epoch': 0.625}
176
+ Step: 879 {'loss': 0.4101, 'grad_norm': 0.8615649938583374, 'learning_rate': 1.7915238095238096e-05, 'epoch': 0.6285714285714286}
177
+ Step: 884 {'loss': 0.3884, 'grad_norm': 1.295130968093872, 'learning_rate': 1.7903392857142858e-05, 'epoch': 0.6321428571428571}
178
+ Step: 889 {'loss': 0.3556, 'grad_norm': 0.6960245370864868, 'learning_rate': 1.789154761904762e-05, 'epoch': 0.6357142857142857}
179
+ Step: 894 {'loss': 0.3243, 'grad_norm': 0.7199245691299438, 'learning_rate': 1.7879702380952383e-05, 'epoch': 0.6392857142857142}
180
+ Step: 899 {'loss': 0.5436, 'grad_norm': 0.8263904452323914, 'learning_rate': 1.7867857142857146e-05, 'epoch': 0.6428571428571429}
181
+ Step: 904 {'loss': 0.3568, 'grad_norm': 0.7996845245361328, 'learning_rate': 1.7856011904761905e-05, 'epoch': 0.6464285714285715}
182
+ Step: 909 {'loss': 0.2545, 'grad_norm': 0.8239178657531738, 'learning_rate': 1.7844166666666668e-05, 'epoch': 0.65}
183
+ Step: 914 {'loss': 0.3934, 'grad_norm': 0.8466132283210754, 'learning_rate': 1.783232142857143e-05, 'epoch': 0.6535714285714286}
184
+ Step: 919 {'loss': 0.2826, 'grad_norm': 0.6736989617347717, 'learning_rate': 1.782047619047619e-05, 'epoch': 0.6571428571428571}
185
+ Step: 924 {'loss': 0.476, 'grad_norm': 1.1140491962432861, 'learning_rate': 1.7808630952380952e-05, 'epoch': 0.6607142857142857}
186
+ Step: 929 {'loss': 0.3631, 'grad_norm': 0.7517083883285522, 'learning_rate': 1.7796785714285714e-05, 'epoch': 0.6642857142857143}
187
+ Step: 934 {'loss': 0.3114, 'grad_norm': 0.8430672287940979, 'learning_rate': 1.7784940476190477e-05, 'epoch': 0.6678571428571428}
188
+ Step: 939 {'loss': 0.3025, 'grad_norm': 0.5135239362716675, 'learning_rate': 1.777309523809524e-05, 'epoch': 0.6714285714285714}
189
+ Step: 944 {'loss': 0.3271, 'grad_norm': 0.918813169002533, 'learning_rate': 1.7761250000000002e-05, 'epoch': 0.675}
190
+ Step: 949 {'loss': 0.388, 'grad_norm': 0.9189344644546509, 'learning_rate': 1.774940476190476e-05, 'epoch': 0.6785714285714286}
191
+ Step: 954 {'loss': 0.4937, 'grad_norm': 1.078315258026123, 'learning_rate': 1.7737559523809524e-05, 'epoch': 0.6821428571428572}
192
+ Step: 959 {'loss': 0.3456, 'grad_norm': 1.0054786205291748, 'learning_rate': 1.7725714285714286e-05, 'epoch': 0.6857142857142857}
193
+ Step: 964 {'loss': 0.3445, 'grad_norm': 0.8630911707878113, 'learning_rate': 1.771386904761905e-05, 'epoch': 0.6892857142857143}
194
+ Step: 969 {'loss': 0.3572, 'grad_norm': 0.8356649279594421, 'learning_rate': 1.770202380952381e-05, 'epoch': 0.6928571428571428}
195
+ Step: 974 {'loss': 0.5159, 'grad_norm': 0.9866499900817871, 'learning_rate': 1.769017857142857e-05, 'epoch': 0.6964285714285714}
196
+ Step: 979 {'loss': 0.3134, 'grad_norm': 1.7429964542388916, 'learning_rate': 1.7678333333333333e-05, 'epoch': 0.7}
197
+ Step: 984 {'loss': 0.39, 'grad_norm': 1.1718984842300415, 'learning_rate': 1.7666488095238096e-05, 'epoch': 0.7035714285714286}
198
+ Step: 989 {'loss': 0.3215, 'grad_norm': 0.6568397283554077, 'learning_rate': 1.7654642857142858e-05, 'epoch': 0.7071428571428572}
199
+ Step: 994 {'loss': 0.329, 'grad_norm': 0.9722153544425964, 'learning_rate': 1.764279761904762e-05, 'epoch': 0.7107142857142857}
200
+ Step: 999 {'loss': 0.363, 'grad_norm': 0.7853933572769165, 'learning_rate': 1.7630952380952383e-05, 'epoch': 0.7142857142857143}
201
+ Step: 1004 {'loss': 0.2597, 'grad_norm': 0.7332313060760498, 'learning_rate': 1.7619107142857143e-05, 'epoch': 0.7178571428571429}
202
+ Step: 1009 {'loss': 0.2838, 'grad_norm': 0.690869927406311, 'learning_rate': 1.7607261904761905e-05, 'epoch': 0.7214285714285714}
203
+ Step: 1014 {'loss': 0.2714, 'grad_norm': 0.7609320282936096, 'learning_rate': 1.7595416666666668e-05, 'epoch': 0.725}
204
+ Step: 1019 {'loss': 0.2741, 'grad_norm': 0.7796645760536194, 'learning_rate': 1.758357142857143e-05, 'epoch': 0.7285714285714285}
205
+ Step: 1024 {'loss': 0.2725, 'grad_norm': 0.7715787887573242, 'learning_rate': 1.7571726190476193e-05, 'epoch': 0.7321428571428571}
206
+ Step: 1029 {'loss': 0.2917, 'grad_norm': 0.9862931370735168, 'learning_rate': 1.7559880952380952e-05, 'epoch': 0.7357142857142858}
207
+ Step: 1034 {'loss': 0.3189, 'grad_norm': 0.998388409614563, 'learning_rate': 1.7548035714285715e-05, 'epoch': 0.7392857142857143}
208
+ Step: 1039 {'loss': 0.3415, 'grad_norm': 1.0296244621276855, 'learning_rate': 1.7536190476190477e-05, 'epoch': 0.7428571428571429}
209
+ Step: 1044 {'loss': 0.3719, 'grad_norm': 0.6839652061462402, 'learning_rate': 1.752434523809524e-05, 'epoch': 0.7464285714285714}
210
+ Step: 1049 {'loss': 0.3024, 'grad_norm': 1.1470963954925537, 'learning_rate': 1.7512500000000002e-05, 'epoch': 0.75}
211
+ Step: 1054 {'loss': 0.408, 'grad_norm': 1.2644574642181396, 'learning_rate': 1.7500654761904765e-05, 'epoch': 0.7535714285714286}
212
+ Step: 1059 {'loss': 0.4078, 'grad_norm': 0.8794793486595154, 'learning_rate': 1.7488809523809524e-05, 'epoch': 0.7571428571428571}
213
+ Step: 1064 {'loss': 0.3511, 'grad_norm': 0.9097239971160889, 'learning_rate': 1.7476964285714286e-05, 'epoch': 0.7607142857142857}
214
+ Step: 1069 {'loss': 0.3949, 'grad_norm': 0.6323068737983704, 'learning_rate': 1.746511904761905e-05, 'epoch': 0.7642857142857142}
215
+ Step: 1074 {'loss': 0.3794, 'grad_norm': 0.6272197365760803, 'learning_rate': 1.745327380952381e-05, 'epoch': 0.7678571428571429}
216
+ Step: 1079 {'loss': 0.3388, 'grad_norm': 0.8878163695335388, 'learning_rate': 1.7441428571428574e-05, 'epoch': 0.7714285714285715}
217
+ Step: 1084 {'loss': 0.4305, 'grad_norm': 1.036283016204834, 'learning_rate': 1.7429583333333333e-05, 'epoch': 0.775}
218
+ Step: 1089 {'loss': 0.3446, 'grad_norm': 1.0395066738128662, 'learning_rate': 1.7417738095238096e-05, 'epoch': 0.7785714285714286}
219
+ Step: 1094 {'loss': 0.3496, 'grad_norm': 0.6800899505615234, 'learning_rate': 1.740589285714286e-05, 'epoch': 0.7821428571428571}
220
+ Step: 1099 {'loss': 0.3291, 'grad_norm': 1.1891331672668457, 'learning_rate': 1.739404761904762e-05, 'epoch': 0.7857142857142857}
221
+ Step: 1104 {'loss': 0.3476, 'grad_norm': 0.8779275417327881, 'learning_rate': 1.7382202380952383e-05, 'epoch': 0.7892857142857143}
222
+ Step: 1109 {'loss': 0.3047, 'grad_norm': 0.853313684463501, 'learning_rate': 1.7370357142857146e-05, 'epoch': 0.7928571428571428}
223
+ Step: 1114 {'loss': 0.3375, 'grad_norm': 0.96327143907547, 'learning_rate': 1.7358511904761905e-05, 'epoch': 0.7964285714285714}
224
+ Step: 1119 {'loss': 0.3974, 'grad_norm': 1.0462590456008911, 'learning_rate': 1.7346666666666668e-05, 'epoch': 0.8}
225
+ Step: 1124 {'loss': 0.332, 'grad_norm': 0.6860864758491516, 'learning_rate': 1.733482142857143e-05, 'epoch': 0.8035714285714286}
226
+ Step: 1129 {'loss': 0.5425, 'grad_norm': 1.0949068069458008, 'learning_rate': 1.7322976190476193e-05, 'epoch': 0.8071428571428572}
227
+ Step: 1134 {'loss': 0.6373, 'grad_norm': 0.7758826613426208, 'learning_rate': 1.7311130952380955e-05, 'epoch': 0.8107142857142857}
228
+ Step: 1139 {'loss': 0.2984, 'grad_norm': 0.8225033283233643, 'learning_rate': 1.7299285714285718e-05, 'epoch': 0.8142857142857143}
229
+ Step: 1144 {'loss': 0.3331, 'grad_norm': 0.4973011910915375, 'learning_rate': 1.7287440476190477e-05, 'epoch': 0.8178571428571428}
230
+ Step: 1149 {'loss': 0.2866, 'grad_norm': 1.2448415756225586, 'learning_rate': 1.727559523809524e-05, 'epoch': 0.8214285714285714}
231
+ Step: 1154 {'loss': 0.3561, 'grad_norm': 0.8659022450447083, 'learning_rate': 1.7263750000000002e-05, 'epoch': 0.825}
232
+ Step: 1159 {'loss': 0.3392, 'grad_norm': 1.0248117446899414, 'learning_rate': 1.7251904761904765e-05, 'epoch': 0.8285714285714286}
233
+ Step: 1164 {'loss': 0.364, 'grad_norm': 0.7102665305137634, 'learning_rate': 1.7240059523809527e-05, 'epoch': 0.8321428571428572}
234
+ Step: 1169 {'loss': 0.3458, 'grad_norm': 0.7900285124778748, 'learning_rate': 1.7228214285714286e-05, 'epoch': 0.8357142857142857}
235
+ Step: 1174 {'loss': 0.3239, 'grad_norm': 1.0749526023864746, 'learning_rate': 1.721636904761905e-05, 'epoch': 0.8392857142857143}
236
+ Step: 1179 {'loss': 0.4444, 'grad_norm': 0.6646351218223572, 'learning_rate': 1.720452380952381e-05, 'epoch': 0.8428571428571429}
237
+ Step: 1184 {'loss': 0.3368, 'grad_norm': 0.6927091479301453, 'learning_rate': 1.7192678571428574e-05, 'epoch': 0.8464285714285714}
238
+ Step: 1189 {'loss': 0.2657, 'grad_norm': 0.7363135814666748, 'learning_rate': 1.7180833333333337e-05, 'epoch': 0.85}
239
+ Step: 1194 {'loss': 0.4087, 'grad_norm': 1.0950276851654053, 'learning_rate': 1.71689880952381e-05, 'epoch': 0.8535714285714285}
240
+ Step: 1199 {'loss': 0.3012, 'grad_norm': 0.8317290544509888, 'learning_rate': 1.715714285714286e-05, 'epoch': 0.8571428571428571}
241
+ Step: 1204 {'loss': 0.3327, 'grad_norm': 0.8805026412010193, 'learning_rate': 1.714529761904762e-05, 'epoch': 0.8607142857142858}
242
+ Step: 1209 {'loss': 0.2755, 'grad_norm': 0.5442625880241394, 'learning_rate': 1.7133452380952384e-05, 'epoch': 0.8642857142857143}
243
+ Step: 1214 {'loss': 0.3451, 'grad_norm': 0.8105046153068542, 'learning_rate': 1.7121607142857143e-05, 'epoch': 0.8678571428571429}
244
+ Step: 1219 {'loss': 0.2655, 'grad_norm': 0.8646172881126404, 'learning_rate': 1.7109761904761905e-05, 'epoch': 0.8714285714285714}
245
+ Step: 1224 {'loss': 0.3989, 'grad_norm': 0.6593888401985168, 'learning_rate': 1.7097916666666668e-05, 'epoch': 0.875}
246
+ Step: 1229 {'loss': 0.269, 'grad_norm': 0.6822863221168518, 'learning_rate': 1.708607142857143e-05, 'epoch': 0.8785714285714286}
247
+ Step: 1234 {'loss': 0.3261, 'grad_norm': 0.8558134436607361, 'learning_rate': 1.707422619047619e-05, 'epoch': 0.8821428571428571}
248
+ Step: 1239 {'loss': 0.317, 'grad_norm': 0.9368207454681396, 'learning_rate': 1.7062380952380952e-05, 'epoch': 0.8857142857142857}
249
+ Step: 1244 {'loss': 0.3394, 'grad_norm': 1.1915005445480347, 'learning_rate': 1.7050535714285715e-05, 'epoch': 0.8892857142857142}
250
+ Step: 1249 {'loss': 0.3968, 'grad_norm': 1.3165326118469238, 'learning_rate': 1.7038690476190477e-05, 'epoch': 0.8928571428571429}
251
+ Step: 1254 {'loss': 0.3263, 'grad_norm': 1.0021530389785767, 'learning_rate': 1.702684523809524e-05, 'epoch': 0.8964285714285715}
252
+ Step: 1259 {'loss': 0.3063, 'grad_norm': 0.679772138595581, 'learning_rate': 1.7015000000000002e-05, 'epoch': 0.9}
253
+ Step: 1264 {'loss': 0.3759, 'grad_norm': 1.0995697975158691, 'learning_rate': 1.700315476190476e-05, 'epoch': 0.9035714285714286}
254
+ Step: 1269 {'loss': 0.377, 'grad_norm': 0.7673999071121216, 'learning_rate': 1.6991309523809524e-05, 'epoch': 0.9071428571428571}
255
+ Step: 1274 {'loss': 0.2665, 'grad_norm': 0.9506070613861084, 'learning_rate': 1.6979464285714287e-05, 'epoch': 0.9107142857142857}
256
+ Step: 1279 {'loss': 0.337, 'grad_norm': 0.766395092010498, 'learning_rate': 1.696761904761905e-05, 'epoch': 0.9142857142857143}
257
+ Step: 1284 {'loss': 0.4346, 'grad_norm': 0.9738909602165222, 'learning_rate': 1.695577380952381e-05, 'epoch': 0.9178571428571428}
258
+ Step: 1289 {'loss': 0.3746, 'grad_norm': 0.7770394086837769, 'learning_rate': 1.694392857142857e-05, 'epoch': 0.9214285714285714}
259
+ Step: 1294 {'loss': 0.3313, 'grad_norm': 0.7297780513763428, 'learning_rate': 1.6932083333333333e-05, 'epoch': 0.925}
260
+ Step: 1299 {'loss': 0.2412, 'grad_norm': 1.143165111541748, 'learning_rate': 1.6920238095238096e-05, 'epoch': 0.9285714285714286}
261
+ Step: 1304 {'loss': 0.4314, 'grad_norm': 1.1894090175628662, 'learning_rate': 1.690839285714286e-05, 'epoch': 0.9321428571428572}
262
+ Step: 1309 {'loss': 0.2754, 'grad_norm': 0.7497856020927429, 'learning_rate': 1.689654761904762e-05, 'epoch': 0.9357142857142857}
263
+ Step: 1314 {'loss': 0.3167, 'grad_norm': 0.7976461052894592, 'learning_rate': 1.6884702380952384e-05, 'epoch': 0.9392857142857143}
264
+ Step: 1319 {'loss': 0.309, 'grad_norm': 0.8367032408714294, 'learning_rate': 1.6872857142857143e-05, 'epoch': 0.9428571428571428}
265
+ Step: 1324 {'loss': 0.2803, 'grad_norm': 0.7481916546821594, 'learning_rate': 1.6861011904761905e-05, 'epoch': 0.9464285714285714}
266
+ Step: 1329 {'loss': 0.3361, 'grad_norm': 1.4836623668670654, 'learning_rate': 1.6849166666666668e-05, 'epoch': 0.95}
267
+ Step: 1334 {'loss': 0.236, 'grad_norm': 0.7607225775718689, 'learning_rate': 1.683732142857143e-05, 'epoch': 0.9535714285714286}
268
+ Step: 1339 {'loss': 0.3701, 'grad_norm': 0.9192880988121033, 'learning_rate': 1.6825476190476193e-05, 'epoch': 0.9571428571428572}
269
+ Step: 1344 {'loss': 0.3533, 'grad_norm': 0.8493601083755493, 'learning_rate': 1.6813630952380955e-05, 'epoch': 0.9607142857142857}
270
+ Step: 1349 {'loss': 0.3261, 'grad_norm': 0.7196798324584961, 'learning_rate': 1.6801785714285715e-05, 'epoch': 0.9642857142857143}
271
+ Step: 1354 {'loss': 0.3694, 'grad_norm': 1.0612856149673462, 'learning_rate': 1.6789940476190477e-05, 'epoch': 0.9678571428571429}
272
+ Step: 1359 {'loss': 0.3639, 'grad_norm': 0.9527296423912048, 'learning_rate': 1.677809523809524e-05, 'epoch': 0.9714285714285714}
273
+ Step: 1364 {'loss': 0.3435, 'grad_norm': 0.7819812893867493, 'learning_rate': 1.6766250000000002e-05, 'epoch': 0.975}
274
+ Step: 1369 {'loss': 0.3111, 'grad_norm': 0.7119016647338867, 'learning_rate': 1.6754404761904765e-05, 'epoch': 0.9785714285714285}
275
+ Step: 1374 {'loss': 0.3079, 'grad_norm': 0.6907299757003784, 'learning_rate': 1.6742559523809524e-05, 'epoch': 0.9821428571428571}
276
+ Step: 1379 {'loss': 0.3241, 'grad_norm': 0.8788382411003113, 'learning_rate': 1.6730714285714287e-05, 'epoch': 0.9857142857142858}
277
+ Step: 1384 {'loss': 0.2445, 'grad_norm': 0.9006222486495972, 'learning_rate': 1.671886904761905e-05, 'epoch': 0.9892857142857143}
278
+ Step: 1389 {'loss': 0.2705, 'grad_norm': 0.8007070422172546, 'learning_rate': 1.670702380952381e-05, 'epoch': 0.9928571428571429}
279
+ Step: 1394 {'loss': 0.3579, 'grad_norm': 1.085525393486023, 'learning_rate': 1.6695178571428574e-05, 'epoch': 0.9964285714285714}
280
+ Step: 1399 {'loss': 0.278, 'grad_norm': 0.8911979794502258, 'learning_rate': 1.6683333333333337e-05, 'epoch': 1.0}
281
+ Interrupted by user
282
+ Step: 1401 {'train_runtime': 18039.657, 'train_samples_per_second': 1.863, 'train_steps_per_second': 0.466, 'train_loss': 0.4571322862499961, 'epoch': 1.0014285714285713}
283
+ 17:57:53-413132 INFO LoRA training run is completed and saved.
284
+ 17:57:53-510668 INFO Training complete, saving
285
+ 17:57:53-597183 INFO Training interrupted.
286
+
287
+
288
+
config.json CHANGED
@@ -1,5 +1,5 @@
1
  {
2
- "_name_or_path": ".\\DigitalSoul",
3
  "architectures": [
4
  "MixtralForCausalLM"
5
  ],
@@ -29,7 +29,7 @@
29
  "sliding_window": null,
30
  "tie_word_embeddings": false,
31
  "torch_dtype": "bfloat16",
32
- "transformers_version": "4.44.0",
33
  "use_cache": false,
34
  "vocab_size": 32064
35
  }
 
1
  {
2
+ "_name_or_path": ".\\BlackSheep",
3
  "architectures": [
4
  "MixtralForCausalLM"
5
  ],
 
29
  "sliding_window": null,
30
  "tie_word_embeddings": false,
31
  "torch_dtype": "bfloat16",
32
+ "transformers_version": "4.44.2",
33
  "use_cache": false,
34
  "vocab_size": 32064
35
  }
generation_config.json CHANGED
@@ -2,5 +2,5 @@
2
  "_from_model_config": true,
3
  "bos_token_id": 1,
4
  "eos_token_id": 32000,
5
- "transformers_version": "4.44.0"
6
  }
 
2
  "_from_model_config": true,
3
  "bos_token_id": 1,
4
  "eos_token_id": 32000,
5
+ "transformers_version": "4.44.2"
6
  }
model-00001-of-00004.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:3c5801f3d971ed8ffe5c8c787bcbd09e0e954a46741a92e3a666da2c1011fdce
3
+ size 4991385392
model-00002-of-00004.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:9cddb7085ddda505a7a73388c789fbe5f37ab266fa9340abe6f25a788b0e8266
3
+ size 4995729856
model-00003-of-00004.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:1f03f0654b8149e0d9dc941791d9c08c734d1e43e079e5b1680232fbeb830d4a
3
+ size 4957962344
model-00004-of-00004.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:463bf1b25f6a83f7e414b9c001e59a5952c4b291fe2b99cf7c9e07c5841a8b75
3
+ size 2361411944
model.safetensors.index.json ADDED
@@ -0,0 +1,522 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "metadata": {
3
+ "total_size": 17306425344
4
+ },
5
+ "weight_map": {
6
+ "lm_head.weight": "model-00004-of-00004.safetensors",
7
+ "model.embed_tokens.weight": "model-00001-of-00004.safetensors",
8
+ "model.layers.0.block_sparse_moe.experts.0.w1.weight": "model-00001-of-00004.safetensors",
9
+ "model.layers.0.block_sparse_moe.experts.0.w2.weight": "model-00001-of-00004.safetensors",
10
+ "model.layers.0.block_sparse_moe.experts.0.w3.weight": "model-00001-of-00004.safetensors",
11
+ "model.layers.0.block_sparse_moe.experts.1.w1.weight": "model-00001-of-00004.safetensors",
12
+ "model.layers.0.block_sparse_moe.experts.1.w2.weight": "model-00001-of-00004.safetensors",
13
+ "model.layers.0.block_sparse_moe.experts.1.w3.weight": "model-00001-of-00004.safetensors",
14
+ "model.layers.0.block_sparse_moe.experts.2.w1.weight": "model-00001-of-00004.safetensors",
15
+ "model.layers.0.block_sparse_moe.experts.2.w2.weight": "model-00001-of-00004.safetensors",
16
+ "model.layers.0.block_sparse_moe.experts.2.w3.weight": "model-00001-of-00004.safetensors",
17
+ "model.layers.0.block_sparse_moe.gate.weight": "model-00001-of-00004.safetensors",
18
+ "model.layers.0.input_layernorm.weight": "model-00001-of-00004.safetensors",
19
+ "model.layers.0.post_attention_layernorm.weight": "model-00001-of-00004.safetensors",
20
+ "model.layers.0.self_attn.k_proj.weight": "model-00001-of-00004.safetensors",
21
+ "model.layers.0.self_attn.o_proj.weight": "model-00001-of-00004.safetensors",
22
+ "model.layers.0.self_attn.q_proj.weight": "model-00001-of-00004.safetensors",
23
+ "model.layers.0.self_attn.v_proj.weight": "model-00001-of-00004.safetensors",
24
+ "model.layers.1.block_sparse_moe.experts.0.w1.weight": "model-00001-of-00004.safetensors",
25
+ "model.layers.1.block_sparse_moe.experts.0.w2.weight": "model-00001-of-00004.safetensors",
26
+ "model.layers.1.block_sparse_moe.experts.0.w3.weight": "model-00001-of-00004.safetensors",
27
+ "model.layers.1.block_sparse_moe.experts.1.w1.weight": "model-00001-of-00004.safetensors",
28
+ "model.layers.1.block_sparse_moe.experts.1.w2.weight": "model-00001-of-00004.safetensors",
29
+ "model.layers.1.block_sparse_moe.experts.1.w3.weight": "model-00001-of-00004.safetensors",
30
+ "model.layers.1.block_sparse_moe.experts.2.w1.weight": "model-00001-of-00004.safetensors",
31
+ "model.layers.1.block_sparse_moe.experts.2.w2.weight": "model-00001-of-00004.safetensors",
32
+ "model.layers.1.block_sparse_moe.experts.2.w3.weight": "model-00001-of-00004.safetensors",
33
+ "model.layers.1.block_sparse_moe.gate.weight": "model-00001-of-00004.safetensors",
34
+ "model.layers.1.input_layernorm.weight": "model-00001-of-00004.safetensors",
35
+ "model.layers.1.post_attention_layernorm.weight": "model-00001-of-00004.safetensors",
36
+ "model.layers.1.self_attn.k_proj.weight": "model-00001-of-00004.safetensors",
37
+ "model.layers.1.self_attn.o_proj.weight": "model-00001-of-00004.safetensors",
38
+ "model.layers.1.self_attn.q_proj.weight": "model-00001-of-00004.safetensors",
39
+ "model.layers.1.self_attn.v_proj.weight": "model-00001-of-00004.safetensors",
40
+ "model.layers.10.block_sparse_moe.experts.0.w1.weight": "model-00002-of-00004.safetensors",
41
+ "model.layers.10.block_sparse_moe.experts.0.w2.weight": "model-00002-of-00004.safetensors",
42
+ "model.layers.10.block_sparse_moe.experts.0.w3.weight": "model-00002-of-00004.safetensors",
43
+ "model.layers.10.block_sparse_moe.experts.1.w1.weight": "model-00002-of-00004.safetensors",
44
+ "model.layers.10.block_sparse_moe.experts.1.w2.weight": "model-00002-of-00004.safetensors",
45
+ "model.layers.10.block_sparse_moe.experts.1.w3.weight": "model-00002-of-00004.safetensors",
46
+ "model.layers.10.block_sparse_moe.experts.2.w1.weight": "model-00002-of-00004.safetensors",
47
+ "model.layers.10.block_sparse_moe.experts.2.w2.weight": "model-00002-of-00004.safetensors",
48
+ "model.layers.10.block_sparse_moe.experts.2.w3.weight": "model-00002-of-00004.safetensors",
49
+ "model.layers.10.block_sparse_moe.gate.weight": "model-00002-of-00004.safetensors",
50
+ "model.layers.10.input_layernorm.weight": "model-00002-of-00004.safetensors",
51
+ "model.layers.10.post_attention_layernorm.weight": "model-00002-of-00004.safetensors",
52
+ "model.layers.10.self_attn.k_proj.weight": "model-00002-of-00004.safetensors",
53
+ "model.layers.10.self_attn.o_proj.weight": "model-00002-of-00004.safetensors",
54
+ "model.layers.10.self_attn.q_proj.weight": "model-00002-of-00004.safetensors",
55
+ "model.layers.10.self_attn.v_proj.weight": "model-00002-of-00004.safetensors",
56
+ "model.layers.11.block_sparse_moe.experts.0.w1.weight": "model-00002-of-00004.safetensors",
57
+ "model.layers.11.block_sparse_moe.experts.0.w2.weight": "model-00002-of-00004.safetensors",
58
+ "model.layers.11.block_sparse_moe.experts.0.w3.weight": "model-00002-of-00004.safetensors",
59
+ "model.layers.11.block_sparse_moe.experts.1.w1.weight": "model-00002-of-00004.safetensors",
60
+ "model.layers.11.block_sparse_moe.experts.1.w2.weight": "model-00002-of-00004.safetensors",
61
+ "model.layers.11.block_sparse_moe.experts.1.w3.weight": "model-00002-of-00004.safetensors",
62
+ "model.layers.11.block_sparse_moe.experts.2.w1.weight": "model-00002-of-00004.safetensors",
63
+ "model.layers.11.block_sparse_moe.experts.2.w2.weight": "model-00002-of-00004.safetensors",
64
+ "model.layers.11.block_sparse_moe.experts.2.w3.weight": "model-00002-of-00004.safetensors",
65
+ "model.layers.11.block_sparse_moe.gate.weight": "model-00002-of-00004.safetensors",
66
+ "model.layers.11.input_layernorm.weight": "model-00002-of-00004.safetensors",
67
+ "model.layers.11.post_attention_layernorm.weight": "model-00002-of-00004.safetensors",
68
+ "model.layers.11.self_attn.k_proj.weight": "model-00002-of-00004.safetensors",
69
+ "model.layers.11.self_attn.o_proj.weight": "model-00002-of-00004.safetensors",
70
+ "model.layers.11.self_attn.q_proj.weight": "model-00002-of-00004.safetensors",
71
+ "model.layers.11.self_attn.v_proj.weight": "model-00002-of-00004.safetensors",
72
+ "model.layers.12.block_sparse_moe.experts.0.w1.weight": "model-00002-of-00004.safetensors",
73
+ "model.layers.12.block_sparse_moe.experts.0.w2.weight": "model-00002-of-00004.safetensors",
74
+ "model.layers.12.block_sparse_moe.experts.0.w3.weight": "model-00002-of-00004.safetensors",
75
+ "model.layers.12.block_sparse_moe.experts.1.w1.weight": "model-00002-of-00004.safetensors",
76
+ "model.layers.12.block_sparse_moe.experts.1.w2.weight": "model-00002-of-00004.safetensors",
77
+ "model.layers.12.block_sparse_moe.experts.1.w3.weight": "model-00002-of-00004.safetensors",
78
+ "model.layers.12.block_sparse_moe.experts.2.w1.weight": "model-00002-of-00004.safetensors",
79
+ "model.layers.12.block_sparse_moe.experts.2.w2.weight": "model-00002-of-00004.safetensors",
80
+ "model.layers.12.block_sparse_moe.experts.2.w3.weight": "model-00002-of-00004.safetensors",
81
+ "model.layers.12.block_sparse_moe.gate.weight": "model-00002-of-00004.safetensors",
82
+ "model.layers.12.input_layernorm.weight": "model-00002-of-00004.safetensors",
83
+ "model.layers.12.post_attention_layernorm.weight": "model-00002-of-00004.safetensors",
84
+ "model.layers.12.self_attn.k_proj.weight": "model-00002-of-00004.safetensors",
85
+ "model.layers.12.self_attn.o_proj.weight": "model-00002-of-00004.safetensors",
86
+ "model.layers.12.self_attn.q_proj.weight": "model-00002-of-00004.safetensors",
87
+ "model.layers.12.self_attn.v_proj.weight": "model-00002-of-00004.safetensors",
88
+ "model.layers.13.block_sparse_moe.experts.0.w1.weight": "model-00002-of-00004.safetensors",
89
+ "model.layers.13.block_sparse_moe.experts.0.w2.weight": "model-00002-of-00004.safetensors",
90
+ "model.layers.13.block_sparse_moe.experts.0.w3.weight": "model-00002-of-00004.safetensors",
91
+ "model.layers.13.block_sparse_moe.experts.1.w1.weight": "model-00002-of-00004.safetensors",
92
+ "model.layers.13.block_sparse_moe.experts.1.w2.weight": "model-00002-of-00004.safetensors",
93
+ "model.layers.13.block_sparse_moe.experts.1.w3.weight": "model-00002-of-00004.safetensors",
94
+ "model.layers.13.block_sparse_moe.experts.2.w1.weight": "model-00002-of-00004.safetensors",
95
+ "model.layers.13.block_sparse_moe.experts.2.w2.weight": "model-00002-of-00004.safetensors",
96
+ "model.layers.13.block_sparse_moe.experts.2.w3.weight": "model-00002-of-00004.safetensors",
97
+ "model.layers.13.block_sparse_moe.gate.weight": "model-00002-of-00004.safetensors",
98
+ "model.layers.13.input_layernorm.weight": "model-00002-of-00004.safetensors",
99
+ "model.layers.13.post_attention_layernorm.weight": "model-00002-of-00004.safetensors",
100
+ "model.layers.13.self_attn.k_proj.weight": "model-00002-of-00004.safetensors",
101
+ "model.layers.13.self_attn.o_proj.weight": "model-00002-of-00004.safetensors",
102
+ "model.layers.13.self_attn.q_proj.weight": "model-00002-of-00004.safetensors",
103
+ "model.layers.13.self_attn.v_proj.weight": "model-00002-of-00004.safetensors",
104
+ "model.layers.14.block_sparse_moe.experts.0.w1.weight": "model-00002-of-00004.safetensors",
105
+ "model.layers.14.block_sparse_moe.experts.0.w2.weight": "model-00002-of-00004.safetensors",
106
+ "model.layers.14.block_sparse_moe.experts.0.w3.weight": "model-00002-of-00004.safetensors",
107
+ "model.layers.14.block_sparse_moe.experts.1.w1.weight": "model-00002-of-00004.safetensors",
108
+ "model.layers.14.block_sparse_moe.experts.1.w2.weight": "model-00002-of-00004.safetensors",
109
+ "model.layers.14.block_sparse_moe.experts.1.w3.weight": "model-00002-of-00004.safetensors",
110
+ "model.layers.14.block_sparse_moe.experts.2.w1.weight": "model-00002-of-00004.safetensors",
111
+ "model.layers.14.block_sparse_moe.experts.2.w2.weight": "model-00002-of-00004.safetensors",
112
+ "model.layers.14.block_sparse_moe.experts.2.w3.weight": "model-00002-of-00004.safetensors",
113
+ "model.layers.14.block_sparse_moe.gate.weight": "model-00002-of-00004.safetensors",
114
+ "model.layers.14.input_layernorm.weight": "model-00002-of-00004.safetensors",
115
+ "model.layers.14.post_attention_layernorm.weight": "model-00002-of-00004.safetensors",
116
+ "model.layers.14.self_attn.k_proj.weight": "model-00002-of-00004.safetensors",
117
+ "model.layers.14.self_attn.o_proj.weight": "model-00002-of-00004.safetensors",
118
+ "model.layers.14.self_attn.q_proj.weight": "model-00002-of-00004.safetensors",
119
+ "model.layers.14.self_attn.v_proj.weight": "model-00002-of-00004.safetensors",
120
+ "model.layers.15.block_sparse_moe.experts.0.w1.weight": "model-00002-of-00004.safetensors",
121
+ "model.layers.15.block_sparse_moe.experts.0.w2.weight": "model-00002-of-00004.safetensors",
122
+ "model.layers.15.block_sparse_moe.experts.0.w3.weight": "model-00002-of-00004.safetensors",
123
+ "model.layers.15.block_sparse_moe.experts.1.w1.weight": "model-00002-of-00004.safetensors",
124
+ "model.layers.15.block_sparse_moe.experts.1.w2.weight": "model-00002-of-00004.safetensors",
125
+ "model.layers.15.block_sparse_moe.experts.1.w3.weight": "model-00002-of-00004.safetensors",
126
+ "model.layers.15.block_sparse_moe.experts.2.w1.weight": "model-00002-of-00004.safetensors",
127
+ "model.layers.15.block_sparse_moe.experts.2.w2.weight": "model-00002-of-00004.safetensors",
128
+ "model.layers.15.block_sparse_moe.experts.2.w3.weight": "model-00002-of-00004.safetensors",
129
+ "model.layers.15.block_sparse_moe.gate.weight": "model-00002-of-00004.safetensors",
130
+ "model.layers.15.input_layernorm.weight": "model-00002-of-00004.safetensors",
131
+ "model.layers.15.post_attention_layernorm.weight": "model-00002-of-00004.safetensors",
132
+ "model.layers.15.self_attn.k_proj.weight": "model-00002-of-00004.safetensors",
133
+ "model.layers.15.self_attn.o_proj.weight": "model-00002-of-00004.safetensors",
134
+ "model.layers.15.self_attn.q_proj.weight": "model-00002-of-00004.safetensors",
135
+ "model.layers.15.self_attn.v_proj.weight": "model-00002-of-00004.safetensors",
136
+ "model.layers.16.block_sparse_moe.experts.0.w1.weight": "model-00002-of-00004.safetensors",
137
+ "model.layers.16.block_sparse_moe.experts.0.w2.weight": "model-00002-of-00004.safetensors",
138
+ "model.layers.16.block_sparse_moe.experts.0.w3.weight": "model-00002-of-00004.safetensors",
139
+ "model.layers.16.block_sparse_moe.experts.1.w1.weight": "model-00002-of-00004.safetensors",
140
+ "model.layers.16.block_sparse_moe.experts.1.w2.weight": "model-00002-of-00004.safetensors",
141
+ "model.layers.16.block_sparse_moe.experts.1.w3.weight": "model-00002-of-00004.safetensors",
142
+ "model.layers.16.block_sparse_moe.experts.2.w1.weight": "model-00002-of-00004.safetensors",
143
+ "model.layers.16.block_sparse_moe.experts.2.w2.weight": "model-00002-of-00004.safetensors",
144
+ "model.layers.16.block_sparse_moe.experts.2.w3.weight": "model-00002-of-00004.safetensors",
145
+ "model.layers.16.block_sparse_moe.gate.weight": "model-00002-of-00004.safetensors",
146
+ "model.layers.16.input_layernorm.weight": "model-00002-of-00004.safetensors",
147
+ "model.layers.16.post_attention_layernorm.weight": "model-00002-of-00004.safetensors",
148
+ "model.layers.16.self_attn.k_proj.weight": "model-00002-of-00004.safetensors",
149
+ "model.layers.16.self_attn.o_proj.weight": "model-00002-of-00004.safetensors",
150
+ "model.layers.16.self_attn.q_proj.weight": "model-00002-of-00004.safetensors",
151
+ "model.layers.16.self_attn.v_proj.weight": "model-00002-of-00004.safetensors",
152
+ "model.layers.17.block_sparse_moe.experts.0.w1.weight": "model-00002-of-00004.safetensors",
153
+ "model.layers.17.block_sparse_moe.experts.0.w2.weight": "model-00002-of-00004.safetensors",
154
+ "model.layers.17.block_sparse_moe.experts.0.w3.weight": "model-00002-of-00004.safetensors",
155
+ "model.layers.17.block_sparse_moe.experts.1.w1.weight": "model-00002-of-00004.safetensors",
156
+ "model.layers.17.block_sparse_moe.experts.1.w2.weight": "model-00002-of-00004.safetensors",
157
+ "model.layers.17.block_sparse_moe.experts.1.w3.weight": "model-00002-of-00004.safetensors",
158
+ "model.layers.17.block_sparse_moe.experts.2.w1.weight": "model-00002-of-00004.safetensors",
159
+ "model.layers.17.block_sparse_moe.experts.2.w2.weight": "model-00002-of-00004.safetensors",
160
+ "model.layers.17.block_sparse_moe.experts.2.w3.weight": "model-00002-of-00004.safetensors",
161
+ "model.layers.17.block_sparse_moe.gate.weight": "model-00002-of-00004.safetensors",
162
+ "model.layers.17.input_layernorm.weight": "model-00002-of-00004.safetensors",
163
+ "model.layers.17.post_attention_layernorm.weight": "model-00002-of-00004.safetensors",
164
+ "model.layers.17.self_attn.k_proj.weight": "model-00002-of-00004.safetensors",
165
+ "model.layers.17.self_attn.o_proj.weight": "model-00002-of-00004.safetensors",
166
+ "model.layers.17.self_attn.q_proj.weight": "model-00002-of-00004.safetensors",
167
+ "model.layers.17.self_attn.v_proj.weight": "model-00002-of-00004.safetensors",
168
+ "model.layers.18.block_sparse_moe.experts.0.w1.weight": "model-00002-of-00004.safetensors",
169
+ "model.layers.18.block_sparse_moe.experts.0.w2.weight": "model-00002-of-00004.safetensors",
170
+ "model.layers.18.block_sparse_moe.experts.0.w3.weight": "model-00002-of-00004.safetensors",
171
+ "model.layers.18.block_sparse_moe.experts.1.w1.weight": "model-00002-of-00004.safetensors",
172
+ "model.layers.18.block_sparse_moe.experts.1.w2.weight": "model-00003-of-00004.safetensors",
173
+ "model.layers.18.block_sparse_moe.experts.1.w3.weight": "model-00003-of-00004.safetensors",
174
+ "model.layers.18.block_sparse_moe.experts.2.w1.weight": "model-00003-of-00004.safetensors",
175
+ "model.layers.18.block_sparse_moe.experts.2.w2.weight": "model-00003-of-00004.safetensors",
176
+ "model.layers.18.block_sparse_moe.experts.2.w3.weight": "model-00003-of-00004.safetensors",
177
+ "model.layers.18.block_sparse_moe.gate.weight": "model-00002-of-00004.safetensors",
178
+ "model.layers.18.input_layernorm.weight": "model-00003-of-00004.safetensors",
179
+ "model.layers.18.post_attention_layernorm.weight": "model-00003-of-00004.safetensors",
180
+ "model.layers.18.self_attn.k_proj.weight": "model-00002-of-00004.safetensors",
181
+ "model.layers.18.self_attn.o_proj.weight": "model-00002-of-00004.safetensors",
182
+ "model.layers.18.self_attn.q_proj.weight": "model-00002-of-00004.safetensors",
183
+ "model.layers.18.self_attn.v_proj.weight": "model-00002-of-00004.safetensors",
184
+ "model.layers.19.block_sparse_moe.experts.0.w1.weight": "model-00003-of-00004.safetensors",
185
+ "model.layers.19.block_sparse_moe.experts.0.w2.weight": "model-00003-of-00004.safetensors",
186
+ "model.layers.19.block_sparse_moe.experts.0.w3.weight": "model-00003-of-00004.safetensors",
187
+ "model.layers.19.block_sparse_moe.experts.1.w1.weight": "model-00003-of-00004.safetensors",
188
+ "model.layers.19.block_sparse_moe.experts.1.w2.weight": "model-00003-of-00004.safetensors",
189
+ "model.layers.19.block_sparse_moe.experts.1.w3.weight": "model-00003-of-00004.safetensors",
190
+ "model.layers.19.block_sparse_moe.experts.2.w1.weight": "model-00003-of-00004.safetensors",
191
+ "model.layers.19.block_sparse_moe.experts.2.w2.weight": "model-00003-of-00004.safetensors",
192
+ "model.layers.19.block_sparse_moe.experts.2.w3.weight": "model-00003-of-00004.safetensors",
193
+ "model.layers.19.block_sparse_moe.gate.weight": "model-00003-of-00004.safetensors",
194
+ "model.layers.19.input_layernorm.weight": "model-00003-of-00004.safetensors",
195
+ "model.layers.19.post_attention_layernorm.weight": "model-00003-of-00004.safetensors",
196
+ "model.layers.19.self_attn.k_proj.weight": "model-00003-of-00004.safetensors",
197
+ "model.layers.19.self_attn.o_proj.weight": "model-00003-of-00004.safetensors",
198
+ "model.layers.19.self_attn.q_proj.weight": "model-00003-of-00004.safetensors",
199
+ "model.layers.19.self_attn.v_proj.weight": "model-00003-of-00004.safetensors",
200
+ "model.layers.2.block_sparse_moe.experts.0.w1.weight": "model-00001-of-00004.safetensors",
201
+ "model.layers.2.block_sparse_moe.experts.0.w2.weight": "model-00001-of-00004.safetensors",
202
+ "model.layers.2.block_sparse_moe.experts.0.w3.weight": "model-00001-of-00004.safetensors",
203
+ "model.layers.2.block_sparse_moe.experts.1.w1.weight": "model-00001-of-00004.safetensors",
204
+ "model.layers.2.block_sparse_moe.experts.1.w2.weight": "model-00001-of-00004.safetensors",
205
+ "model.layers.2.block_sparse_moe.experts.1.w3.weight": "model-00001-of-00004.safetensors",
206
+ "model.layers.2.block_sparse_moe.experts.2.w1.weight": "model-00001-of-00004.safetensors",
207
+ "model.layers.2.block_sparse_moe.experts.2.w2.weight": "model-00001-of-00004.safetensors",
208
+ "model.layers.2.block_sparse_moe.experts.2.w3.weight": "model-00001-of-00004.safetensors",
209
+ "model.layers.2.block_sparse_moe.gate.weight": "model-00001-of-00004.safetensors",
210
+ "model.layers.2.input_layernorm.weight": "model-00001-of-00004.safetensors",
211
+ "model.layers.2.post_attention_layernorm.weight": "model-00001-of-00004.safetensors",
212
+ "model.layers.2.self_attn.k_proj.weight": "model-00001-of-00004.safetensors",
213
+ "model.layers.2.self_attn.o_proj.weight": "model-00001-of-00004.safetensors",
214
+ "model.layers.2.self_attn.q_proj.weight": "model-00001-of-00004.safetensors",
215
+ "model.layers.2.self_attn.v_proj.weight": "model-00001-of-00004.safetensors",
216
+ "model.layers.20.block_sparse_moe.experts.0.w1.weight": "model-00003-of-00004.safetensors",
217
+ "model.layers.20.block_sparse_moe.experts.0.w2.weight": "model-00003-of-00004.safetensors",
218
+ "model.layers.20.block_sparse_moe.experts.0.w3.weight": "model-00003-of-00004.safetensors",
219
+ "model.layers.20.block_sparse_moe.experts.1.w1.weight": "model-00003-of-00004.safetensors",
220
+ "model.layers.20.block_sparse_moe.experts.1.w2.weight": "model-00003-of-00004.safetensors",
221
+ "model.layers.20.block_sparse_moe.experts.1.w3.weight": "model-00003-of-00004.safetensors",
222
+ "model.layers.20.block_sparse_moe.experts.2.w1.weight": "model-00003-of-00004.safetensors",
223
+ "model.layers.20.block_sparse_moe.experts.2.w2.weight": "model-00003-of-00004.safetensors",
224
+ "model.layers.20.block_sparse_moe.experts.2.w3.weight": "model-00003-of-00004.safetensors",
225
+ "model.layers.20.block_sparse_moe.gate.weight": "model-00003-of-00004.safetensors",
226
+ "model.layers.20.input_layernorm.weight": "model-00003-of-00004.safetensors",
227
+ "model.layers.20.post_attention_layernorm.weight": "model-00003-of-00004.safetensors",
228
+ "model.layers.20.self_attn.k_proj.weight": "model-00003-of-00004.safetensors",
229
+ "model.layers.20.self_attn.o_proj.weight": "model-00003-of-00004.safetensors",
230
+ "model.layers.20.self_attn.q_proj.weight": "model-00003-of-00004.safetensors",
231
+ "model.layers.20.self_attn.v_proj.weight": "model-00003-of-00004.safetensors",
232
+ "model.layers.21.block_sparse_moe.experts.0.w1.weight": "model-00003-of-00004.safetensors",
233
+ "model.layers.21.block_sparse_moe.experts.0.w2.weight": "model-00003-of-00004.safetensors",
234
+ "model.layers.21.block_sparse_moe.experts.0.w3.weight": "model-00003-of-00004.safetensors",
235
+ "model.layers.21.block_sparse_moe.experts.1.w1.weight": "model-00003-of-00004.safetensors",
236
+ "model.layers.21.block_sparse_moe.experts.1.w2.weight": "model-00003-of-00004.safetensors",
237
+ "model.layers.21.block_sparse_moe.experts.1.w3.weight": "model-00003-of-00004.safetensors",
238
+ "model.layers.21.block_sparse_moe.experts.2.w1.weight": "model-00003-of-00004.safetensors",
239
+ "model.layers.21.block_sparse_moe.experts.2.w2.weight": "model-00003-of-00004.safetensors",
240
+ "model.layers.21.block_sparse_moe.experts.2.w3.weight": "model-00003-of-00004.safetensors",
241
+ "model.layers.21.block_sparse_moe.gate.weight": "model-00003-of-00004.safetensors",
242
+ "model.layers.21.input_layernorm.weight": "model-00003-of-00004.safetensors",
243
+ "model.layers.21.post_attention_layernorm.weight": "model-00003-of-00004.safetensors",
244
+ "model.layers.21.self_attn.k_proj.weight": "model-00003-of-00004.safetensors",
245
+ "model.layers.21.self_attn.o_proj.weight": "model-00003-of-00004.safetensors",
246
+ "model.layers.21.self_attn.q_proj.weight": "model-00003-of-00004.safetensors",
247
+ "model.layers.21.self_attn.v_proj.weight": "model-00003-of-00004.safetensors",
248
+ "model.layers.22.block_sparse_moe.experts.0.w1.weight": "model-00003-of-00004.safetensors",
249
+ "model.layers.22.block_sparse_moe.experts.0.w2.weight": "model-00003-of-00004.safetensors",
250
+ "model.layers.22.block_sparse_moe.experts.0.w3.weight": "model-00003-of-00004.safetensors",
251
+ "model.layers.22.block_sparse_moe.experts.1.w1.weight": "model-00003-of-00004.safetensors",
252
+ "model.layers.22.block_sparse_moe.experts.1.w2.weight": "model-00003-of-00004.safetensors",
253
+ "model.layers.22.block_sparse_moe.experts.1.w3.weight": "model-00003-of-00004.safetensors",
254
+ "model.layers.22.block_sparse_moe.experts.2.w1.weight": "model-00003-of-00004.safetensors",
255
+ "model.layers.22.block_sparse_moe.experts.2.w2.weight": "model-00003-of-00004.safetensors",
256
+ "model.layers.22.block_sparse_moe.experts.2.w3.weight": "model-00003-of-00004.safetensors",
257
+ "model.layers.22.block_sparse_moe.gate.weight": "model-00003-of-00004.safetensors",
258
+ "model.layers.22.input_layernorm.weight": "model-00003-of-00004.safetensors",
259
+ "model.layers.22.post_attention_layernorm.weight": "model-00003-of-00004.safetensors",
260
+ "model.layers.22.self_attn.k_proj.weight": "model-00003-of-00004.safetensors",
261
+ "model.layers.22.self_attn.o_proj.weight": "model-00003-of-00004.safetensors",
262
+ "model.layers.22.self_attn.q_proj.weight": "model-00003-of-00004.safetensors",
263
+ "model.layers.22.self_attn.v_proj.weight": "model-00003-of-00004.safetensors",
264
+ "model.layers.23.block_sparse_moe.experts.0.w1.weight": "model-00003-of-00004.safetensors",
265
+ "model.layers.23.block_sparse_moe.experts.0.w2.weight": "model-00003-of-00004.safetensors",
266
+ "model.layers.23.block_sparse_moe.experts.0.w3.weight": "model-00003-of-00004.safetensors",
267
+ "model.layers.23.block_sparse_moe.experts.1.w1.weight": "model-00003-of-00004.safetensors",
268
+ "model.layers.23.block_sparse_moe.experts.1.w2.weight": "model-00003-of-00004.safetensors",
269
+ "model.layers.23.block_sparse_moe.experts.1.w3.weight": "model-00003-of-00004.safetensors",
270
+ "model.layers.23.block_sparse_moe.experts.2.w1.weight": "model-00003-of-00004.safetensors",
271
+ "model.layers.23.block_sparse_moe.experts.2.w2.weight": "model-00003-of-00004.safetensors",
272
+ "model.layers.23.block_sparse_moe.experts.2.w3.weight": "model-00003-of-00004.safetensors",
273
+ "model.layers.23.block_sparse_moe.gate.weight": "model-00003-of-00004.safetensors",
274
+ "model.layers.23.input_layernorm.weight": "model-00003-of-00004.safetensors",
275
+ "model.layers.23.post_attention_layernorm.weight": "model-00003-of-00004.safetensors",
276
+ "model.layers.23.self_attn.k_proj.weight": "model-00003-of-00004.safetensors",
277
+ "model.layers.23.self_attn.o_proj.weight": "model-00003-of-00004.safetensors",
278
+ "model.layers.23.self_attn.q_proj.weight": "model-00003-of-00004.safetensors",
279
+ "model.layers.23.self_attn.v_proj.weight": "model-00003-of-00004.safetensors",
280
+ "model.layers.24.block_sparse_moe.experts.0.w1.weight": "model-00003-of-00004.safetensors",
281
+ "model.layers.24.block_sparse_moe.experts.0.w2.weight": "model-00003-of-00004.safetensors",
282
+ "model.layers.24.block_sparse_moe.experts.0.w3.weight": "model-00003-of-00004.safetensors",
283
+ "model.layers.24.block_sparse_moe.experts.1.w1.weight": "model-00003-of-00004.safetensors",
284
+ "model.layers.24.block_sparse_moe.experts.1.w2.weight": "model-00003-of-00004.safetensors",
285
+ "model.layers.24.block_sparse_moe.experts.1.w3.weight": "model-00003-of-00004.safetensors",
286
+ "model.layers.24.block_sparse_moe.experts.2.w1.weight": "model-00003-of-00004.safetensors",
287
+ "model.layers.24.block_sparse_moe.experts.2.w2.weight": "model-00003-of-00004.safetensors",
288
+ "model.layers.24.block_sparse_moe.experts.2.w3.weight": "model-00003-of-00004.safetensors",
289
+ "model.layers.24.block_sparse_moe.gate.weight": "model-00003-of-00004.safetensors",
290
+ "model.layers.24.input_layernorm.weight": "model-00003-of-00004.safetensors",
291
+ "model.layers.24.post_attention_layernorm.weight": "model-00003-of-00004.safetensors",
292
+ "model.layers.24.self_attn.k_proj.weight": "model-00003-of-00004.safetensors",
293
+ "model.layers.24.self_attn.o_proj.weight": "model-00003-of-00004.safetensors",
294
+ "model.layers.24.self_attn.q_proj.weight": "model-00003-of-00004.safetensors",
295
+ "model.layers.24.self_attn.v_proj.weight": "model-00003-of-00004.safetensors",
296
+ "model.layers.25.block_sparse_moe.experts.0.w1.weight": "model-00003-of-00004.safetensors",
297
+ "model.layers.25.block_sparse_moe.experts.0.w2.weight": "model-00003-of-00004.safetensors",
298
+ "model.layers.25.block_sparse_moe.experts.0.w3.weight": "model-00003-of-00004.safetensors",
299
+ "model.layers.25.block_sparse_moe.experts.1.w1.weight": "model-00003-of-00004.safetensors",
300
+ "model.layers.25.block_sparse_moe.experts.1.w2.weight": "model-00003-of-00004.safetensors",
301
+ "model.layers.25.block_sparse_moe.experts.1.w3.weight": "model-00003-of-00004.safetensors",
302
+ "model.layers.25.block_sparse_moe.experts.2.w1.weight": "model-00003-of-00004.safetensors",
303
+ "model.layers.25.block_sparse_moe.experts.2.w2.weight": "model-00003-of-00004.safetensors",
304
+ "model.layers.25.block_sparse_moe.experts.2.w3.weight": "model-00003-of-00004.safetensors",
305
+ "model.layers.25.block_sparse_moe.gate.weight": "model-00003-of-00004.safetensors",
306
+ "model.layers.25.input_layernorm.weight": "model-00003-of-00004.safetensors",
307
+ "model.layers.25.post_attention_layernorm.weight": "model-00003-of-00004.safetensors",
308
+ "model.layers.25.self_attn.k_proj.weight": "model-00003-of-00004.safetensors",
309
+ "model.layers.25.self_attn.o_proj.weight": "model-00003-of-00004.safetensors",
310
+ "model.layers.25.self_attn.q_proj.weight": "model-00003-of-00004.safetensors",
311
+ "model.layers.25.self_attn.v_proj.weight": "model-00003-of-00004.safetensors",
312
+ "model.layers.26.block_sparse_moe.experts.0.w1.weight": "model-00003-of-00004.safetensors",
313
+ "model.layers.26.block_sparse_moe.experts.0.w2.weight": "model-00003-of-00004.safetensors",
314
+ "model.layers.26.block_sparse_moe.experts.0.w3.weight": "model-00003-of-00004.safetensors",
315
+ "model.layers.26.block_sparse_moe.experts.1.w1.weight": "model-00003-of-00004.safetensors",
316
+ "model.layers.26.block_sparse_moe.experts.1.w2.weight": "model-00003-of-00004.safetensors",
317
+ "model.layers.26.block_sparse_moe.experts.1.w3.weight": "model-00003-of-00004.safetensors",
318
+ "model.layers.26.block_sparse_moe.experts.2.w1.weight": "model-00003-of-00004.safetensors",
319
+ "model.layers.26.block_sparse_moe.experts.2.w2.weight": "model-00003-of-00004.safetensors",
320
+ "model.layers.26.block_sparse_moe.experts.2.w3.weight": "model-00003-of-00004.safetensors",
321
+ "model.layers.26.block_sparse_moe.gate.weight": "model-00003-of-00004.safetensors",
322
+ "model.layers.26.input_layernorm.weight": "model-00003-of-00004.safetensors",
323
+ "model.layers.26.post_attention_layernorm.weight": "model-00003-of-00004.safetensors",
324
+ "model.layers.26.self_attn.k_proj.weight": "model-00003-of-00004.safetensors",
325
+ "model.layers.26.self_attn.o_proj.weight": "model-00003-of-00004.safetensors",
326
+ "model.layers.26.self_attn.q_proj.weight": "model-00003-of-00004.safetensors",
327
+ "model.layers.26.self_attn.v_proj.weight": "model-00003-of-00004.safetensors",
328
+ "model.layers.27.block_sparse_moe.experts.0.w1.weight": "model-00003-of-00004.safetensors",
329
+ "model.layers.27.block_sparse_moe.experts.0.w2.weight": "model-00003-of-00004.safetensors",
330
+ "model.layers.27.block_sparse_moe.experts.0.w3.weight": "model-00003-of-00004.safetensors",
331
+ "model.layers.27.block_sparse_moe.experts.1.w1.weight": "model-00003-of-00004.safetensors",
332
+ "model.layers.27.block_sparse_moe.experts.1.w2.weight": "model-00003-of-00004.safetensors",
333
+ "model.layers.27.block_sparse_moe.experts.1.w3.weight": "model-00003-of-00004.safetensors",
334
+ "model.layers.27.block_sparse_moe.experts.2.w1.weight": "model-00003-of-00004.safetensors",
335
+ "model.layers.27.block_sparse_moe.experts.2.w2.weight": "model-00003-of-00004.safetensors",
336
+ "model.layers.27.block_sparse_moe.experts.2.w3.weight": "model-00004-of-00004.safetensors",
337
+ "model.layers.27.block_sparse_moe.gate.weight": "model-00003-of-00004.safetensors",
338
+ "model.layers.27.input_layernorm.weight": "model-00004-of-00004.safetensors",
339
+ "model.layers.27.post_attention_layernorm.weight": "model-00004-of-00004.safetensors",
340
+ "model.layers.27.self_attn.k_proj.weight": "model-00003-of-00004.safetensors",
341
+ "model.layers.27.self_attn.o_proj.weight": "model-00003-of-00004.safetensors",
342
+ "model.layers.27.self_attn.q_proj.weight": "model-00003-of-00004.safetensors",
343
+ "model.layers.27.self_attn.v_proj.weight": "model-00003-of-00004.safetensors",
344
+ "model.layers.28.block_sparse_moe.experts.0.w1.weight": "model-00004-of-00004.safetensors",
345
+ "model.layers.28.block_sparse_moe.experts.0.w2.weight": "model-00004-of-00004.safetensors",
346
+ "model.layers.28.block_sparse_moe.experts.0.w3.weight": "model-00004-of-00004.safetensors",
347
+ "model.layers.28.block_sparse_moe.experts.1.w1.weight": "model-00004-of-00004.safetensors",
348
+ "model.layers.28.block_sparse_moe.experts.1.w2.weight": "model-00004-of-00004.safetensors",
349
+ "model.layers.28.block_sparse_moe.experts.1.w3.weight": "model-00004-of-00004.safetensors",
350
+ "model.layers.28.block_sparse_moe.experts.2.w1.weight": "model-00004-of-00004.safetensors",
351
+ "model.layers.28.block_sparse_moe.experts.2.w2.weight": "model-00004-of-00004.safetensors",
352
+ "model.layers.28.block_sparse_moe.experts.2.w3.weight": "model-00004-of-00004.safetensors",
353
+ "model.layers.28.block_sparse_moe.gate.weight": "model-00004-of-00004.safetensors",
354
+ "model.layers.28.input_layernorm.weight": "model-00004-of-00004.safetensors",
355
+ "model.layers.28.post_attention_layernorm.weight": "model-00004-of-00004.safetensors",
356
+ "model.layers.28.self_attn.k_proj.weight": "model-00004-of-00004.safetensors",
357
+ "model.layers.28.self_attn.o_proj.weight": "model-00004-of-00004.safetensors",
358
+ "model.layers.28.self_attn.q_proj.weight": "model-00004-of-00004.safetensors",
359
+ "model.layers.28.self_attn.v_proj.weight": "model-00004-of-00004.safetensors",
360
+ "model.layers.29.block_sparse_moe.experts.0.w1.weight": "model-00004-of-00004.safetensors",
361
+ "model.layers.29.block_sparse_moe.experts.0.w2.weight": "model-00004-of-00004.safetensors",
362
+ "model.layers.29.block_sparse_moe.experts.0.w3.weight": "model-00004-of-00004.safetensors",
363
+ "model.layers.29.block_sparse_moe.experts.1.w1.weight": "model-00004-of-00004.safetensors",
364
+ "model.layers.29.block_sparse_moe.experts.1.w2.weight": "model-00004-of-00004.safetensors",
365
+ "model.layers.29.block_sparse_moe.experts.1.w3.weight": "model-00004-of-00004.safetensors",
366
+ "model.layers.29.block_sparse_moe.experts.2.w1.weight": "model-00004-of-00004.safetensors",
367
+ "model.layers.29.block_sparse_moe.experts.2.w2.weight": "model-00004-of-00004.safetensors",
368
+ "model.layers.29.block_sparse_moe.experts.2.w3.weight": "model-00004-of-00004.safetensors",
369
+ "model.layers.29.block_sparse_moe.gate.weight": "model-00004-of-00004.safetensors",
370
+ "model.layers.29.input_layernorm.weight": "model-00004-of-00004.safetensors",
371
+ "model.layers.29.post_attention_layernorm.weight": "model-00004-of-00004.safetensors",
372
+ "model.layers.29.self_attn.k_proj.weight": "model-00004-of-00004.safetensors",
373
+ "model.layers.29.self_attn.o_proj.weight": "model-00004-of-00004.safetensors",
374
+ "model.layers.29.self_attn.q_proj.weight": "model-00004-of-00004.safetensors",
375
+ "model.layers.29.self_attn.v_proj.weight": "model-00004-of-00004.safetensors",
376
+ "model.layers.3.block_sparse_moe.experts.0.w1.weight": "model-00001-of-00004.safetensors",
377
+ "model.layers.3.block_sparse_moe.experts.0.w2.weight": "model-00001-of-00004.safetensors",
378
+ "model.layers.3.block_sparse_moe.experts.0.w3.weight": "model-00001-of-00004.safetensors",
379
+ "model.layers.3.block_sparse_moe.experts.1.w1.weight": "model-00001-of-00004.safetensors",
380
+ "model.layers.3.block_sparse_moe.experts.1.w2.weight": "model-00001-of-00004.safetensors",
381
+ "model.layers.3.block_sparse_moe.experts.1.w3.weight": "model-00001-of-00004.safetensors",
382
+ "model.layers.3.block_sparse_moe.experts.2.w1.weight": "model-00001-of-00004.safetensors",
383
+ "model.layers.3.block_sparse_moe.experts.2.w2.weight": "model-00001-of-00004.safetensors",
384
+ "model.layers.3.block_sparse_moe.experts.2.w3.weight": "model-00001-of-00004.safetensors",
385
+ "model.layers.3.block_sparse_moe.gate.weight": "model-00001-of-00004.safetensors",
386
+ "model.layers.3.input_layernorm.weight": "model-00001-of-00004.safetensors",
387
+ "model.layers.3.post_attention_layernorm.weight": "model-00001-of-00004.safetensors",
388
+ "model.layers.3.self_attn.k_proj.weight": "model-00001-of-00004.safetensors",
389
+ "model.layers.3.self_attn.o_proj.weight": "model-00001-of-00004.safetensors",
390
+ "model.layers.3.self_attn.q_proj.weight": "model-00001-of-00004.safetensors",
391
+ "model.layers.3.self_attn.v_proj.weight": "model-00001-of-00004.safetensors",
392
+ "model.layers.30.block_sparse_moe.experts.0.w1.weight": "model-00004-of-00004.safetensors",
393
+ "model.layers.30.block_sparse_moe.experts.0.w2.weight": "model-00004-of-00004.safetensors",
394
+ "model.layers.30.block_sparse_moe.experts.0.w3.weight": "model-00004-of-00004.safetensors",
395
+ "model.layers.30.block_sparse_moe.experts.1.w1.weight": "model-00004-of-00004.safetensors",
396
+ "model.layers.30.block_sparse_moe.experts.1.w2.weight": "model-00004-of-00004.safetensors",
397
+ "model.layers.30.block_sparse_moe.experts.1.w3.weight": "model-00004-of-00004.safetensors",
398
+ "model.layers.30.block_sparse_moe.experts.2.w1.weight": "model-00004-of-00004.safetensors",
399
+ "model.layers.30.block_sparse_moe.experts.2.w2.weight": "model-00004-of-00004.safetensors",
400
+ "model.layers.30.block_sparse_moe.experts.2.w3.weight": "model-00004-of-00004.safetensors",
401
+ "model.layers.30.block_sparse_moe.gate.weight": "model-00004-of-00004.safetensors",
402
+ "model.layers.30.input_layernorm.weight": "model-00004-of-00004.safetensors",
403
+ "model.layers.30.post_attention_layernorm.weight": "model-00004-of-00004.safetensors",
404
+ "model.layers.30.self_attn.k_proj.weight": "model-00004-of-00004.safetensors",
405
+ "model.layers.30.self_attn.o_proj.weight": "model-00004-of-00004.safetensors",
406
+ "model.layers.30.self_attn.q_proj.weight": "model-00004-of-00004.safetensors",
407
+ "model.layers.30.self_attn.v_proj.weight": "model-00004-of-00004.safetensors",
408
+ "model.layers.31.block_sparse_moe.experts.0.w1.weight": "model-00004-of-00004.safetensors",
409
+ "model.layers.31.block_sparse_moe.experts.0.w2.weight": "model-00004-of-00004.safetensors",
410
+ "model.layers.31.block_sparse_moe.experts.0.w3.weight": "model-00004-of-00004.safetensors",
411
+ "model.layers.31.block_sparse_moe.experts.1.w1.weight": "model-00004-of-00004.safetensors",
412
+ "model.layers.31.block_sparse_moe.experts.1.w2.weight": "model-00004-of-00004.safetensors",
413
+ "model.layers.31.block_sparse_moe.experts.1.w3.weight": "model-00004-of-00004.safetensors",
414
+ "model.layers.31.block_sparse_moe.experts.2.w1.weight": "model-00004-of-00004.safetensors",
415
+ "model.layers.31.block_sparse_moe.experts.2.w2.weight": "model-00004-of-00004.safetensors",
416
+ "model.layers.31.block_sparse_moe.experts.2.w3.weight": "model-00004-of-00004.safetensors",
417
+ "model.layers.31.block_sparse_moe.gate.weight": "model-00004-of-00004.safetensors",
418
+ "model.layers.31.input_layernorm.weight": "model-00004-of-00004.safetensors",
419
+ "model.layers.31.post_attention_layernorm.weight": "model-00004-of-00004.safetensors",
420
+ "model.layers.31.self_attn.k_proj.weight": "model-00004-of-00004.safetensors",
421
+ "model.layers.31.self_attn.o_proj.weight": "model-00004-of-00004.safetensors",
422
+ "model.layers.31.self_attn.q_proj.weight": "model-00004-of-00004.safetensors",
423
+ "model.layers.31.self_attn.v_proj.weight": "model-00004-of-00004.safetensors",
424
+ "model.layers.4.block_sparse_moe.experts.0.w1.weight": "model-00001-of-00004.safetensors",
425
+ "model.layers.4.block_sparse_moe.experts.0.w2.weight": "model-00001-of-00004.safetensors",
426
+ "model.layers.4.block_sparse_moe.experts.0.w3.weight": "model-00001-of-00004.safetensors",
427
+ "model.layers.4.block_sparse_moe.experts.1.w1.weight": "model-00001-of-00004.safetensors",
428
+ "model.layers.4.block_sparse_moe.experts.1.w2.weight": "model-00001-of-00004.safetensors",
429
+ "model.layers.4.block_sparse_moe.experts.1.w3.weight": "model-00001-of-00004.safetensors",
430
+ "model.layers.4.block_sparse_moe.experts.2.w1.weight": "model-00001-of-00004.safetensors",
431
+ "model.layers.4.block_sparse_moe.experts.2.w2.weight": "model-00001-of-00004.safetensors",
432
+ "model.layers.4.block_sparse_moe.experts.2.w3.weight": "model-00001-of-00004.safetensors",
433
+ "model.layers.4.block_sparse_moe.gate.weight": "model-00001-of-00004.safetensors",
434
+ "model.layers.4.input_layernorm.weight": "model-00001-of-00004.safetensors",
435
+ "model.layers.4.post_attention_layernorm.weight": "model-00001-of-00004.safetensors",
436
+ "model.layers.4.self_attn.k_proj.weight": "model-00001-of-00004.safetensors",
437
+ "model.layers.4.self_attn.o_proj.weight": "model-00001-of-00004.safetensors",
438
+ "model.layers.4.self_attn.q_proj.weight": "model-00001-of-00004.safetensors",
439
+ "model.layers.4.self_attn.v_proj.weight": "model-00001-of-00004.safetensors",
440
+ "model.layers.5.block_sparse_moe.experts.0.w1.weight": "model-00001-of-00004.safetensors",
441
+ "model.layers.5.block_sparse_moe.experts.0.w2.weight": "model-00001-of-00004.safetensors",
442
+ "model.layers.5.block_sparse_moe.experts.0.w3.weight": "model-00001-of-00004.safetensors",
443
+ "model.layers.5.block_sparse_moe.experts.1.w1.weight": "model-00001-of-00004.safetensors",
444
+ "model.layers.5.block_sparse_moe.experts.1.w2.weight": "model-00001-of-00004.safetensors",
445
+ "model.layers.5.block_sparse_moe.experts.1.w3.weight": "model-00001-of-00004.safetensors",
446
+ "model.layers.5.block_sparse_moe.experts.2.w1.weight": "model-00001-of-00004.safetensors",
447
+ "model.layers.5.block_sparse_moe.experts.2.w2.weight": "model-00001-of-00004.safetensors",
448
+ "model.layers.5.block_sparse_moe.experts.2.w3.weight": "model-00001-of-00004.safetensors",
449
+ "model.layers.5.block_sparse_moe.gate.weight": "model-00001-of-00004.safetensors",
450
+ "model.layers.5.input_layernorm.weight": "model-00001-of-00004.safetensors",
451
+ "model.layers.5.post_attention_layernorm.weight": "model-00001-of-00004.safetensors",
452
+ "model.layers.5.self_attn.k_proj.weight": "model-00001-of-00004.safetensors",
453
+ "model.layers.5.self_attn.o_proj.weight": "model-00001-of-00004.safetensors",
454
+ "model.layers.5.self_attn.q_proj.weight": "model-00001-of-00004.safetensors",
455
+ "model.layers.5.self_attn.v_proj.weight": "model-00001-of-00004.safetensors",
456
+ "model.layers.6.block_sparse_moe.experts.0.w1.weight": "model-00001-of-00004.safetensors",
457
+ "model.layers.6.block_sparse_moe.experts.0.w2.weight": "model-00001-of-00004.safetensors",
458
+ "model.layers.6.block_sparse_moe.experts.0.w3.weight": "model-00001-of-00004.safetensors",
459
+ "model.layers.6.block_sparse_moe.experts.1.w1.weight": "model-00001-of-00004.safetensors",
460
+ "model.layers.6.block_sparse_moe.experts.1.w2.weight": "model-00001-of-00004.safetensors",
461
+ "model.layers.6.block_sparse_moe.experts.1.w3.weight": "model-00001-of-00004.safetensors",
462
+ "model.layers.6.block_sparse_moe.experts.2.w1.weight": "model-00001-of-00004.safetensors",
463
+ "model.layers.6.block_sparse_moe.experts.2.w2.weight": "model-00001-of-00004.safetensors",
464
+ "model.layers.6.block_sparse_moe.experts.2.w3.weight": "model-00001-of-00004.safetensors",
465
+ "model.layers.6.block_sparse_moe.gate.weight": "model-00001-of-00004.safetensors",
466
+ "model.layers.6.input_layernorm.weight": "model-00001-of-00004.safetensors",
467
+ "model.layers.6.post_attention_layernorm.weight": "model-00001-of-00004.safetensors",
468
+ "model.layers.6.self_attn.k_proj.weight": "model-00001-of-00004.safetensors",
469
+ "model.layers.6.self_attn.o_proj.weight": "model-00001-of-00004.safetensors",
470
+ "model.layers.6.self_attn.q_proj.weight": "model-00001-of-00004.safetensors",
471
+ "model.layers.6.self_attn.v_proj.weight": "model-00001-of-00004.safetensors",
472
+ "model.layers.7.block_sparse_moe.experts.0.w1.weight": "model-00001-of-00004.safetensors",
473
+ "model.layers.7.block_sparse_moe.experts.0.w2.weight": "model-00001-of-00004.safetensors",
474
+ "model.layers.7.block_sparse_moe.experts.0.w3.weight": "model-00001-of-00004.safetensors",
475
+ "model.layers.7.block_sparse_moe.experts.1.w1.weight": "model-00001-of-00004.safetensors",
476
+ "model.layers.7.block_sparse_moe.experts.1.w2.weight": "model-00001-of-00004.safetensors",
477
+ "model.layers.7.block_sparse_moe.experts.1.w3.weight": "model-00001-of-00004.safetensors",
478
+ "model.layers.7.block_sparse_moe.experts.2.w1.weight": "model-00001-of-00004.safetensors",
479
+ "model.layers.7.block_sparse_moe.experts.2.w2.weight": "model-00001-of-00004.safetensors",
480
+ "model.layers.7.block_sparse_moe.experts.2.w3.weight": "model-00001-of-00004.safetensors",
481
+ "model.layers.7.block_sparse_moe.gate.weight": "model-00001-of-00004.safetensors",
482
+ "model.layers.7.input_layernorm.weight": "model-00001-of-00004.safetensors",
483
+ "model.layers.7.post_attention_layernorm.weight": "model-00001-of-00004.safetensors",
484
+ "model.layers.7.self_attn.k_proj.weight": "model-00001-of-00004.safetensors",
485
+ "model.layers.7.self_attn.o_proj.weight": "model-00001-of-00004.safetensors",
486
+ "model.layers.7.self_attn.q_proj.weight": "model-00001-of-00004.safetensors",
487
+ "model.layers.7.self_attn.v_proj.weight": "model-00001-of-00004.safetensors",
488
+ "model.layers.8.block_sparse_moe.experts.0.w1.weight": "model-00001-of-00004.safetensors",
489
+ "model.layers.8.block_sparse_moe.experts.0.w2.weight": "model-00001-of-00004.safetensors",
490
+ "model.layers.8.block_sparse_moe.experts.0.w3.weight": "model-00001-of-00004.safetensors",
491
+ "model.layers.8.block_sparse_moe.experts.1.w1.weight": "model-00001-of-00004.safetensors",
492
+ "model.layers.8.block_sparse_moe.experts.1.w2.weight": "model-00001-of-00004.safetensors",
493
+ "model.layers.8.block_sparse_moe.experts.1.w3.weight": "model-00001-of-00004.safetensors",
494
+ "model.layers.8.block_sparse_moe.experts.2.w1.weight": "model-00001-of-00004.safetensors",
495
+ "model.layers.8.block_sparse_moe.experts.2.w2.weight": "model-00001-of-00004.safetensors",
496
+ "model.layers.8.block_sparse_moe.experts.2.w3.weight": "model-00001-of-00004.safetensors",
497
+ "model.layers.8.block_sparse_moe.gate.weight": "model-00001-of-00004.safetensors",
498
+ "model.layers.8.input_layernorm.weight": "model-00001-of-00004.safetensors",
499
+ "model.layers.8.post_attention_layernorm.weight": "model-00001-of-00004.safetensors",
500
+ "model.layers.8.self_attn.k_proj.weight": "model-00001-of-00004.safetensors",
501
+ "model.layers.8.self_attn.o_proj.weight": "model-00001-of-00004.safetensors",
502
+ "model.layers.8.self_attn.q_proj.weight": "model-00001-of-00004.safetensors",
503
+ "model.layers.8.self_attn.v_proj.weight": "model-00001-of-00004.safetensors",
504
+ "model.layers.9.block_sparse_moe.experts.0.w1.weight": "model-00002-of-00004.safetensors",
505
+ "model.layers.9.block_sparse_moe.experts.0.w2.weight": "model-00002-of-00004.safetensors",
506
+ "model.layers.9.block_sparse_moe.experts.0.w3.weight": "model-00002-of-00004.safetensors",
507
+ "model.layers.9.block_sparse_moe.experts.1.w1.weight": "model-00002-of-00004.safetensors",
508
+ "model.layers.9.block_sparse_moe.experts.1.w2.weight": "model-00002-of-00004.safetensors",
509
+ "model.layers.9.block_sparse_moe.experts.1.w3.weight": "model-00002-of-00004.safetensors",
510
+ "model.layers.9.block_sparse_moe.experts.2.w1.weight": "model-00002-of-00004.safetensors",
511
+ "model.layers.9.block_sparse_moe.experts.2.w2.weight": "model-00002-of-00004.safetensors",
512
+ "model.layers.9.block_sparse_moe.experts.2.w3.weight": "model-00002-of-00004.safetensors",
513
+ "model.layers.9.block_sparse_moe.gate.weight": "model-00002-of-00004.safetensors",
514
+ "model.layers.9.input_layernorm.weight": "model-00002-of-00004.safetensors",
515
+ "model.layers.9.post_attention_layernorm.weight": "model-00002-of-00004.safetensors",
516
+ "model.layers.9.self_attn.k_proj.weight": "model-00001-of-00004.safetensors",
517
+ "model.layers.9.self_attn.o_proj.weight": "model-00002-of-00004.safetensors",
518
+ "model.layers.9.self_attn.q_proj.weight": "model-00001-of-00004.safetensors",
519
+ "model.layers.9.self_attn.v_proj.weight": "model-00002-of-00004.safetensors",
520
+ "model.norm.weight": "model-00004-of-00004.safetensors"
521
+ }
522
+ }