Model save
Browse files- README.md +3 -8
- adapter_config.json +4 -4
- adapter_model.safetensors +1 -1
- all_results.json +9 -9
- eval_results.json +5 -5
- train_results.json +5 -5
- trainer_state.json +10 -42
- training_args.bin +1 -1
README.md
CHANGED
@@ -2,13 +2,11 @@
|
|
2 |
license: other
|
3 |
library_name: peft
|
4 |
tags:
|
5 |
-
- alignment-handbook
|
6 |
-
- generated_from_trainer
|
7 |
- trl
|
8 |
- sft
|
9 |
- generated_from_trainer
|
10 |
datasets:
|
11 |
-
-
|
12 |
base_model: 01-ai/Yi-6B
|
13 |
model-index:
|
14 |
- name: Yi-6B-zhihu3
|
@@ -20,9 +18,9 @@ should probably proofread and complete it, then remove this comment. -->
|
|
20 |
|
21 |
# Yi-6B-zhihu3
|
22 |
|
23 |
-
This model is a fine-tuned version of [01-ai/Yi-6B](https://huggingface.co/01-ai/Yi-6B) on the
|
24 |
It achieves the following results on the evaluation set:
|
25 |
-
- Loss: 2.
|
26 |
|
27 |
## Model description
|
28 |
|
@@ -52,9 +50,6 @@ The following hyperparameters were used during training:
|
|
52 |
|
53 |
### Training results
|
54 |
|
55 |
-
| Training Loss | Epoch | Step | Validation Loss |
|
56 |
-
|:-------------:|:-----:|:----:|:---------------:|
|
57 |
-
| 2.303 | 1.0 | 820 | 2.3217 |
|
58 |
|
59 |
|
60 |
### Framework versions
|
|
|
2 |
license: other
|
3 |
library_name: peft
|
4 |
tags:
|
|
|
|
|
5 |
- trl
|
6 |
- sft
|
7 |
- generated_from_trainer
|
8 |
datasets:
|
9 |
+
- generator
|
10 |
base_model: 01-ai/Yi-6B
|
11 |
model-index:
|
12 |
- name: Yi-6B-zhihu3
|
|
|
18 |
|
19 |
# Yi-6B-zhihu3
|
20 |
|
21 |
+
This model is a fine-tuned version of [01-ai/Yi-6B](https://huggingface.co/01-ai/Yi-6B) on the generator dataset.
|
22 |
It achieves the following results on the evaluation set:
|
23 |
+
- Loss: 2.5565
|
24 |
|
25 |
## Model description
|
26 |
|
|
|
50 |
|
51 |
### Training results
|
52 |
|
|
|
|
|
|
|
53 |
|
54 |
|
55 |
### Framework versions
|
adapter_config.json
CHANGED
@@ -19,13 +19,13 @@
|
|
19 |
"rank_pattern": {},
|
20 |
"revision": null,
|
21 |
"target_modules": [
|
22 |
-
"
|
23 |
-
"o_proj",
|
24 |
-
"v_proj",
|
25 |
"q_proj",
|
|
|
26 |
"gate_proj",
|
27 |
"up_proj",
|
28 |
-
"
|
|
|
29 |
],
|
30 |
"task_type": "CAUSAL_LM"
|
31 |
}
|
|
|
19 |
"rank_pattern": {},
|
20 |
"revision": null,
|
21 |
"target_modules": [
|
22 |
+
"down_proj",
|
|
|
|
|
23 |
"q_proj",
|
24 |
+
"k_proj",
|
25 |
"gate_proj",
|
26 |
"up_proj",
|
27 |
+
"o_proj",
|
28 |
+
"v_proj"
|
29 |
],
|
30 |
"task_type": "CAUSAL_LM"
|
31 |
}
|
adapter_model.safetensors
CHANGED
@@ -1,3 +1,3 @@
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
-
oid sha256:
|
3 |
size 72673912
|
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:df896d3219d78f6ae6755039d54dde33208844da975f18b6e96546f08cf24293
|
3 |
size 72673912
|
all_results.json
CHANGED
@@ -1,13 +1,13 @@
|
|
1 |
{
|
2 |
-
"epoch":
|
3 |
-
"eval_loss": 2.
|
4 |
-
"eval_runtime":
|
5 |
"eval_samples": 2561,
|
6 |
-
"eval_samples_per_second": 3.
|
7 |
-
"eval_steps_per_second": 3.
|
8 |
-
"train_loss": 0.
|
9 |
-
"train_runtime":
|
10 |
"train_samples": 2561,
|
11 |
-
"train_samples_per_second":
|
12 |
-
"train_steps_per_second":
|
13 |
}
|
|
|
1 |
{
|
2 |
+
"epoch": 0.98,
|
3 |
+
"eval_loss": 2.556525945663452,
|
4 |
+
"eval_runtime": 237.4327,
|
5 |
"eval_samples": 2561,
|
6 |
+
"eval_samples_per_second": 3.226,
|
7 |
+
"eval_steps_per_second": 3.226,
|
8 |
+
"train_loss": 0.0,
|
9 |
+
"train_runtime": 12.6248,
|
10 |
"train_samples": 2561,
|
11 |
+
"train_samples_per_second": 60.674,
|
12 |
+
"train_steps_per_second": 60.674
|
13 |
}
|
eval_results.json
CHANGED
@@ -1,8 +1,8 @@
|
|
1 |
{
|
2 |
-
"epoch":
|
3 |
-
"eval_loss": 2.
|
4 |
-
"eval_runtime":
|
5 |
"eval_samples": 2561,
|
6 |
-
"eval_samples_per_second": 3.
|
7 |
-
"eval_steps_per_second": 3.
|
8 |
}
|
|
|
1 |
{
|
2 |
+
"epoch": 0.98,
|
3 |
+
"eval_loss": 2.556525945663452,
|
4 |
+
"eval_runtime": 237.4327,
|
5 |
"eval_samples": 2561,
|
6 |
+
"eval_samples_per_second": 3.226,
|
7 |
+
"eval_steps_per_second": 3.226
|
8 |
}
|
train_results.json
CHANGED
@@ -1,8 +1,8 @@
|
|
1 |
{
|
2 |
-
"epoch":
|
3 |
-
"train_loss": 0.
|
4 |
-
"train_runtime":
|
5 |
"train_samples": 2561,
|
6 |
-
"train_samples_per_second":
|
7 |
-
"train_steps_per_second":
|
8 |
}
|
|
|
1 |
{
|
2 |
+
"epoch": 0.98,
|
3 |
+
"train_loss": 0.0,
|
4 |
+
"train_runtime": 12.6248,
|
5 |
"train_samples": 2561,
|
6 |
+
"train_samples_per_second": 60.674,
|
7 |
+
"train_steps_per_second": 60.674
|
8 |
}
|
trainer_state.json
CHANGED
@@ -1,9 +1,9 @@
|
|
1 |
{
|
2 |
"best_metric": null,
|
3 |
"best_model_checkpoint": null,
|
4 |
-
"epoch":
|
5 |
"eval_steps": 500,
|
6 |
-
"global_step":
|
7 |
"is_hyper_param_search": false,
|
8 |
"is_local_process_zero": true,
|
9 |
"is_world_process_zero": true,
|
@@ -976,52 +976,20 @@
|
|
976 |
},
|
977 |
{
|
978 |
"epoch": 0.98,
|
979 |
-
"
|
980 |
-
"
|
981 |
-
"
|
982 |
-
|
983 |
-
|
984 |
-
"
|
985 |
-
"learning_rate": 9.059233262386225e-08,
|
986 |
-
"loss": 2.4554,
|
987 |
-
"step": 810
|
988 |
-
},
|
989 |
-
{
|
990 |
-
"epoch": 0.99,
|
991 |
-
"learning_rate": 2.2650648415334376e-08,
|
992 |
-
"loss": 2.5679,
|
993 |
-
"step": 815
|
994 |
-
},
|
995 |
-
{
|
996 |
-
"epoch": 1.0,
|
997 |
-
"learning_rate": 0.0,
|
998 |
-
"loss": 2.303,
|
999 |
-
"step": 820
|
1000 |
-
},
|
1001 |
-
{
|
1002 |
-
"epoch": 1.0,
|
1003 |
-
"eval_loss": 2.321652412414551,
|
1004 |
-
"eval_runtime": 249.3077,
|
1005 |
-
"eval_samples_per_second": 3.289,
|
1006 |
-
"eval_steps_per_second": 3.289,
|
1007 |
-
"step": 820
|
1008 |
-
},
|
1009 |
-
{
|
1010 |
-
"epoch": 1.0,
|
1011 |
-
"step": 820,
|
1012 |
-
"total_flos": 5.879639335501824e+16,
|
1013 |
-
"train_loss": 0.9362016701116794,
|
1014 |
-
"train_runtime": 598.217,
|
1015 |
-
"train_samples_per_second": 1.371,
|
1016 |
-
"train_steps_per_second": 1.371
|
1017 |
}
|
1018 |
],
|
1019 |
"logging_steps": 5,
|
1020 |
-
"max_steps":
|
1021 |
"num_input_tokens_seen": 0,
|
1022 |
"num_train_epochs": 1,
|
1023 |
"save_steps": 100,
|
1024 |
-
"total_flos": 5.
|
1025 |
"train_batch_size": 1,
|
1026 |
"trial_name": null,
|
1027 |
"trial_params": null
|
|
|
1 |
{
|
2 |
"best_metric": null,
|
3 |
"best_model_checkpoint": null,
|
4 |
+
"epoch": 0.975609756097561,
|
5 |
"eval_steps": 500,
|
6 |
+
"global_step": 800,
|
7 |
"is_hyper_param_search": false,
|
8 |
"is_local_process_zero": true,
|
9 |
"is_world_process_zero": true,
|
|
|
976 |
},
|
977 |
{
|
978 |
"epoch": 0.98,
|
979 |
+
"step": 800,
|
980 |
+
"total_flos": 5.73623349805056e+16,
|
981 |
+
"train_loss": 0.0,
|
982 |
+
"train_runtime": 12.6248,
|
983 |
+
"train_samples_per_second": 60.674,
|
984 |
+
"train_steps_per_second": 60.674
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
985 |
}
|
986 |
],
|
987 |
"logging_steps": 5,
|
988 |
+
"max_steps": 766,
|
989 |
"num_input_tokens_seen": 0,
|
990 |
"num_train_epochs": 1,
|
991 |
"save_steps": 100,
|
992 |
+
"total_flos": 5.73623349805056e+16,
|
993 |
"train_batch_size": 1,
|
994 |
"trial_name": null,
|
995 |
"trial_params": null
|
training_args.bin
CHANGED
@@ -1,3 +1,3 @@
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
-
oid sha256:
|
3 |
size 4728
|
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:a4409cfb726e8f752d47ceb3dab2ab0604266fff53118f9c8e0f4f1c34cb19fc
|
3 |
size 4728
|