File size: 4,210 Bytes
7362de1
 
d80331d
 
3d18ac5
 
 
 
 
 
 
 
 
 
 
e1b7c2b
3d18ac5
e1b7c2b
3d18ac5
e1b7c2b
3d18ac5
e1b7c2b
3d18ac5
e1b7c2b
3d18ac5
e1b7c2b
3d18ac5
 
945cbd6
 
1da5a30
e012bdc
 
945cbd6
8e24955
 
945cbd6
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
If you like this model, consider joining my discord to provide feedback: https://discord.gg/QXdn8hWSkY

This is a storywriting and roleplay model with a significant amount of self generated long context multiturn roleplay. 

I downloaded a bit under a thousand cards from chub.ai, and created a synthetic roleplay for each card. I batched as many turns as I could in 4k token chunks in order to maintain coherency over longer context. There was a lot of cleaning and validation between each batch, so a lot of examples were "lost,"  but the final output seems to be very good quality. The longest conversation is about 20k tokens, and I plan to extend this further as well as broaden the dataset with more examples. The first 4k tokens were generated with Command-R-Plus, with the remainder generated with byroneverson/Mistral-Small-Instruct-2409-abliterated. 

Next, I downloaded the prompt backup from this site, and used them as a seed for some storywriting data:

https://aetherroom.club/whats-new#backup-update 

I went over it twice with Command-R-Plus. The first time, having it basically write the first draft of the output, the second improving and extending the length of the original output. 

Also included was a subset of the following datasets:

anthracite-org/stheno-filtered-v1.1

anthracite-org/kalo_misc_part2

anthracite-org/kalo_opus_misc_240827

anthracite-org/kalo-opus-instruct-22k-no-refusal

Chaser-cz/sonnet35-charcard-roleplay-sharegpt

(A very small subset) jondurbin/airoboros-3.2

And some various other data, viewable at openerotica/mixed-rp

Every line of data was run through a large model in order to filter for low quality, repetition, and underage content.

There is a LOT more I can do to improve the dataset used to create this model. As of now, the storywriting data is all single shot, where as I'd like to create some examples of "contunue the story in x direction..." and "rewrite it adding/removing these elements".  More stable diffusion roleplay prompting data could also be helpful. Roleplay conversations should be exteneded to 32k and beyond.


[<img src="https://raw.githubusercontent.com/axolotl-ai-cloud/axolotl/main/image/axolotl-badge-web.png" alt="Built with Axolotl" width="200" height="32"/>](https://github.com/axolotl-ai-cloud/axolotl)

```yaml
base_model: mistralai/Mistral-Nemo-Base-2407
model_type: AutoModelForCausalLM
tokenizer_type: AutoTokenizer

load_in_8bit: false
load_in_4bit: true
strict: false

datasets:
  - path: openerotica/mixed-rp
    type: sharegpt
    conversation: chatml

chat_template: chatml
adapter: qlora
lora_r: 128
lora_alpha: 256
lora_modules_to_save: [embed_tokens, lm_head]
lora_dropout: 0.05
lora_target_linear: true
lora_target_modules:
  - gate_proj
  - down_proj
  - up_proj
  - q_proj
  - v_proj
  - k_proj
  - o_proj

dataset_prepared_path:
val_set_size: 0.01
output_dir: /workspace/axolotl/mixed-rp-mistral-nemo

sequence_len: 20000
sample_packing: true
pad_to_sequence_len: true

wandb_project: mistral-2
wandb_watch:
wandb_run_id:
wandb_log_model:

gradient_accumulation_steps: 2
micro_batch_size: 1
num_epochs: 1
optimizer: adamw_torch
lr_scheduler: cosine
learning_rate: 1e-5
train_on_inputs: false
group_by_length: false
bf16: auto
fp16:
tf32: false

gradient_checkpointing: true
gradient_checkpointing_kwargs:
  use_reentrant: false
early_stopping_patience:
resume_from_checkpoint:
logging_steps: 1
xformers_attention:
flash_attention: true

warmup_steps: 100
evals_per_epoch: 4
eval_table_size:
saves_per_epoch: 1
save_total_limit: 2
save_steps:
debug:
deepspeed: 
weight_decay: 0.1
special_tokens:
  eos_token: "<|im_end|>"
  pad_token: "<pad>"
  bos_token: "<s>"
  unk_token: "<unk>"
tokens:
  - "<|im_start|>"

# fsdp:
#   - full_shard
#   - auto_wrap
# fsdp_config:
#   fsdp_limit_all_gathers: true
#   fsdp_sync_module_states: true
#   fsdp_offload_params: true
#   fsdp_use_orig_params: false
#   fsdp_cpu_ram_efficient_loading: true
#   fsdp_transformer_layer_cls_to_wrap: MixtralSparseMoeBlock
#   fsdp_state_dict_type: FULL_STATE_DICT
#   fsdp_auto_wrap_policy: TRANSFORMER_BASED_WRAP
#   fsdp_sharding_strategy: FULL_SHARD
#   fsdp_forward_prefetch: false
#   fsdp_backward_prefetch: BACKWARD_PRE