|
--- |
|
license: apache-2.0 |
|
base_model: arcee-ai/SuperNova-Medius |
|
language: |
|
- en |
|
library_name: transformers |
|
pipeline_tag: text-generation |
|
datasets: |
|
- nothingiisreal/Claude-3-Opus-Instruct-15K |
|
- nothingiisreal/Short-Storygen-v2 |
|
- allura-org/Celeste-1.x-data-mixture |
|
- allura-org/niir-C2logs-cleaned |
|
- allura-org/gryphe-sonnet-3.5-charcards-names-added |
|
--- |
|
|
|
<img src="neon.png"> |
|
<small>Image by CalamitousFelicitousness</small> |
|
|
|
--- |
|
|
|
# Qwen2.5-14B Neon v1 |
|
|
|
RP finetune of Supernova-Medius. Turned out surprisingly nice on it's own, I honestly made it only as a merge fuel, but it impressed me and Prodeus enough to release it separately (history repeats I guess, Sugarquill also started out this way). Quite interesting prose, definitely quite distinct from Supernova or EVA for that matter. |
|
Instruction following is decent as well. Not really much to say about this one, just a decent RP model, tbh. Euryale-inspired I guess. |
|
|
|
Model was trained by Auri. |
|
|
|
--- |
|
|
|
**Training notes** |
|
|
|
Model was trained on a dataset consisting of 77M tokens of synthetic RP and short story gen data. Training took around 2 hours on 8xH100 SXM node. Training config was more or less reused from Sugarquill, and it worked fairly well again. |
|
Had the node crash after finishing the training and merging in the LoRA, so I had to merge it with MergeKit on a separate node, otherwise everything was smooth. |
|
|
|
Huge thanks to Retis Labs for sponsoring this run! |
|
|
|
**Format** |
|
|
|
Model responds to ChatML instruct formatting, exactly like it's base model. |
|
|
|
``` |
|
<|im_start|>system |
|
{system message}<|im_end|> |
|
<|im_start|>user |
|
{user message}<|im_end|> |
|
<|im_start|>assistant |
|
{response}<|im_end|> |
|
``` |
|
|
|
**Recommended Samplers** |
|
|
|
My classic stable Qwen setup works quite well: |
|
|
|
``` |
|
Temperature - 0.8 |
|
Min-P - 0.05 |
|
Top-A - 0.3 |
|
Repetition Penalty - 1.03 |
|
``` |
|
|
|
**Training config** |
|
<details><summary>See Axolotl config</summary> |
|
|
|
axolotl version `0.6.0` |
|
|
|
```yaml |
|
# Model |
|
base_model: arcee-ai/SuperNova-Medius |
|
strict: false |
|
|
|
# Liger Kernels (optimization) |
|
plugins: |
|
- axolotl.integrations.liger.LigerPlugin |
|
liger_rope: true |
|
liger_rms_norm: true |
|
liger_swiglu: true |
|
liger_fused_linear_cross_entropy: true |
|
|
|
# Output and HuggingFace |
|
output_dir: /workspace/axolotl/TQ-2.5-14B-Neon |
|
hub_model_id: allura-org/TQ-2.5-14B-Neon-LoRA |
|
hf_use_auth_token: true |
|
hub_strategy: "all_checkpoints" |
|
|
|
# WandB |
|
wandb_project: allura-org |
|
wandb_entity: |
|
wandb_name: TQ-2.5-14B-Neon-1 |
|
|
|
# Data |
|
chat_template: chatml |
|
#train_on_inputs: false |
|
group_by_length: false |
|
datasets: |
|
- path: allura-org/neon-41k |
|
type: chat_template |
|
field_messages: conversations |
|
message_field_role: from |
|
message_field_content: value |
|
|
|
## Evaluation |
|
val_set_size: 0.01 |
|
evals_per_epoch: 4 |
|
eval_table_size: |
|
eval_max_new_tokens: 128 |
|
|
|
# Technical aspects |
|
sequence_len: 16384 |
|
save_safetensors: true |
|
saves_per_epoch: 2 |
|
logging_steps: 1 |
|
special_tokens: |
|
|
|
# Quantization |
|
bf16: auto |
|
fp16: |
|
tf32: false |
|
## For LoRA |
|
load_in_8bit: false |
|
load_in_4bit: false |
|
|
|
# LoRA |
|
peft_use_rslora: true |
|
peft_use_dora: false # better but slower |
|
adapter: lora # lora or qlora |
|
lora_model_dir: |
|
lora_r: 64 # 64 is optimal for most trains on instruct |
|
lora_alpha: 32 |
|
lora_dropout: 0.1 |
|
lora_target_linear: true |
|
lora_fan_in_fan_out: |
|
lora_target_modules: |
|
# - embed_tokens |
|
# - lm_head |
|
|
|
#loraplus_lr_ratio: 8 # works to converge faster but is kinda cancer bc makes model unstable |
|
#loraplus_lr_embedding: |
|
|
|
# Training hyperparameters |
|
# max_steps: |
|
num_epochs: 2 |
|
|
|
# Anti Overfit and Stability |
|
weight_decay: 0.01 |
|
max_grad_norm: 1.0 |
|
|
|
## Learning Rate |
|
warmup_ratio: 0.05 |
|
learning_rate: 0.00003 |
|
lr_scheduler: cosine |
|
#lr_scheduler_kwargs: |
|
# min_lr: 0.0000024 |
|
optimizer: paged_ademamix_8bit # usually adamw_torch or paged_adamw_8bit |
|
|
|
## Batch Size |
|
gradient_accumulation_steps: 4 # More effective batch size - stabler train, usually. MBS also speeds it up. |
|
micro_batch_size: 4 # Batch size per gpu = micro_batch_size * gradient_accumulation_steps |
|
eval_batch_size: 1 |
|
|
|
# Optimizations |
|
pad_to_sequence_len: true |
|
sample_packing: true |
|
eval_sample_packing: false |
|
flash_attention: true |
|
xformers_attention: |
|
gradient_checkpointing: "unsloth" |
|
gradient_checkpointing_kwargs: |
|
use_reentrant: true |
|
local_rank: |
|
deepspeed: /workspace/axolotl/deepspeed_configs/zero3_bf16.json # Only use with multi gpu # _bf16_cpuoffload_all |
|
# fsdp: |
|
# - full_shard |
|
# - auto_wrap |
|
# fsdp_config: |
|
# fsdp_limit_all_gathers: true |
|
# fsdp_sync_module_states: true |
|
# fsdp_offload_params: true |
|
# fsdp_use_orig_params: false |
|
# fsdp_cpu_ram_efficient_loading: true |
|
# fsdp_auto_wrap_policy: TRANSFORMER_BASED_WRAP |
|
# fsdp_transformer_layer_cls_to_wrap: LlamaDecoderLayer |
|
# fsdp_state_dict_type: FULL_STATE_DICT |
|
# fsdp_sharding_strategy: FULL_SHARD |
|
# Misc |
|
early_stopping_patience: |
|
debug: |
|
``` |
|
|
|
</details> |