max_seq_length = 500

def fmt(examples):
    print(len(examples))
    return examples
    
# 'lora_r' is the dimension of the LoRA attention.
lora_r = 32

# 'lora_alpha' is the alpha parameter for LoRA scaling.
lora_alpha = 16

# 'lora_dropout' is the dropout probability for LoRA layers.
lora_dropout = 0.05

# 'target_modules' is a list of the modules that should be targeted by LoRA.
target_modules= ['k_proj', 'q_proj', 'v_proj', 'o_proj', "gate_proj", "down_proj", "up_proj"]

# 'se

peft_config = LoraConfig(
        r=lora_r,
        lora_alpha=lora_alpha,
        lora_dropout=lora_dropout,
        task_type=TaskType.CAUSAL_LM,
        target_modules=target_modules,
)

trainer = SFTTrainer(
    model = model,
    tokenizer = tokenizer,
    train_dataset = qa_dataset['train'],
    eval_dataset = qa_dataset['test'],
    dataset_text_field = "text",
    max_seq_length = max_seq_length,
    dataset_num_proc = 4,
    data_collator = collator,
    # formatting_func = fmt,
    # peft_config=peft_config,
    args = TrainingArguments(
        per_device_train_batch_size = 8,
        gradient_checkpointing = True,
        gradient_accumulation_steps = 4,
        per_device_eval_batch_size = 40,
        do_eval = True,
        eval_strategy = 'steps',
        eval_steps = 50,
        # save_strategy = 'steps',
        save_steps = 1000,

        # Use num_train_epochs and warmup_ratio for longer runs!
        # max_steps = 70,
        # warmup_steps = 10,
        # warmup_ratio = 0.1,
        num_train_epochs = 2,

        # Select a 2 to 10x smaller learning rate for the embedding matrices!
        learning_rate = 3e-5,
        # embedding_learning_rate = 1e-6,

        # fp16 = not is_bfloat16_supported(),
        bf16 = True,
        logging_steps = 1,
        optim = "adamw_torch",
        weight_decay = 0.00,
        lr_scheduler_type = "linear",
        # seed = 3407,

        output_dir = "llama_3b_step2_batch_v5",
    ),
)