このnotebookはstockmark/stockmark-13bのモデルをkunishou/databricks-dolly-15k-jaのデータセットを用いてLoRA tuningするためのコードの例です。A100またはH100のGPUを用いることを想定しています。

- モデル：https://huggingface.co/stockmark/stockmark-13b
- データ：https://github.com/kunishou/databricks-dolly-15k-ja

以下の例では、学習を1 epochを行います。A100 GPUで実行すると30分ほどかかります。

また、ここで用いられているハイパーパラメータは最適化されたものではありませんので、必要に応じて調整してください。

# 準備

In [None]:
import torch
import datasets
from transformers import AutoTokenizer, AutoModelForCausalLM, Trainer, TrainingArguments
from peft import get_peft_model, LoraConfig, PeftModel, PeftConfig

model_name = "stockmark/stockmark-13b"
peft_model_name = "stockmark-13b-adapter"

prompt_template = """### Instruction:
{instruction}

### Input:
{input}

### Response:
"""

def encode(sample):
    prompt = prompt_template.format(instruction=sample["instruction"], input=sample["input"])
    target = sample["output"]
    input_ids_prompt, input_ids_target = tokenizer([prompt, target], add_special_tokens=False).input_ids
    input_ids_prompt = [ tokenizer.bos_token_id ] + input_ids_prompt
    input_ids_target = input_ids_target + [ tokenizer.eos_token_id ]
    input_ids = input_ids_prompt + input_ids_target
    labels = input_ids.copy()
    labels[:len(input_ids_prompt)] = [-100] * len(input_ids_prompt) # ignore label tokens in a prompt for loss calculation
    return {"input_ids": input_ids, "labels": labels}

def get_collator(tokenizer, max_length):
    def collator(batch):
        batch = [{ key: value[:max_length] for key, value in sample.items() } for sample in batch ]
        batch = tokenizer.pad(batch)
        batch["labels"] = [ e + [-100] * (len(batch["input_ids"][0]) - len(e)) for e in batch["labels"] ]
        batch = { key: torch.tensor(value) for key, value in batch.items() }
        return batch

    return collator

# データセットとモデルのロード

In [None]:
# load_tokenizer
tokenizer = AutoTokenizer.from_pretrained(model_name)

# prepare dataset
dataset_name = "kunishou/databricks-dolly-15k-ja"
dataset = datasets.load_dataset(dataset_name)
dataset = dataset.map(encode)
dataset = dataset["train"].train_test_split(0.1)
train_dataset = dataset["train"]
val_dataset = dataset["test"]

# load model
model = AutoModelForCausalLM.from_pretrained(model_name, device_map="auto", torch_dtype=torch.bfloat16)

peft_config = LoraConfig(
    task_type="CAUSAL_LM",
    inference_mode=False,
    target_modules=["q_proj", "k_proj", "v_proj", "o_proj"],
    r=16,
    lora_alpha=32,
    lora_dropout=0.05
)

model = get_peft_model(model, peft_config)
model.print_trainable_parameters()

# LoRA Tuning

In [None]:
training_args = TrainingArguments(
    output_dir="./log_stockmark_13b",
    learning_rate=2e-4,
    per_device_train_batch_size=2,
    gradient_accumulation_steps=8,
    per_device_eval_batch_size=16,
    num_train_epochs=1,
    logging_strategy='steps',
    logging_steps=10,
    save_strategy='epoch',
    evaluation_strategy='epoch',
    load_best_model_at_end=True,
    metric_for_best_model="eval_loss",
    greater_is_better=False,
    save_total_limit=2
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    data_collator=get_collator(tokenizer, 320)
)

# LoRA tuning
trainer.train()

# save model
model = trainer.model
model.save_pretrained(peft_model_name)

# 学習したモデルのロード（Optional）
異なるセッションでモデルを読み込む場合、まず最初の準備のセクションのコードを実行して、このコードを実行してください。

In [None]:
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForCausalLM.from_pretrained(model_name, device_map="auto", torch_dtype=torch.bfloat16)
model = PeftModel.from_pretrained(model, peft_model_name)

# 推論

In [None]:
prompt = prompt_template.format(instruction="自然言語処理とは？", input="")

inputs = tokenizer(prompt, return_tensors="pt").to(model.device)
with torch.no_grad():
    tokens = model.generate(
        **inputs,
        max_new_tokens=128,
        do_sample=True,
        temperature=0.7
    )

output = tokenizer.decode(tokens[0], skip_special_tokens=True)
print(output)