end0tknr's kipple - web写経開発

太宰府天満宮の狛犬って、妙にカワイイ

PEFT (Parameter-Efficient Fine-Tuning , LoRA) for python で LLM ( rinna/japanese-gpt-neox-3.6b-instruction-ppo )の finetune

先日の上記2 entryで作成した環境を用い、以下のurlを写経

note.com

目次

python moduleの追加install

!pip install -Uqq git+https://github.com/huggingface/peft.git
!pip install -Uqq transformers datasets accelerate
!pip install sentencepiece
!pip install scipy

PEFT (LoRA) python script

# -*- coding: utf-8 -*-
from transformers import AutoTokenizer
from transformers import AutoModelForCausalLM
from peft import LoraConfig, get_peft_model
from peft import prepare_model_for_int8_training, TaskType
import json
import transformers

model_name = "rinna/japanese-gpt-neox-3.6b-instruction-ppo"
peft_name  = "lorappo-rinna-3.6b"         # PEFT出力dir
output_dir = "lorappo-rinna-3.6b-results" # 学習結果の出力dir
dataset_src_path = "enquete.txt"

lora_config = LoraConfig(r= 8,
                         lora_alpha    =16,
                         target_modules=["query_key_value"],
                         lora_dropout  =0.05,
                         bias          ="none",
                         task_type     =TaskType.CAUSAL_LM )

def main():
    tokenizer = load_tokenizer()
    train_dataset, val_dataset = load_datasets(dataset_src_path,tokenizer)
    model = AutoModelForCausalLM.from_pretrained(model_name,
                                                 load_in_8bit=True,
                                                 device_map="auto" )
    # model前処理
    model = prepare_model_for_int8_training(model)
    # LoRAの準備
    model = get_peft_model(model, lora_config)
    # 学習可能param確認
    model.print_trainable_parameters()

    trainer = prepare_trainer(model,train_dataset,val_dataset,tokenizer)
    # 学習
    model.config.use_cache = False
    trainer.train()
    model.config.use_cache = True
    
    # LoRA model保存
    trainer.model.save_pretrained(peft_name)

def prepare_trainer(model,train_dataset,val_dataset,tokenizer):
    trainer = transformers.Trainer(
        model        =model,
        train_dataset=train_dataset,
        eval_dataset =val_dataset,
        args=transformers.TrainingArguments(
            num_train_epochs    =3,
            learning_rate       =3e-4,
            logging_steps       =20,
            evaluation_strategy ="steps",
            save_strategy       ="steps",
            eval_steps          =200,
            save_steps          =200,
            output_dir          =output_dir,
            save_total_limit    =3,
            push_to_hub         =False,
            auto_find_batch_size=True
        ),
        data_collator=transformers.DataCollatorForLanguageModeling(tokenizer,
                                                                   mlm=False),
    )
    return trainer
    
def load_datasets(dataset_src_path,tokenizer):
    train_dataset = []
    val_dataset   = []

    with open(dataset_src_path, mode='r',encoding="UTF-8") as f:
        i = 0
        for tsv_line in f.readlines():
            i += 1
            dataset_src = tsv_line.strip().split()
            prompt = generate_prompt( dataset_src )
            token  = tokenize(prompt, tokenizer)
            if i % 5 == 0:
                val_dataset.append(token)
            else:
                train_dataset.append(token)
    return train_dataset, val_dataset

def load_tokenizer():
    # documentを眺めても「use_fast=False」は意味不明でしたが、必要らしい
    # https://huggingface.co/docs/transformers/v4.30.0/en/model_doc/auto
    tokenizer = AutoTokenizer.from_pretrained(model_name,
                                              use_fast=False)
    # 以下のprint()は debug用
    # print(tokenizer.special_tokens_map)
    # print("bos_token :", tokenizer.bos_token, ",", tokenizer.bos_token_id)
    # print("eos_token :", tokenizer.eos_token, ",", tokenizer.eos_token_id)
    # print("unk_token :", tokenizer.unk_token, ",", tokenizer.unk_token_id)
    # print("pad_token :", tokenizer.pad_token, ",", tokenizer.pad_token_id)
    
    return tokenizer
    
def tokenize(prompt, tokenizer):
    #「max_length=256」を指定しない場合、modelの最大長になるらしい
    # cf https://note.com/npaka/n/n36acd2122192
    result = tokenizer(prompt,
                       truncation=True,
                       max_length=256,
                       padding   =False )
    
    return {"input_ids"     : result["input_ids"],
            "attention_mask": result["attention_mask"] }

def generate_prompt( dataset_src ):
    prompt = f"""### 指示:
{dataset_src[0]}

### 回答:
{dataset_src[1]}
"""
    prompt = prompt.replace('\n', '<NL>')
    return prompt

if __name__ == '__main__':
    main()

PEFT (LoRA) python script の実行

先程のpython scriptを使用し、82records(26MB)の enquete.txt を PEFT (LoRA)学習実行された画面の抜粋が以下。

学習結果を用いた推論等は改めて実施しますが、 使用したGPUのGeForce3060では重荷だったのか、 そもそも、実行する際の条件が悪かったのか、 学習完了まで、4日間弱を要しました。

{'loss': 0.9846, 'learning_rate': 2.2502291475710358e-05, 'epoch': 2.77}
{'loss': 0.9305, 'learning_rate': 2.227314390467461e-05, 'epoch': 2.78}
{'loss': 0.9045, 'learning_rate': 2.2043996333638858e-05, 'epoch': 2.78}
{'loss': 0.9285, 'learning_rate': 2.1814848762603113e-05, 'epoch': 2.78}
{'loss': 0.9426, 'learning_rate': 2.1585701191567365e-05, 'epoch': 2.78}
{'loss': 0.9998, 'learning_rate': 2.135655362053162e-05, 'epoch': 2.79}
{'loss': 0.927, 'learning_rate': 2.1127406049495872e-05, 'epoch': 2.79}
{'loss': 0.8817, 'learning_rate': 2.0898258478460124e-05, 'epoch': 2.79}
{'loss': 0.8973, 'learning_rate': 2.066911090742438e-05, 'epoch': 2.79}
{'loss': 0.9317, 'learning_rate': 2.043996333638863e-05, 'epoch': 2.8}
{'eval_loss': 1.1007884740829468, 'eval_runtime': 1591.4846, 'eval_samples_per_second': 10.968, 'eval_steps_per_second': 1.371, 'epoch': 2.8}
 93%|█████████████████████████████████████████████████▍   | 24400/26184 [84:49:47<1:33:45,  3.15s/it]C:\Users\end0t\miniconda3\envs\mycuda\lib\site-packages\bitsandbytes\autograd\_functions.py:321: UserWarning: MatMul8bitLt: inputs will be cast from torch.float32 to float16 during quantization
  warnings.warn(f"MatMul8bitLt: inputs will be cast from {A.dtype} to float16 during quantization")
<略>
{'eval_loss': 1.098840355873108, 'eval_runtime': 2009.9981, 'eval_samples_per_second': 8.684, 'eval_steps_per_second': 1.086, 'epoch': 2.98}
 99%|██████████████████████████████████████████████████████▌| 26000/26184 [89:53:34<10:33,  3.45s/it]C:\Users\end0t\miniconda3\envs\mycuda\lib\site-packages\bitsandbytes\autograd\_functions.py:321: UserWarning: MatMul8bitLt: inputs will be cast from torch.float32 to float16 during quantization
  warnings.warn(f"MatMul8bitLt: inputs will be cast from {A.dtype} to float16 during quantization")
{'loss': 1.0022, 'learning_rate': 1.8790100824931255e-06, 'epoch': 2.98}
{'loss': 0.9512, 'learning_rate': 1.6498625114573782e-06, 'epoch': 2.98}
{'loss': 0.986, 'learning_rate': 1.4207149404216316e-06, 'epoch': 2.99}
{'loss': 0.923, 'learning_rate': 1.1915673693858843e-06, 'epoch': 2.99}
{'loss': 0.9338, 'learning_rate': 9.624197983501375e-07, 'epoch': 2.99}
{'loss': 0.9578, 'learning_rate': 7.332722273143903e-07, 'epoch': 2.99}
{'loss': 0.9031, 'learning_rate': 5.041246562786434e-07, 'epoch': 2.99}
{'loss': 0.9126, 'learning_rate': 2.749770852428964e-07, 'epoch': 3.0}
{'loss': 0.9763, 'learning_rate': 4.5829514207149396e-08, 'epoch': 3.0}
{'train_runtime': 324200.6491, 'train_samples_per_second': 0.646, 'train_steps_per_second': 0.081, 'train_loss': 1.0020638379324405, 'epoch': 3.0}
100%|███████████████████████████████████████████████████████| 26184/26184 [90:03:20<00:00, 12.38s/it]