- install tensorflow_gpu & pytorch for python 3.10 to windows11 & NVIDIA GeForce RTX3060(GDDR6 12GB) - end0tknr's kipple - web写経開発
- install bitsandbytes for python to windows11 - end0tknr's kipple - web写経開発
先日の上記2 entryで作成した環境を用い、以下のurlを写経
目次
python moduleの追加install
!pip install -Uqq git+https://github.com/huggingface/peft.git !pip install -Uqq transformers datasets accelerate !pip install sentencepiece !pip install scipy
PEFT (LoRA) python script
# -*- coding: utf-8 -*- from transformers import AutoTokenizer from transformers import AutoModelForCausalLM from peft import LoraConfig, get_peft_model from peft import prepare_model_for_int8_training, TaskType import json import transformers model_name = "rinna/japanese-gpt-neox-3.6b-instruction-ppo" peft_name = "lorappo-rinna-3.6b" # PEFT出力dir output_dir = "lorappo-rinna-3.6b-results" # 学習結果の出力dir dataset_src_path = "enquete.txt" lora_config = LoraConfig(r= 8, lora_alpha =16, target_modules=["query_key_value"], lora_dropout =0.05, bias ="none", task_type =TaskType.CAUSAL_LM ) def main(): tokenizer = load_tokenizer() train_dataset, val_dataset = load_datasets(dataset_src_path,tokenizer) model = AutoModelForCausalLM.from_pretrained(model_name, load_in_8bit=True, device_map="auto" ) # model前処理 model = prepare_model_for_int8_training(model) # LoRAの準備 model = get_peft_model(model, lora_config) # 学習可能param確認 model.print_trainable_parameters() trainer = prepare_trainer(model,train_dataset,val_dataset,tokenizer) # 学習 model.config.use_cache = False trainer.train() model.config.use_cache = True # LoRA model保存 trainer.model.save_pretrained(peft_name) def prepare_trainer(model,train_dataset,val_dataset,tokenizer): trainer = transformers.Trainer( model =model, train_dataset=train_dataset, eval_dataset =val_dataset, args=transformers.TrainingArguments( num_train_epochs =3, learning_rate =3e-4, logging_steps =20, evaluation_strategy ="steps", save_strategy ="steps", eval_steps =200, save_steps =200, output_dir =output_dir, save_total_limit =3, push_to_hub =False, auto_find_batch_size=True ), data_collator=transformers.DataCollatorForLanguageModeling(tokenizer, mlm=False), ) return trainer def load_datasets(dataset_src_path,tokenizer): train_dataset = [] val_dataset = [] with open(dataset_src_path, mode='r',encoding="UTF-8") as f: i = 0 for tsv_line in f.readlines(): i += 1 dataset_src = tsv_line.strip().split() prompt = generate_prompt( dataset_src ) token = tokenize(prompt, tokenizer) if i % 5 == 0: val_dataset.append(token) else: train_dataset.append(token) return train_dataset, val_dataset def load_tokenizer(): # documentを眺めても「use_fast=False」は意味不明でしたが、必要らしい # https://huggingface.co/docs/transformers/v4.30.0/en/model_doc/auto tokenizer = AutoTokenizer.from_pretrained(model_name, use_fast=False) # 以下のprint()は debug用 # print(tokenizer.special_tokens_map) # print("bos_token :", tokenizer.bos_token, ",", tokenizer.bos_token_id) # print("eos_token :", tokenizer.eos_token, ",", tokenizer.eos_token_id) # print("unk_token :", tokenizer.unk_token, ",", tokenizer.unk_token_id) # print("pad_token :", tokenizer.pad_token, ",", tokenizer.pad_token_id) return tokenizer def tokenize(prompt, tokenizer): #「max_length=256」を指定しない場合、modelの最大長になるらしい # cf https://note.com/npaka/n/n36acd2122192 result = tokenizer(prompt, truncation=True, max_length=256, padding =False ) return {"input_ids" : result["input_ids"], "attention_mask": result["attention_mask"] } def generate_prompt( dataset_src ): prompt = f"""### 指示: {dataset_src[0]} ### 回答: {dataset_src[1]} """ prompt = prompt.replace('\n', '<NL>') return prompt if __name__ == '__main__': main()
PEFT (LoRA) python script の実行
先程のpython scriptを使用し、82records(26MB)の enquete.txt を PEFT (LoRA)学習実行された画面の抜粋が以下。
学習結果を用いた推論等は改めて実施しますが、 使用したGPUのGeForce3060では重荷だったのか、 そもそも、実行する際の条件が悪かったのか、 学習完了まで、4日間弱を要しました。
{'loss': 0.9846, 'learning_rate': 2.2502291475710358e-05, 'epoch': 2.77} {'loss': 0.9305, 'learning_rate': 2.227314390467461e-05, 'epoch': 2.78} {'loss': 0.9045, 'learning_rate': 2.2043996333638858e-05, 'epoch': 2.78} {'loss': 0.9285, 'learning_rate': 2.1814848762603113e-05, 'epoch': 2.78} {'loss': 0.9426, 'learning_rate': 2.1585701191567365e-05, 'epoch': 2.78} {'loss': 0.9998, 'learning_rate': 2.135655362053162e-05, 'epoch': 2.79} {'loss': 0.927, 'learning_rate': 2.1127406049495872e-05, 'epoch': 2.79} {'loss': 0.8817, 'learning_rate': 2.0898258478460124e-05, 'epoch': 2.79} {'loss': 0.8973, 'learning_rate': 2.066911090742438e-05, 'epoch': 2.79} {'loss': 0.9317, 'learning_rate': 2.043996333638863e-05, 'epoch': 2.8} {'eval_loss': 1.1007884740829468, 'eval_runtime': 1591.4846, 'eval_samples_per_second': 10.968, 'eval_steps_per_second': 1.371, 'epoch': 2.8} 93%|█████████████████████████████████████████████████▍ | 24400/26184 [84:49:47<1:33:45, 3.15s/it]C:\Users\end0t\miniconda3\envs\mycuda\lib\site-packages\bitsandbytes\autograd\_functions.py:321: UserWarning: MatMul8bitLt: inputs will be cast from torch.float32 to float16 during quantization warnings.warn(f"MatMul8bitLt: inputs will be cast from {A.dtype} to float16 during quantization") <略> {'eval_loss': 1.098840355873108, 'eval_runtime': 2009.9981, 'eval_samples_per_second': 8.684, 'eval_steps_per_second': 1.086, 'epoch': 2.98} 99%|██████████████████████████████████████████████████████▌| 26000/26184 [89:53:34<10:33, 3.45s/it]C:\Users\end0t\miniconda3\envs\mycuda\lib\site-packages\bitsandbytes\autograd\_functions.py:321: UserWarning: MatMul8bitLt: inputs will be cast from torch.float32 to float16 during quantization warnings.warn(f"MatMul8bitLt: inputs will be cast from {A.dtype} to float16 during quantization") {'loss': 1.0022, 'learning_rate': 1.8790100824931255e-06, 'epoch': 2.98} {'loss': 0.9512, 'learning_rate': 1.6498625114573782e-06, 'epoch': 2.98} {'loss': 0.986, 'learning_rate': 1.4207149404216316e-06, 'epoch': 2.99} {'loss': 0.923, 'learning_rate': 1.1915673693858843e-06, 'epoch': 2.99} {'loss': 0.9338, 'learning_rate': 9.624197983501375e-07, 'epoch': 2.99} {'loss': 0.9578, 'learning_rate': 7.332722273143903e-07, 'epoch': 2.99} {'loss': 0.9031, 'learning_rate': 5.041246562786434e-07, 'epoch': 2.99} {'loss': 0.9126, 'learning_rate': 2.749770852428964e-07, 'epoch': 3.0} {'loss': 0.9763, 'learning_rate': 4.5829514207149396e-08, 'epoch': 3.0} {'train_runtime': 324200.6491, 'train_samples_per_second': 0.646, 'train_steps_per_second': 0.081, 'train_loss': 1.0020638379324405, 'epoch': 3.0} 100%|███████████████████████████████████████████████████████| 26184/26184 [90:03:20<00:00, 12.38s/it]