先程のentryの続きです。
交叉検証(Cross Validation)には、以下のような種類があるらしく、 https://www.codexa.net/cross_validation/ を写経しました。
- sklearn.model_selection.train_test_split #ホールドアウト
- sklearn.model_selection.KFold #K分割
- sklearn.model_selection.ShuffleSplit #シャッフル分割
- sklearn.model_selection.StratifiedKFold #層化K分割
- sklearn.model_selection.GroupKFold #グループK分割
- sklearn.model_selection.TimeSeriesSplit #時系列
#!python # -*- coding: utf-8 -*- from sklearn.model_selection import train_test_split #ホールドアウト from sklearn.model_selection import KFold #K分割 from sklearn.model_selection import ShuffleSplit #シャッフル分割 from sklearn.model_selection import StratifiedKFold #層化K分割 from sklearn.model_selection import GroupKFold #グループK分割 from sklearn.model_selection import TimeSeriesSplit #時系列 import numpy as np import pandas as pd import sys # refer to https://www.codexa.net/cross_validation/ def main(): df = init_data() # 初期データの準備 data = df["data"] label = df["label"] group = df["group"] holdout(df,data,label,group) #交叉検証 - ホールドアウト k_fold(df,data,label,group) #〃 - K分割 shuffle_split(df,data,label,group) #〃 - シャッフル分割 stratified_k_fold(df,data,label,group) #〃 - 層化K分割 group_k_fold(df,data,label,group) #〃 - グループK分割 time_series_split(df,data,label,group) #〃 - 時系列 def time_series_split(df,data,label,group): for max_size in [None,2]: print("####",sys._getframe().f_code.co_name) tss_a = TimeSeriesSplit(n_splits = 3, max_train_size = max_size) for train, test in tss_a.split(data): print(f"Train Data :{data[train].values}, Test Data: {data[test].values}") def time_series_split_2(df,data,label,group): print("####",sys._getframe().f_code.co_name) tss_a = TimeSeriesSplit(n_splits = 3, max_train_size = 2) for train, test in tss_a.split(data): print(f"Train Data :{data[train].values}, Test Data: {data[test].values}") def group_k_fold(df,data,label,group): print("####",sys._getframe().f_code.co_name) gkf = GroupKFold(n_splits = 2) for train, test in gkf.split(df,groups=group): print(f"Train Data :{data[train].values}, Test Data: {data[test].values}") print(f"Train Label:{label[train].values},Test Label:{label[test].values}") print(f"Train Group:{group[train].values},Test Label:{group[test].values}") def stratified_k_fold(df,data,label,group): print("####",sys._getframe().f_code.co_name) skf = StratifiedKFold(n_splits = 5, shuffle = True, random_state = 0) for train, test in skf.split(df, label): print(f"Train Data :{data[train].values}, Test Data: {data[test].values}") print(f"Train Label:{label[train].values},Test Label:{label[test].values}") def shuffle_split(df,data,label,group): print("####",sys._getframe().f_code.co_name) ss = ShuffleSplit(n_splits = 5, train_size = 0.7, test_size = 0.2, random_state = 0) #交差検証 for train, test in ss.split(df): print(f"Train Data :{data[train].values}, Test Data: {data[test].values}") print(f"Train Label:{label[train].values},Test Label:{label[test].values}") def k_fold(df,data,label,group): print("####",sys._getframe().f_code.co_name) kf = KFold(n_splits = 5, shuffle = True, random_state = 1) #交差検証 for train, test in kf.split(df): print(f"Train Data :{data[train].values}, Test Data: {data[test].values}") print(f"Train Label:{label[train].values},Test Label:{label[test].values}") def holdout(df,data,label,group): print("####",sys._getframe().f_code.co_name) train_x,test_x,train_y,test_y = \ train_test_split(data, label, train_size=0.8, # 訓練dataの割合 test_size =0.2, # テストdataの割合 shuffle = True, random_state=0) # シード値 #訓練データの確認 print(train_x.values) print(train_y.values) #テストデータの確認 print(test_x.values) print(test_y.values) def init_data(): all = dict( #分割する文字列 data = ["0","1","2","3","4","5","6","7","8","9"], #目的変数 label = [1,1,1,1,1,0,0,0,0,0], #dataが所属するグループ group = [1,0,1,0,1,0,1,0,1,0], ) df = pd.DataFrame(data = all) return df if __name__ == '__main__': main()
↑こう書くと、↓こう表示されます
#### holdout ['4' '9' '1' '6' '7' '3' '0' '5'] [1 0 1 0 0 1 1 0] ['2' '8'] [1 0] #### k_fold Train Data :['0' '1' '3' '4' '5' '6' '7' '8'], Test Data: ['2' '9'] Train Label:[1 1 1 1 0 0 0 0],Test Label:[1 0] Train Data :['0' '1' '2' '3' '5' '7' '8' '9'], Test Data: ['4' '6'] Train Label:[1 1 1 1 0 0 0 0],Test Label:[1 0] Train Data :['1' '2' '4' '5' '6' '7' '8' '9'], Test Data: ['0' '3'] Train Label:[1 1 1 0 0 0 0 0],Test Label:[1 1] Train Data :['0' '2' '3' '4' '5' '6' '8' '9'], Test Data: ['1' '7'] Train Label:[1 1 1 1 0 0 0 0],Test Label:[1 0] Train Data :['0' '1' '2' '3' '4' '6' '7' '9'], Test Data: ['5' '8'] Train Label:[1 1 1 1 1 0 0 0],Test Label:[0 0] #### shuffle_split Train Data :['4' '9' '1' '6' '7' '3' '0'], Test Data: ['2' '8'] Train Label:[1 0 1 0 0 1 1],Test Label:[1 0] Train Data :['1' '2' '9' '8' '0' '6' '7'], Test Data: ['3' '5'] Train Label:[1 1 0 0 1 0 0],Test Label:[1 0] Train Data :['8' '4' '5' '1' '0' '6' '9'], Test Data: ['2' '3'] Train Label:[0 1 0 1 1 0 0],Test Label:[1 1] Train Data :['9' '2' '7' '5' '8' '0' '3'], Test Data: ['6' '1'] Train Label:[0 1 0 0 0 1 1],Test Label:[0 1] Train Data :['7' '4' '1' '0' '6' '8' '9'], Test Data: ['5' '2'] Train Label:[0 1 1 1 0 0 0],Test Label:[0 1] #### stratified_k_fold Train Data :['0' '2' '3' '4' '6' '7' '8' '9'], Test Data: ['1' '5'] Train Label:[1 1 1 1 0 0 0 0],Test Label:[1 0] Train Data :['0' '1' '3' '4' '5' '6' '8' '9'], Test Data: ['2' '7'] Train Label:[1 1 1 1 0 0 0 0],Test Label:[1 0] Train Data :['1' '2' '3' '4' '5' '7' '8' '9'], Test Data: ['0' '6'] Train Label:[1 1 1 1 0 0 0 0],Test Label:[1 0] Train Data :['0' '1' '2' '4' '5' '6' '7' '8'], Test Data: ['3' '9'] Train Label:[1 1 1 1 0 0 0 0],Test Label:[1 0] Train Data :['0' '1' '2' '3' '5' '6' '7' '9'], Test Data: ['4' '8'] Train Label:[1 1 1 1 0 0 0 0],Test Label:[1 0] #### group_k_fold Train Data :['1' '3' '5' '7' '9'], Test Data: ['0' '2' '4' '6' '8'] Train Label:[1 1 0 0 0],Test Label:[1 1 1 0 0] Train Group:[0 0 0 0 0],Test Label:[1 1 1 1 1] Train Data :['0' '2' '4' '6' '8'], Test Data: ['1' '3' '5' '7' '9'] Train Label:[1 1 1 0 0],Test Label:[1 1 0 0 0] Train Group:[1 1 1 1 1],Test Label:[0 0 0 0 0] #### time_series_split Train Data :['0' '1' '2' '3'], Test Data: ['4' '5'] Train Data :['0' '1' '2' '3' '4' '5'], Test Data: ['6' '7'] Train Data :['0' '1' '2' '3' '4' '5' '6' '7'], Test Data: ['8' '9'] #### time_series_split Train Data :['2' '3'], Test Data: ['4' '5'] Train Data :['4' '5'], Test Data: ['6' '7'] Train Data :['6' '7'], Test Data: ['8' '9']