end0tknr's kipple - web写経開発

太宰府天満宮の狛犬って、妙にカワイイ

ハイパーパラメータの最適化

deep-learning-from-scratch/ch06 at master · oreilly-japan/deep-learning-from-scratch · GitHub

「ゼロから作るDeep Learning ① (Pythonで学ぶディープラーニングの理論と実装)」 p.197~202 の写経です。

ハイパーパラメータとは?

  • 各層のニューロン
  • バッチサイズ
  • パラメータの更新の際の学習係数
  • Weight decay

等で、重みやバイアスとは別

検証データ (≠テストデータ)

テストデータを使ってハイパーパラメータを調整すると、 ハイパーパラメータの値はテストデータに対し過学習する。

ハイパーパラメータの調整には、 ハイパーパラメータ専用の検証データが必要らしい。

ハイパーパラメータ最適化のpython実装

# coding: utf-8
import sys, os
import numpy as np
import matplotlib.pyplot as plt
import urllib.request
import gzip
from collections import OrderedDict


def main():
    mymnist = MyMnist()
    (x_train, t_train, x_test, t_test) = mymnist.load_mnist()

    # 高速化の為、訓練data削減
    x_train = x_train[:500]
    t_train = t_train[:500]

    # 検証data分離
    validation_rate = 0.20
    validation_num = int(x_train.shape[0] * validation_rate)
    x_train, t_train = shuffle_dataset(x_train, t_train)
    x_val   = x_train[:validation_num] # 検証data
    t_val   = t_train[:validation_num] # 〃
    x_train = x_train[validation_num:]
    t_train = t_train[validation_num:]

    # ハイパーパラメータのランダム探索
    optimization_trial = 100
    results_val = {}
    results_train = {}
    for _ in range(optimization_trial):
        # 探索したハイパーパラメータの範囲を指定
        weight_decay = 10 ** np.random.uniform(-8, -4)
        lr = 10 ** np.random.uniform(-6, -2)    # 学習係数

        val_acc_list, train_acc_list = __train(lr,
                                               weight_decay,
                                               x_train,
                                               t_train,
                                               x_val,
                                               t_val )
        print("val acc:", str(val_acc_list[-1]),
              " | lr:" + str(lr),
              "weight decay:", str(weight_decay) )
        key = "lr:" + str(lr) + ", weight decay:" + str(weight_decay)
        results_val[key] = val_acc_list
        results_train[key] = train_acc_list

    # グラフの描画
    graph_draw_num = 20
    col_num = 5
    row_num = int(np.ceil(graph_draw_num / col_num))
    i = 0

    for key, val_acc_list in sorted(results_val.items(),
                                    key=lambda x:x[1][-1],
                                    reverse=True):
        print("Best-" + str(i+1),
              "(val acc:" + str(val_acc_list[-1]) + ") | " + key)

        plt.subplot(row_num, col_num, i+1)
        plt.title("Best-" + str(i+1))
        plt.ylim(0.0, 1.0)
        if i % 5: plt.yticks([])
        plt.xticks([])
        x = np.arange(len(val_acc_list))
        plt.plot(x, val_acc_list)
        plt.plot(x, results_train[key], "--")
        i += 1

        if i >= graph_draw_num:
            break

    plt.show()

def __train(lr,
            weight_decay,
            x_train,
            t_train,
            x_val,
            t_val,
            epocs=50):
    network = MultiLayerNet(input_size=784,
                            hidden_size_list=[100, 100, 100, 100, 100, 100],
                            output_size=10,
                            weight_decay_lambda=weight_decay)
    
    trainer = Trainer(network,
                      x_train,
                      t_train,
                      x_val,
                      t_val,
                      epochs=epocs,
                      mini_batch_size=100,
                      optimizer='sgd',
                      optimizer_param={'lr': lr},
                      verbose=False)
    trainer.train()
    return trainer.test_acc_list, trainer.train_acc_list

class MyMnist:
    def __init__(self):
        pass

    def load_mnist(self):
        data_files = self.download_mnist()
        # convert numpy
        dataset = {}
        dataset['train_img']   = self.load_img(  data_files['train_img'] )
        dataset['train_label'] = self.load_label(data_files['train_label'])
        dataset['test_img']    = self.load_img(  data_files['test_img']  )
        dataset['test_label']  = self.load_label(data_files['test_label'])

        for key in ('train_img', 'test_img'):
            dataset[key] = dataset[key].astype(np.float32)
            dataset[key] /= 255.0

        for key in ('train_label','test_label'):
            dataset[key]=self.change_one_hot_label( dataset[key] )

        return (dataset['train_img'],
                dataset['train_label'],
                dataset['test_img'],
                dataset['test_label'] )

    def change_one_hot_label(self,X):
        T = np.zeros((X.size, 10))
        for idx, row in enumerate(T):
            row[X[idx]] = 1
        return T
    
    def download_mnist(self):
        url_base = 'http://yann.lecun.com/exdb/mnist/'
        key_file = {'train_img'  :'train-images-idx3-ubyte.gz',
                    'train_label':'train-labels-idx1-ubyte.gz',
                    'test_img'   :'t10k-images-idx3-ubyte.gz',
                    'test_label' :'t10k-labels-idx1-ubyte.gz' }
        data_files = {}
        dataset_dir = os.path.dirname(os.path.abspath(__file__))
        
        for data_name, file_name in key_file.items():
            req_url   = url_base+file_name
            file_path = dataset_dir + "/" + file_name

            request  = urllib.request.Request( req_url )
            response = urllib.request.urlopen(request).read()
            with open(file_path, mode='wb') as f:
                f.write(response)
                
            data_files[data_name] = file_path
        return data_files

    def load_img( self,file_path):
        img_size    = 784 # = 28*28
        
        with gzip.open(file_path, 'rb') as f:
            data = np.frombuffer(f.read(), np.uint8, offset=16)
        data = data.reshape(-1, img_size)
        return data
    
    def load_label(self,file_path):
        with gzip.open(file_path, 'rb') as f:
            labels = np.frombuffer(f.read(), np.uint8, offset=8)
        return labels
    
# x:訓練データ、t:教師データ
def shuffle_dataset(x, t):
    permutation = np.random.permutation(x.shape[0])
    x = x[permutation,:] if x.ndim == 2 else x[permutation,:,:,:]
    t = t[permutation]

    return x, t


# 全結合による多層ニューラルネットワーク
class MultiLayerNet:
    """
    input_size : 
    hidden_size_list : 隠れ層のニューロンの数のリスト(e.g. [100, 100, 100])
    output_size : 
    activation : 'relu' or 'sigmoid'
    weight_init_std : 重みの標準偏差を指定(e.g. 0.01)
        'relu'または'he'を指定した場合は「Heの初期値」を設定
        'sigmoid'または'xavier'を指定した場合は「Xavierの初期値」を設定
    weight_decay_lambda : Weight Decay(L2ノルム)の強さ
    """
    def __init__(self,
                 input_size,      # 入力size(MNISTの場合 784)
                 hidden_size_list,# 隠れ層neuron数list 例[100,100,100]
                 output_size,     # 出力size(MNISTの場合は10)
                 activation='relu', # 活性化関数 'relu' or 'sigmoid'
                 weight_init_std='relu',
                 weight_decay_lambda=0):
        self.input_size = input_size
        self.output_size = output_size
        self.hidden_size_list = hidden_size_list
        self.hidden_layer_num = len(hidden_size_list)
        self.weight_decay_lambda = weight_decay_lambda
        self.params = {}

        # 重みの初期化
        self.__init_weight(weight_init_std)

        # レイヤの生成
        activation_layer = {'sigmoid': Sigmoid, 'relu': Relu}
        self.layers = OrderedDict()
        for idx in range(1, self.hidden_layer_num+1):
            self.layers['Affine' + str(idx)] = Affine(self.params['W' + str(idx)],
                                                      self.params['b' + str(idx)])
            self.layers['Activation_function' + str(idx)] = activation_layer[activation]()

        idx = self.hidden_layer_num + 1
        self.layers['Affine' + str(idx)] = Affine(self.params['W' + str(idx)],
            self.params['b' + str(idx)])

        self.last_layer = SoftmaxWithLoss()

    def __init_weight(self, weight_init_std):
        """重みの初期値設定

        Parameters
        ----------
        weight_init_std : 重みの標準偏差を指定(e.g. 0.01)
            'relu'または'he'を指定した場合は「Heの初期値」を設定
            'sigmoid'または'xavier'を指定した場合は「Xavierの初期値」を設定
        """
        all_size_list = [self.input_size] + self.hidden_size_list + [self.output_size]
        for idx in range(1, len(all_size_list)):
            scale = weight_init_std
            if str(weight_init_std).lower() in ('relu', 'he'):
                scale = np.sqrt(2.0 / all_size_list[idx - 1])  # ReLUを使う場合に推奨される初期値
            elif str(weight_init_std).lower() in ('sigmoid', 'xavier'):
                scale = np.sqrt(1.0 / all_size_list[idx - 1])  # sigmoidを使う場合に推奨される初期値

            self.params['W' + str(idx)] = scale * np.random.randn(all_size_list[idx-1], all_size_list[idx])
            self.params['b' + str(idx)] = np.zeros(all_size_list[idx])

    def predict(self, x):
        for layer in self.layers.values():
            x = layer.forward(x)

        return x

    def loss(self, x, t):
        """損失関数を求める

        Parameters
        ----------
        x : 入力データ
        t : 教師ラベル

        Returns
        -------
        損失関数の値
        """
        y = self.predict(x)

        weight_decay = 0
        for idx in range(1, self.hidden_layer_num + 2):
            W = self.params['W' + str(idx)]
            weight_decay += 0.5 * self.weight_decay_lambda * np.sum(W ** 2)

        return self.last_layer.forward(y, t) + weight_decay

    def accuracy(self, x, t):
        y = self.predict(x)
        y = np.argmax(y, axis=1)
        if t.ndim != 1 : t = np.argmax(t, axis=1)

        accuracy = np.sum(y == t) / float(x.shape[0])
        return accuracy

    def numerical_gradient(self, x, t):
        """勾配を求める(数値微分)

        Parameters
        ----------
        x : 入力データ
        t : 教師ラベル

        Returns
        -------
        各層の勾配を持ったディクショナリ変数
            grads['W1']、grads['W2']、...は各層の重み
            grads['b1']、grads['b2']、...は各層のバイアス
        """
        loss_W = lambda W: self.loss(x, t)

        grads = {}
        for idx in range(1, self.hidden_layer_num+2):
            grads['W' + str(idx)] = numerical_gradient(loss_W, self.params['W' + str(idx)])
            grads['b' + str(idx)] = numerical_gradient(loss_W, self.params['b' + str(idx)])

        return grads

    def gradient(self, x, t):
        """勾配を求める(誤差逆伝搬法)

        Parameters
        ----------
        x : 入力データ
        t : 教師ラベル

        Returns
        -------
        各層の勾配を持ったディクショナリ変数
            grads['W1']、grads['W2']、...は各層の重み
            grads['b1']、grads['b2']、...は各層のバイアス
        """
        # forward
        self.loss(x, t)

        # backward
        dout = 1
        dout = self.last_layer.backward(dout)

        layers = list(self.layers.values())
        layers.reverse()
        for layer in layers:
            dout = layer.backward(dout)

        # 設定
        grads = {}
        for idx in range(1, self.hidden_layer_num+2):
            grads['W' + str(idx)] = self.layers['Affine' + str(idx)].dW + self.weight_decay_lambda * self.layers['Affine' + str(idx)].W
            grads['b' + str(idx)] = self.layers['Affine' + str(idx)].db

        return grads

class Relu:
    def __init__(self):
        self.mask = None

    def forward(self, x):
        self.mask = (x <= 0)
        out = x.copy()
        out[self.mask] = 0

        return out

    def backward(self, dout):
        dout[self.mask] = 0
        dx = dout

        return dx


class Sigmoid:
    def __init__(self):
        self.out = None

    def forward(self, x):
        out = sigmoid(x)
        self.out = out
        return out

    def backward(self, dout):
        dx = dout * (1.0 - self.out) * self.out

        return dx

class Affine:
    def __init__(self, W, b):
        self.W =W
        self.b = b
        
        self.x = None
        self.original_x_shape = None
        # 重み・バイアスパラメータの微分
        self.dW = None
        self.db = None

    def forward(self, x):
        # テンソル対応
        self.original_x_shape = x.shape
        x = x.reshape(x.shape[0], -1)
        self.x = x

        out = np.dot(self.x, self.W) + self.b

        return out

    def backward(self, dout):
        dx = np.dot(dout, self.W.T)
        self.dW = np.dot(self.x.T, dout)
        self.db = np.sum(dout, axis=0)
        
        dx = dx.reshape(*self.original_x_shape)
        return dx
    
class SoftmaxWithLoss:
    def __init__(self):
        self.loss = None
        self.y = None # softmaxの出力
        self.t = None # 教師データ

    def forward(self, x, t):
        self.t = t
        self.y = self.softmax(x)
        self.loss = self.cross_entropy_error(self.y, self.t)
        
        return self.loss

    def backward(self, dout=1):
        batch_size = self.t.shape[0]
        if self.t.size == self.y.size: # 教師データがone-hot-vectorの場合
            dx = (self.y - self.t) / batch_size
        else:
            dx = self.y.copy()
            dx[np.arange(batch_size), self.t] -= 1
            dx = dx / batch_size
        
        return dx

    def softmax(self,x):
        x = x - np.max(x, axis=-1, keepdims=True)   # オーバーフロー対策
        return np.exp(x) / np.sum(np.exp(x), axis=-1, keepdims=True)

    def cross_entropy_error(self, y, t):
        if y.ndim == 1:
            t = t.reshape(1, t.size)
            y = y.reshape(1, y.size)

        # 教師データがone-hot-vectorの場合、正解ラベルのインデックスに変換
        if t.size == y.size:
            t = t.argmax(axis=1)

        batch_size = y.shape[0]
        return -np.sum(np.log(y[np.arange(batch_size), t] + 1e-7)) / batch_size

class Trainer:

    def __init__(self, network, x_train, t_train, x_test, t_test,
                 epochs=20, mini_batch_size=100,
                 optimizer='SGD', optimizer_param={'lr':0.01}, 
                 evaluate_sample_num_per_epoch=None, verbose=True):
        self.network = network
        self.verbose = verbose
        self.x_train = x_train
        self.t_train = t_train
        self.x_test = x_test
        self.t_test = t_test
        self.epochs = epochs
        self.batch_size = mini_batch_size
        self.evaluate_sample_num_per_epoch = evaluate_sample_num_per_epoch

        # optimizer
        optimizer_class_dict = {'sgd'     :SGD,
                                'momentum':Momentum,
                                'nesterov':Nesterov,
                                'adagrad' :AdaGrad,
                                'rmsprop' :RMSprop,
                                'adam'    :Adam}
        self.optimizer = optimizer_class_dict[optimizer.lower()](**optimizer_param)
        
        self.train_size = x_train.shape[0]
        self.iter_per_epoch = max(self.train_size / mini_batch_size, 1)
        self.max_iter = int(epochs * self.iter_per_epoch)
        self.current_iter = 0
        self.current_epoch = 0
        
        self.train_loss_list = []
        self.train_acc_list = []
        self.test_acc_list = []

    def train_step(self):
        batch_mask = np.random.choice(self.train_size, self.batch_size)
        x_batch = self.x_train[batch_mask]
        t_batch = self.t_train[batch_mask]
        
        grads = self.network.gradient(x_batch, t_batch)
        self.optimizer.update(self.network.params, grads)
        
        loss = self.network.loss(x_batch, t_batch)
        self.train_loss_list.append(loss)
        if self.verbose: print("train loss:" + str(loss))
        
        if self.current_iter % self.iter_per_epoch == 0:
            self.current_epoch += 1
            
            x_train_sample, t_train_sample = self.x_train, self.t_train
            x_test_sample, t_test_sample = self.x_test, self.t_test
            if not self.evaluate_sample_num_per_epoch is None:
                t = self.evaluate_sample_num_per_epoch
                x_train_sample, t_train_sample = self.x_train[:t], self.t_train[:t]
                x_test_sample, t_test_sample   = self.x_test[:t], self.t_test[:t]
                
            train_acc = self.network.accuracy(x_train_sample, t_train_sample)
            test_acc = self.network.accuracy(x_test_sample, t_test_sample)
            self.train_acc_list.append(train_acc)
            self.test_acc_list.append(test_acc)

            if self.verbose: print("epoch:",str(self.current_epoch),
                                   "train acc:",str(train_acc),
                                   "test acc:", str(test_acc) )
        self.current_iter += 1

    def train(self):
        for i in range(self.max_iter):
            self.train_step()

        test_acc = self.network.accuracy(self.x_test, self.t_test)

        if self.verbose:
            print("=============== Final Test Accuracy ===============")
            print("test acc:" + str(test_acc))

# 確率的勾配降下法(Stochastic Gradient Descent)
class SGD:
    def __init__(self, lr=0.01):
        self.lr = lr
        
    def update(self, params, grads):
        for key in params.keys():
            params[key] -= self.lr * grads[key] 

class Momentum:
    def __init__(self, lr=0.01, momentum=0.9):
        self.lr = lr
        self.momentum = momentum
        self.v = None
        
    def update(self, params, grads):
        if self.v is None:
            self.v = {}
            for key, val in params.items():                                
                self.v[key] = np.zeros_like(val)
                
        for key in params.keys():
            self.v[key] = self.momentum*self.v[key] - self.lr*grads[key] 
            params[key] += self.v[key]

# Nesterov's Accelerated Gradient http://arxiv.org/abs/1212.0901
class Nesterov:
    def __init__(self, lr=0.01, momentum=0.9):
        self.lr = lr
        self.momentum = momentum
        self.v = None
        
    def update(self, params, grads):
        if self.v is None:
            self.v = {}
            for key, val in params.items():
                self.v[key] = np.zeros_like(val)
            
        for key in params.keys():
            params[key] += self.momentum * self.momentum * self.v[key]
            params[key] -= (1 + self.momentum) * self.lr * grads[key]
            self.v[key] *= self.momentum
            self.v[key] -= self.lr * grads[key]

class AdaGrad:
    def __init__(self, lr=0.01):
        self.lr = lr
        self.h = None
        
    def update(self, params, grads):
        if self.h is None:
            self.h = {}
            for key, val in params.items():
                self.h[key] = np.zeros_like(val)
            
        for key in params.keys():
            self.h[key] += grads[key] * grads[key]
            params[key] -= self.lr * grads[key] / (np.sqrt(self.h[key]) + 1e-7)

class RMSprop:
    def __init__(self, lr=0.01, decay_rate = 0.99):
        self.lr = lr
        self.decay_rate = decay_rate
        self.h = None
        
    def update(self, params, grads):
        if self.h is None:
            self.h = {}
            for key, val in params.items():
                self.h[key] = np.zeros_like(val)
            
        for key in params.keys():
            self.h[key] *= self.decay_rate
            self.h[key] += (1 - self.decay_rate) * grads[key] * grads[key]
            params[key] -= self.lr * grads[key] / (np.sqrt(self.h[key]) + 1e-7)


class Adam: # http://arxiv.org/abs/1412.6980v8
    def __init__(self, lr=0.001, beta1=0.9, beta2=0.999):
        self.lr = lr
        self.beta1 = beta1
        self.beta2 = beta2
        self.iter = 0
        self.m = None
        self.v = None
        
    def update(self, params, grads):
        if self.m is None:
            self.m, self.v = {}, {}
            for key, val in params.items():
                self.m[key] = np.zeros_like(val)
                self.v[key] = np.zeros_like(val)
        
        self.iter += 1
        lr_t  = self.lr * np.sqrt(1.0 - self.beta2**self.iter) / \
            (1.0 - self.beta1**self.iter)
        
        for key in params.keys():
            #self.m[key] = self.beta1*self.m[key] + (1-self.beta1)*grads[key]
            #self.v[key] = self.beta2*self.v[key] + (1-self.beta2)*(grads[key]**2)
            self.m[key] += (1 - self.beta1) * (grads[key] - self.m[key])
            self.v[key] += (1 - self.beta2) * (grads[key]**2 - self.v[key])
            
            params[key] -= lr_t * self.m[key] / (np.sqrt(self.v[key]) + 1e-7)
            
            #unbias_m += (1 - self.beta1) * (grads[key] - self.m[key]) # correct bias
            #unbisa_b += (1 - self.beta2) * (grads[key]*grads[key] - self.v[key]) # correct bias
            #params[key] += self.lr * unbias_m / (np.sqrt(unbisa_b) + 1e-7)

if __name__ == '__main__':
    main()

↑こう書くと、↓こう表示されますが、自身の理解度はイマイチです

Dropout による過学習の軽減

deep-learning-from-scratch/ch06 at master · oreilly-japan/deep-learning-from-scratch · GitHub

「ゼロから作るDeep Learning ① (Pythonで学ぶディープラーニングの理論と実装)」 p.193~195 の写経です。

先程の Weight decay (荷重減衰)とは別に Dropoutという過学習軽減方法もあるようです。

Dropoutでは、以下のようにランダムにneuronを消去しながら学習を行うことで 過学習を回避します。 (アンサンブル学習やランダムフォレストと似た考え方のようです)

# coding: utf-8
import os
import numpy as np
import matplotlib.pyplot as plt
from collections import OrderedDict
import urllib.request
import gzip

from common.optimizer import *

def main():
    mymnist = MyMnist()
    (x_train, t_train, x_test, t_test) = mymnist.load_mnist()

    # 過学習を再現するために、学習データを削減
    x_train = x_train[:300]
    t_train = t_train[:300]

    network = MultiLayerNetExtend(input_size    =784,
                                  hidden_size_list=[100,100,100,100,100,100],
                                  output_size   =10,
                                  use_dropout   =True,
                                  dropout_ration=0.2 )
    
    trainer = Trainer(network,
                      x_train,
                      t_train,
                      x_test,
                      t_test,
                      epochs=301,
                      mini_batch_size=100,
                      optimizer='sgd',
                      optimizer_param={'lr': 0.01},
                      verbose=True)
    trainer.train()

    train_acc_list, test_acc_list = trainer.train_acc_list, trainer.test_acc_list

    # グラフの描画==========
    markers = {'train': 'o', 'test': 's'}
    x = np.arange(len(train_acc_list))
    plt.plot(x, train_acc_list, marker='o', label='train', markevery=10)
    plt.plot(x, test_acc_list, marker='s', label='test', markevery=10)
    plt.xlabel("epochs")
    plt.ylabel("accuracy")
    plt.ylim(0, 1.0)
    plt.legend(loc='lower right')
    plt.show()

class MyMnist:
    def __init__(self):
        pass

    def load_mnist(self):
        data_files = self.download_mnist()
        # convert numpy
        dataset = {}
        dataset['train_img']   = self.load_img(  data_files['train_img'] )
        dataset['train_label'] = self.load_label(data_files['train_label'])
        dataset['test_img']    = self.load_img(  data_files['test_img']  )
        dataset['test_label']  = self.load_label(data_files['test_label'])

        for key in ('train_img', 'test_img'):
            dataset[key] = dataset[key].astype(np.float32)
            dataset[key] /= 255.0

        for key in ('train_label','test_label'):
            dataset[key]=self.change_one_hot_label( dataset[key] )

        return (dataset['train_img'],
                dataset['train_label'],
                dataset['test_img'],
                dataset['test_label'] )

    def change_one_hot_label(self,X):
        T = np.zeros((X.size, 10))
        for idx, row in enumerate(T):
            row[X[idx]] = 1
        return T
    
    def download_mnist(self):
        url_base = 'http://yann.lecun.com/exdb/mnist/'
        key_file = {'train_img'  :'train-images-idx3-ubyte.gz',
                    'train_label':'train-labels-idx1-ubyte.gz',
                    'test_img'   :'t10k-images-idx3-ubyte.gz',
                    'test_label' :'t10k-labels-idx1-ubyte.gz' }
        data_files = {}
        dataset_dir = os.path.dirname(os.path.abspath(__file__))
        
        for data_name, file_name in key_file.items():
            req_url   = url_base+file_name
            file_path = dataset_dir + "/" + file_name

            request  = urllib.request.Request( req_url )
            response = urllib.request.urlopen(request).read()
            with open(file_path, mode='wb') as f:
                f.write(response)
                
            data_files[data_name] = file_path
        return data_files

    def load_img( self,file_path):
        img_size    = 784 # = 28*28
        
        with gzip.open(file_path, 'rb') as f:
            data = np.frombuffer(f.read(), np.uint8, offset=16)
        data = data.reshape(-1, img_size)
        return data
    
    def load_label(self,file_path):
        with gzip.open(file_path, 'rb') as f:
            labels = np.frombuffer(f.read(), np.uint8, offset=8)
        return labels

# 拡張版の全結合による多層ニューラルネットワーク
# (Weiht Decay、Dropout、Batch Normalizationの機能を持つ)
class MultiLayerNetExtend:

    def __init__(
            self,
            input_size,         # 入力size (MNISTの場合 784)
            hidden_size_list,   # 隠れ層のneuron数list 例[100,100,100]
            output_size,        # 出力size (MNISTの場合 10)
            activation='relu',  # 活性化関数 relu sigmoid
            weight_init_std='relu',# ※
            weight_decay_lambda=0, # Weight Decay(L2ノルム)の強さ
            use_dropout = False,   
            dropout_ration = 0.5,  # Dropoutの割り合い
            use_batchnorm=False ):

        # ※ weight_init_std : 重みの標準偏差を指定(例0.01)
        #        relu or he        →Heの初期値
        #        sigmoid or xavier →Xavierの初期値
        self.input_size = input_size
        self.output_size = output_size
        self.hidden_size_list = hidden_size_list
        self.hidden_layer_num = len(hidden_size_list)
        self.use_dropout = use_dropout
        self.weight_decay_lambda = weight_decay_lambda
        self.use_batchnorm = use_batchnorm
        self.params = {}

        # 重みの初期化
        self.__init_weight(weight_init_std)

        # レイヤの生成
        activation_layer = {'sigmoid': Sigmoid, 'relu': Relu}
        self.layers = OrderedDict()
        for idx in range(1, self.hidden_layer_num+1):
            self.layers['Affine' + str(idx)] = Affine(self.params['W' + str(idx)],
                                                      self.params['b' + str(idx)])
            if self.use_batchnorm:
                self.params['gamma' + str(idx)] = np.ones(hidden_size_list[idx-1])
                self.params['beta' + str(idx)]  = np.zeros(hidden_size_list[idx-1])
                self.layers['BatchNorm' + str(idx)] = \
                    BatchNormalization(self.params['gamma' + str(idx)],
                                       self.params['beta' + str(idx)] )
                
            self.layers['Activation_function' + str(idx)] = \
                activation_layer[activation]()
            
            if self.use_dropout:
                self.layers['Dropout' + str(idx)] = Dropout(dropout_ration)

        idx = self.hidden_layer_num + 1
        self.layers['Affine' + str(idx)] = Affine(self.params['W' + str(idx)], self.params['b' + str(idx)])

        self.last_layer = SoftmaxWithLoss()

    def __init_weight(self, weight_init_std):
        """重みの初期値設定

        Parameters
        ----------
        weight_init_std : 重みの標準偏差を指定(e.g. 0.01)
            'relu'または'he'を指定した場合は「Heの初期値」を設定
            'sigmoid'または'xavier'を指定した場合は「Xavierの初期値」を設定
        """
        all_size_list = [self.input_size] + self.hidden_size_list + [self.output_size]
        for idx in range(1, len(all_size_list)):
            scale = weight_init_std
            if str(weight_init_std).lower() in ('relu', 'he'):
                scale = np.sqrt(2.0 / all_size_list[idx - 1])  # ReLUを使う場合に推奨される初期値
            elif str(weight_init_std).lower() in ('sigmoid', 'xavier'):
                scale = np.sqrt(1.0 / all_size_list[idx - 1])  # sigmoidを使う場合に推奨される初期値
            self.params['W' + str(idx)] = scale * np.random.randn(all_size_list[idx-1], all_size_list[idx])
            self.params['b' + str(idx)] = np.zeros(all_size_list[idx])

    def predict(self, x, train_flg=False):
        for key, layer in self.layers.items():
            if "Dropout" in key or "BatchNorm" in key:
                x = layer.forward(x, train_flg)
            else:
                x = layer.forward(x)

        return x

    def loss(self, x, t, train_flg=False):
        """損失関数を求める
        引数のxは入力データ、tは教師ラベル
        """
        y = self.predict(x, train_flg)

        weight_decay = 0
        for idx in range(1, self.hidden_layer_num + 2):
            W = self.params['W' + str(idx)]
            weight_decay += 0.5 * self.weight_decay_lambda * np.sum(W**2)

        return self.last_layer.forward(y, t) + weight_decay

    def accuracy(self, x, t):
        y = self.predict(x, train_flg=False)
        y = np.argmax(y, axis=1)
        if t.ndim != 1 : t = np.argmax(t, axis=1)

        accuracy = np.sum(y == t) / float(x.shape[0])
        return accuracy

    def numerical_gradient(self, x, t):
        """勾配を求める(数値微分)

        Parameters
        ----------
        x : 入力データ
        t : 教師ラベル

        Returns
        -------
        各層の勾配を持ったディクショナリ変数
            grads['W1']、grads['W2']、...は各層の重み
            grads['b1']、grads['b2']、...は各層のバイアス
        """
        loss_W = lambda W: self.loss(x, t, train_flg=True)

        grads = {}
        for idx in range(1, self.hidden_layer_num+2):
            grads['W' + str(idx)] = numerical_gradient(loss_W, self.params['W' + str(idx)])
            grads['b' + str(idx)] = numerical_gradient(loss_W, self.params['b' + str(idx)])
            
            if self.use_batchnorm and idx != self.hidden_layer_num+1:
                grads['gamma' + str(idx)] = numerical_gradient(loss_W, self.params['gamma' + str(idx)])
                grads['beta' + str(idx)] = numerical_gradient(loss_W, self.params['beta' + str(idx)])

        return grads
        
    def gradient(self, x, t):
        # forward
        self.loss(x, t, train_flg=True)

        # backward
        dout = 1
        dout = self.last_layer.backward(dout)

        layers = list(self.layers.values())
        layers.reverse()
        for layer in layers:
            dout = layer.backward(dout)

        # 設定
        grads = {}
        for idx in range(1, self.hidden_layer_num+2):
            grads['W' + str(idx)] = self.layers['Affine' + str(idx)].dW + self.weight_decay_lambda * self.params['W' + str(idx)]
            grads['b' + str(idx)] = self.layers['Affine' + str(idx)].db

            if self.use_batchnorm and idx != self.hidden_layer_num+1:
                grads['gamma' + str(idx)] = self.layers['BatchNorm' + str(idx)].dgamma
                grads['beta' + str(idx)] = self.layers['BatchNorm' + str(idx)].dbeta

        return grads
    
class Affine:
    def __init__(self, W, b):
        self.W =W
        self.b = b
        
        self.x = None
        self.original_x_shape = None
        # 重み・バイアスパラメータの微分
        self.dW = None
        self.db = None

    def forward(self, x):
        # テンソル対応
        self.original_x_shape = x.shape
        x = x.reshape(x.shape[0], -1)
        self.x = x

        out = np.dot(self.x, self.W) + self.b
        return out

    def backward(self, dout):
        dx = np.dot(dout, self.W.T)
        self.dW = np.dot(self.x.T, dout)
        self.db = np.sum(dout, axis=0)
        dx = dx.reshape(*self.original_x_shape)
        return dx

class Relu:
    def __init__(self):
        self.mask = None

    def forward(self, x):
        self.mask = (x <= 0)
        out = x.copy()
        out[self.mask] = 0
        return out

    def backward(self, dout):
        dout[self.mask] = 0
        dx = dout
        return dx

class Sigmoid:
    def __init__(self):
        self.out = None

    def forward(self, x):
        out = sigmoid(x)
        self.out = out
        return out

    def backward(self, dout):
        dx = dout * (1.0 - self.out) * self.out
        return dx


# http://arxiv.org/abs/1502.03167
class BatchNormalization:
    def __init__(self,
                 gamma,
                 beta,
                 momentum=0.9,
                 running_mean=None,
                 running_var=None):
        self.gamma = gamma
        self.beta = beta
        self.momentum = momentum
        self.input_shape = None # Conv層の場合は4次元、全結合層の場合は2次元

        # テスト時に使用する平均と分散
        self.running_mean = running_mean
        self.running_var = running_var  
        
        # backward時に使用する中間データ
        self.batch_size = None
        self.xc = None
        self.std = None
        self.dgamma = None
        self.dbeta = None

    def forward(self, x, train_flg=True):
        self.input_shape = x.shape
        if x.ndim != 2:
            N, C, H, W = x.shape
            x = x.reshape(N, -1)

        out = self.__forward(x, train_flg)
        
        return out.reshape(*self.input_shape)
            
    def __forward(self, x, train_flg):
        if self.running_mean is None:
            N, D = x.shape
            self.running_mean = np.zeros(D)
            self.running_var = np.zeros(D)
                        
        if train_flg:
            mu = x.mean(axis=0)
            xc = x - mu
            var = np.mean(xc**2, axis=0)
            std = np.sqrt(var + 10e-7)
            xn = xc / std
            
            self.batch_size = x.shape[0]
            self.xc = xc
            self.xn = xn
            self.std = std
            self.running_mean = \
                self.momentum * self.running_mean + (1-self.momentum) * mu
            self.running_var = \
                self.momentum * self.running_var + (1-self.momentum) * var
        else:
            xc = x - self.running_mean
            xn = xc / ((np.sqrt(self.running_var + 10e-7)))
            
        out = self.gamma * xn + self.beta 
        return out

    def backward(self, dout):
        if dout.ndim != 2:
            N, C, H, W = dout.shape
            dout = dout.reshape(N, -1)

        dx = self.__backward(dout)

        dx = dx.reshape(*self.input_shape)
        return dx

    def __backward(self, dout):
        dbeta = dout.sum(axis=0)
        dgamma = np.sum(self.xn * dout, axis=0)
        dxn = self.gamma * dout
        dxc = dxn / self.std
        dstd = -np.sum((dxn * self.xc) / (self.std * self.std), axis=0)
        dvar = 0.5 * dstd / self.std
        dxc += (2.0 / self.batch_size) * self.xc * dvar
        dmu = np.sum(dxc, axis=0)
        dx = dxc - dmu / self.batch_size
        
        self.dgamma = dgamma
        self.dbeta = dbeta
        
        return dx

# http://arxiv.org/abs/1207.0580
class Dropout:
    def __init__(self, dropout_ratio=0.5):
        self.dropout_ratio = dropout_ratio
        self.mask = None

    def forward(self, x, train_flg=True):
        if train_flg:
            self.mask = np.random.rand(*x.shape) > self.dropout_ratio
            return x * self.mask
        else:
            return x * (1.0 - self.dropout_ratio)

    def backward(self, dout):
        return dout * self.mask
    
class SoftmaxWithLoss:
    def __init__(self):
        self.loss = None
        self.y = None # softmaxの出力
        self.t = None # 教師データ

    def forward(self, x, t):
        self.t = t
        self.y = self.softmax(x)
        self.loss = self.cross_entropy_error(self.y, self.t)
        
        return self.loss

    def backward(self, dout=1):
        batch_size = self.t.shape[0]
        if self.t.size == self.y.size: # 教師データがone-hot-vectorの場合
            dx = (self.y - self.t) / batch_size
        else:
            dx = self.y.copy()
            dx[np.arange(batch_size), self.t] -= 1
            dx = dx / batch_size
        
        return dx
    
    def softmax(self, x):
        x = x - np.max(x, axis=-1, keepdims=True)   # オーバーフロー対策
        return np.exp(x) / np.sum(np.exp(x), axis=-1, keepdims=True)

    def cross_entropy_error(self, y, t):
        if y.ndim == 1:
            t = t.reshape(1, t.size)
            y = y.reshape(1, y.size)

        # 教師データがone-hot-vectorの場合、正解ラベルのインデックスに変換
        if t.size == y.size:
            t = t.argmax(axis=1)

        batch_size = y.shape[0]
        return -np.sum(np.log(y[np.arange(batch_size), t] + 1e-7)) / batch_size

class Trainer:

    def __init__(self, network, x_train, t_train, x_test, t_test,
                 epochs=20, mini_batch_size=100,
                 optimizer='SGD', optimizer_param={'lr':0.01}, 
                 evaluate_sample_num_per_epoch=None, verbose=True):
        self.network = network
        self.verbose = verbose
        self.x_train = x_train
        self.t_train = t_train
        self.x_test = x_test
        self.t_test = t_test
        self.epochs = epochs
        self.batch_size = mini_batch_size
        self.evaluate_sample_num_per_epoch = evaluate_sample_num_per_epoch

        # optimizer
        optimizer_class_dict = {'sgd'     :SGD,
                                'momentum':Momentum,
                                'nesterov':Nesterov,
                                'adagrad' :AdaGrad,
                                'rmsprop' :RMSprop,
                                'adam'    :Adam}
        self.optimizer = optimizer_class_dict[optimizer.lower()](**optimizer_param)
        
        self.train_size = x_train.shape[0]
        self.iter_per_epoch = max(self.train_size / mini_batch_size, 1)
        self.max_iter = int(epochs * self.iter_per_epoch)
        self.current_iter = 0
        self.current_epoch = 0
        
        self.train_loss_list = []
        self.train_acc_list = []
        self.test_acc_list = []

    def train_step(self):
        batch_mask = np.random.choice(self.train_size, self.batch_size)
        x_batch = self.x_train[batch_mask]
        t_batch = self.t_train[batch_mask]
        
        grads = self.network.gradient(x_batch, t_batch)
        self.optimizer.update(self.network.params, grads)
        
        loss = self.network.loss(x_batch, t_batch)
        self.train_loss_list.append(loss)
        if self.verbose: print("train loss:" + str(loss))
        
        if self.current_iter % self.iter_per_epoch == 0:
            self.current_epoch += 1
            
            x_train_sample, t_train_sample = self.x_train, self.t_train
            x_test_sample, t_test_sample = self.x_test, self.t_test
            if not self.evaluate_sample_num_per_epoch is None:
                t = self.evaluate_sample_num_per_epoch
                x_train_sample, t_train_sample = self.x_train[:t], self.t_train[:t]
                x_test_sample, t_test_sample   = self.x_test[:t], self.t_test[:t]
                
            train_acc = self.network.accuracy(x_train_sample, t_train_sample)
            test_acc = self.network.accuracy(x_test_sample, t_test_sample)
            self.train_acc_list.append(train_acc)
            self.test_acc_list.append(test_acc)

            if self.verbose: print("epoch:",str(self.current_epoch),
                                   "train acc:",str(train_acc),
                                   "test acc:", str(test_acc) )
        self.current_iter += 1

    def train(self):
        for i in range(self.max_iter):
            self.train_step()

        test_acc = self.network.accuracy(self.x_test, self.t_test)

        if self.verbose:
            print("=============== Final Test Accuracy ===============")
            print("test acc:" + str(test_acc))

# 確率的勾配降下法(Stochastic Gradient Descent)
class SGD:
    def __init__(self, lr=0.01):
        self.lr = lr
        
    def update(self, params, grads):
        for key in params.keys():
            params[key] -= self.lr * grads[key] 

class Momentum:
    def __init__(self, lr=0.01, momentum=0.9):
        self.lr = lr
        self.momentum = momentum
        self.v = None
        
    def update(self, params, grads):
        if self.v is None:
            self.v = {}
            for key, val in params.items():                                
                self.v[key] = np.zeros_like(val)
                
        for key in params.keys():
            self.v[key] = self.momentum*self.v[key] - self.lr*grads[key] 
            params[key] += self.v[key]

# Nesterov's Accelerated Gradient http://arxiv.org/abs/1212.0901
class Nesterov:
    def __init__(self, lr=0.01, momentum=0.9):
        self.lr = lr
        self.momentum = momentum
        self.v = None
        
    def update(self, params, grads):
        if self.v is None:
            self.v = {}
            for key, val in params.items():
                self.v[key] = np.zeros_like(val)
            
        for key in params.keys():
            params[key] += self.momentum * self.momentum * self.v[key]
            params[key] -= (1 + self.momentum) * self.lr * grads[key]
            self.v[key] *= self.momentum
            self.v[key] -= self.lr * grads[key]

class AdaGrad:
    def __init__(self, lr=0.01):
        self.lr = lr
        self.h = None
        
    def update(self, params, grads):
        if self.h is None:
            self.h = {}
            for key, val in params.items():
                self.h[key] = np.zeros_like(val)
            
        for key in params.keys():
            self.h[key] += grads[key] * grads[key]
            params[key] -= self.lr * grads[key] / (np.sqrt(self.h[key]) + 1e-7)

class RMSprop:
    def __init__(self, lr=0.01, decay_rate = 0.99):
        self.lr = lr
        self.decay_rate = decay_rate
        self.h = None
        
    def update(self, params, grads):
        if self.h is None:
            self.h = {}
            for key, val in params.items():
                self.h[key] = np.zeros_like(val)
            
        for key in params.keys():
            self.h[key] *= self.decay_rate
            self.h[key] += (1 - self.decay_rate) * grads[key] * grads[key]
            params[key] -= self.lr * grads[key] / (np.sqrt(self.h[key]) + 1e-7)


class Adam: # http://arxiv.org/abs/1412.6980v8
    def __init__(self, lr=0.001, beta1=0.9, beta2=0.999):
        self.lr = lr
        self.beta1 = beta1
        self.beta2 = beta2
        self.iter = 0
        self.m = None
        self.v = None
        
    def update(self, params, grads):
        if self.m is None:
            self.m, self.v = {}, {}
            for key, val in params.items():
                self.m[key] = np.zeros_like(val)
                self.v[key] = np.zeros_like(val)
        
        self.iter += 1
        lr_t  = self.lr * np.sqrt(1.0 - self.beta2**self.iter) / \
            (1.0 - self.beta1**self.iter)
        
        for key in params.keys():
            #self.m[key] = self.beta1*self.m[key] + (1-self.beta1)*grads[key]
            #self.v[key] = self.beta2*self.v[key] + (1-self.beta2)*(grads[key]**2)
            self.m[key] += (1 - self.beta1) * (grads[key] - self.m[key])
            self.v[key] += (1 - self.beta2) * (grads[key]**2 - self.v[key])
            
            params[key] -= lr_t * self.m[key] / (np.sqrt(self.v[key]) + 1e-7)
            
            #unbias_m += (1 - self.beta1) * (grads[key] - self.m[key]) # correct bias
            #unbisa_b += (1 - self.beta2) * (grads[key]*grads[key] - self.v[key]) # correct bias
            #params[key] += self.lr * unbias_m / (np.sqrt(unbisa_b) + 1e-7)

            
if __name__ == '__main__':
    main()

↑こう書くと、↓こう表示され、 先程のentry同様、精度+100%に達していないことから、 過学習となっていないことが分かります。

Weight decay (荷重減衰)による過学習の軽減(正則化)

deep-learning-from-scratch/ch06 at master · oreilly-japan/deep-learning-from-scratch · GitHub

「ゼロから作るDeep Learning ① (Pythonで学ぶディープラーニングの理論と実装)」 p.193~195 の写経です。

過学習の主な原因

  • パラメータを大量に持ち、表現力の高いモデルであること
  • 訓練データが少ないこと

上記が原因で、重みパラメータが大きくなり、 結果として、過学習となることが多いらしい。

Weight decay (荷重減衰)とは

重みの2乗ノルム(L2ノルム)を損失関数に加算し、 重みが大きくなることを抑えること

 \large{ L2ノルム : \frac{1}{2} λ W^2 }

ただし

 \large{ W = (w_1,w_2,…,w_n) }

Weight decay の python 実装

# coding: utf-8
import os
import sys

sys.path.append(os.pardir)  # 親ディレクトリのファイルをインポートするための設定
import numpy as np
import matplotlib.pyplot as plt
import urllib.request
import gzip
from collections import OrderedDict

    
def main():
    max_epochs = 201
    
    mymnist = MyMnist()
    train_test_data = mymnist.load_mnist()

    markers = {'train': 'o', 'test': 's'}
    
    # weight decayは荷重減衰で、0はweight decayを使用しない場合
    for i, weight_decay_lambda in enumerate([0, 0.1]):
        
        (train_acc_list,test_acc_list) = \
            my_train(train_test_data,
                     weight_decay_lambda,
                     max_epochs )

        plt.subplot(1,2,i+1)
        x = np.arange(max_epochs)
        plt.plot(x, train_acc_list,marker='o',label='train',markevery=10)
        plt.plot(x, test_acc_list, marker='s',label='test', markevery=10)
        plt.xlabel("epochs")
        plt.ylabel("accuracy")
        plt.ylim(0, 1.0)
        plt.legend(loc='lower right')
    plt.show()


def my_train(train_test_data,
             weight_decay_lambda, # 荷重減衰の設定
             max_epochs):
    (x_train, t_train, x_test, t_test) = train_test_data
    # 過学習を再現するために、学習データを削減
    x_train = x_train[:300]
    t_train = t_train[:300]

    network = MultiLayerNet(input_size=784,
                            hidden_size_list=[100,100,100,100,100,100],
                            output_size=10,
                            weight_decay_lambda=weight_decay_lambda)
    optimizer = SGD(lr=0.01)

    train_size = x_train.shape[0]
    batch_size = 100

    train_acc_list  = []
    test_acc_list   = []

    iter_per_epoch = max(train_size / batch_size, 1)
    epoch_cnt = 0

    for i in range(1000000000):
        batch_mask = np.random.choice(train_size, batch_size)
        x_batch = x_train[batch_mask]
        t_batch = t_train[batch_mask]

        grads = network.gradient(x_batch, t_batch)
        optimizer.update(network.params, grads)

        if i % iter_per_epoch == 0:
            train_acc = network.accuracy(x_train, t_train)
            test_acc = network.accuracy(x_test, t_test)
            train_acc_list.append(train_acc)
            test_acc_list.append(test_acc)

            print("epoch:" ,   str(epoch_cnt) ,
                  "train acc:",str(train_acc) ,
                  "test acc:", str(test_acc))

            epoch_cnt += 1
            if epoch_cnt >= max_epochs:
                break

    return (train_acc_list, test_acc_list)
    

class MyMnist:
    def __init__(self):
        pass

    def load_mnist(self):
        data_files = self.download_mnist()
        # convert numpy
        dataset = {}
        dataset['train_img']   = self.load_img(  data_files['train_img'] )
        dataset['train_label'] = self.load_label(data_files['train_label'])
        dataset['test_img']    = self.load_img(  data_files['test_img']  )
        dataset['test_label']  = self.load_label(data_files['test_label'])

        for key in ('train_img', 'test_img'):
            dataset[key] = dataset[key].astype(np.float32)
            dataset[key] /= 255.0

        for key in ('train_label','test_label'):
            dataset[key]=self.change_one_hot_label( dataset[key] )

        return (dataset['train_img'],
                dataset['train_label'],
                dataset['test_img'],
                dataset['test_label'] )

    def change_one_hot_label(self,X):
        T = np.zeros((X.size, 10))
        for idx, row in enumerate(T):
            row[X[idx]] = 1
        return T
    
    def download_mnist(self):
        url_base = 'http://yann.lecun.com/exdb/mnist/'
        key_file = {'train_img'  :'train-images-idx3-ubyte.gz',
                    'train_label':'train-labels-idx1-ubyte.gz',
                    'test_img'   :'t10k-images-idx3-ubyte.gz',
                    'test_label' :'t10k-labels-idx1-ubyte.gz' }
        data_files = {}
        dataset_dir = os.path.dirname(os.path.abspath(__file__))
        
        for data_name, file_name in key_file.items():
            req_url   = url_base+file_name
            file_path = dataset_dir + "/" + file_name

            request  = urllib.request.Request( req_url )
            response = urllib.request.urlopen(request).read()
            with open(file_path, mode='wb') as f:
                f.write(response)
                
            data_files[data_name] = file_path
        return data_files

    def load_img( self,file_path):
        img_size    = 784 # = 28*28
        
        with gzip.open(file_path, 'rb') as f:
            data = np.frombuffer(f.read(), np.uint8, offset=16)
        data = data.reshape(-1, img_size)
        return data
    
    def load_label(self,file_path):
        with gzip.open(file_path, 'rb') as f:
            labels = np.frombuffer(f.read(), np.uint8, offset=8)
        return labels

# 全結合による多層ニューラルネットワーク
class MultiLayerNet:
    def __init__(self,
                 input_size,            # 入力size (MNISTの場合は784)
                 hidden_size_list,      # 隠れ層のneuron数list 例[100,100,100]
                 output_size,           # shuturyoku 出力size (MNISTの場合は10)
                 activation='relu',     # 活性化関数 relu or sigmoid
                 weight_init_std='relu',# ※
                 weight_decay_lambda=0):# Weight Decay (L2ノルム)の強さ
        # weight_init_std : 重みの標準偏差 ( 例 0.01 )
        #   'relu'または'he'を指定した場合は「Heの初期値」
        #   'sigmoid'または'xavier'を指定した場合は「Xavierの初期値」

        
        self.input_size          = input_size
        self.output_size         = output_size
        self.hidden_size_list    = hidden_size_list
        self.hidden_layer_num    = len(hidden_size_list)
        self.weight_decay_lambda = weight_decay_lambda
        self.params = {}

        self.__init_weight(weight_init_std) # 重み初期化

        # レイヤの生成
        activation_layer = {'sigmoid': Sigmoid, 'relu': Relu}
        self.layers = OrderedDict()
        for idx in range(1, self.hidden_layer_num+1):
            self.layers['Affine' + str(idx)] = Affine(self.params['W' + str(idx)],
                                                      self.params['b' + str(idx)])
            self.layers['Activation_function' + str(idx)] = \
                activation_layer[activation]()

        idx = self.hidden_layer_num + 1
        self.layers['Affine' + str(idx)] = Affine(self.params['W' + str(idx)],
            self.params['b' + str(idx)])

        self.last_layer = SoftmaxWithLoss()

    # 重みの初期値設定
    # weight_init_std : 重みの標準偏差を指定(e.g. 0.01)
    #  'relu'または'he'を指定した場合は「Heの初期値」を設定
    #  'sigmoid'または'xavier'を指定した場合は「Xavierの初期値」を設定
    def __init_weight(self, weight_init_std):
        all_size_list = \
            [self.input_size] + self.hidden_size_list + [self.output_size]
        for idx in range(1, len(all_size_list)):
            scale = weight_init_std
            # ReLUを使う場合
            if str(weight_init_std).lower() in ('relu', 'he'):
                scale = np.sqrt(2.0 / all_size_list[idx - 1])
            # sigmoidを使う場合
            elif str(weight_init_std).lower() in ('sigmoid', 'xavier'):
                scale = np.sqrt(1.0 / all_size_list[idx - 1])

            self.params['W' + str(idx)] = \
                scale * np.random.randn(all_size_list[idx-1], all_size_list[idx])
            self.params['b' + str(idx)] = np.zeros(all_size_list[idx])

    def predict(self, x):
        for layer in self.layers.values():
            x = layer.forward(x)

        return x

    # 損失関数。x:入力データ、t:教師ラベル
    def loss(self, x, t):
        y = self.predict(x)

        weight_decay = 0
        for idx in range(1, self.hidden_layer_num + 2):
            W = self.params['W' + str(idx)]
            weight_decay += 0.5 * self.weight_decay_lambda * np.sum(W ** 2)

        return self.last_layer.forward(y, t) + weight_decay

    def accuracy(self, x, t):
        y = self.predict(x)
        y = np.argmax(y, axis=1)
        if t.ndim != 1 : t = np.argmax(t, axis=1)

        accuracy = np.sum(y == t) / float(x.shape[0])
        return accuracy

    # 勾配(数値微分)。x:入力データ、t:教師ラベル
    def numerical_gradient(self, x, t):
        loss_W = lambda W: self.loss(x, t)

        grads = {}
        for idx in range(1, self.hidden_layer_num+2):
            grads['W' + str(idx)] = \
                numerical_gradient(loss_W, self.params['W' + str(idx)])
            grads['b' + str(idx)] = \
                numerical_gradient(loss_W, self.params['b' + str(idx)])
        return grads

    # 勾配(誤差逆伝搬法)。x:入力データ、t:教師ラベル
    def gradient(self, x, t):
        # forward
        self.loss(x, t)

        # backward
        dout = 1
        dout = self.last_layer.backward(dout)

        layers = list(self.layers.values())
        layers.reverse()
        for layer in layers:
            dout = layer.backward(dout)

        # 設定
        grads = {}
        for idx in range(1, self.hidden_layer_num+2):
            grads['W' + str(idx)] = \
                self.layers['Affine' + str(idx)].dW + \
                self.weight_decay_lambda * self.layers['Affine' + str(idx)].W
            grads['b' + str(idx)] = self.layers['Affine' + str(idx)].db

        return grads
    
class Affine:
    def __init__(self, W, b):
        self.W =W
        self.b = b
        
        self.x = None
        self.original_x_shape = None
        # 重み・バイアスパラメータの微分
        self.dW = None
        self.db = None

    def forward(self, x):
        # テンソル対応
        self.original_x_shape = x.shape
        x = x.reshape(x.shape[0], -1)
        self.x = x

        out = np.dot(self.x, self.W) + self.b
        return out

    def backward(self, dout):
        dx = np.dot(dout, self.W.T)
        self.dW = np.dot(self.x.T, dout)
        self.db = np.sum(dout, axis=0)
        
        dx = dx.reshape(*self.original_x_shape)
        return dx

class Relu:
    def __init__(self):
        self.mask = None

    def forward(self, x):
        self.mask = (x <= 0)
        out = x.copy()
        out[self.mask] = 0

        return out

    def backward(self, dout):
        dout[self.mask] = 0
        dx = dout
        return dx

class Sigmoid:
    def __init__(self):
        self.out = None

    def forward(self, x):
        out = sigmoid(x)
        self.out = out
        return out

    def backward(self, dout):
        dx = dout * (1.0 - self.out) * self.out
        return dx
    
class SoftmaxWithLoss:
    def __init__(self):
        self.loss = None
        self.y = None # softmaxの出力
        self.t = None # 教師データ

    def forward(self, x, t):
        self.t = t
        self.y = self.softmax(x)
        self.loss = self.cross_entropy_error(self.y, self.t)
        
        return self.loss

    def backward(self, dout=1):
        batch_size = self.t.shape[0]
        if self.t.size == self.y.size: # 教師データがone-hot-vectorの場合
            dx = (self.y - self.t) / batch_size
        else:
            dx = self.y.copy()
            dx[np.arange(batch_size), self.t] -= 1
            dx = dx / batch_size
        return dx
    
    def softmax(self,x):
        x = x - np.max(x, axis=-1, keepdims=True)   # オーバーフロー対策
        return np.exp(x) / np.sum(np.exp(x), axis=-1, keepdims=True)

    def cross_entropy_error(self, y, t):
        if y.ndim == 1:
            t = t.reshape(1, t.size)
            y = y.reshape(1, y.size)

        # 教師データがone-hot-vectorの場合、正解ラベルのインデックスに変換
        if t.size == y.size:
            t = t.argmax(axis=1)

        batch_size = y.shape[0]
        return -np.sum(np.log(y[np.arange(batch_size), t] + 1e-7)) / batch_size

# 確率的勾配降下法(Stochastic Gradient Descent)
class SGD:
    def __init__(self, lr=0.01):
        self.lr = lr
        
    def update(self, params, grads):
        for key in params.keys():
            params[key] -= self.lr * grads[key] 

    
if __name__ == '__main__':
    main()

上記を実行すると、以下のように表示されます。

右側がWeight decayありで、 教師dataとテストdataでの精度の差が少ないようです。 また、教師dataが100%に至ってないことから、 過学習が防げているらしい。

Batch Normalizationによるアクティベーション分布の広がり調整

先程のentryでは、 重み初期値が広がりが持つようXavierやHeを使用していますが Batch Normalizationという手法を追加することでも、 アクティベーション分布の広がりを調整できるようです。

尚、アクティベーションとは、Affine層による処理を指します。

Affine ReLU Affine Softmax input CrossEntropyError L BatchNormalization

Batch Normalization の利点

以下

  • 学習を速く進行 (学習係数を大きくできる)
  • 初期値にそれほど依存しない
  • 過学習抑制 (Dropout 等の必要性を減らす)

Batch Normalization による正規化式

 \Large{
x_i ← \frac{x_i - μ_B}{ \sqrt{ σ_B^2 + ε } }
}

ただし、[tex: \large{μB、 σB2、ε}]は以下

 \large{
平均:μ_B = \frac {1}{m} \sum ^m_{i=1} x_i }
 \large{
分散:σ_B^2 = \frac {1}{m} \sum ^m_{i=1} (x_i - μ_B)^2 }

ε:ゼロ除算を避ける為の小さな値。例 10e-7

上記の  \Large{ x_i } を以下のように変換。

 \large{ y_i ← γ x_i + β }

Batch Normalization の計算グラフ

先程の数式は、以下の計算グラフで表すことがことができます。

x1N1xiNx+^2N1xiNx+ε××+γβxdx(N,D)γ(D,)(D,)β(N,D)outdout

Batch Normalization の python 実装

# coding: utf-8
import sys, os
import numpy as np
import matplotlib.pyplot as plt
import urllib.request
import gzip
from collections import OrderedDict

max_epochs    = 20
batch_size    = 100
learning_rate = 0.01
x_train = []
t_train = []
train_size    = 0
    
def main():
    
    # MNISTデータのdownload
    global x_train
    global t_train
    mymnist = MyMnist()
    (x_train, t_train, x_test, t_test) = mymnist.load_mnist()

    # 学習data削減
    x_train = x_train[:1000]
    t_train = t_train[:1000]
    global train_size
    train_size    = x_train.shape[0]

    # 3.グラフの描画==========
    
    # 対数スケールの配列生成
    weight_scale_list = np.logspace(0, -4, num=16)
    # 等差数列の生成
    x = np.arange(max_epochs)

    for i, w in enumerate(weight_scale_list):
        print(str(i+1), "/ 16" )
        train_acc_list, bn_train_acc_list = __train(w)

        plt.subplot(4,4,i+1)
        plt.title("W:" + str(w))
        if i == 15:
            plt.plot(x,
                     bn_train_acc_list,
                     label='Batch Normalization',
                     markevery=2)
            plt.plot(x,
                     train_acc_list,
                     linestyle = "--",
                     label='without BatchNorm',
                     markevery=2)
        else:
            plt.plot(x,
                     bn_train_acc_list,
                     markevery=2)
            plt.plot(x,
                     train_acc_list,
                     linestyle="--",
                     markevery=2)

        plt.ylim(0, 1.0)
        if i % 4:
            plt.yticks([])
        else:
            plt.ylabel("accuracy")
        if i < 12:
            plt.xticks([])
        else:
            plt.xlabel("epochs")
        plt.legend(loc='lower right')

    plt.show()

def __train(weight_init_std):
    bn_network = MultiLayerNetExtend(
        input_size=784,
        hidden_size_list=[100, 100, 100, 100, 100],
        output_size=10, 
        weight_init_std=weight_init_std,
        use_batchnorm=True)
    
    network = MultiLayerNetExtend(
        input_size=784,
        hidden_size_list=[100, 100, 100, 100, 100],
        output_size=10,
        weight_init_std=weight_init_std )
    
    optimizer = SGD(lr=learning_rate)
    
    train_acc_list    = []
    bn_train_acc_list = []
    
    iter_per_epoch = max(train_size / batch_size, 1)
    epoch_cnt = 0
    
    for i in range(1000000000):
        batch_mask = np.random.choice(train_size, batch_size)
        x_batch = x_train[batch_mask]
        t_batch = t_train[batch_mask]
    
        for _network in (bn_network, network):
            grads = _network.gradient(x_batch, t_batch)
            optimizer.update(_network.params, grads)
    
        if i % iter_per_epoch == 0:
            train_acc = network.accuracy(x_train, t_train)
            bn_train_acc = bn_network.accuracy(x_train, t_train)
            train_acc_list.append(train_acc)
            bn_train_acc_list.append(bn_train_acc)
    
            print("epoch:" + str(epoch_cnt) + " | " + str(train_acc) + " - " + str(bn_train_acc))
    
            epoch_cnt += 1
            if epoch_cnt >= max_epochs:
                break
                
    return train_acc_list, bn_train_acc_list

# 拡張版の全結合による多層ニューラルネットワーク
# ( Weiht Decay、Dropout、Batch Normalizationの機能を持つ )
class MultiLayerNetExtend:
    
    def __init__(
            self,
            input_size,         # 入力size(MNISTの場合784)
            hidden_size_list,   # 隠れ層のneuron数list(例[100, 100, 100])
            output_size,        # 出力size (MNISTの場合は10)
            activation='relu',  # 活性化関数 relu sigmoid
            weight_init_std='relu', # ※
            weight_decay_lambda=0,  # Weight Decay(L2ノルム)の強さ
            use_dropout = False,    
            dropout_ration = 0.5,   # Dropoutの割り合い
            use_batchnorm=False):
        # ※ weight_init_std : 重みの標準偏差を指定(e.g. 0.01)
        #       relu or he の「Heの初期値」、
        #       sigmoid or xavierの場合「Xavierの初期値」

        
        self.input_size         = input_size
        self.output_size        = output_size
        self.hidden_size_list   = hidden_size_list
        self.hidden_layer_num   = len(hidden_size_list)
        self.use_dropout        = use_dropout
        self.weight_decay_lambda= weight_decay_lambda
        self.use_batchnorm      = use_batchnorm
        self.params = {}

        # 重みの初期化
        self.__init_weight(weight_init_std)

        # レイヤの生成
        activation_layer = {'sigmoid': Sigmoid, 'relu': Relu}
        self.layers = OrderedDict()
        for idx in range(1, self.hidden_layer_num+1):
            self.layers['Affine' + str(idx)] = Affine(self.params['W' + str(idx)],
                                                      self.params['b' + str(idx)])
            if self.use_batchnorm:
                self.params['gamma' + str(idx)] = np.ones(hidden_size_list[idx-1])
                self.params['beta' + str(idx)]  = np.zeros(hidden_size_list[idx-1])
                self.layers['BatchNorm' + str(idx)] = \
                    BatchNormalization( self.params['gamma'+ str(idx)],
                                        self.params['beta' + str(idx)] )
                
            self.layers['Activation_function' + str(idx)] =\
                activation_layer[activation]()
            
            if self.use_dropout:
                self.layers['Dropout' + str(idx)] = Dropout(dropout_ration)

        idx = self.hidden_layer_num + 1
        self.layers['Affine' + str(idx)] = Affine(self.params['W' + str(idx)],
                                                  self.params['b' + str(idx)])
        self.last_layer = SoftmaxWithLoss()
        
    # 重みの初期値設定
    def __init_weight(self, weight_init_std):
        # weight_init_std : 重みの標準偏差を指定(e.g. 0.01)
        #    'relu'または'he'を指定した場合は「Heの初期値」を設定
        #    'sigmoid'または'xavier'を指定した場合は「Xavierの初期値」を設定

        all_size_list = \
            [self.input_size] + self.hidden_size_list + [self.output_size]
        for idx in range(1, len(all_size_list)):
            scale = weight_init_std
            
            # ReLUを使う場合に推奨される初期値
            if str(weight_init_std).lower() in ('relu', 'he'):
                scale = np.sqrt(2.0 / all_size_list[idx - 1])
            # sigmoidを使う場合に推奨される初期値
            elif str(weight_init_std).lower() in ('sigmoid', 'xavier'):
                scale = np.sqrt(1.0 / all_size_list[idx - 1])
                
            self.params['W' + str(idx)] = \
                scale * np.random.randn(all_size_list[idx-1], all_size_list[idx])
            self.params['b' + str(idx)] = np.zeros(all_size_list[idx])

    def predict(self, x, train_flg=False):
        for key, layer in self.layers.items():
            if "Dropout" in key or "BatchNorm" in key:
                x = layer.forward(x, train_flg)
            else:
                x = layer.forward(x)

        return x

    # x: 入力data、t:教師ラベル
    def loss(self, x, t, train_flg=False):
        y = self.predict(x, train_flg)

        weight_decay = 0
        for idx in range(1, self.hidden_layer_num + 2):
            W = self.params['W' + str(idx)]
            weight_decay += 0.5 * self.weight_decay_lambda * np.sum(W**2)

        return self.last_layer.forward(y, t) + weight_decay

    def accuracy(self, x, t):
        y = self.predict(x, train_flg=False)
        y = np.argmax(y, axis=1)
        if t.ndim != 1 : t = np.argmax(t, axis=1)

        accuracy = np.sum(y == t) / float(x.shape[0])
        return accuracy

    # 勾配を求める (数値微分)。x: 入力data、t:教師ラベル
    def numerical_gradient(self, x, t):
        loss_W = lambda W: self.loss(x, t, train_flg=True)

        grads = {}
        for idx in range(1, self.hidden_layer_num+2):
            grads['W' + str(idx)] = \
                self._numerical_gradient(loss_W, self.params['W' + str(idx)])
            grads['b' + str(idx)] = \
                self._numerical_gradient(loss_W, self.params['b' + str(idx)])
            
            if self.use_batchnorm and idx != self.hidden_layer_num+1:
                grads['gamma' + str(idx)] = \
                    self._numerical_gradient(loss_W,
                                             self.params['gamma' + str(idx)])
                grads['beta' + str(idx)] = \
                    self._numerical_gradient(loss_W,
                                             self.params['beta' + str(idx)])

        return grads

    def _numerical_gradient(self, f, x):
        h = 1e-4 # 0.0001
        grad = np.zeros_like(x)

        it = np.nditer(x, flags=['multi_index'], op_flags=['readwrite'])
        while not it.finished:
            idx = it.multi_index
            tmp_val = x[idx]
            x[idx] = tmp_val + h
            fxh1 = f(x) # f(x+h)

            x[idx] = tmp_val - h 
            fxh2 = f(x) # f(x-h)
            grad[idx] = (fxh1 - fxh2) / (2*h)

            x[idx] = tmp_val # 値を元に戻す
            it.iternext()
        return grad
    
        
    def gradient(self, x, t):
        # forward
        self.loss(x, t, train_flg=True)

        # backward
        dout = 1
        dout = self.last_layer.backward(dout)

        layers = list(self.layers.values())
        layers.reverse()
        for layer in layers:
            dout = layer.backward(dout)

        # 設定
        grads = {}
        for idx in range(1, self.hidden_layer_num+2):
            grads['W' + str(idx)] = \
                self.layers['Affine' + str(idx)].dW + \
                self.weight_decay_lambda * self.params['W' + str(idx)]
            
            grads['b' + str(idx)] = self.layers['Affine' + str(idx)].db

            if self.use_batchnorm and idx != self.hidden_layer_num+1:
                grads['gamma' + str(idx)]= \
                    self.layers['BatchNorm' + str(idx)].dgamma
                grads['beta' + str(idx)] = \
                    self.layers['BatchNorm' + str(idx)].dbeta

        return grads
    
class Affine:
    def __init__(self, W, b):
        self.W =W
        self.b = b
        
        self.x = None
        self.original_x_shape = None
        # 重み・バイアスパラメータの微分
        self.dW = None
        self.db = None

    def forward(self, x):
        # テンソル対応
        self.original_x_shape = x.shape
        x = x.reshape(x.shape[0], -1)
        self.x = x

        out = np.dot(self.x, self.W) + self.b
        return out

    def backward(self, dout):
        dx = np.dot(dout, self.W.T)
        self.dW = np.dot(self.x.T, dout)
        self.db = np.sum(dout, axis=0)
        dx = dx.reshape(*self.original_x_shape)
        return dx
    
class Sigmoid:
    def __init__(self):
        self.out = None

    def forward(self, x):
        out = sigmoid(x)
        self.out = out
        return out

    def backward(self, dout):
        dx = dout * (1.0 - self.out) * self.out
        return dx
    
class Relu:
    def __init__(self):
        self.mask = None

    def forward(self, x):
        self.mask = (x <= 0)
        out = x.copy()
        out[self.mask] = 0
        return out

    def backward(self, dout):
        dout[self.mask] = 0
        dx = dout
        return dx

# http://arxiv.org/abs/1502.03167
class BatchNormalization:
    
    def __init__(self,
                 gamma,
                 beta,
                 momentum=0.9,
                 running_mean=None,
                 running_var=None):
        
        self.gamma    = gamma
        self.beta     = beta
        self.momentum = momentum
        self.input_shape = None # Conv層の場合は4次元、全結合層の場合は2次元

        # テスト時に使用する平均と分散
        self.running_mean = running_mean
        self.running_var  = running_var
        
        # backward時に使用する中間データ
        self.batch_size = None
        self.xc         = None
        self.std        = None
        self.dgamma     = None
        self.dbeta      = None

    def forward(self, x, train_flg=True):
        self.input_shape = x.shape
        if x.ndim != 2:
            N, C, H, W = x.shape
            x = x.reshape(N, -1)

        out = self.__forward(x, train_flg)
        
        return out.reshape(*self.input_shape)
            
    def __forward(self, x, train_flg):
        if self.running_mean is None:
            N, D = x.shape
            self.running_mean = np.zeros(D)
            self.running_var  = np.zeros(D)
                        
        if train_flg:
            mu = x.mean(axis=0)
            xc = x - mu
            var = np.mean(xc**2, axis=0)
            std = np.sqrt(var + 10e-7)
            xn = xc / std
            
            self.batch_size = x.shape[0]
            self.xc = xc
            self.xn = xn
            self.std = std
            self.running_mean = \
                self.momentum * self.running_mean + (1-self.momentum) * mu
            self.running_var = \
                self.momentum * self.running_var + (1-self.momentum) * var
        else:
            xc = x - self.running_mean
            xn = xc / ((np.sqrt(self.running_var + 10e-7)))
            
        out = self.gamma * xn + self.beta 
        return out

    def backward(self, dout):
        if dout.ndim != 2:
            N, C, H, W = dout.shape
            dout = dout.reshape(N, -1)

        dx = self.__backward(dout)

        dx = dx.reshape(*self.input_shape)
        return dx

    def __backward(self, dout):
        dbeta  = dout.sum(axis=0)
        dgamma = np.sum(self.xn * dout, axis=0)
        dxn  = self.gamma * dout
        dxc  = dxn / self.std
        dstd = -np.sum((dxn * self.xc) / (self.std * self.std), axis=0)
        dvar = 0.5 * dstd / self.std
        dxc  += (2.0 / self.batch_size) * self.xc * dvar
        dmu  = np.sum(dxc, axis=0)
        dx   = dxc - dmu / self.batch_size
        
        self.dgamma = dgamma
        self.dbeta = dbeta
        
        return dx

# http://arxiv.org/abs/1207.0580
class Dropout:
    def __init__(self, dropout_ratio=0.5):
        self.dropout_ratio = dropout_ratio
        self.mask = None

    def forward(self, x, train_flg=True):
        if train_flg:
            self.mask = np.random.rand(*x.shape) > self.dropout_ratio
            return x * self.mask
        else:
            return x * (1.0 - self.dropout_ratio)

    def backward(self, dout):
        return dout * self.mask

class SoftmaxWithLoss:
    def __init__(self):
        self.loss = None
        self.y = None # softmaxの出力
        self.t = None # 教師データ

    def forward(self, x, t):
        self.t = t
        self.y = self.softmax(x)
        self.loss = self.cross_entropy_error(self.y, self.t)
        
        return self.loss

    def cross_entropy_error(self, y, t):
        if y.ndim == 1:
            t = t.reshape(1, t.size)
            y = y.reshape(1, y.size)

        # 教師データがone-hot-vectorの場合、正解ラベルのインデックスに変換
        if t.size == y.size:
            t = t.argmax(axis=1)

        batch_size = y.shape[0]
        return -np.sum(np.log(y[np.arange(batch_size), t] + 1e-7)) / batch_size

    def softmax(self,x):
        x = x - np.max(x, axis=-1, keepdims=True)   # オーバーフロー対策
        return np.exp(x) / np.sum(np.exp(x), axis=-1, keepdims=True)

    def backward(self, dout=1):
        batch_size = self.t.shape[0]
        if self.t.size == self.y.size: # 教師データがone-hot-vectorの場合
            dx = (self.y - self.t) / batch_size
        else:
            dx = self.y.copy()
            dx[np.arange(batch_size), self.t] -= 1
            dx = dx / batch_size
        
        return dx
    

class MyMnist:
    def __init__(self):
        pass

    def load_mnist(self):
        data_files = self.download_mnist()
        # convert numpy
        dataset = {}
        dataset['train_img']   = self.load_img(  data_files['train_img'] )
        dataset['train_label'] = self.load_label(data_files['train_label'])
        dataset['test_img']    = self.load_img(  data_files['test_img']  )
        dataset['test_label']  = self.load_label(data_files['test_label'])

        for key in ('train_img', 'test_img'):
            dataset[key] = dataset[key].astype(np.float32)
            dataset[key] /= 255.0

        for key in ('train_label','test_label'):
            dataset[key]=self.change_one_hot_label( dataset[key] )

        return (dataset['train_img'],
                dataset['train_label'],
                dataset['test_img'],
                dataset['test_label'] )

    def change_one_hot_label(self,X):
        T = np.zeros((X.size, 10))
        for idx, row in enumerate(T):
            row[X[idx]] = 1
        return T
    
    def download_mnist(self):
        url_base = 'http://yann.lecun.com/exdb/mnist/'
        key_file = {'train_img'  :'train-images-idx3-ubyte.gz',
                    'train_label':'train-labels-idx1-ubyte.gz',
                    'test_img'   :'t10k-images-idx3-ubyte.gz',
                    'test_label' :'t10k-labels-idx1-ubyte.gz' }
        data_files = {}
        dataset_dir = os.path.dirname(os.path.abspath(__file__))
        
        for data_name, file_name in key_file.items():
            req_url   = url_base+file_name
            file_path = dataset_dir + "/" + file_name

            request  = urllib.request.Request( req_url )
            response = urllib.request.urlopen(request).read()
            with open(file_path, mode='wb') as f:
                f.write(response)
                
            data_files[data_name] = file_path
        return data_files

    def load_img( self,file_path):
        img_size    = 784 # = 28*28
        
        with gzip.open(file_path, 'rb') as f:
            data = np.frombuffer(f.read(), np.uint8, offset=16)
        data = data.reshape(-1, img_size)
        return data
    
    def load_label(self,file_path):
        with gzip.open(file_path, 'rb') as f:
            labels = np.frombuffer(f.read(), np.uint8, offset=8)
        return labels


# 確率的勾配降下法(Stochastic Gradient Descent)
class SGD:
    def __init__(self, lr=0.01):
        self.lr = lr
        
    def update(self, params, grads):
        for key in params.keys():
            params[key] -= self.lr * grads[key] 


if __name__ == '__main__':
    main()

上記を実行すると、以下のように表示されます。

グラフの青線が Batch Normalization に該当しますが、 こちらの方が、学習が速いことが分かります。

(dl_scratch) C:\Users\end0t\tmp\deep-learning-from-scratch\ch06>python foo4.py
1 / 16
epoch:0 | 0.097 - 0.101
epoch:1 | 0.116 - 0.116
epoch:2 | 0.116 - 0.128
epoch:3 | 0.116 - 0.144
epoch:4 | 0.116 - 0.146
epoch:5 | 0.116 - 0.156
epoch:6 | 0.116 - 0.189
epoch:7 | 0.116 - 0.194
epoch:8 | 0.116 - 0.219
epoch:9 | 0.116 - 0.239
epoch:10 | 0.116 - 0.259
epoch:11 | 0.116 - 0.272
epoch:12 | 0.116 - 0.288
epoch:13 | 0.116 - 0.302
epoch:14 | 0.116 - 0.318
epoch:15 | 0.116 - 0.331
epoch:16 | 0.116 - 0.336
epoch:17 | 0.116 - 0.352
epoch:18 | 0.116 - 0.372
epoch:19 | 0.116 - 0.38
No artists with labels found to put in legend.  Note that artists whose label start with an underscore are ignored when legend() is called with no argument.
2 / 16
epoch:0 | 0.094 - 0.081
foo4.py:220: RuntimeWarning: overflow encountered in square
  weight_decay += 0.5 * self.weight_decay_lambda * np.sum(W**2)
foo4.py:220: RuntimeWarning: invalid value encountered in double_scalars
  weight_decay += 0.5 * self.weight_decay_lambda * np.sum(W**2)
C:\Users\end0t\Anaconda2\envs\dl_scratch\lib\site-packages\numpy\core\fromnumeric.py:86: RuntimeWarning: overflow encountered in reduce
  return ufunc.reduce(obj, axis, dtype, out, **passkwargs)

<略>

No artists with labels found to put in legend.  Note that artists whose label start with an underscore are ignored when legend() is called with no argument.
16 / 16
epoch:0 | 0.1 - 0.121
epoch:1 | 0.117 - 0.341
epoch:2 | 0.117 - 0.43
epoch:3 | 0.117 - 0.416
epoch:4 | 0.117 - 0.4
epoch:5 | 0.117 - 0.462
epoch:6 | 0.117 - 0.446
epoch:7 | 0.117 - 0.454
epoch:8 | 0.117 - 0.52
epoch:9 | 0.117 - 0.533
epoch:10 | 0.117 - 0.52
epoch:11 | 0.117 - 0.527
epoch:12 | 0.117 - 0.482
epoch:13 | 0.117 - 0.523
epoch:14 | 0.117 - 0.52
epoch:15 | 0.117 - 0.522
epoch:16 | 0.117 - 0.516
epoch:17 | 0.117 - 0.523
epoch:18 | 0.117 - 0.544
epoch:19 | 0.117 - 0.572

ディープラーニングにおける XavierとHeによる重み初期値の適正化

deep-learning-from-scratch/ch06 at master · oreilly-japan/deep-learning-from-scratch · GitHub

「ゼロから作るDeep Learning ① (Pythonで学ぶディープラーニングの理論と実装)」 p.177~186 の写経です。

p.178によれば、過学習を抑える為、重みの初期値は

  • できるだけ小さな値
  • ≠0
  • ランダム

が良いらしく、更には「sigmoid関数にはXavier初期値」、 「relu関数にはHe初期値」が良いらしく、これにより、重みが広がりを持つようです。 (詳細は、本をご覧ください)

0.0 0.2 0.4 0.6 0.8 1.0 0.0 0.2 0.4 0.6 0.8 1.0 0.0 0.2 0.4 0.6 0.8 1.0 0.0 0.2 0.4 0.6 0.8 1.0 標準偏差1のガウス分布を重み初期値とした場合のアクティベーション層の分布 標準偏差0.01のガウス分布を重み初期値とした場合のアクティベーション層の分布 「Xavier」を重み初期値とした場合のアクティベーション層の分布 ReLUに対し「He」を重み初期値とした場合のアクティベーション層の分布

以下のpython scriptは、初期値によるlossの変化(効率化)を比較するものです。

# coding: utf-8
import os
import sys
import numpy as np
import matplotlib.pyplot as plt
import urllib.request
import gzip
from collections import OrderedDict

def main():
    # MNISTデータのdownload
    mymnist = MyMnist()
    (x_train, t_train, x_test, t_test) = mymnist.load_mnist()

    train_size     = x_train.shape[0]
    batch_size     = 128
    max_iterations = 2000

    weight_init_types = {
        'std=0.01': 0.01,       # 通常?は、標準偏差初期値
        'Xavier': 'sigmoid',    # sigmoidには、Xavier初期値
        'He': 'relu'            # reluには、He初期値
    }
    optimizer = SGD(lr=0.01)    # 確率的勾配降下法

    networks   = {}
    train_loss = {}

    for key, weight_type in weight_init_types.items():
        networks[key] = \
            MultiLayerNet(input_size=784,
                          hidden_size_list=[100,100,100,100],
                          output_size=10,
                          weight_init_std=weight_type)
        train_loss[key] = []


    # 訓練
    print( "i       std=0.01                Xavier                  He" )
    
    for i in range(max_iterations):
        batch_mask = np.random.choice(train_size, batch_size)
        x_batch = x_train[batch_mask]
        t_batch = t_train[batch_mask]
    
        for key in weight_init_types.keys():
            grads = networks[key].gradient(x_batch, t_batch)
            optimizer.update(networks[key].params, grads)
    
            loss = networks[key].loss(x_batch, t_batch)
            train_loss[key].append(loss)

        
        if i % 100 == 0:
            disp_cols = [str(i)]
            for key in weight_init_types.keys():
                loss = networks[key].loss(x_batch, t_batch)
                disp_cols.append( str(loss) )
            print( "\t".join( disp_cols ) )

    my_plot = MyPlot()
    my_plot.disp_graph(max_iterations,
                       weight_init_types,
                       train_loss )
    
    
class MyPlot:
    def __init__(self):
        pass

    def disp_graph(self,
                   max_iterations,
                   weight_init_types,
                   train_loss ):
        markers = {'std=0.01': 'o', 'Xavier': 's', 'He': 'D'}
        x = np.arange(max_iterations)
        for key in weight_init_types.keys():
            plt.plot(x,
                     self.smooth_curve(train_loss[key]),
                     marker=markers[key],
                     markevery=100,
                     label=key )
        plt.xlabel("iterations")
        plt.ylabel("loss")
        plt.ylim(0, 2.5)
        plt.legend()
        plt.show()

    # 損失関数のグラフを滑らかにする
    # http://glowingpython.blogspot.jp/2012/02/convolution-with-numpy.html
    def smooth_curve(self, x):
        window_len = 11
        s = np.r_[x[window_len-1:0:-1], x, x[-1:-window_len:-1]]
        w = np.kaiser(window_len, 2)
        y = np.convolve(w/w.sum(), s, mode='valid')
        return y[5:len(y)-5]


# 確率的勾配降下法 Stochastic Gradient Descent
class SGD:
    def __init__(self, lr=0.01):
        self.lr = lr    # lrは学習率
        
    def update(self, params, grads):
        for key in params.keys():
            params[key] -= self.lr * grads[key] 

class MyMnist:
    def __init__(self):
        pass

    def load_mnist(self):
        data_files = self.download_mnist()
        # convert numpy
        dataset = {}
        dataset['train_img']   = self.load_img(  data_files['train_img'] )
        dataset['train_label'] = self.load_label(data_files['train_label'])
        dataset['test_img']    = self.load_img(  data_files['test_img']  )
        dataset['test_label']  = self.load_label(data_files['test_label'])

        for key in ('train_img', 'test_img'):
            dataset[key] = dataset[key].astype(np.float32)
            dataset[key] /= 255.0

        for key in ('train_label','test_label'):
            dataset[key]=self.change_one_hot_label( dataset[key] )

        return (dataset['train_img'],
                dataset['train_label'],
                dataset['test_img'],
                dataset['test_label'] )

    def change_one_hot_label(self,X):
        T = np.zeros((X.size, 10))
        for idx, row in enumerate(T):
            row[X[idx]] = 1
        return T
    
    def download_mnist(self):
        url_base = 'http://yann.lecun.com/exdb/mnist/'
        key_file = {'train_img'  :'train-images-idx3-ubyte.gz',
                    'train_label':'train-labels-idx1-ubyte.gz',
                    'test_img'   :'t10k-images-idx3-ubyte.gz',
                    'test_label' :'t10k-labels-idx1-ubyte.gz' }
        data_files = {}
        dataset_dir = os.path.dirname(os.path.abspath(__file__))
        
        for data_name, file_name in key_file.items():
            req_url   = url_base+file_name
            file_path = dataset_dir + "/" + file_name

            request  = urllib.request.Request( req_url )
            response = urllib.request.urlopen(request).read()
            with open(file_path, mode='wb') as f:
                f.write(response)
                
            data_files[data_name] = file_path
        return data_files

    def load_img( self,file_path):
        img_size    = 784 # = 28*28
        
        with gzip.open(file_path, 'rb') as f:
            data = np.frombuffer(f.read(), np.uint8, offset=16)
        data = data.reshape(-1, img_size)
        return data
    
    def load_label(self,file_path):
        with gzip.open(file_path, 'rb') as f:
            labels = np.frombuffer(f.read(), np.uint8, offset=8)
        return labels

# 全結合による多層ニューラルネットワーク
class MultiLayerNet:
    
    def __init__(
            self,
            input_size,            # 入力size MNISTの場合 784
            hidden_size_list,      # 隠れ層のneuron数 例[100,100,100]
            output_size,           # 出力size MNISTの場合 10
            activation='relu',     # 活性化関数 'relu' or 'sigmoid'
            weight_init_std='relu',# ※
            weight_decay_lambda=0):# Weight Decay (L2ノルム)の強さ
    
        # ※ weight_init_std :
        #    重みの標準偏差を指定(e.g. 0.01)
        #    'relu'または'he'を指定した場合は「Heの初期値」
        #    'sigmoid'または'xavier'を指定した場合は「Xavierの初期値」
        
        self.input_size          = input_size
        self.output_size         = output_size
        self.hidden_size_list    = hidden_size_list
        self.hidden_layer_num    = len(hidden_size_list)
        self.weight_decay_lambda = weight_decay_lambda
        self.params = {}

        # 重みの初期化
        self.__init_weight(weight_init_std)

        # レイヤの生成
        activation_layer = {'sigmoid': Sigmoid, 'relu': Relu}
        self.layers = OrderedDict()
        
        for idx in range(1, self.hidden_layer_num+1):
            self.layers['Affine' + str(idx)] = \
                Affine(self.params['W' + str(idx)],
                       self.params['b' + str(idx)])
            
            self.layers['Activation_function' + str(idx)] = \
                activation_layer[activation]()

        idx = self.hidden_layer_num + 1
        self.layers['Affine' + str(idx)] = \
            Affine(self.params['W' + str(idx)],
                   
            self.params['b' + str(idx)])

        self.last_layer = SoftmaxWithLoss()
        
    # 重みの初期値設定
    def __init_weight(self, weight_init_std):

        all_size_list = \
            [self.input_size] + self.hidden_size_list + [self.output_size]
        
        for idx in range(1, len(all_size_list)):
            scale = weight_init_std
            
            if str(weight_init_std).lower() in ('relu', 'he'):
                # ReLUを使う場合の初期値
                scale = np.sqrt(2.0 / all_size_list[idx - 1])
            elif str(weight_init_std).lower() in ('sigmoid', 'xavier'):
                # sigmoidを使う場合の初期値
                scale = np.sqrt(1.0 / all_size_list[idx - 1])

            self.params['W' + str(idx)] = \
                scale * np.random.randn(all_size_list[idx-1], all_size_list[idx])
            self.params['b' + str(idx)] = np.zeros(all_size_list[idx])

    def predict(self, x):
        for layer in self.layers.values():
            x = layer.forward(x)
        return x

    # 損失関数を求める
    def loss(self,
             x, # 入力データ
             t):# 教師ラベル
        
        y = self.predict(x)

        weight_decay = 0
        for idx in range(1, self.hidden_layer_num + 2):
            W = self.params['W' + str(idx)]
            weight_decay += 0.5 * self.weight_decay_lambda * np.sum(W ** 2)

        return self.last_layer.forward(y, t) + weight_decay

    def accuracy(self, x, t):
        y = self.predict(x)
        y = np.argmax(y, axis=1)
        if t.ndim != 1 : t = np.argmax(t, axis=1)

        accuracy = np.sum(y == t) / float(x.shape[0])
        return accuracy

    # 勾配を求める (数値微分)
    def numerical_gradient(self, x, t):
        
        loss_W = lambda W: self.loss(x, t)

        grads = {}
        for idx in range(1, self.hidden_layer_num+2):
            grads['W' + str(idx)] = \
                self._numerical_gradient(loss_W, self.params['W' + str(idx)])
            grads['b' + str(idx)] = \
                self._numerical_gradient(loss_W, self.params['b' + str(idx)])
        return grads

    def _numerical_gradient(self, f, x):
        h = 1e-4 # 0.0001
        grad = np.zeros_like(x)

        it = np.nditer(x, flags=['multi_index'], op_flags=['readwrite'])
        while not it.finished:
            idx = it.multi_index
            tmp_val = x[idx]
            x[idx] = tmp_val + h
            fxh1 = f(x) # f(x+h)

            x[idx] = tmp_val - h 
            fxh2 = f(x) # f(x-h)
            grad[idx] = (fxh1 - fxh2) / (2*h)

            x[idx] = tmp_val # 値を元に戻す
            it.iternext()
        return grad

    # 勾配を求める (誤差逆伝搬法)
    def gradient(self, x, t):
        # forward
        self.loss(x, t)
        # backward
        dout = 1
        dout = self.last_layer.backward(dout)

        layers = list(self.layers.values())
        layers.reverse()
        for layer in layers:
            dout = layer.backward(dout)

        # 設定
        grads = {}
        for idx in range(1, self.hidden_layer_num+2):
            grads['W' + str(idx)] = \
                self.layers['Affine' + str(idx)].dW + \
                self.weight_decay_lambda * self.layers['Affine' + str(idx)].W
            
            grads['b' + str(idx)] = self.layers['Affine' + str(idx)].db

        return grads

class Relu:
    def __init__(self):
        self.mask = None

    def forward(self, x):
        self.mask = (x <= 0)
        out = x.copy()
        out[self.mask] = 0
        return out

    def backward(self, dout):
        dout[self.mask] = 0
        dx = dout
        return dx

class Sigmoid:
    def __init__(self):
        self.out = None

    def forward(self, x):
        out = sigmoid(x)
        self.out = out
        return out

    def backward(self, dout):
        dx = dout * (1.0 - self.out) * self.out
        return dx
    
class Affine:
    def __init__(self, W, b):
        self.W =W
        self.b = b
        
        self.x = None
        self.original_x_shape = None
        # 重み・バイアスパラメータの微分
        self.dW = None
        self.db = None

    def forward(self, x):
        # テンソル対応
        self.original_x_shape = x.shape
        x = x.reshape(x.shape[0], -1)
        self.x = x

        out = np.dot(self.x, self.W) + self.b
        return out

    def backward(self, dout):
        dx = np.dot(dout, self.W.T)
        self.dW = np.dot(self.x.T, dout)
        self.db = np.sum(dout, axis=0)
        dx = dx.reshape(*self.original_x_shape)
        return dx
    
class SoftmaxWithLoss:
    def __init__(self):
        self.loss = None
        self.y = None # softmaxの出力
        self.t = None # 教師データ

    def forward(self, x, t):
        self.t = t
        self.y = self.softmax(x)
        self.loss = self.cross_entropy_error(self.y, self.t)
        
        return self.loss

    def softmax(self,x):
        x = x - np.max(x, axis=-1, keepdims=True)   # オーバーフロー対策
        return np.exp(x) / np.sum(np.exp(x), axis=-1, keepdims=True)

    def cross_entropy_error(self, y, t):
        if y.ndim == 1:
            t = t.reshape(1, t.size)
            y = y.reshape(1, y.size)

        # 教師データがone-hot-vectorの場合、正解ラベルのインデックスに変換
        if t.size == y.size:
            t = t.argmax(axis=1)

        batch_size = y.shape[0]
        return -np.sum(np.log(y[np.arange(batch_size), t] + 1e-7)) / batch_size
    

    def backward(self, dout=1):
        batch_size = self.t.shape[0]
        
        # 教師データがone-hot-vectorの場合
        if self.t.size == self.y.size: 
            dx = (self.y - self.t) / batch_size
        else:
            dx = self.y.copy()
            dx[np.arange(batch_size), self.t] -= 1
            dx = dx / batch_size
        return dx
    
if __name__ == '__main__':
    main()

↑こう書くと、↓こう表示されます。

(dl_scratch) C:\Users\end0t\tmp\deep-learning-from-scratch\ch06>python foo.py
i       std=0.01                Xavier                  He
0       2.3025004704494005      2.2944839686050837      2.3653180242752265
100     2.302454057219614       2.262232616944392       1.423335924549919
200     2.3036850103449593      2.1885701331268885      0.8922211412264922
300     2.3016946285904103      1.9951984874523245      0.5921122127049883
400     2.3023076862028184      1.5446343436237382      0.40275908822140105
500     2.3030326901257485      1.1345704960672047      0.43646367986152734
600     2.301907296081413       0.7156987132105668      0.3053186464209969
700     2.3033136374489893      0.48499635572109423     0.17792225418744217
800     2.3005787741577275      0.5644043961354724      0.34814906298688486
900     2.303748175006521       0.6012669563754791      0.3043807980209645
1000    2.302643527159904       0.48081435514697207     0.27306946946391025
1100    2.297993101564411       0.38269275507390704     0.3207061269442881
1200    2.3055223181126547      0.38107335902028217     0.22529239435114604
1300    2.304165004571269       0.3188476087654375      0.18484296010456175
1400    2.2957005728712074      0.2831411390906005      0.20438082403853383
1500    2.3043031319472287      0.30482817306281096     0.20284927100771363
1600    2.3077019241695296      0.29383020286123773     0.17759237769208688
1700    2.2960884184185426      0.41951974165910566     0.26643741166574597
1800    2.299928249921105       0.2764734085240914      0.2112177030146325
1900    2.2992910824123762      0.46042525877196677     0.32704587115374617

numpy for python によるニューラルネットワーク学習 (誤差逆伝播法版)

GitHub - oreilly-japan/deep-learning-from-scratch: 『ゼロから作る Deep Learning』(O'Reilly Japan, 2016)

「ゼロから作るDeep Learning ① (Pythonで学ぶディープラーニングの理論と実装)」 p.156~163 の写経です。

以前、以下のentryでは、勾配計算に数値微分を使用していますが、 今回は、誤差逆伝播法を使用しています。

numpy for python によるニューラルネットワーク学習 - end0tknr's kipple - web写経開発

# coding: utf-8

import gzip
import matplotlib.pyplot as plt
import numpy             as np
import os
import sys
import urllib.request
from collections import OrderedDict


def main():
    # MNISTデータのdownload
    mymnist = MyMnist()
    (x_train, t_train, x_test, t_test) = mymnist.load_mnist()

    network = TwoLayerNet(input_size=784, hidden_size=50, output_size=10)

    iters_num     = 10000 # 繰り返し回数
    train_size    = x_train.shape[0]
    batch_size    = 100
    learning_rate = 0.1

    train_loss_list = []
    train_acc_list  = []
    test_acc_list   = []
    
    iter_per_epoch = max(train_size / batch_size, 1)

    for i in range(iters_num):
        batch_mask = np.random.choice(train_size, batch_size)
        x_batch = x_train[batch_mask]
        t_batch = t_train[batch_mask]

        # 勾配
        #grad = network.numerical_gradient(x_batch, t_batch)
        grad = network.gradient(x_batch, t_batch)

        # 更新
        for key in ('W1', 'b1', 'W2', 'b2'):
            network.params[key] -= learning_rate * grad[key]

        loss = network.loss(x_batch, t_batch)
        train_loss_list.append(loss)

        if i % iter_per_epoch == 0:
            train_acc = network.accuracy(x_train, t_train)
            test_acc = network.accuracy(x_test, t_test)
            train_acc_list.append(train_acc)
            test_acc_list.append(test_acc)
            print(train_acc, test_acc)

    my_plot = MyPlot()
    my_plot.disp_graph(train_acc_list,test_acc_list)
    
class TwoLayerNet:

    def __init__(self,
                 input_size,    # 入力層のneuron数
                 hidden_size,   # 隠れ層の〃
                 output_size,   # 出力層の〃
                 weight_init_std = 0.01):
        # 重みの初期化
        self.params = {}
        self.params['W1'] = \
            weight_init_std * np.random.randn(input_size, hidden_size)
        self.params['b1'] = np.zeros(hidden_size)
        self.params['W2'] = \
            weight_init_std * np.random.randn(hidden_size, output_size) 
        self.params['b2'] = np.zeros(output_size)

        # Affine層と ReLU層の作成
        self.layers = OrderedDict()     # 順序付き辞書
        self.layers['Affine1'] = Affine(self.params['W1'], self.params['b1'])
        self.layers['Relu1'] = Relu()
        self.layers['Affine2'] = Affine(self.params['W2'], self.params['b2'])

        # 出力層としての Softmax-with-Loss 層の作成
        self.lastLayer = SoftmaxWithLoss()
        
    def predict(self, x):
        for layer in self.layers.values():
            x = layer.forward(x)
        
        return x
        
    # x:入力データ, t:教師データ
    def loss(self, x, t):
        y = self.predict(x)
        return self.lastLayer.forward(y, t)
    
    def accuracy(self, x, t):
        y = self.predict(x)
        y = np.argmax(y, axis=1)
        if t.ndim != 1 : t = np.argmax(t, axis=1)
        
        accuracy = np.sum(y == t) / float(x.shape[0])
        return accuracy
        
    # x:入力データ, t:教師データ
    def numerical_gradient(self, x, t):
        loss_W = lambda W: self.loss(x, t)
        
        grads = {}
        grads['W1'] = self._numerical_gradient(loss_W, self.params['W1'])
        grads['b1'] = self._numerical_gradient(loss_W, self.params['b1'])
        grads['W2'] = self._numerical_gradient(loss_W, self.params['W2'])
        grads['b2'] = self._numerical_gradient(loss_W, self.params['b2'])
        
        return grads

    def _numerical_gradient(self, f, x):
        h = 1e-4 # 0.0001
        grad = np.zeros_like(x)

        it = np.nditer(x, flags=['multi_index'], op_flags=['readwrite'])
        while not it.finished:
            idx = it.multi_index
            tmp_val = x[idx]
            x[idx] = tmp_val + h
            fxh1 = f(x) # f(x+h)

            x[idx] = tmp_val - h 
            fxh2 = f(x) # f(x-h)
            grad[idx] = (fxh1 - fxh2) / (2*h)

            x[idx] = tmp_val # 値を元に戻す
            it.iternext()
        return grad
    
        
    def gradient(self, x, t):
        # forward
        self.loss(x, t)

        # backward
        dout = 1
        dout = self.lastLayer.backward(dout)
        
        layers = list(self.layers.values())
        layers.reverse()
        for layer in layers:
            dout = layer.backward(dout)

        # 設定
        grads = {}
        grads['W1'] = self.layers['Affine1'].dW
        grads['b1'] = self.layers['Affine1'].db
        grads['W2'] = self.layers['Affine2'].dW
        grads['b2'] = self.layers['Affine2'].db
        
        return grads


class Affine:
    def __init__(self, W, b):
        self.W =W
        self.b = b
        self.x = None
        self.original_x_shape = None
        self.dW = None
        self.db = None

    def forward(self, x):
        self.original_x_shape = x.shape
        x = x.reshape(x.shape[0], -1)
        self.x = x

        out = np.dot(self.x, self.W) + self.b
        return out

    def backward(self, dout):
        dx = np.dot(dout, self.W.T) # W.Tはnumpyによる転置行列
        self.dW = np.dot(self.x.T, dout)
        self.db = np.sum(dout, axis=0)
        # 変数前のアスタリスクは、入力値の分割
        dx = dx.reshape(*self.original_x_shape)
        return dx


class SoftmaxWithLoss:
    def __init__(self):
        self.loss = None # 損失
        self.y    = None # softmax の出力
        self.t    = None # 教師データ(one-hot vector)
    def forward(self, x, t):
        self.t = t
        self.y = self.softmax(x)
        self.loss = self.cross_entropy_error(self.y, self.t)
        return self.loss
    def backward(self, dout=1):
        batch_size = self.t.shape[0]
        dx = (self.y - self.t) / batch_size
        return dx

    def softmax(self,x):
        x = x - np.max(x, axis=-1, keepdims=True)   # オーバーフロー対策
        return np.exp(x) / np.sum(np.exp(x), axis=-1, keepdims=True)

    def cross_entropy_error(self, y, t):
        if y.ndim == 1:
            t = t.reshape(1, t.size)
            y = y.reshape(1, y.size)
            
        # 教師データがone-hot-vectorの場合、正解ラベルのindexへ
        if t.size == y.size:
            t = t.argmax(axis=1)
             
        batch_size = y.shape[0]
        return -np.sum(np.log(y[np.arange(batch_size), t] + 1e-7)) / batch_size

class Relu:
    def __init__(self):
        self.mask = None
        
    def forward(self, x):
        # 0以下の要素をTrue、それ以外をFalse化
        self.mask = (x <= 0)
        # print( self.mask )

        # Trueの要素位置にものを0化、それ以外はそのまま
        out = x.copy()
        out[self.mask] = 0
        # print( out )
        
        return out
    
    def backward(self, dout):
        # print( self.mask )
        
        dout[self.mask] = 0
        # print( dout )
        dx = dout
        return dx

class MyMnist:
    def __init__(self):
        pass

    def load_mnist(self):
        data_files = self.download_mnist()
        # convert numpy
        dataset = {}
        dataset['train_img']   = self.load_img(  data_files['train_img'] )
        dataset['train_label'] = self.load_label(data_files['train_label'])
        dataset['test_img']    = self.load_img(  data_files['test_img']  )
        dataset['test_label']  = self.load_label(data_files['test_label'])

        for key in ('train_img', 'test_img'):
            dataset[key] = dataset[key].astype(np.float32)
            dataset[key] /= 255.0

        for key in ('train_label','test_label'):
            dataset[key]=self.change_one_hot_label( dataset[key] )

        return (dataset['train_img'],
                dataset['train_label'],
                dataset['test_img'],
                dataset['test_label'] )

    def change_one_hot_label(self,X):
        T = np.zeros((X.size, 10))
        for idx, row in enumerate(T):
            row[X[idx]] = 1
        return T
    
    def download_mnist(self):
        url_base = 'http://yann.lecun.com/exdb/mnist/'
        key_file = {'train_img'  :'train-images-idx3-ubyte.gz',
                    'train_label':'train-labels-idx1-ubyte.gz',
                    'test_img'   :'t10k-images-idx3-ubyte.gz',
                    'test_label' :'t10k-labels-idx1-ubyte.gz' }
        data_files = {}
        dataset_dir = os.path.dirname(os.path.abspath(__file__))
        
        for data_name, file_name in key_file.items():
            req_url   = url_base+file_name
            file_path = dataset_dir + "/" + file_name

            request  = urllib.request.Request( req_url )
            response = urllib.request.urlopen(request).read()
            with open(file_path, mode='wb') as f:
                f.write(response)
                
            data_files[data_name] = file_path
        return data_files

    def load_img( self,file_path):
        img_size    = 784 # = 28*28
        
        with gzip.open(file_path, 'rb') as f:
            data = np.frombuffer(f.read(), np.uint8, offset=16)
        data = data.reshape(-1, img_size)
        return data
    
    def load_label(self,file_path):
        with gzip.open(file_path, 'rb') as f:
            labels = np.frombuffer(f.read(), np.uint8, offset=8)
        return labels

class MyPlot:
    def __init__(self):
        pass
    def disp_graph(self,train_acc_list,test_acc_list):
        markers = {'train': 'o', 'test': 's'}
        x = np.arange(len(train_acc_list))
        plt.plot(x, train_acc_list, label='train acc')
        plt.plot(x, test_acc_list, label='test acc', linestyle='--')
        plt.xlabel("epochs")
        plt.ylabel("accuracy")
        plt.ylim(0, 1.0)
        plt.legend(loc='lower right')
        plt.show()
        
    
if __name__ == '__main__':
    main()
    

上記を実行すると、以下のように表示されます。

(dl_scratch) C:\Users\end0t\tmp\deep-learning-from-scratch\ch05>python foo5.py
0.14275 0.1407
0.9061833333333333 0.9092
0.9242833333333333 0.9257
0.9384166666666667 0.9379
0.9472333333333334 0.9462
0.9537 0.9511
0.9568166666666666 0.9557
0.9632333333333334 0.9602
0.96755 0.9632
0.9692 0.965
0.97145 0.9673
0.9744 0.9684
0.9755333333333334 0.9698
0.9776 0.9712
0.9781 0.9707
0.9797666666666667 0.9707
0.9784 0.9721

Affine層とSoftmax-with-Loss層の計算グラフとnumpy for python実装

github.com

「ゼロから作るDeep Learning ① (Pythonで学ぶディープラーニングの理論と実装)」 p.147~154の写経として、以下の朱書き部分を計算グラフ化します。

目次

Affine ReLU Affine Softmax input CrossEntropyError L

Affineレイヤ

Affine層とは「X・W + B = O」らしく、この計算グラフは以下の通りです。

朱書き部分は、微分による誤差逆伝播です。

W = ( ) w w w 11 12 13 w w w 21 22 23 T Y + L Y dot X W X W B (2,) (3,) (3,) (3,) (2,3) (3,) L Y (3,) L Y (3,) L W L Y (1,3) =X T (2,3) (2,1) L X L Y (3,2) W T (2,) (3,) = X・W + B = O (2,) (2,3) (3,) 対応する要素数を一致 W = ( ) w w w 11 12 13 w w w 21 22 23 (3,) ただし

さらに、これをミニバッチに対応さたたものが以下。

<sodipodi:namedview id="namedview2563" pagecolor="#ffffff" bordercolor="#666666" borderopacity="1.0" inkscape:showpageshadow="2" inkscape:pageopacity="0.0" inkscape:pagecheckerboard="0" inkscape:deskcolor="#d1d1d1" inkscape:document-units="mm" showgrid="false" inkscape:zoom="0.9888468" inkscape:cx="453.55863" inkscape:cy="521.31432" inkscape:window-width="2542" inkscape:window-height="1332" inkscape:window-x="2585" inkscape:window-y="473" inkscape:window-maximized="0" inkscape:current-layer="layer1" /> <inkscape:path-effect effect="bspline" id="path-effect4338" is_visible="true" lpeversion="1" weight="33.333333" steps="2" helper_size="0" apply_no_weight="true" apply_with_weight="true" only_selected="false" /> <inkscape:path-effect effect="bspline" id="path-effect4444" is_visible="true" lpeversion="1" weight="33.333333" steps="2" helper_size="0" apply_no_weight="true" apply_with_weight="true" only_selected="false" /> <inkscape:path-effect effect="bspline" id="path-effect6891" is_visible="true" lpeversion="1" weight="33.333333" steps="2" helper_size="0" apply_no_weight="true" apply_with_weight="true" only_selected="false" /> Y + L Y dot X W X W B (N,2) (N,3) (N,3) (3,) (2,3) (N,3) L Y (N,3) L Y (N,3) L W L Y (N,3) =X T (2,3) (2,N) L X L Y (3,2) W T (N,2) (N,3) =

以下は、Affine layerのpython実装

class Affine:
    def __init__(self, W, b):
        self.W =W
        self.b = b
        self.x = None
        self.original_x_shape = None
        self.dW = None
        self.db = None

    def forward(self, x):
        self.original_x_shape = x.shape
        x = x.reshape(x.shape[0], -1)
        self.x = x

        out = np.dot(self.x, self.W) + self.b
        return out

    def backward(self, dout):
        dx = np.dot(dout, self.W.T) # W.Tはnumpyによる転置行列
        self.dW = np.dot(self.x.T, dout)
        self.db = np.sum(dout, axis=0)
        # 変数前のアスタリスクは、入力値の分割
        dx = dx.reshape(*self.original_x_shape)
        return dx

Softmax-with-Lossレイヤ

画面幅の都合から、Softmax部分と、Cross Entropy Error部分を分けて記載します。

e a 1 y - t 1 1 + e a 2 y - t 2 2 e a 3 y - t 3 3 e e e ÷ a 1 e a 2 e a 3 e S 1/S 1/S -t1S -t2S -t3S -t1/e a 1 -t2/e a 2 -t3/e a 3 y 1 y 2 y 3 -t1/y1 -t2/y2 -t3/y3 Soft max

ln × ln y1 -t1 -t2 -t3 t1 ln y1 Cross Entropy Error y 1 y 2 y 3 -t1/y1 -t2/y2 -t3/y3 ln ln ln y2 ln y3 × × t1 t2 t3 + t2 ln y2 t3 ln y3 -1 × t1 ln y1+ t2 ln y2+ t3 ln y3 -1 1 L

以下は、python実装

import numpy as np

class SoftmaxWithLoss:
    def __init__(self):
        self.loss = None # 損失
        self.y    = None # softmax の出力
        self.t    = None # 教師データ(one-hot vector)
    def forward(self, x, t):
        self.t = t
        self.y = self.softmax(x)
        self.loss = cross_entropy_error(self.y, self.t)
        return self.loss
    def backward(self, dout=1):
        batch_size = self.t.shape[0]
        dx = (self.y - self.t) / batch_size
        return dx

    def softmax(self,x):
        x = x - np.max(x, axis=-1, keepdims=True)   # オーバーフロー対策
        return np.exp(x) / np.sum(np.exp(x), axis=-1, keepdims=True)

    def cross_entropy_error(y, t):
        if y.ndim == 1:
            t = t.reshape(1, t.size)
            y = y.reshape(1, y.size)
            
        # 教師データがone-hot-vectorの場合、正解ラベルのindexへ
        if t.size == y.size:
            t = t.argmax(axis=1)
             
        batch_size = y.shape[0]
        return -np.sum(np.log(y[np.arange(batch_size), t] + 1e-7)) / batch_size

Sigmoid活性化関数の計算グラフと、微分

「ゼロから作るDeep Learning ① (Pythonで学ぶディープラーニングの理論と実装)」 p.143~146の写経です。

github.com

Sigmoid層のpython実装

# coding: utf-8
import numpy as np

def main():
    sigmoid = Sigmoid()
    x = np.array( [[1.0, -0.5],
                   [-2.0, 3.0]] )
    # 準伝播
    fwd = sigmoid.forward(x)
    print(fwd)
    
    # 逆伝播
    back = sigmoid.backward(x)
    print(back)
    
class Sigmoid:
    def __init__(self):
        self.out = None

    # https://github.com/oreilly-japan/deep-learning-from-scratch/blob/master/common/layers.py
    # にある Sigmoid は、未定義のsigmoid(x)を使用しており
    # 誤っている気がします。
    # def forward(self, x):
    #     out = sigmoid(x)
    #     self.out = out
    #     return out
    
    def forward(self, x):
        out = 1 / (1 + np.exp(-x))
        self.out = out
        return out

    # forward()で算出したoutを保持し、再利用しています
    def backward(self, dout):
        dx = dout * (1.0 - self.out) * self.out
        return dx
    
if __name__ == '__main__':
    main()

計算グラフ

<sodipodi:namedview id="namedview7" pagecolor="#ffffff" bordercolor="#666666" borderopacity="1.0" inkscape:showpageshadow="2" inkscape:pageopacity="0.0" inkscape:pagecheckerboard="0" inkscape:deskcolor="#d1d1d1" inkscape:document-units="mm" showgrid="false" inkscape:zoom="1.9776936" inkscape:cx="336.50309" inkscape:cy="131.46627" inkscape:window-width="2191" inkscape:window-height="1333" inkscape:window-x="2725" inkscape:window-y="171" inkscape:window-maximized="0" inkscape:current-layer="layer1" />y==1x÷yx==1x-2yx==2y∵1y==eyx==∵2xexey==11+e-xx×Lyy÷Ly-11e1=-y2e-x1+e-x1+e-xLy-y2Ly-y2e-xyy2e-xL(∵1)-x(∵2)yy2e-xL=yy(1-y)L(∵3)=yL1(1+e-x)2e-xyL11+e-xe-x1+e-x==yLy(1-y)∵3y==1x÷yx==1x-2yx==2y∵1y==eyx==∵2xexeyxy(1-y)∵3

Relu活性化関数の計算グラフと、微分

「ゼロから作るDeep Learning ① (Pythonで学ぶディープラーニングの理論と実装)」 p.141~142の写経です。

github.com

Relu層のpython実装

# coding: utf-8
import numpy as np

def main():
    relu = Relu()
    x = np.array( [[1.0, -0.5],
                   [-2.0, 3.0]] )
    # 準伝播
    fwd = relu.forward(x)
    print(fwd)
    
    # 逆伝播
    back = relu.backward(x)
    print(back)
    

class Relu:
    def __init__(self):
        self.mask = None
        
    def forward(self, x):
        # 0以下の要素をTrue、それ以外をFalse化
        self.mask = (x <= 0)
        # print( self.mask )

        # Trueの要素位置にものを0化、それ以外はそのまま
        out = x.copy()
        out[self.mask] = 0
        # print( out )
        
        return out
    
    def backward(self, dout):
        # print( self.mask )
        
        dout[self.mask] = 0
        # print( dout )
        dx = dout
        return dx
    
if __name__ == '__main__':
    main()

上記のオライリーのサポートサイトには、 上記のsample codeが掲載されていますが、 特に backward() は、以下が正しい気がします。

# coding: utf-8

import numpy as np

# coding: utf-8
import numpy as np

def main():
    relu = Relu()
    x = np.array( [[1.0, -0.5],
                   [-2.0, 3.0]] )
    # 準伝播
    fwd = relu.forward(x)
    print(fwd)
    
    # 逆伝播
    back = relu.backward(x)
    print(back)
    

class Relu:
    def __init__(self):
        self.mask = None
        
    def forward(self, x):
        out = np.maximum(0, x)
        return out
    
    def backward(self, dout):
        dx = np.where(dout>0, 1, 0)
        return dx
    
if __name__ == '__main__':
    main()

計算グラフ

<sodipodi:namedview id="namedview7" pagecolor="#ffffff" bordercolor="#666666" borderopacity="1.0" inkscape:showpageshadow="2" inkscape:pageopacity="0.0" inkscape:pagecheckerboard="0" inkscape:deskcolor="#d1d1d1" inkscape:document-units="mm" showgrid="false" inkscape:zoom="0.9888468" inkscape:cx="60.171101" inkscape:cy="265.96638" inkscape:window-width="1651" inkscape:window-height="1329" inkscape:window-x="2955" inkscape:window-y="150" inkscape:window-maximized="0" inkscape:current-layer="layer1" />yxxy==Ly0{(x>0)(x≦0)yx==0{1ReluLy(x>0)(x≦0)(x>0)yxReluLy0(x≦0)

ニューラルネットワークにおける誤差逆伝播法を計算グラフと合成関数の微分で理解する

「ゼロから作るDeep Learning ① (Pythonで学ぶディープラーニングの理論と実装)」 p.129~132の写経です。

基本は、上流からの値と偏微分の乗算

まず、 \large{ y = f(x) }を計算グラフで表すと、下図の通りです。

次に出力値であるEからの逆伝播は、偏微分との乗算で得られる為、 朱書きのようになります。

<sodipodi:namedview id="namedview371" pagecolor="#ffffff" bordercolor="#666666" borderopacity="1.0" inkscape:showpageshadow="2" inkscape:pageopacity="0.0" inkscape:pagecheckerboard="0" inkscape:deskcolor="#d1d1d1" inkscape:document-units="mm" showgrid="false" inkscape:zoom="2.5680429" inkscape:cx="308.99016" inkscape:cy="151.08782" inkscape:window-width="2207" inkscape:window-height="1333" inkscape:window-x="2767" inkscape:window-y="150" inkscape:window-maximized="0" inkscape:current-layer="g927" /> f y x f (x) y = = E E y x

応用例?  合成関数での逆電波

今度は、[tex: \large{z=(x+y)2}]の場合を考えます。 この式は、下図の①→②のような合成関数へ置き換えることができますので、 下図のような計算グラフで表すことができます。

逆伝播は、上流の値と、偏微分の乗算で得られますので、 下図の朱書きのようになります。

numpy for python によるニューラルネットワーク学習

GitHub - oreilly-japan/deep-learning-from-scratch: 『ゼロから作る Deep Learning』(O'Reilly Japan, 2016)

「ゼロから作るDeep Learning ① (Pythonで学ぶディープラーニングの理論と実装)」 p.112~121 の写経です。

また、以下のキカガクのページも分かりやすいかと思います。

https://free.kikagaku.ai/tutorial/basic_of_deep_learning/learn/neural_network_basic_backward

目次

python

# coding: utf-8

import gzip
import matplotlib.pyplot as plt
import numpy             as np
import os
import sys
import urllib.request

def main():
    # MNISTデータのdownload
    mymnist = MyMnist()
    (x_train, t_train, x_test, t_test) = mymnist.load_mnist()

    iters_num     = 10000 # 繰り返し回数
    batch_size    = 100
    learning_rate = 0.1   # 学習率 η
    
    train_size  = x_train.shape[0]
    iter_per_epoch = max(train_size / batch_size, 1)
    
    train_loss_list = []
    train_acc_list  = []
    test_acc_list   = []

    # 2層ニューラルネットワークでの訓練
    network = TwoLayerNet(28*28, 100, 10)
    # for param_type in ['W1','b1','W2','b2',]:
    #     print( param_type, network.params[param_type].shape )

    for i in range(iters_num):
        batch_mask = np.random.choice(train_size, batch_size)
        x_batch = x_train[batch_mask]
        t_batch = t_train[batch_mask]

        # 勾配を計算し、パラメータを更新
        #grad = network.numerical_gradient(x_batch, t_batch)
        grad = network.gradient(x_batch, t_batch)

        for key in ('W1', 'b1', 'W2', 'b2'):
            network.params[key] -= learning_rate * grad[key]
        
        loss = network.loss(x_batch, t_batch)
        train_loss_list.append(loss)

        if i % iter_per_epoch == 0:
            train_acc = network.accuracy(x_train, t_train)
            test_acc = network.accuracy(x_test, t_test)
            train_acc_list.append(train_acc)
            test_acc_list.append(test_acc)
            print("train acc, test acc | " + str(train_acc) + \
                  ", " + str(test_acc))

    my_plot = MyPlot()
    my_plot.disp_graph(train_acc_list,test_acc_list)
    

   
class MyMnist:
    def __init__(self):
        pass

    def load_mnist(self):
        data_files = self.download_mnist()
        # convert numpy
        dataset = {}
        dataset['train_img']   = self.load_img(  data_files['train_img'] )
        dataset['train_label'] = self.load_label(data_files['train_label'])
        dataset['test_img']    = self.load_img(  data_files['test_img']  )
        dataset['test_label']  = self.load_label(data_files['test_label'])

        for key in ('train_img', 'test_img'):
            dataset[key] = dataset[key].astype(np.float32)
            dataset[key] /= 255.0

        for key in ('train_label','test_label'):
            dataset[key]=self.change_one_hot_label( dataset[key] )

        return (dataset['train_img'],
                dataset['train_label'],
                dataset['test_img'],
                dataset['test_label'] )

    def change_one_hot_label(self,X):
        T = np.zeros((X.size, 10))
        for idx, row in enumerate(T):
            row[X[idx]] = 1
        return T
    
    def download_mnist(self):
        url_base = 'http://yann.lecun.com/exdb/mnist/'
        key_file = {'train_img'  :'train-images-idx3-ubyte.gz',
                    'train_label':'train-labels-idx1-ubyte.gz',
                    'test_img'   :'t10k-images-idx3-ubyte.gz',
                    'test_label' :'t10k-labels-idx1-ubyte.gz' }
        data_files = {}
        dataset_dir = os.path.dirname(os.path.abspath(__file__))
        
        for data_name, file_name in key_file.items():
            req_url   = url_base+file_name
            file_path = dataset_dir + "/" + file_name

            request  = urllib.request.Request( req_url )
            response = urllib.request.urlopen(request).read()
            with open(file_path, mode='wb') as f:
                f.write(response)
                
            data_files[data_name] = file_path
        return data_files

    def load_img( self,file_path):
        img_size    = 784 # = 28*28
        
        with gzip.open(file_path, 'rb') as f:
            data = np.frombuffer(f.read(), np.uint8, offset=16)
        data = data.reshape(-1, img_size)
        return data
    
    def load_label(self,file_path):
        with gzip.open(file_path, 'rb') as f:
            labels = np.frombuffer(f.read(), np.uint8, offset=8)
        return labels


class MyPlot:
    def __init__(self):
        pass
    def disp_graph(self,train_acc_list,test_acc_list):
        markers = {'train': 'o', 'test': 's'}
        x = np.arange(len(train_acc_list))
        plt.plot(x, train_acc_list, label='train acc')
        plt.plot(x, test_acc_list, label='test acc', linestyle='--')
        plt.xlabel("epochs")
        plt.ylabel("accuracy")
        plt.ylim(0, 1.0)
        plt.legend(loc='lower right')
        plt.show()

class TwoLayerNet:

    def __init__(self,
                 input_size,    # 入力層のneuron数
                 hidden_size,   # 隠れ層の〃
                 output_size,   # 出力層の〃
                 weight_init_std=0.01):
        # 重みの初期化
        self.params = {}
        self.params['W1'] = \
            weight_init_std * np.random.randn(input_size, hidden_size)
        self.params['b1'] = np.zeros(hidden_size)
        self.params['W2'] = \
            weight_init_std * np.random.randn(hidden_size, output_size)
        self.params['b2'] = np.zeros(output_size)

    def predict(self, x):
        W1, W2 = self.params['W1'], self.params['W2']
        b1, b2 = self.params['b1'], self.params['b2']
    
        a1 = np.dot(x, W1) + b1
        z1 = self.sigmoid(a1)
        a2 = np.dot(z1, W2) + b2
        y = self.softmax(a2)
        
        return y
        
    # x:入力データ, t:教師データ
    def loss(self, x, t):
        y = self.predict(x)
        return self.cross_entropy_error(y, t)
    
    def accuracy(self, x, t):
        y = self.predict(x)
        y = np.argmax(y, axis=1)
        t = np.argmax(t, axis=1)
        
        accuracy = np.sum(y == t) / float(x.shape[0])
        return accuracy
        
    # x:入力データ, t:教師データ
    def numerical_gradient(self, x, t):
        loss_W = lambda W: self.loss(x, t)
        
        grads = {}
        grads['W1'] = self._numerical_gradient(loss_W, self.params['W1'])
        grads['b1'] = self._numerical_gradient(loss_W, self.params['b1'])
        grads['W2'] = self._numerical_gradient(loss_W, self.params['W2'])
        grads['b2'] = self._numerical_gradient(loss_W, self.params['b2'])
        
        return grads
        
    def _numerical_gradient(self, f, x):
        h = 1e-4 # 0.0001
        grad = np.zeros_like(x)

        it = np.nditer(x, flags=['multi_index'], op_flags=['readwrite'])
        while not it.finished:
            idx = it.multi_index
            tmp_val = x[idx]
            x[idx] = tmp_val + h
            fxh1 = f(x) # f(x+h)

            x[idx] = tmp_val - h 
            fxh2 = f(x) # f(x-h)
            grad[idx] = (fxh1 - fxh2) / (2*h)

            x[idx] = tmp_val # 値を元に戻す
            it.iternext()
        return grad

    def gradient(self, x, t):
        W1, W2 = self.params['W1'], self.params['W2']
        b1, b2 = self.params['b1'], self.params['b2']
        grads = {}
        
        batch_num = x.shape[0]
        
        # forward
        a1 = np.dot(x, W1) + b1
        z1 = self.sigmoid(a1)
        a2 = np.dot(z1, W2) + b2
        y = self.softmax(a2)
        
        # backward
        dy = (y - t) / batch_num
        grads['W2'] = np.dot(z1.T, dy)
        grads['b2'] = np.sum(dy, axis=0)
        
        dz1 = np.dot(dy, W2.T)
        da1 = self.sigmoid_grad(a1) * dz1
        grads['W1'] = np.dot(x.T, da1)
        grads['b1'] = np.sum(da1, axis=0)

        return grads

    def sigmoid(self,x):
        return 1 / (1 + np.exp(-x))    

    def sigmoid_grad(self,x):
        return (1.0 - self.sigmoid(x)) * self.sigmoid(x)

    def cross_entropy_error(self, y, t):
        if y.ndim == 1:
            t = t.reshape(1, t.size)
            y = y.reshape(1, y.size)

        # 教師データがone-hot-vectorの場合、正解ラベルのインデックスに変換
        if t.size == y.size:
            t = t.argmax(axis=1)

        batch_size = y.shape[0]
        return -np.sum(np.log(y[np.arange(batch_size),t]+ 1e-7))/batch_size

    def softmax(self,x):
        x = x - np.max(x, axis=-1, keepdims=True)   # オーバーフロー対策
        return np.exp(x) / np.sum(np.exp(x), axis=-1, keepdims=True)
    
if __name__ == '__main__':
    main()

実行結果

(dl_scratch) C:\Users\end0t\tmp\deep-learning-from-scratch\ch04>python foo.py
train acc, test acc | 0.0993, 0.1032
train acc, test acc | 0.8140666666666667, 0.8168
train acc, test acc | 0.8824, 0.8873
train acc, test acc | 0.8999166666666667, 0.9047
train acc, test acc | 0.9076333333333333, 0.9121
train acc, test acc | 0.9124, 0.9162
train acc, test acc | 0.9184166666666667, 0.9219
train acc, test acc | 0.9225333333333333, 0.9231
train acc, test acc | 0.92515, 0.9279
train acc, test acc | 0.9284166666666667, 0.9299
train acc, test acc | 0.9318833333333333, 0.9321
train acc, test acc | 0.9342333333333334, 0.9346
train acc, test acc | 0.9367, 0.9374
train acc, test acc | 0.93965, 0.9399
train acc, test acc | 0.9422, 0.9412
train acc, test acc | 0.9439166666666666, 0.9423
train acc, test acc | 0.9461333333333334, 0.9441

ニューラルネットワークにおける損失関数と勾配(偏微分)

GitHub - oreilly-japan/deep-learning-from-scratch: 『ゼロから作る Deep Learning』(O'Reilly Japan, 2016)

「ゼロから作るDeep Learning ① (Pythonで学ぶディープラーニングの理論と実装)」 p.89~108 の写経です。

目次

損失関数としての 交差エントロピー誤差

 \large{ E = - \sum_k t_k \log y_k }
ただし、  \large{ t_k : 正解データ、y_k : 実際の出力データ}

交差エントロピー誤差は、上記の通りですが、 複数データの「ミニバッチ学習」を行う場合、以下のようになります。

「ミニバッチ学習」とは、データの1つ1つでパラメータを更新するのではなく、 いくつかのデータをまとめて入力し、それぞれの勾配を計算し、 その勾配の平均値を用いてパラメータを更新する方法です。

 \large{ E = - \frac{1}{N} \sum_n \sum_k t_{nk} \log y_{nk} }
ただし、  \large{ t_{nk} : n番目のデータのk番目要素の正解データ \\\
                       y_{nk} : n番目のデータのk番目要素の実際の出力データ}

上記を numpy for pythonで実装する場合、p94に記載の以下のようになります。

def cross_entropy_error(y, t):
    if y.ndim == 1:
        t = t.reshape(1, t.size)
        y = y.reshape(1, y.size)
        
    # 正解dataがone-hot-vectorの場合の変換
    if t.size == y.size:
        t = t.argmax(axis=1)
             
    batch_size = y.shape[0]
    return -np.sum(np.log(y[np.arange(batch_size), t] + 1e-7)) / batch_size

損失関数を数値微分し、勾配算出

単純?な数値に対しての数値微分と、python実装は以下の通りです

 \large{
  \frac{ d (x) }{ dx } =  \lim_{h \to 0} \frac{ f(x+h)-f(x)}{h}
}

def numerical_diff(f, x):
    h = 1e-4 # 0.0001
    return (f(x+h) - f(x-h)) / (2*h)

※ fは損失関数、xは入力値

更には、 \large{x_0、x_1}のように、 複数の値を持つベクトルに対し、  \large{ \frac{\partial f}{\partial x_0}、\frac{\partial f}{\partial x_1} } のような偏微分を行う場合、p104にあるように実装します。

def numerical_gradient(f, x):
    h = 1e-4 # 0.0001
    grad = np.zeros_like(x)
    
    it = np.nditer(x, flags=['multi_index'], op_flags=['readwrite'])
    while not it.finished:
        idx = it.multi_index
        tmp_val = x[idx]
        x[idx] = tmp_val + h
        fxh1 = f(x) # f(x+h)
        
        x[idx] = tmp_val - h 
        fxh2 = f(x) # f(x-h)
        grad[idx] = (fxh1 - fxh2) / (2*h)
        
        x[idx] = tmp_val # 値を元に戻す
        it.iternext()   
        
    return grad

勾配降下法による最小値算出

勾配降下法では、以下の数式で、次の \large{x_0、x_1}を算出します。

 \large{
  x_0 ← x_0 - \eta \frac{\partial f}{\partial x_0} \\\
  x_1 ← x_1 - \eta \frac{\partial f}{\partial x_1}
}

※ ηは、前もって決めておく学習率で、例えば、0.01や0.001

これをpythonで実装する場合、p.107の以下のようになります

def gradient_descent(f, init_x, lr=0.01, step_num=100):
    x = init_x
    x_history = []

    for i in range(step_num):
        x_history.append( x.copy() )

        grad = numerical_gradient(f, x)
        x -= lr * grad

    return x, np.array(x_history)

numpy for python によるニューラルネットワーク実装と、予測

GitHub - oreilly-japan/deep-learning-from-scratch: 『ゼロから作る Deep Learning』(O'Reilly Japan, 2016)

「ゼロから作るDeep Learning ① (Pythonで学ぶディープラーニングの理論と実装)」 p.64~65の写経です。

「学習済の為、重みやバイアスは確定済」として、  \large{ x=(1.0,0.5)}の入力に対する予測を行います。

1 1.0 0.5 0.1 0.2 0.3 0.1 0.3 0.5 0.2 0.4 0.6 0.1 0.2 0.1 0.4 0.2 0.5 0.3 0.6 0.1 0.2 0.1 0.3 0.2 0.4 1 1 x w b 1 b 2 b 3 1 2 3 w w a 1 (1) a 2 (1) a 3 (1) z 1 (1) z 2 (1) z 3 (1) a 1 (2) a 2 (2) z 1 (2) z 2 (2) a 1 (3) a 2 (3) y sigmoid sigmoid 恒等関数

↑このニューラルネットワークは以下のように実装でき、 実行すると、[0.31682708 0.69627909] と表示されます。

# coding: utf-8
import numpy as np

def main():
    network = init_network()
    x = np.array([1.0, 0.5])
    y = forward(network, x)
    print(y)
    
def init_network():
    network = {}
    network['W1'] = np.array([[0.1, 0.3, 0.5], [0.2, 0.4, 0.6]])
    network['b1'] = np.array([0.1, 0.2, 0.3])
    network['W2'] = np.array([[0.1, 0.4], [0.2, 0.5], [0.3, 0.6]])
    network['b2'] = np.array([0.1, 0.2])
    network['W3'] = np.array([[0.1, 0.3], [0.2, 0.4]])
    network['b3'] = np.array([0.1, 0.2])
    return network

def forward(network, x):
    W1, W2, W3 = network['W1'], network['W2'], network['W3']
    b1, b2, b3 = network['b1'], network['b2'], network['b3']
    a1 = np.dot(x, W1) + b1
    z1 = sigmoid(a1)
    a2 = np.dot(z1,W2) + b2
    z2 = sigmoid(a2)
    a3 = np.dot(z2,W3) + b3
    
    y = identity_function(a3)
    return y

def sigmoid(x):
    return 1 / (1 + np.exp(-x))

# 恒等関数
def identity_function(x):
    return x

if __name__ == '__main__':
    main()

numpy for python による活性化関数 ( STEP, SIGMOID, ReLU, tanh )

GitHub - oreilly-japan/deep-learning-from-scratch: 『ゼロから作る Deep Learning』(O'Reilly Japan, 2016)

「ゼロから作るDeep Learning ① (Pythonで学ぶディープラーニングの理論と実装)」 p.44~52の写経です。

行列を引数として扱う為、numpyを利用しています

# coding: utf-8
import numpy as np
import matplotlib.pylab as plt

def main():
    x = np.arange(-5.0, 5.0, 0.1)
    y1 = sigmoid(x)             # シグモイド
    y2 = step_function(x)       # ステップ
    y3 = relu(x)                # ReLU
    y4 = tanh(x)                # tanh
    
    plt.plot(x, y1)
    plt.plot(x, y2, '--')
    plt.plot(x, y3, ':')
    plt.plot(x, y4, '-.')
    plt.ylim(-1.1, 1.1)
    plt.show()
    
def step_function(x):
    return np.array(x > 0, dtype=np.int)

def sigmoid(x):
    return 1 / (1 + np.exp(-x))

def relu(x):
    return np.maximum(0, x)

def tanh(x):
    return np.tanh(x)

if __name__ == '__main__':
    main()

↑こう書くと、↓こう表示されます

※要Anaconda