先程のentryの続きで、参考urlからの写経です
参考url
import copy import numpy as np import matplotlib.pyplot as plt EPISODE = 100 # 学習回数 def main(): grid_env = GridWorld() agent = QAgent(alpha =0.1, # 学習率 gamma =0.9, # 割引率 epsilon=0.1, # ε-greedy行動選択用 actions=np.arange(4),# 行動の集合 observation=grid_env.start_pos ) rewards = [] # 評価用報酬の保存 is_end_episode = False # Goalしたか? for episode in range(EPISODE): episode_reward = [] # 1エピソードの累積報酬 while(is_end_episode == False): # ε-greedy行動選択 action = agent.act() # 行動し、次の状態や即時報酬等を得る state, reward, is_end_episode = grid_env.step(action) # 次の状態や即時報酬で、Q値を更新 agent.observe(state, reward) episode_reward.append(reward) rewards.append(np.sum(episode_reward)) state = grid_env.reset()# 初期化 agent.observe(state) # エージェントを初期位置に is_end_episode = False # 結果の表示 plt.plot(np.arange(EPISODE), rewards) plt.xlabel("episode") plt.ylabel("reward") plt.savefig("result.jpg") plt.show() class QAgent: def __init__(self, alpha=.2, epsilon=.1, gamma=.99, actions=None, observation=None ): self.alpha = alpha self.gamma = gamma self.epsilon = epsilon self.reward_history = [] self.actions = actions self.state = str(observation) self.ini_state = str(observation) self.previous_state = None self.previous_action = None self.q_values = self._init_q_values() def _init_q_values(self): q_values = {} q_values[self.state] = np.repeat(0.0, len(self.actions)) return q_values def init_state(self): self.previous_state = copy.deepcopy(self.ini_state) self.state = copy.deepcopy(self.ini_state) return self.state # ε-greedy選択 def act(self): if np.random.uniform() < self.epsilon: # εの確率でrandom行動 action = np.random.randint(0, len(self.q_values[self.state])) else: # 1-εの確率で最適行動 action = np.argmax(self.q_values[self.state]) self.previous_action = action return action # 次の状態と報酬の観測 def observe(self, next_state, reward=None): next_state = str(next_state) # 次の状態が初めての状態なら、その座標?を初期化 if next_state not in self.q_values: self.q_values[next_state] = np.repeat(0.0, len(self.actions)) else: print(self.q_values[next_state]) self.previous_state = copy.deepcopy(self.state) self.state = next_state if reward != None: self.reward_history.append(reward) self.learn(reward) # Q値の更新 def learn(self, reward): q = self.q_values[self.previous_state][self.previous_action] max_q = max(self.q_values[self.state]) # max Q(s') # Q(s, a) = Q(s, a) + alpha*(r+gamma*maxQ(s')-Q(s, a)) self.q_values[self.previous_state][self.previous_action] = q + \ (self.alpha * (reward + (self.gamma * max_q) - q)) class GridWorld: def __init__(self): # N:Normal, G:Goal, W:Wall, T:Trap self.filed_type = {"N":0,"G":1,"W":2,"T":3 } self.actions = {"UP":0, "DOWN":1, "LEFT":2, "RIGHT":3} self.map = [[3, 2, 0, 1], [0, 0, 0, 2], [0, 0, 2, 0], [2, 0, 2, 0], [0, 0, 0, 0]] self.start_pos = 0, 4 self.agent_pos = copy.deepcopy(self.start_pos) # 行動を実行し、状態, 報酬、ゴールしたかを返却 def step(self, action): to_x, to_y = copy.deepcopy(self.agent_pos) # 移動可能かどうかの確認。移動不可能であれば、ポジションはそのままにマイナス報酬 if self._is_possible_action(to_x, to_y, action) == False: return self.agent_pos, -1, False if action == self.actions["UP"]: to_y += -1 elif action == self.actions["DOWN"]: to_y += 1 elif action == self.actions["LEFT"]: to_x += -1 elif action == self.actions["RIGHT"]: to_x += 1 is_goal = self._is_end_episode(to_x, to_y) # エピソードの終了の確認 reward = self._compute_reward(to_x, to_y) self.agent_pos = to_x, to_y return self.agent_pos, reward, is_goal # エピソード終了(Goal or Trap)の確認 def _is_end_episode(self, x, y): if self.map[y][x] == self.filed_type["G"]: return True if self.map[y][x] == self.filed_type["T"]: return True return False # 壁かどうかの確認 def _is_wall(self, x, y): if self.map[y][x] == self.filed_type["W"]: return True return False # 実行可能な行動かどうかの判定 def _is_possible_action(self, x, y, action): to_x = x to_y = y if action == self.actions["UP"]: to_y += -1 elif action == self.actions["DOWN"]: to_y += 1 elif action == self.actions["LEFT"]: to_x += -1 elif action == self.actions["RIGHT"]: to_x += 1 if len(self.map) <= to_y or 0 > to_y: return False if len(self.map[0]) <= to_x or 0 > to_x: return False if self._is_wall(to_x, to_y): return False return True def _compute_reward(self, x, y): if self.map[y][x] == self.filed_type["G"]: return 100 if self.map[y][x] == self.filed_type["T"]: return -100 return 0 def reset(self): self.agent_pos = self.start_pos return self.start_pos if __name__ == '__main__': main()
↑こう書くと、↓この結果画像が生成されます