【发布时间】:2019-08-20 01:22:43
【问题描述】:
这部分是动作转移概率
def _calculate_transition_prob(self, current, delta):
new_position = np.array(current) + np.array(delta)
new_position =self._limit_coordinates(new_position).astype(int)
new_state = np.ravel_multi_index(tuple(new_position), self.shape)
reward = self.reward
is_done = self._cliff[tuple(new_position)] or (tuple(new_position) == (4,11))
return [(1.0, new_state, reward, is_done)]
这部分我想使用奖励函数作为参数
def reward(reward, self):
self.reward = -100.0 if self._cliff[tuple(new_position)] else -1.0
return reward
这部分是q学习(RL)算法
def q_learning(env, num_episodes, discount_factor=1.0, alpha=0.5, epsilon=0.1):
Q = defaultdict(lambda: np.zeros(env.action_space.n))
episode_lengths = np.zeros(num_episodes)
episode_rewards = np.zeros(num_episodes)
policy = epsilon_greedy_policy(Q, epsilon, env.action_space.n)
for i_episode in range(num_episodes):
state = env.reset()
for t in itertools.count():
action_probs = policy(state)
action = np.random.choice(np.arange(len(action_probs)), p = action_probs)
next_state, reward, done, _ = env.step(action)
episode_rewards[i_episode] += reward
episode_lengths[i_episode] = t
【问题讨论】:
-
错误到底发生在哪里?