【发布时间】:2020-03-24 15:35:52
【问题描述】:
我正在尝试建立一个深度 Q 网络来玩蛇。我将游戏设计为使窗口为 600 x 600,并且蛇的头部每刻移动 30 个像素。我使用内存重放和目标网络实现了 DQN 算法,但是一旦策略网络开始更新其权重,训练就会显着减慢,以至于权重更新循环的每次迭代大约需要 5 分钟。此外,即使在训练了大约 500 集之后,我也发现代理的表现几乎没有任何改善。这是代理的代码:
import numpy as np
import tensorflow as tf
from snake_rl.envs.snake_env import SnakeEnv
import random
from Game.experience import Experience
import time
import pygame
from PIL import Image
from keras import Sequential
from keras.layers import Conv2D, Dense, BatchNormalization, Activation, Flatten, Reshape
import matplotlib.pyplot as plt
import matplotlib.image as mpimg
class Brain:
def __init__(self, learning_rate, discount_rate, eps_start, eps_end, eps_decay, memory_size, batch_size, max_episodes, max_steps, target_update):
self.memory = []
self.push_count = 0
self.learning_rate = learning_rate
self.discount_rate = discount_rate
self.eps_start = eps_start
self.current_eps = eps_start
self.eps_end = eps_end
self.eps_decay = eps_decay
self.memory_size = memory_size
self.batch_size = batch_size
self.max_steps = max_steps
self.max_episodes = max_episodes
self.current_episode = 1
self.policy_model = None
self.replay_model = None
self.target_update = target_update
pygame.init()
self.screen = pygame.display.set_mode((600, 600))
pygame.display.set_caption("Snake")
def build_model(self):
self.policy_model = Sequential()
self.policy_model.add(Conv2D(8, (5, 5), padding = 'same', activation = 'relu', data_format = "channels_last", input_shape = (600, 600, 2)))
self.policy_model.add(Conv2D(16, (5, 5), padding="same", activation="relu"))
self.policy_model.add(Conv2D(32, (5, 5), padding="same", activation="relu"))
self.policy_model.add(Flatten())
self.policy_model.add(Dense(16, activation = "relu"))
self.policy_model.add(Dense(5, activation = "softmax"))
self.policy_model.compile(optimizer = 'rmsprop', loss = 'mean_squared_error')
self.replay_model = Sequential()
self.replay_model.add(Conv2D(8, (5, 5), padding = 'same', activation = 'relu', data_format = "channels_last", input_shape = (600, 600, 2)))
self.replay_model.add(Conv2D(16, (5, 5), padding="same", activation="relu"))
self.replay_model.add(Conv2D(32, (5, 5), padding="same", activation="relu"))
self.replay_model.add(Flatten())
self.replay_model.add(Dense(16, activation = "relu"))
self.replay_model.add(Dense(5, activation = "softmax"))
self.replay_model.compile(optimizer = 'rmsprop', loss = 'mean_squared_error')
print(self.policy_model.summary())
def decay_epsilon(self, episode):
self.current_eps = self.eps_end + (self.eps_start - self.eps_end) * np.exp(-self.eps_decay * episode)
def push_memory(self, new_memory):
if(len(self.memory) < self.memory_size):
self.memory.append(new_memory)
else:
self.memory[self.push_count % self.memory_size] = new_memory
self.push_count += 1
def sample_memory(self):
return random.sample(self.memory, self.batch_size)
def can_sample_memory(self):
return len(self.memory) >= self.batch_size
def screenshot(self):
data = pygame.image.tostring(self.screen, 'RGB')
image = Image.frombytes('RGB', (600, 600), data)
image = image.convert('LA')
matrix = np.asarray(image.getdata(), dtype=np.uint8)
matrix = (matrix - 128)/(128 - 1)
matrix = np.reshape(matrix, (1, 600, 600, 2))
return matrix
def train(self):
tf.logging.set_verbosity(tf.logging.ERROR)
self.build_model()
for episode in range(self.max_episodes):
self.current_episode = episode
env = SnakeEnv(self.screen)
episode_reward = 0
for timestep in range(self.max_steps):
env.render(self.screen)
state = self.screenshot()
#state = env.get_state()
action = None
epsilon = self.current_eps
if epsilon > random.random():
action = np.random.choice(env.action_space) #explore
else:
values = self.policy_model.predict(state) #exploit
action = np.argmax(values)
experience = env.step(action)
if(experience['done'] == True):
episode_reward += experience['reward']
break
episode_reward += experience['reward']
self.push_memory(Experience(experience['state'], experience['action'], experience['reward'], experience['next_state']))
self.decay_epsilon(episode)
if self.can_sample_memory():
memory_sample = self.sample_memory()
X = []
Y = []
for memory in memory_sample:
memstate = memory.state
action = memory.action
next_state = memory.next_state
reward = memory.reward
max_q = reward + (self.discount_rate * self.replay_model.predict(next_state)) #bellman equation
X.append(memstate)
Y.append(max_q)
X = np.array(X)
X = X.reshape([-1, 600, 600, 2])
Y = np.array(Y)
Y = Y.reshape([128, 5])
self.policy_model.fit(X, Y)
print("Episode: ", episode, " Total Reward: ", episode_reward)
if episode % self.target_update == 0:
self.replay_model.set_weights(self.policy_model.get_weights())
self.policy_model.save_weights('weights.hdf5')
pygame.quit()
def render(self):
self.env.render(self.screen)
def choose_action(self, state):
q_values = self.policy_model.predict(state)
action = np.amax(q_values)
return action
def load(self):
self.build_model()
self.policy_model.load_weights("weights.hdf5")
def play(self):
for episode in range(100):
env = SnakeEnv(self.screen)
for timestep in range(1000):
env.render(self.screen)
pred = self.policy_model.predict(env.get_state())
print(np.array(pred))
action = np.amax(pred)
d = env.step(action)
if(d['done'] == True):
break
我的超参数如下:
learning_rate = 0.5
discount_rate = 0.99
eps_start = 1
eps_end = .01
eps_decay = .001
memory_size = 100000
batch_size = 128
max_episodes = 1000
max_steps = 5000
target_update = 10
有人对如何加快训练和提高性能有任何建议吗?
【问题讨论】:
标签: python tensorflow keras deep-learning reinforcement-learning