lstm：输入批量大小 100 与隐藏 [0] 批量大小 1 不匹配答案

【问题标题】：lstm:Input batch size 100 doesn't match hidden[0] batch size 1lstm：输入批量大小 100 与隐藏 [0] 批量大小 1 不匹配
【发布时间】：2020-11-13 17:00:20
【问题描述】：

我正在尝试将 lstm 层添加到我之前工作的 AI 模型中。添加模型时，我在将批次训练到我的 AI 时遇到此错误。

之前没有 LSTM，错误不存在并且工作正常。

输入批量大小 100 与隐藏 [0] 批量大小 1 不匹配。

我正在使用 nn.LSTMCell

谁能帮我检查一下我是否缺少一些参数来初始化我的 lstmcell，以便它也可以批量输入。

下面是我的代码...

import os
import time
import random
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
from sklearn.preprocessing import StandardScaler
import torch
import torch.nn as nn
import torch.nn.functional as F
from random import random as rndm
from torch.autograd import Variable
from collections import deque

os.chdir("C:\\Users\\granthjain\\Desktop\\startup_code")

torch.set_default_tensor_type('torch.DoubleTensor')

class ReplayBuffer(object):

  def __init__(self, max_size=1e6):
    self.storage = []
    self.max_size = max_size
    self.ptr = 0

  def add(self, transition):
    if len(self.storage) == self.max_size:
      self.storage[int(self.ptr)] = transition
    else:
      self.storage.append(transition)
    self.ptr = (self.ptr + 1) % self.max_size
    
  def sample(self, batch_size):
    
    ind = np.random.randint(0, self.ptr, size=batch_size)
    batch_states, batch_next_states, batch_actions, batch_rewards, batch_dones = [], [], [], [], []
    for i in ind: 
      state, next_state, action, reward, done = self.storage[i]
      
      if state is None:
          continue
      elif next_state is None:
          continue
      elif action is None:
          continue
      elif reward is None:
          continue
      elif done is None:
          continue
          
      batch_states.append(np.array(state, copy=False))
      batch_next_states.append(np.array(next_state, copy=False))
      batch_actions.append(np.array(action, copy=False))
      batch_rewards.append(np.array(reward, copy=False))
      batch_dones.append(np.array(done, copy=False))

    return np.array(batch_states,dtype=object).astype(float), np.array(batch_next_states,dtype=object).astype(float), np.array(batch_actions,dtype=object).astype(float), np.array(batch_rewards,dtype=object).astype(float), np.array(batch_dones,dtype=object).astype(float)

class Actor(nn.Module):
  
  def __init__(self, state_dim, action_dim, max_action):
    super(Actor, self).__init__()
    self.lstm = nn.LSTMCell(state_dim, 256)
    self.layer_1 = nn.Linear(256, 400)
    self.layer_2 = nn.Linear(400, 300)
    self.layer_3 = nn.Linear(300, action_dim)
    self.hx = torch.zeros(1,256)
    self.cx = torch.zeros(1,256)
    self.max_action = max_action

  def forward(self, x):
    self.hx, self.cx = self.lstm(x, (self.hx, self.cx))
    x = F.relu(self.layer_1(self.hx))
    x = F.relu(self.layer_2(x))
    x = self.max_action * torch.tanh(self.layer_3(x))
    return x


class Critic(nn.Module):
  
  def __init__(self, state_dim, action_dim):
    super(Critic, self).__init__()
    # Defining the first Critic neural network
    self.lstm1 = nn.LSTMCell(state_dim + action_dim, 256)
    self.layer_1 = nn.Linear(256, 400)
    self.layer_2 = nn.Linear(400, 300)
    self.layer_3 = nn.Linear(300, 1)
    # Defining the second Critic neural network
    self.lstm2 = nn.LSTMCell(state_dim + action_dim, 256)
    self.layer_4 = nn.Linear(256, 400)
    self.layer_5 = nn.Linear(400, 300)
    self.layer_6 = nn.Linear(300, 1)
    self.hx1 = torch.zeros(1,256)
    self.cx1 = torch.zeros(1,256)
    self.hx2 = torch.zeros(1,256)
    self.cx2 = torch.zeros(1,256)
    
  def forward(self, x, u):
    xu = torch.cat([x, u], 1)
    # Forward-Propagation on the first Critic Neural Network
    self.hx1,self.cx1 = self.lstm(xu, (self.hx1, self.cx1))
    x1 = F.relu(self.layer_1(self.hx1))
    x1 = F.relu(self.layer_2(x1))
    x1 = self.layer_3(x1)
    # Forward-Propagation on the second Critic Neural Network
    self.hx2,self.cx2 = self.lstm(xu, (self.hx2, self.cx2))
    x2 = F.relu(self.layer_4(self.hx2))
    x2 = F.relu(self.layer_5(x2))
    x2 = self.layer_6(x2)
    return x1, x2

  def Q1(self, x, u):
    xu = torch.cat([x, u], 1)
    self.hx1,self.cx1 = self.lstm(xu, (self.hx1, self.cx1))
    x1 = F.relu(self.layer_1(self.hx1))
    x1 = F.relu(self.layer_2(x1))
    x1 = self.layer_3(x1)
    return x1

# Selecting the device (CPU or GPU)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Building the whole Training Process into a class

class TD3(object):
  
  def __init__(self, state_dim, action_dim, max_action):
    self.actor = Actor(state_dim, action_dim, max_action).to(device)
    self.actor_target = Actor(state_dim, action_dim, max_action).to(device)
    self.actor_target.load_state_dict(self.actor.state_dict())
    self.actor_optimizer = torch.optim.Adam(self.actor.parameters())
    self.critic = Critic(state_dim, action_dim).to(device)
    self.critic_target = Critic(state_dim, action_dim).to(device)
    self.critic_target.load_state_dict(self.critic.state_dict())
    self.critic_optimizer = torch.optim.Adam(self.critic.parameters())
    self.max_action = max_action

  def reset_hxcx(self):
    self.actor.cx = torch.zeros(1,256)
    self.actor.hx = torch.zeros(1,256)
    self.actor_target.cx = torch.zeros(1,256)
    self.actor_target.hx = torch.zeros(1,256)
    self.critic.cx1 = torch.zeros(1,256)
    self.critic.cx2 = torch.zeros(1,256)
    self.critic.hx1 = torch.zeros(1,256)
    self.critic.hx2 = torch.zeros(1,256)    
    self.critic_target.cx1 = torch.zeros(1,256)
    self.critic_target.cx2 = torch.zeros(1,256)
    self.critic_target.hx1 = torch.zeros(1,256)
    self.critic_target.hx2 = torch.zeros(1,256)      

  def select_action(self, state):
    print("state =", type(state))
    return self.actor(state).cpu().data.numpy().flatten()

  def train(self, replay_buffer, iterations, batch_size=50, discount=0.99, tau=0.005, policy_noise=0.2, noise_clip=0.5, policy_freq=2):
    
    for it in range(iterations):
      
      # Step 4: We sample a batch of transitions (s, s’, a, r) from the memory
      batch_states, batch_next_states, batch_actions, batch_rewards, batch_dones = replay_buffer.sample(batch_size)
      
      batch_states=batch_states.astype(float)
      batch_next_states=batch_next_states.astype(float)
      batch_actions=batch_actions.astype(float)
      batch_rewards=batch_rewards.astype(float)
      batch_dones=batch_dones.astype(float)
      
      state = torch.from_numpy(batch_states)
      next_state = torch.from_numpy(batch_next_states)
      action = torch.from_numpy(batch_actions)
      reward = torch.from_numpy(batch_rewards)
      done = torch.from_numpy(batch_dones)
      
#      print("actor cx:",self.actor.cx)
#      print("actor hx:",self.actor.hx)
#      print("actor_target cx:",self.actor_target.cx)
#      print("actor_target cx:",self.actor_target.cx)
#      print("self.critic.cx1:",self.critic.cx1)
#      print("self.critic.cx2",self.critic.cx2)
#      print("self.critic.hx1:",self.critic.hx1)
#      print("self.critic.hx2:",self.critic.hx2)
#      print("self.critic_target.cx1:",self.critic_target.cx1)
#      print("self.critic_target.hx1",self.critic_target.hx1)
#      print("self.critic_target.cx2:",self.critic_target.cx2)
#      print("self.critic_target.hx2:",self.critic_target.hx2)

      # Step 5: From the next state s’, the Actor target plays the next action a’
      next_action = self.actor_target(next_state)
      
      # Step 6: We add Gaussian noise to this next action a’ and we clamp it in a range of values supported by the environment
      noise = torch.Tensor(batch_actions).data.normal_(0, policy_noise).to(device)
      noise = noise.clamp(-noise_clip, noise_clip)
      next_action = (next_action + noise).clamp(-self.max_action, self.max_action)
      
      # Step 7: The two Critic targets take each the couple (s’, a’) as input and return two Q-values Qt1(s’,a’) and Qt2(s’,a’) as outputs
      target_Q1, target_Q2 = self.critic_target(next_state, next_action)
      
      # Step 8: We keep the minimum of these two Q-values: min(Qt1, Qt2)
      target_Q = torch.min(target_Q1, target_Q2).double()
      
      
      
      
      # Step 9: We get the final target of the two Critic models, which is: Qt = r + γ * min(Qt1, Qt2), where γ is the discount factor

      done = done.resize_((done.shape[0],1))

      reward = reward.resize_((reward.shape[0],1))

      target_Q = reward + ((1 - done) * discount * target_Q).detach()

      
      
      # Step 10: The two Critic models take each the couple (s, a) as input and return two Q-values Q1(s,a) and Q2(s,a) as outputs
      current_Q1, current_Q2 = self.critic(state, action)

      # Step 11: We compute the loss coming from the two Critic models: Critic Loss = MSE_Loss(Q1(s,a), Qt) + MSE_Loss(Q2(s,a), Qt)
      critic_loss = F.mse_loss(current_Q1, target_Q) + F.mse_loss(current_Q2, target_Q)
      
      # Step 12: We backpropagate this Critic loss and update the parameters of the two Critic models with a SGD optimizer
      self.critic_optimizer.zero_grad()
      critic_loss.backward()
      self.critic_optimizer.step()
      
      # Step 13: Once every two iterations, we update our Actor model by performing gradient ascent on the output of the first Critic model
      if it % policy_freq == 0:
        actor_loss = -self.critic.Q1(state, self.actor(state)).mean()
        self.actor_optimizer.zero_grad()
        actor_loss.backward()
        self.actor_optimizer.step()
        
        # Step 14: Still once every two iterations, we update the weights of the Actor target by polyak averaging
        for param, target_param in zip(self.actor.parameters(), self.actor_target.parameters()):
          target_param.data.copy_(tau * param.data + (1 - tau) * target_param.data)
        
        # Step 15: Still once every two iterations, we update the weights of the Critic target by polyak averaging
        for param, target_param in zip(self.critic.parameters(), self.critic_target.parameters()):
          target_param.data.copy_(tau * param.data + (1 - tau) * target_param.data)
  
  # Making a save method to save a trained model
  def save(self, filename, directory):
    torch.save(self.actor.state_dict(), '%s/%s_actor.pth' % (directory, filename))
    torch.save(self.critic.state_dict(), '%s/%s_critic.pth' % (directory, filename))
  
  # Making a load method to load a pre-trained model
  def load(self, filename, directory):
    self.actor.load_state_dict(torch.load('%s/%s_actor.pth' % (directory, filename)))
    self.critic.load_state_dict(torch.load('%s/%s_critic.pth' % (directory, filename)))
    
#set the parameters

start_timesteps = 1e3 # Number of iterations/timesteps before which the model randomly chooses an action, and after which it starts to use the policy network
eval_freq = 5e1 # How often the evaluation step is performed (after how many timesteps)
max_timesteps = 5e3 # Total number of iterations/timesteps
save_models = True # Boolean checker whether or not to save the pre-trained model
expl_noise = 0.1 # Exploration noise - STD value of exploration Gaussian noise
batch_size = 100 # Size of the batch
discount = 0.99 # Discount factor gamma, used in the calculation of the total discounted reward
tau = 0.005 # Target network update rate
policy_noise = 0.2 # STD of Gaussian noise added to the actions for the exploration purposes
noise_clip = 0.5 # Maximum value of the Gaussian noise added to the actions (policy)
policy_freq = 2 # Number of iterations to wait before the policy network (Actor model) is updated

state_dim = 3
action_dim = 3
max_action = 1
idx = 0



class env1:
    
    def __init__(self,state_dim,action_dim,data):
        self.state_dim = state_dim
        self.state = torch.zeros(self.state_dim)
        self.state[state_dim-1]=1000.0
        self.next_state = torch.zeros(self.state_dim)
        self.next_state[state_dim-1] = 1000.0
        self.action_dim = action_dim
        self.data = data
        self.idx = 0
        self.count = 0
        self._max_episode_steps = 200
        self.state[1] = self.data[self.idx]
        self.next_state[1] = self.data[self.idx]

    def reset(self):
        self.next_state = torch.zeros(self.state_dim)
        self.next_state[state_dim-1]=1000.0
        self.state = torch.zeros(self.state_dim)
        self.state[state_dim-1]=1000.0
        self.state[1] = self.data[self.idx]
        self.next_state[1] = self.data[self.idx]
        self.count = 0
        ch = self.state[0]
        cp = self.state[1]
        cc = self.state[2]
        st = torch.tensor([ch,cp,cc])
        return st

    def step(self,action):
        done = False
        act_t = torch.argmax(action)
        self.idx += 1
        if(act_t==0):
            num_s = int(self.state[2]/self.state[1])

            self.next_state[0] += num_s
            self.next_state[2] = self.state[2]%self.state[1]

            self.next_state[1] = self.data[self.idx]
        elif(act_t==1):
            self.next_state[1] = self.data[self.idx]
        elif(act_t==2):
            self.next_state[2] = self.state[2]+ self.state[1]*self.state[0]
            self.next_state[0] = 0
            self.next_state[1] = self.data[self.idx]
            
        

        reward = self.next_state[2] - self.state[2] + self.next_state[1]*self.next_state[0] - self.state[1]*self.state[0] -1
        
        self.state[0] = self.next_state[0]
        self.state[1] = self.next_state[1]
        self.state[2] = self.next_state[2]
        
        ch = self.state[0]
        cp = self.state[1]
        cc = self.state[2]
        
        st = torch.tensor([ch,cp,cc])

        self.count = (self.count + 1)%100
        if(self.count==0):
            done = True

        return st, reward, done
policy = TD3(state_dim, action_dim, max_action)


#Create the environment
data = pd.read_csv('PAGEIND.csv')
data = data['Close']
data = np.array(data).reshape(-1,1)

max_timesteps = data.shape[0]
sc = StandardScaler()
data = sc.fit_transform(data)
data = torch.DoubleTensor(data)
env = env1(state_dim,action_dim,data)

replay_buffer = ReplayBuffer()

#init training variables
  
total_timesteps = 0
timesteps_since_eval = 0
episode_num = 0
done = True
t0 = time.time()



# We start the main loop over 500,000 timesteps
while total_timesteps < max_timesteps:

  # If the episode is done
  if done:

    # If we are not at the very beginning, we start the training process of the model
    if total_timesteps != 0:
      print("Total Timesteps: {} Episode Num: {} Reward: {}".format(total_timesteps, episode_num, episode_reward))
      policy.train(replay_buffer, episode_timesteps, batch_size, discount, tau, policy_noise, noise_clip, policy_freq)

   
    # When the training step is done, we reset the state of the environment
    obs = env.reset()
    policy.reset_hxcx()

    # Set the Done to False
    done = False
    
    # Set rewards and episode timesteps to zero
    episode_reward = 0
    episode_timesteps = 0
    episode_num += 1
  
  # Before 1000 timesteps, we play random actions
  if total_timesteps < 0.8*max_timesteps:
#random action
      actn = torch.randn(action_dim)
      action = torch.zeros(action_dim)
      action[torch.argmax(actn)] = 1
      
  else: # After 1000 timesteps, we switch to the model
    action = policy.select_action(torch.tensor(obs))
    # If the explore_noise parameter is not 0, we add noise to the action and we clip it
    if expl_noise != 0:
      print("policy action:",action)
      actn = (action + torch.randn(action_dim))
      action = torch.zeros(action_dim)
      action[torch.argmax(actn)] = 1
      
      
  # The agent performs the action in the environment, then reaches the next state and receives the reward
  new_obs, reward, done = env.step(action)
  
  # We check if the episode is done
  done_bool = 0 if episode_timesteps + 1 == env._max_episode_steps else float(done)
  
  # We increase the total reward
  episode_reward += reward
  
  # We store the new transition into the Experience Replay memory (ReplayBuffer)
  replay_buffer.add((obs, new_obs, action, reward, done_bool))

  # We update the state, the episode timestep, the total timesteps, and the timesteps since the evaluation of the policy
  obs = new_obs
  episode_timesteps += 1
  total_timesteps += 1
  timesteps_since_eval += 1

以下是错误信息：

    policy.train(replay_buffer, episode_timesteps, batch_size, discount, tau, policy_noise, noise_clip, policy_freq)

  File "C:/Users/granthjain/Desktop/startup_code/td3_lstm_try.py", line 196, in train
    next_action = self.actor_target(next_state)

  File "C:\Users\granthjain\Anaconda3_1\lib\site-packages\torch\nn\modules\module.py", line 477, in __call__
    result = self.forward(*input, **kwargs)

  File "C:/Users/granthjain/Desktop/startup_code/td3_lstm_try.py", line 79, in forward
    self.hx, self.cx = self.lstm(x, (self.hx, self.cx))

  File "C:\Users\granthjain\Anaconda3_1\lib\site-packages\torch\nn\modules\module.py", line 477, in __call__
    result = self.forward(*input, **kwargs)

  File "C:\Users\granthjain\Anaconda3_1\lib\site-packages\torch\nn\modules\rnn.py", line 708, in forward
    self.check_forward_hidden(input, hx[0], '[0]')

  File "C:\Users\granthjain\Anaconda3_1\lib\site-packages\torch\nn\modules\rnn.py", line 532, in check_forward_hidden
    input.size(0), hidden_label, hx.size(0)))

RuntimeError: Input batch size 100 doesn't match hidden[0] batch size 1

【问题讨论】：

我已经检查了所有 lstm 层的 hx 和 cx 是否已初始化
您确定要使用nn.LSTMCell 吗？这是一个很好的单次计算。LSTM 单元。如果你想将 LSTM 应用于整个输入，你应该使用 nn.LSTM

标签： machine-learning pytorch artificial-intelligence lstm batch-processing

【解决方案1】：

如果你用零初始化你的单元格状态和隐藏状态，根本不需要提供初始化，它会为你提供（默认情况下，请参阅docs）。但是，如果您决定自己做，您应该始终考虑批量大小（每次迭代可能不同）。

最后，nn.LSTMCell 的单元格和隐藏状态都具有形状 (batch_size, hidden_size)，而您在构造函数中使用形状 (1, hidden_size) 初始化它们一次。您必须将初始化移动到 forward() 并且每次调用都从 x 获取批量大小，这应该只是 x.shape[0]

附带说明，您使用的是nn.LSTMCell，这只是一个单细胞计算。使用一次并没有真正意义，请确保这对您有用。也许只是nn.LSTM？

【讨论】：

您好，感谢您的回复。我更正了上述错误，但现在出现错误：- RuntimeError: Dimension out of range (expected to be in range of [-1, 0], but got 1) 你能帮忙检查一下代码有什么问题吗？ .我假设尺寸是正确的....可能是旧的pytorch版本问题吗？ stackoverflow.com/questions/63072770/…
您好，感谢您的回复。我尝试了 nn.LSTM，它的速度更快。