如果您有诸如视频帧之类的图像,并且它们以某种方式相互连接
你可以遍历 conv2d 并一个一个地传递它们
查看my git repo 中的此示例以进行视频动作识别
class Net(nn.Module):
def __init__(self):
super(Net, self).__init__()
num_classes = 1
dr_rate= 0.2
pretrained = True
rnn_hidden_size = 30
rnn_num_layers = 2
#get a pretrained vgg19 model ( taking only the cnn layers and fine tun them)
baseModel = models.vgg19(pretrained=pretrained).features
i = 0
for child in baseModel.children():
if i < 28:
for param in child.parameters():
param.requires_grad = False
else:
for param in child.parameters():
param.requires_grad = True
i +=1
num_features = 12800
self.baseModel = baseModel
self.dropout= nn.Dropout(dr_rate)
self.rnn = nn.LSTM(num_features, rnn_hidden_size, rnn_num_layers , batch_first=True)
self.fc2 = nn.Linear(30, 256)
self.fc3 = nn.Linear(256, num_classes)
def forward(self, x):
batch_size, time_steps, C, H, W = x.size()
# reshape input to be (batch_size * timesteps, input_size)
x = x.contiguous().view(batch_size * time_steps, C, H, W)
x = self.baseModel(x)
x = x.view(x.size(0), -1)
#make output as ( samples, timesteps, output_size)
x = x.contiguous().view(batch_size , time_steps , x.size(-1))
x , (hn, cn) = self.rnn(x)
x = F.relu(self.fc2(x[:, -1, :])) # get output of the last lstm not full sequence
x = self.dropout(x)
x = self.fc3(x)
return x
主要思想是在这个块中,我们将每个帧或图像分配到卷积网络,然后我们重塑甚至将其馈送到新网络
# reshape input to be (batch_size * timesteps, input_size)
x = x.contiguous().view(batch_size * time_steps, C, H, W)
# feed to the pre-trained conv model
x = self.baseModel(x)
# flatten the output
x = x.view(x.size(0), -1)
# make the new correct shape (batch_size , timesteps , output_size)
x = x.contiguous().view(batch_size , time_steps , x.size(-1)) # this x is now ready to be entred or feed into lstm layer