【发布时间】:2021-11-06 21:33:15
【问题描述】:
我正在尝试将我的 tensorflow 代码转换为 pytorch。
简单地说,它使用 CNN 从图像中估计 7 个值(数字)。(回归量)
主干网络是预训练权重的vgg16,我想将最后一个fcl(实际上由于ImageNet数据集,最后一个fcl输出1000个类)转换为(4096 x 4096),并添加更多fcl。
之前:
vgg 最后整箱 (4096 x 1000)
之后:
vgg 最后一个 fcl(更改为 4096 x 4096)
----添加 fcl1 (4096 x 4096)
----添加 fcl2 (4096 x 2048)
└ 添加 fclx (2048 x 3)
└ 添加 fclq (2048 x 4)
:fcl2连接两个不同的张量,大小分别为3和4
在这里,我尝试只用一张图像(仅用于调试)和带有 L2 损失的 GT 值(7 个值)来做。 如果我使用 Tensorflow 这样做,损失会大大减少,当我推断图像时,它给出的值几乎与 GT 相似。
但是,如果我尝试使用 Pytorch 来做,看起来训练效果不佳。
我猜损失应该在训练时急剧减少(几乎每次迭代)
有什么问题?
- 损失实际上是 |x-x'|^2 + b|q-q'|^2,众所周知的 L2-norm 用于 PoseNet (Kendall, 2015)。 x 具有三个位置值,q 具有四个四元数(旋转)值。 b 是用户确定的超参数。
from torchvision import models
import torch.nn as nn
import torch
from torch.autograd import Variable
import torch.optim as optim
import os
import os.path
import torch.utils.data as data
from torchvision import transforms as T
from PIL import Image
class DataSource(data.Dataset):
def __init__(self, root, train=True, transforms=None, txtName='dataset_train'):
self.root = os.path.expanduser(root)
self.transforms = transforms
self.train = train
self.imageFormat = '.jpg'
self.image_poses = []
self.image_paths = []
self.txtName = txtName
self._get_data()
if transforms is None:
normalize = T.Normalize(mean=[0.485, 0.456, 0.406],
std=[0.229, 0.224, 0.225])
if not train:
self.transforms = T.Compose(
[T.Resize(256),
T.CenterCrop(224),
T.ToTensor(),
normalize]
)
else:
self.transforms = T.Compose(
[T.Resize(256),
T.CenterCrop(224),
# T.RandomCrop(224),
T.ToTensor(),
normalize]
)
def _get_data(self):
txt_file = self.root + '/' + self.txtName + '.txt'
count = 0
with open(txt_file, 'r') as f:
for line in f:
if len(line.split()) != 8:
next(f)
fname, p0, p1, p2, p3, p4, p5, p6 = line.split()
p0 = float(p0); p1 = float(p1); p2 = float(p2);
p3 = float(p3); p4 = float(p4); p5 = float(p5); p6 = float(p6)
ImageFullName = self.root + '/' + fname
if count == 0:
if os.path.isfile(ImageFullName) == False:
self.imageFormat = '.png'
if self.imageFormat != '.jpg':
ImageFullName = ImageFullName.replace('.jpg', self.imageFormat)
self.image_poses.append([p0, p1, p2, p3, p4, p5, p6])
self.image_paths.append(ImageFullName)
count += 1
print('Total : ', len(self.image_paths), ' images')
def __getitem__(self, index):
img_path = self.image_paths[index]
img_pose = self.image_poses[index]
data = Image.open(img_path)
data = self.transforms(data)
return data, torch.tensor(img_pose)
def __len__(self):
return len(self.image_paths)
class PoseLoss(nn.Module):
def __init__(self, beta, device = 'cuda'):
super(PoseLoss, self).__init__()
self.beta = beta
self.device = device
self.t_loss_fn = nn.MSELoss()
def forward(self, x, q, poseGT):
GT_x = poseGT[:, 0:3]
GT_q = poseGT[:, 3:]
xx = Variable(x, requires_grad=True).to(self.device)
qq = Variable(q, requires_grad=True).to(self.device)
print('GT', GT_x, GT_q)
print('Estim', xx, qq)
loss = torch.sqrt(self.t_loss_fn(GT_x[:, :3].cpu(), xx[:, :3].cpu())) + self.beta*torch.sqrt(self.t_loss_fn(GT_q[:, 3:].cpu(), qq[:, 3:].cpu()))
return loss
class Net(nn.Module):
def __init__(self):
super(Net, self).__init__()
self.backbone = models.vgg16(pretrained=True)
self.backbone._modules['classifier'][6] = nn.ReLU(nn.Linear(4096, 4096))
self.fcl = nn.Sequential(nn.Linear(4096, 4096), nn.ReLU(), nn.Linear(4096, 2048), nn.ReLU())
self.xyz = nn.Linear(2048, 3)
self.q = nn.Linear(2048, 4)
def forward(self, x):
x1 = self.backbone(x)
x2 = self.fcl(x1)
xyz = self.xyz(x2)
q = self.q(x2)
return xyz, q
batch_size = 1
learning_rate = 10e-5
training_epochs = 100
if __name__ == "__main__":
device = 'cuda' if torch.cuda.is_available() else 'cpu'
data = DataSource(DatasetDirectory + DatasetFolder, train=True, transforms=None, txtName=TrainDatasetList)
data_loader = torch.utils.data.DataLoader(dataset=data, batch_size=batch_size, shuffle=False, num_workers=4)
model = Net().to(device)
model.train()
criterion = PoseLoss(beta = 100, device = device)
optimizer = optim.Adam(model.parameters(), lr=learning_rate, betas = (0.9, 0.999), eps =0.00000001)
iteration = 0
minloss = 10e8
minlossindex = -1
for epoch in range(1, training_epochs):
dataiter = iter(data_loader)
for Images, Poses in dataiter:
optimizer.zero_grad()
Images = Images.to(device).float()
x, q = model(Images)
loss = criterion(x, q, Poses)
loss.backward()
loss = loss.item()/ batch_size
optimizer.step()
print(epoch, ' : ', iteration , ' -> ' , loss, ' minloss ', minloss, ' at ', minlossindex)
if loss < minloss:
minloss = loss
minlossindex = iteration
if epoch < (int)(training_epochs*0.8):
torch.save(model.state_dict(), 'Min.pth')
iteration = iteration + 1
torch.save(model.state_dict(), 'Fin.pth')
所有 7 个值的估计结果往往为零,我想不出它为什么给出这样的值。
另外,正如我上面提到的,损失值在训练时并没有显着降低(我预计每次迭代都应该显着降低,直到收敛,因为我只使用了一张图像进行训练)
【问题讨论】:
-
可能不相关,但您采用了 MSE 的
torch.sqrt(平均 平方 错误),即两次sqrt。另外,你为什么使用Variable(..., requires_grad=True)?VariableAPI 已弃用很长时间。 -
解决方案:我将所有张量更改为在 GPU(cuda) 上工作,并转换 self.backbone._modules['classifier'][6] = nn.ReLU(nn.Linear(4096, 4096) ) 到 nn.Sequential。
标签: python tensorflow deep-learning pytorch