【发布时间】:2019-07-16 09:52:05
【问题描述】:
我正在将 Keras 代码转换为 PyTorch,因为我对后者比前者更熟悉。但是,我发现它没有学习(或几乎没有学习)。
下面我提供了我几乎所有的 PyTorch 代码,包括初始化代码,以便您自己尝试。您唯一需要自己提供的是词嵌入(我相信您可以在网上找到许多 word2vec 模型)。第一个输入文件应该是一个带有标记文本的文件,第二个输入文件应该是一个带有浮点数的文件,每行一个。因为我已经提供了所有代码,所以这个问题可能看起来很大而且过于广泛。但是,我认为我的问题足够具体:我的模型或训练循环中有什么问题导致我的模型没有或几乎没有改进。 (结果见下文。)
我已尝试在适用的情况下提供许多 cmets,并且我还提供了形状转换,因此您不必 运行代码来查看发生了什么。检查数据准备方法并不重要。
最重要的部分是RegressorNet 的前向方法和RegressionNN 的训练循环(诚然,这些名称选错了)。我认为错误存在于某个地方。
from pathlib import Path
import time
import numpy as np
import torch
from torch import nn, optim
from torch.utils.data import DataLoader
import gensim
from scipy.stats import pearsonr
from LazyTextDataset import LazyTextDataset
class RegressorNet(nn.Module):
def __init__(self, hidden_dim, embeddings=None, drop_prob=0.0):
super(RegressorNet, self).__init__()
self.hidden_dim = hidden_dim
self.drop_prob = drop_prob
# Load pretrained w2v model, but freeze it: don't retrain it.
self.word_embeddings = nn.Embedding.from_pretrained(embeddings)
self.word_embeddings.weight.requires_grad = False
self.w2v_rnode = nn.GRU(embeddings.size(1), hidden_dim, bidirectional=True, dropout=drop_prob)
self.dropout = nn.Dropout(drop_prob)
self.linear = nn.Linear(hidden_dim * 2, 1)
# LeakyReLU rather than ReLU so that we don't get stuck in a dead nodes
self.lrelu = nn.LeakyReLU()
def forward(self, batch_size, sentence_input):
# shape sizes for:
# * batch_size 128
# * embeddings of dim 146
# * hidden dim of 200
# * sentence length of 20
# sentence_input: torch.Size([128, 20])
# Get word2vec vector representation
embeds = self.word_embeddings(sentence_input)
# embeds: torch.Size([128, 20, 146])
# embeds.view(-1, batch_size, embeds.size(2)): torch.Size([20, 128, 146])
# Input vectors into GRU, only keep track of output
w2v_out, _ = self.w2v_rnode(embeds.view(-1, batch_size, embeds.size(2)))
# w2v_out = torch.Size([20, 128, 400])
# Leaky ReLU it
w2v_out = self.lrelu(w2v_out)
# Dropout some nodes
if self.drop_prob > 0:
w2v_out = self.dropout(w2v_out)
# w2v_out: torch.Size([20, 128, 400
# w2v_out[-1, :, :]: torch.Size([128, 400])
# Only use the last output of a sequence! Supposedly that cell outputs the final information
regression = self.linear(w2v_out[-1, :, :])
regression: torch.Size([128, 1])
return regression
class RegressionRNN:
def __init__(self, train_files=None, test_files=None, dev_files=None):
print('Using torch ' + torch.__version__)
self.datasets, self.dataloaders = RegressionRNN._set_data_loaders(train_files, test_files, dev_files)
self.device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
self.model = self.w2v_vocab = self.criterion = self.optimizer = self.scheduler = None
@staticmethod
def _set_data_loaders(train_files, test_files, dev_files):
# labels must be the last input file
datasets = {
'train': LazyTextDataset(train_files) if train_files is not None else None,
'test': LazyTextDataset(test_files) if test_files is not None else None,
'valid': LazyTextDataset(dev_files) if dev_files is not None else None
}
dataloaders = {
'train': DataLoader(datasets['train'], batch_size=128, shuffle=True, num_workers=4) if train_files is not None else None,
'test': DataLoader(datasets['test'], batch_size=128, num_workers=4) if test_files is not None else None,
'valid': DataLoader(datasets['valid'], batch_size=128, num_workers=4) if dev_files is not None else None
}
return datasets, dataloaders
@staticmethod
def prepare_lines(data, split_on=None, cast_to=None, min_size=None, pad_str=None, max_size=None, to_numpy=False,
list_internal=False):
""" Converts the string input (line) to an applicable format. """
out = []
for line in data:
line = line.strip()
if split_on:
line = line.split(split_on)
line = list(filter(None, line))
else:
line = [line]
if cast_to is not None:
line = [cast_to(l) for l in line]
if min_size is not None and len(line) < min_size:
# pad line up to a number of tokens
line += (min_size - len(line)) * ['@pad@']
elif max_size and len(line) > max_size:
line = line[:max_size]
if list_internal:
line = [[item] for item in line]
if to_numpy:
line = np.array(line)
out.append(line)
if to_numpy:
out = np.array(out)
return out
def prepare_w2v(self, data):
idxs = []
for seq in data:
tok_idxs = []
for word in seq:
# For every word, get its index in the w2v model.
# If it doesn't exist, use @unk@ (available in the model).
try:
tok_idxs.append(self.w2v_vocab[word].index)
except KeyError:
tok_idxs.append(self.w2v_vocab['@unk@'].index)
idxs.append(tok_idxs)
idxs = torch.tensor(idxs, dtype=torch.long)
return idxs
def train(self, epochs=10):
valid_loss_min = np.Inf
train_losses, valid_losses = [], []
for epoch in range(1, epochs + 1):
epoch_start = time.time()
train_loss, train_results = self._train_valid('train')
valid_loss, valid_results = self._train_valid('valid')
# Calculate Pearson correlation between prediction and target
try:
train_pearson = pearsonr(train_results['predictions'], train_results['targets'])
except FloatingPointError:
train_pearson = "Could not calculate Pearsonr"
try:
valid_pearson = pearsonr(valid_results['predictions'], valid_results['targets'])
except FloatingPointError:
valid_pearson = "Could not calculate Pearsonr"
# calculate average losses
train_loss = np.mean(train_loss)
valid_loss = np.mean(valid_loss)
train_losses.append(train_loss)
valid_losses.append(valid_loss)
# print training/validation statistics
print(f'----------\n'
f'Epoch {epoch} - completed in {(time.time() - epoch_start):.0f} seconds\n'
f'Training Loss: {train_loss:.6f}\t Pearson: {train_pearson}\n'
f'Validation loss: {valid_loss:.6f}\t Pearson: {valid_pearson}')
# validation loss has decreased
if valid_loss <= valid_loss_min and train_loss > valid_loss:
print(f'!! Validation loss decreased ({valid_loss_min:.6f} --> {valid_loss:.6f}). Saving model ...')
valid_loss_min = valid_loss
if train_loss <= valid_loss:
print('!! Training loss is lte validation loss. Might be overfitting!')
# Optimise with scheduler
if self.scheduler is not None:
self.scheduler.step(valid_loss)
print('Done training...')
def _train_valid(self, do):
""" Do training or validating. """
if do not in ('train', 'valid'):
raise ValueError("Use 'train' or 'valid' for 'do'.")
results = {'predictions': np.array([]), 'targets': np.array([])}
losses = np.array([])
self.model = self.model.to(self.device)
if do == 'train':
self.model.train()
torch.set_grad_enabled(True)
else:
self.model.eval()
torch.set_grad_enabled(False)
for batch_idx, data in enumerate(self.dataloaders[do], 1):
# 1. Data prep
sentence = data[0]
target = data[-1]
curr_batch_size = target.size(0)
# Returns list of tokens, possibly padded @pad@
sentence = self.prepare_lines(sentence, split_on=' ', min_size=20, max_size=20)
# Converts tokens into w2v IDs as a Tensor
sent_w2v_idxs = self.prepare_w2v(sentence)
# Converts output to Tensor of floats
target = torch.Tensor(self.prepare_lines(target, cast_to=float))
# Move input to device
sent_w2v_idxs, target = sent_w2v_idxs.to(self.device), target.to(self.device)
# 2. Predictions
pred = self.model(curr_batch_size, sentence_input=sent_w2v_idxs)
loss = self.criterion(pred, target)
# 3. Optimise during training
if do == 'train':
self.optimizer.zero_grad()
loss.backward()
self.optimizer.step()
# 4. Save results
pred = pred.detach().cpu().numpy()
target = target.cpu().numpy()
results['predictions'] = np.append(results['predictions'], pred, axis=None)
results['targets'] = np.append(results['targets'], target, axis=None)
losses = np.append(losses, float(loss))
torch.set_grad_enabled(True)
return losses, results
if __name__ == '__main__':
HIDDEN_DIM = 200
# Load embeddings from pretrained gensim model
embed_p = Path('path-to.w2v_model').resolve()
w2v_model = gensim.models.KeyedVectors.load_word2vec_format(str(embed_p))
# add a padding token with only zeros
w2v_model.add(['@pad@'], [np.zeros(w2v_model.vectors.shape[1])])
embed_weights = torch.FloatTensor(w2v_model.vectors)
# Text files are used as input. Every line is one datapoint.
# *.tok.low.*: tokenized (space-separated) sentences
# *.cross: one floating point number per line, which we are trying to predict
regr = RegressionRNN(train_files=(r'train.tok.low.en',
r'train.cross'),
dev_files=(r'dev.tok.low.en',
r'dev.cross'),
test_files=(r'test.tok.low.en',
r'test.cross'))
regr.w2v_vocab = w2v_model.vocab
regr.model = RegressorNet(HIDDEN_DIM, embed_weights, drop_prob=0.2)
regr.criterion = nn.MSELoss()
regr.optimizer = optim.Adam(list(regr.model.parameters())[0:], lr=0.001)
regr.scheduler = optim.lr_scheduler.ReduceLROnPlateau(regr.optimizer, 'min', factor=0.1, patience=5, verbose=True)
regr.train(epochs=100)
关于LazyTextDataset,可以参考下面的类。
from torch.utils.data import Dataset
import linecache
class LazyTextDataset(Dataset):
def __init__(self, paths):
# labels are in the last path
self.paths, self.labels_path = paths[:-1], paths[-1]
with open(self.labels_path, encoding='utf-8') as fhin:
lines = 0
for line in fhin:
if line.strip() != '':
lines += 1
self.num_entries = lines
def __getitem__(self, idx):
data = [linecache.getline(p, idx + 1) for p in self.paths]
label = linecache.getline(self.labels_path, idx + 1)
return (*data, label)
def __len__(self):
return self.num_entries
正如我之前所写,我正在尝试将 Keras 模型转换为 PyTorch。原始 Keras 代码不使用嵌入层,而是使用每个句子的预构建 word2vec 向量作为输入。在下面的模型中,没有嵌入层。 Keras 摘要看起来像这样(我无权访问基本模型设置)。
Layer (type) Output Shape Param # Connected to
====================================================================================================
bidirectional_1 (Bidirectional) (200, 400) 417600
____________________________________________________________________________________________________
dropout_1 (Dropout) (200, 800) 0 merge_1[0][0]
____________________________________________________________________________________________________
dense_1 (Dense) (200, 1) 801 dropout_1[0][0]
====================================================================================================
问题在于,在输入相同的情况下,Keras 模型工作,并在预测标签和实际标签之间获得 +0.5 的 Pearson 相关性。但是,上面的 PyTorch 模型似乎根本不起作用。给你一个概念,这里是第一个 epoch 之后的损失(均方误差)和 Pearson(相关系数,p 值):
Epoch 1 - completed in 11 seconds
Training Loss: 1.684495 Pearson: (-0.0006077809280690612, 0.8173368901481127)
Validation loss: 1.708228 Pearson: (0.017794288315261794, 0.4264098054188664)
在第 100 个 epoch 之后:
Epoch 100 - completed in 11 seconds
Training Loss: 1.660194 Pearson: (0.0020315421756790806, 0.4400929436716754)
Validation loss: 1.704910 Pearson: (-0.017288118524826892, 0.4396865964324158)
损失如下图所示(当您查看 Y 轴时,您可以看到改进很小)。
可能有问题的最后一个指标是,对于我的 140K 行输入,在我的 GTX 1080TI 上每个 epoch 只需要 10 秒。我觉得他的并不多,我猜想优化没有工作/运行。不过,我不知道为什么。问题可能在我的训练循环或模型本身中,但我找不到它。
再一次,一定是出了点问题,因为: - Keras 模型确实表现良好; - 140K 句子的训练速度“太快” - 训练后几乎没有任何改进。
我错过了什么?该问题很可能存在于训练循环或网络结构中。
【问题讨论】:
-
因为太宽泛而投反对票甚至投票结束的人:请阅读整个问题,如果您阅读的内容过多,请转到下一个问题。出于可重复性的目的,该代码非常详细。如果您只想查看问题出在哪里,请查看网络类和训练循环。
-
非常适合在没有线性的情况下使用“回归”一词。
标签: python tensorflow machine-learning keras pytorch