将 PyTorch 代码从 CPU 移植到 GPU答案

【问题标题】：Porting PyTorch code from CPU to GPU将 PyTorch 代码从 CPU 移植到 GPU
【发布时间】：2018-03-24 01:56:43
【问题描述】：

按照https://github.com/spro/practical-pytorch/blob/master/seq2seq-translation/seq2seq-translation.ipynb的教程进行操作

有一个USE_CUDA 标志，用于控制 CPU（当为 False）到 GPU（当为 True）类型之间的变量和张量类型。

使用来自en-fr.tsv 的数据并将句子转换为变量：

import unicodedata
import string
import re
import random
import time
import math

from gensim.corpora.dictionary import Dictionary

import torch
import torch.nn as nn
from torch.autograd import Variable
from torch import LongTensor, FloatTensor
from torch import optim
import torch.nn.functional as F

import numpy as np

MAX_LENGTH = 10
USE_CUDA = False

# Turn a Unicode string to plain ASCII, thanks to http://stackoverflow.com/a/518232/2809427
def unicode_to_ascii(s):
    return ''.join(
        c for c in unicodedata.normalize('NFD', s)
        if unicodedata.category(c) != 'Mn'
    )

# Lowercase, trim, and remove non-letter characters
def normalize_string(s):
    s = unicode_to_ascii(s.lower().strip())
    s = re.sub(r"([.!?])", r" \1", s)
    s = re.sub(r"[^a-zA-Z.!?]+", r" ", s)
    return s

SOS_IDX, SOS_TOKEN = 0, '<s>'
EOS_IDX, EOS_TOKEN = 1, '</s>'
UNK_IDX, UNK_TOKEN = 2, '<unk>'
PAD_IDX, PAD_TOKEN = 3, '<blank>'

lines = open('en-fr.tsv').read().strip().split('\n')
pairs = [[normalize_string(s).split() for s in l.split('\t')] for l in lines]
src_sents, trg_sents = zip(*pairs)

src_dict = Dictionary([[SOS_TOKEN, EOS_TOKEN, UNK_TOKEN, PAD_TOKEN]])
src_dict.add_documents(src_sents)

trg_dict = Dictionary([[SOS_TOKEN, EOS_TOKEN, UNK_TOKEN, PAD_TOKEN]])
trg_dict.add_documents(trg_sents)

def variablize_sentences(sentence, dictionary):
    indices = [dictionary.token2id[tok] for tok in sentence] + [dictionary.token2id[EOS_TOKEN]]
    var = Variable(LongTensor(indices).view(-1, 1))
    return var.cuda() if USE_CUDA else var

input_variables = [variablize_sentences(sent, src_dict) for sent in src_sents]
output_variables = [variablize_sentences(sent, trg_dict) for sent in trg_sents]

并使用 Encoder-Attn-Decoder 网络：

class EncoderRNN(nn.Module):
    def __init__(self, input_size, hidden_size, n_layers=1):
        super(EncoderRNN, self).__init__()

        self.input_size = input_size
        self.hidden_size = hidden_size
        self.n_layers = n_layers

        self.embedding = nn.Embedding(input_size, hidden_size)    
        self.gru = nn.GRU(hidden_size, hidden_size, n_layers)

        self.embedding = self.embedding.cuda() if USE_CUDA else self.embedding
        self.gru = self.gru.cuda() if USE_CUDA else self.gru

    def forward(self, word_inputs, hidden):
        seq_len = len(word_inputs)

        embedded = self.embedding(word_inputs).view(seq_len, 1, -1)
        embedded = embedded.cuda() if USE_CUDA else embedded

        output, hidden = self.gru(embedded, hidden)
        output = output.cuda() if USE_CUDA else output
        hiddne = hidden.cuda() if USE_CUDA else hidden

        return output, hidden

    def init_hidden(self):
        hidden = Variable(torch.zeros(self.n_layers, 1, self.hidden_size))
        return hidden.cuda() if USE_CUDA else hidden

class Attn(nn.Module):
    def __init__(self, method, hidden_size, max_length=MAX_LENGTH):
        super(Attn, self).__init__()

        self.method = method
        self.hidden_size = hidden_size

        if self.method == 'general':
            self.attn = nn.Linear(self.hidden_size, hidden_size)

        elif self.method == 'concat':
            self.attn = nn.Linear(self.hidden_size * 2, hidden_size)
            self.other = nn.Parameter(FloatTensor(1, hidden_size))

    def forward(self, hidden, encoder_outputs):
        seq_len = len(encoder_outputs)

        # Create variable to store attention energies
        attn_energies = Variable(torch.zeros(seq_len)) # B x 1 x S
        attn_energies = attn_energies.cuda() if USE_CUDA else attn_energies
        # Calculate energies for each encoder output
        for i in range(seq_len):
            attn_energies[i] = self.score(hidden, encoder_outputs[i])

        # Normalize energies to weights in range 0 to 1, resize to 1 x 1 x seq_len
        return F.softmax(attn_energies).unsqueeze(0).unsqueeze(0)

    def score(self, hidden, encoder_output):
        if self.method == 'dot':
            energy =torch.dot(hidden.view(-1), encoder_output.view(-1))
        elif self.method == 'general':
            energy = self.attn(encoder_output)
            energy = torch.dot(hidden.view(-1), energy.view(-1))
        elif self.method == 'concat':
            energy = self.attn(torch.cat((hidden, encoder_output), 1))
            energy = torch.dot(self.v.view(-1), energy.view(-1))
        return energy

class AttnDecoderRNN(nn.Module):
    def __init__(self, attn_model, hidden_size, output_size, n_layers=1, dropout_p=0.1):
        super(AttnDecoderRNN, self).__init__()

        # Keep parameters for reference
        self.attn_model = attn_model
        self.hidden_size = hidden_size
        self.output_size = output_size
        self.n_layers = n_layers
        self.dropout_p = dropout_p

        # Define layers
        self.embedding = nn.Embedding(output_size, hidden_size)
        self.gru = nn.GRU(hidden_size * 2, hidden_size, n_layers, dropout=dropout_p)
        self.out = nn.Linear(hidden_size * 2, output_size)

        self.embedding = self.embedding.cuda() if USE_CUDA else self.embedding
        self.gru = self.gru.cuda() if USE_CUDA else self.gru
        self.out = self.out.cuda() if USE_CUDA else self.out


        # Choose attention model
        if attn_model != 'none':
            self.attn = Attn(attn_model, hidden_size)
            self.attn = self.attn.cuda() if USE_CUDA else self.attn

    def forward(self, word_input, last_context, last_hidden, encoder_outputs):
        # Note: we run this one step at a time

        # Get the embedding of the current input word (last output word)
        word_embedded = self.embedding(word_input).view(1, 1, -1) # S=1 x B x N

        # Combine embedded input word and last context, run through RNN
        rnn_input = torch.cat((word_embedded, last_context.unsqueeze(0)), 2)
        rnn_output, hidden = self.gru(rnn_input, last_hidden)

        # Calculate attention from current RNN state and all encoder outputs; apply to encoder outputs
        attn_weights = self.attn(rnn_output.squeeze(0), encoder_outputs)
        context = attn_weights.bmm(encoder_outputs.transpose(0, 1)) # B x 1 x N

        # Final output layer (next word prediction) using the RNN hidden state and context vector
        rnn_output = rnn_output.squeeze(0) # S=1 x B x N -> B x N
        context = context.squeeze(1)       # B x S=1 x N -> B x N
        output = F.log_softmax(self.out(torch.cat((rnn_output, context), 1)))

        if USE_CUDA:
            return output.cuda(), context.cuda(), hidden.cuda(), attn_weights.cuda()
        else:
            return output, context, hidden, attn_weights

并测试网络：

encoder_test = EncoderRNN(10, 10, 2) # I, H , L
decoder_test = AttnDecoderRNN('general', 10, 10, 2) # A, H, O, L

encoder_hidden = encoder_test.init_hidden()
if USE_CUDA:
    word_inputs = Variable(torch.LongTensor([1, 2, 3]).cuda())
else:
    word_inputs = Variable(torch.LongTensor([1, 2, 3]))
encoder_outputs, encoder_hidden = encoder_test(word_inputs, encoder_hidden)
decoder_attns = torch.zeros(1, 3, 3)
decoder_hidden = encoder_hidden
decoder_context = Variable(torch.zeros(1, decoder_test.hidden_size))

decoder_output, decoder_context, decoder_hidden, decoder_attn = decoder_test(word_inputs[0], decoder_context, decoder_hidden, encoder_outputs)
print(decoder_output)
print(decoder_hidden)
print(decoder_attn)

代码在 CPU 上运行良好，

[出]：

EncoderRNN (
  (embedding): Embedding(10, 10)
  (gru): GRU(10, 10, num_layers=2)
)
AttnDecoderRNN (
  (embedding): Embedding(10, 10)
  (gru): GRU(20, 10, num_layers=2, dropout=0.1)
  (out): Linear (20 -> 10)
  (attn): Attn (
    (attn): Linear (10 -> 10)
  )
)
Variable containing:
-2.4378 -2.3556 -2.3391 -2.5070 -2.3439 -2.3415 -2.3976 -2.1832 -1.9976 -2.2213
[torch.FloatTensor of size 1x10]

Variable containing:
(0 ,.,.) = 

Columns 0 to 8 
  -0.2325  0.0775  0.5415  0.4876 -0.5771 -0.0687  0.1832 -0.5285  0.2508

Columns 9 to 9 
  -0.1837

(1 ,.,.) = 

Columns 0 to 8 
  -0.1389 -0.2605 -0.0518  0.3405  0.0774  0.1815  0.0297 -0.1304 -0.1015

Columns 9 to 9 
   0.2602
[torch.FloatTensor of size 2x1x10]

Variable containing:
(0 ,.,.) = 
  0.3334  0.3291  0.3374
[torch.FloatTensor of size 1x1x3]

但是将标志更改为USE_GPU=True时，会在初始化decoder_test对象时抛出错误，它会抛出TypeError：

---------------------------------------------------------------------------
TypeError                                 Traceback (most recent call last)
<ipython-input-76-b3c660013934> in <module>()
     12 decoder_context = Variable(torch.zeros(1, decoder_test.hidden_size))
     13 
---> 14 decoder_output, decoder_context, decoder_hidden, decoder_attn = decoder_test(word_inputs[0], decoder_context, decoder_hidden, encoder_outputs)
     15 print(decoder_output)
     16 print(decoder_hidden)

~/.local/lib/python3.5/site-packages/torch/nn/modules/module.py in __call__(self, *input, **kwargs)
    222         for hook in self._forward_pre_hooks.values():
    223             hook(self, input)
--> 224         result = self.forward(*input, **kwargs)
    225         for hook in self._forward_hooks.values():
    226             hook_result = hook(self, input, result)

<ipython-input-75-34ecfe9b3112> in forward(self, word_input, last_context, last_hidden, encoder_outputs)
     32 
     33         # Combine embedded input word and last context, run through RNN
---> 34         rnn_input = torch.cat((word_embedded, last_context.unsqueeze(0)), 2)
     35         rnn_output, hidden = self.gru(rnn_input, last_hidden)
     36 

~/.local/lib/python3.5/site-packages/torch/autograd/variable.py in cat(iterable, dim)
    895         @staticmethod
    896         def cat(iterable, dim=0):
--> 897             return Concat.apply(dim, *iterable)
    898 
    899         @staticmethod

~/.local/lib/python3.5/site-packages/torch/autograd/_functions/tensor.py in forward(ctx, dim, *inputs)
    315         ctx.dim = dim
    316         ctx.input_sizes = [i.size(dim) for i in inputs]
--> 317         return torch.cat(inputs, dim)
    318 
    319     @staticmethod

TypeError: cat received an invalid combination of arguments - got (tuple, int), but expected one of:
 * (sequence[torch.cuda.FloatTensor] seq)
 * (sequence[torch.cuda.FloatTensor] seq, int dim)
      didn't match because some of the arguments have invalid types: (tuple, int)

问题是为什么这些类型在 CUDA 中不匹配但它在 CPU 上工作以及如何解决这个问题？

PyTorch 是否有一个全局标志，可以只将所有类型更改为 CUDA 类型，而不是搞乱 CPU/GPU 类型？

【问题讨论】：

也许您在模型encoder_test 和decoder_test 以及Variable decoder_context 中忘记了.cuda()
也在discuss.pytorch.org/t/…
谢谢@MauelLagunas！事实上，encoder_hidden 和 decoder_context 没有 .cuda()
这能回答你的问题吗？ How to run PyTorch computations in cuda as default

标签： python torch pytorch tensor

【解决方案1】：

PyTorch 是否有一个全局标志，可以只将所有类型更改为 CUDA 类型，而不是搞乱 CPU/GPU 类型？

没有。

（来源：https://discuss.pytorch.org/t/porting-seq2seq-tutorial-from-spro-practical-pytorh-from-cpu-to-gpu/8604）

具体到示例：

decoder_test 对象的输入变量需要为.cuda() 类型。更具体地说：

encoder_hidden = encoder_test.init_hidden()
---> encoder_hidden = encoder_test.init_hidden().cuda()


decoder_context = Variable(torch.zeros(1, decoder_test.hidden_size))
---> decoder_context = Variable(torch.zeros(1, decoder_test.hidden_size)).cuda()

所以测试网络的代码应该是：

encoder_test = EncoderRNN(10, 10, 2) # I, H , L
decoder_test = AttnDecoderRNN('general', 10, 10, 2) # A, H, O, L

encoder_hidden = encoder_test.init_hidden().cuda()
if USE_CUDA:
    word_inputs = Variable(torch.LongTensor([1, 2, 3]).cuda())
else:
    word_inputs = Variable(torch.LongTensor([1, 2, 3]))
encoder_outputs, encoder_hidden = encoder_test(word_inputs, encoder_hidden)
decoder_attns = torch.zeros(1, 3, 3)
decoder_hidden = encoder_hidden
decoder_context = Variable(torch.zeros(1, decoder_test.hidden_size)).cuda()

decoder_output, decoder_context, decoder_hidden, decoder_attn = decoder_test(word_inputs[0], decoder_context, decoder_hidden, encoder_outputs)
print(decoder_output)
print(decoder_hidden)
print(decoder_attn)

[出]：

Variable containing:
-2.1412 -2.4589 -2.4042 -2.1591 -2.5080 -2.0839 -2.5058 -2.3831 -2.4468 -2.0804
[torch.cuda.FloatTensor of size 1x10 (GPU 0)]

Variable containing:
(0 ,.,.) = 

Columns 0 to 8 
  -0.0264 -0.0689  0.1049  0.0760  0.1017 -0.4585 -0.1273  0.0449 -0.3271

Columns 9 to 9 
  -0.0104

(1 ,.,.) = 

Columns 0 to 8 
  -0.0308 -0.0690 -0.0258 -0.2759  0.1403 -0.0468 -0.0205  0.0126 -0.1729

Columns 9 to 9 
   0.0599
[torch.cuda.FloatTensor of size 2x1x10 (GPU 0)]

Variable containing:
(0 ,.,.) = 
  0.3328  0.3328  0.3344
[torch.cuda.FloatTensor of size 1x1x3 (GPU 0)]

【讨论】：

【解决方案2】：

你也可以试试：

net = YouNetworkClass()
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
net.to(device)

之后，您还必须将word_inputs、encoder_hidden 和decoder_context 发送到GPU：

word_inputs, encoder_hidden, decoder_context = word_inputs.to(device), encoder_hidden.to(device), decoder_context.to(device)

看这里：https://pytorch.org/tutorials/beginner/blitz/cifar10_tutorial.html#training-on-gpu

【讨论】：

【解决方案3】：

PyTorch 是否有一个全局标志，可以只将所有类型更改为 CUDA 类型，而不是搞乱 CPU/GPU 类型？

是的。你可以 set the default tensor type 到 cuda 使用：

torch.set_default_tensor_type('torch.cuda.FloatTensor')

【讨论】：