为什么我的 tensorflow 中的 LSTM 学习如此缓慢和糟糕？答案

【问题标题】：Why is my LSTM in tensorflow learning so slowly and badly?为什么我的 tensorflow 中的 LSTM 学习如此缓慢和糟糕？
【发布时间】：2017-12-10 22:33:45
【问题描述】：

这个程序读取一个文本文件 RNNtext.txt，为所有数据创建一个热向量表示，用数据训练 LSTM 并不时显示一堆采样字符。但是，即使查看cost vs iterations graph 也表明它的学习效率非常低。老实说，我拥有的 LSTM 的原始代码（numpy）做得更好。它不仅速度更快，而且能产生大部分有意义的单词。这只会产生乱码。我的错误在哪里？我真的没有想法，我似乎找不到逻辑错误的地方。

import numpy as np
import random
import tensorflow as tf
import os
import matplotlib.pyplot as plt

plt.style.use('fivethirtyeight')

# Reading RNNtext.txt file
direc = os.path.dirname(os.path.realpath(__file__))
data = open(direc + "/RNNtext.txt", "r").read()

# Array of unique characters
chars = list(set(data))

num_hidden = 80
iterations = 1000
display_iteration = 100 # Sample when iteration % display_iteration == 0
sample_size = 250
batch_size = 120 # batch size or the number of time steps to unroll RNN
alpha = 0.01 # Learning rate

#Vocabulary and text file sizes
vocab_size = len(chars)
data_size = len(data)

# Bijection from a unique character to an index
char_to_ix = {}
# Bijection from an index to a unique character
ix_to_char = {}

for j in range(vocab_size):
    char_to_ix[chars[j]] = j
    ix_to_char[j] = chars[j]


# Transforming all characters to indices    
data_ix = [char_to_ix[ch] for ch in data]


train_data = [] # This will contain one-hot vectors
for k in range(data_size):
    # Representing each index/character by a one-hot vector
    hot1 = np.zeros((vocab_size, 1))
    hot1[data_ix[k]] = 1
    train_data.append(hot1)



X = tf.placeholder(tf.float32, [None, vocab_size, 1]) #Number of examples, number of input, dimension of each input
target = tf.placeholder(tf.float32, [None, vocab_size])


cell = tf.contrib.rnn.LSTMCell(num_hidden,state_is_tuple=True)
output, _ = tf.nn.dynamic_rnn(cell, X, dtype = tf.float32)
output = tf.transpose(output, [1, 0, 2])


weight = tf.Variable(tf.random_normal([num_hidden, vocab_size]))
bias = tf.Variable(tf.constant(0.0, shape=[vocab_size]))

prediction = tf.matmul(output[-1], weight) + bias
cost = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(logits=prediction, labels=target))

optimizer = tf.train.ProximalGradientDescentOptimizer(alpha)
minimize = optimizer.minimize(cost)


init = tf.global_variables_initializer()
sess = tf.Session()
sess.run(init)


ARR = [i for i in range(vocab_size)] # for extracting index by probabilities in np.random.choice()

ITER = []
COST = []

p = 0 # p will be iterated by batch_size steps
for i in range(iterations):
    if p + batch_size >= data_size:
        p = 0

    # sweeping through data one-hot vectors 
    inp, out = train_data[p:p+batch_size], train_data[p+1:p+batch_size+1]
    out = np.reshape(out, [-1, vocab_size])

    c = sess.run(cost, {X: inp, target: out}) # calculating cost for plotting later
    COST.append(c)
    ITER.append(i)

    sess.run(minimize, {X: inp, target: out})

    # displaying sample_size number of characters with random seed
    # doesn't affect training
    if i % display_iteration == 0:
        seed = np.random.randint(0, vocab_size)
        CHARS = []
        for j in range(sample_size):
            x = np.zeros((vocab_size, 1))
            x[seed] = 1
            x = [x]
            pred = sess.run(prediction, {X: x})[0]
            pred = np.exp(pred) / np.sum(np.exp(pred))
            pred = pred.ravel()

            seed = np.random.choice(ARR, 1, p = pred)[0]
            ch = ix_to_char[seed]
            CHARS.append(ch)
        TXT = ''.join(CHARS)

        print("-------------------------------------------------")
        print(TXT)
        print("Iteration: ", str(i))

    p += batch_size
sess.close()
plt.plot(ITER, COST)
plt.show()

编辑：添加 numpy 代码进行比较

import numpy as np
import matplotlib.pyplot as plt
import os
plt.style.use('fivethirtyeight')
direc = os.path.dirname(os.path.realpath(__file__))

readFile = open(direc + "\RNNtext.txt", 'r')

data = readFile.read()
readFile.close()


chars = list(set(data))
data_size, vocab_size = len(data), len(chars)
print(chars)
print("Vocabulary size: " + str(vocab_size))
char_to_ix = {}
ix_to_char = {}

for j in range(len(chars)):
    char_to_ix[chars[j]] = j
    ix_to_char[j] = chars[j]

hidden_size = 80
batch_size = 120
alpha = 0.1
sample_size = 250
iterations = 1000
display_iteration = 100



Wxh = np.random.randn(hidden_size, vocab_size)*0.01 # input to hidden
Whh = np.random.randn(hidden_size, hidden_size)*0.01 # hidden to hidden
Why = np.random.randn(vocab_size, hidden_size)*0.01 # hidden to output
bh = np.zeros((hidden_size, 1)) # hidden bias
by = np.zeros((vocab_size, 1)) # output bias


def sample(hid, seed, weights, sample_size):
    X = np.zeros((vocab_size, 1))
    X[seed] = 1
    CHARS = []
    ARR = [i for i in range(vocab_size)]

    for t in range(sample_size):
        hid = np.tanh(np.dot(Wxh, X)  + np.dot(Whh, hid) + bh)
        y = np.dot(Why, hid) + by
        prob = np.exp(y) / np.sum(np.exp(y))
        prob = prob.ravel()
        ix = np.random.choice(ARR, 1, p=prob)[0]
        CHARS.append(ix_to_char[ix])
        X = np.zeros((vocab_size, 1))
        X[ix] = 1
        TXT = ''.join(CHARS)
    return TXT

LOSS = []
ITER = []
p = 0

mWxh, mWhh, mWhy = np.zeros_like(Wxh), np.zeros_like(Whh), np.zeros_like(Why)
mbh, mby = np.zeros_like(bh), np.zeros_like(by) # memory variables for Adagrad

smooth_loss = -np.log(1.0/vocab_size)*batch_size # loss at iteration 0
hprev = np.zeros((hidden_size,1))


for i in range(iterations): ## just time passing by

    dWxh = np.zeros_like(Wxh)
    dWhh = np.zeros_like(Whh)   
    dWhy = np.zeros_like(Why)   
    dbh = np.zeros_like(bh) 
    dby = np.zeros_like(by)     


    if p+batch_size >= len(data) or i == 0:
        hprev = np.zeros((hidden_size,1))
        p = 0

    inputs = [char_to_ix[ch] for ch in data[p:p+batch_size]]
    targets = [char_to_ix[ch] for ch in data[p+1:p+batch_size+1]]

    HID = {}
    X = {}
    Y = {}
    P = {}
    HID[-1] = np.copy(hprev)

    loss = 0

    ##======FORWARD======##
    for t in range(len(inputs)):
        X[t] = np.zeros((vocab_size,1))
        X[t][inputs[t]] = 1

        HID[t] = np.tanh(np.dot(Wxh, X[t])  + np.dot(Whh, HID[t-1]) + bh) # inp -> X
        Y[t] = np.dot(Why, HID[t]) + by # tanh
        P[t] = np.exp(Y[t]) / np.sum(np.exp(Y[t]))
        loss += -np.log(P[t][targets[t]][0])
    dhnext = np.zeros_like(HID[0])
    ##======BACKPROP======##
    for t in reversed(range(len(inputs))):

        dy = np.copy(P[t])
        dy[targets[t]] -= 1
        dh = (np.dot(Why.T, dy) + dhnext)*(1-HID[t]*HID[t]) 
        dx = np.dot(Why.T, dy)*(1 - HID[t]**2)

        dWhy += np.dot(dy, HID[t].T) 
        dWhh += np.dot(dh, HID[t-1].T) 
        dWxh += np.dot(dh, X[t].T) 
        dby += dy 
        dbh += dh 

        dhnext = np.dot(Whh.T, dh)

    ##=====================##
    hprev = HID[-1]
    smooth_loss = smooth_loss * 0.999 + loss * 0.001
    for dparam in [dWxh, dWhh, dWhy, dbh, dby]:
        np.clip(dparam, -1, 1, out=dparam) # clip to mitigate exploding gradients


    for param, dparam, mem in zip([Wxh, Whh, Why, bh, by], 
                                [dWxh, dWhh, dWhy, dbh, dby], 
                                [mWxh, mWhh, mWhy, mbh, mby]):


        mem += dparam * dparam

        param += -alpha * dparam / np.sqrt(mem + 1e-8) # Adagrad
    if i % display_iteration == 0:
        print(str(i))
        weights = [Wxh,Whh,Why,bh,by]
        seed = inputs[np.random.randint(0,len(inputs))]
        TXT = sample(HID[-1], seed, weights, sample_size)
        print("-----------------------------------------------")
        print(TXT)
        print("-----------------------------------------------")
        with open(direc + "\RNNout.txt", 'w') as writeFile:
            writeFile.write(TXT)
    ITER.append(i)
    LOSS.append(loss)

    p += batch_size
best_text = sample(HID[-1], inputs[0], weights, sample_size)


plt.plot(ITER, LOSS, linewidth = 1)
plt.show()

writeFile.close()

【问题讨论】：

没有提供足够的信息。你训练了多长时间？开始和结束的培训/验证成本是多少？您需要提供有关您正在使用的系统的详细信息以获得明智的答案。几乎所有的 RNN 模型都需要大量的时间和计算才能训练到不只是吐出垃圾的地步。
从代码中可以看出，我将迭代次数设置为 1000。这需要大约一分钟左右的时间来训练。但这不是重点。我可以在原始代码（用 numpy 编写）中读取具有相同 LSTM 超参数的完全相同的文件，并且它在 10 秒后开始生成单词。我一夜之间离开了这个 tensorflow 实现，到了早上它仍然在产生乱码。
嗯，当你说“产生词”时，我们不知道它是什么意思。你在训练什么数据，目标是什么？我们可以看看你的 numpy 实现吗？我敢打赌你的“原始代码”实现并没有做同样的事情。
我编辑了我的 OP。我基本上包括了 Andrej Karpathy 著名的简单 RNN 代码。我也确实为 LSTM 重新实现了它，但它有点长（并且对于一个层来说，就像 numpy 中的简单 RNN 一样好）。说到数据，我不明白你为什么如此强调它——我只是复制粘贴了我发现的第一个纯拉丁字符的段落，它的长度约为 1200 个字符。

标签： python machine-learning tensorflow neural-network lstm

【解决方案1】：

嗯，doh...看起来你没有重新使用状态！如果您不维护状态，LSTM（状态机）应该如何正常工作？

对我来说，这看起来像是一个危险信号：

output, _ = tf.nn.dynamic_rnn(cell, X, dtype = tf.float32)

tf.nn.dynamic_rnn 的第二个输出是处理给定序列后的最新状态。看起来您明确地忽略了它，并且没有将其重新输入到 sess.run(...) 中的每个后续训练迭代中（因此您的 dynamic_rnn 没有 initial_state 参数）。

我强烈建议您在进一步查看之前更改这部分代码。

另外，我不知道您的数据是什么样的，但您的馈送和批处理策略需要能够从整个状态传递练习中理解。否则，再一次，它只会产生乱码。

【讨论】：

感谢您的回复。如果我错了，请纠正我，但我认为 tf.nn.dynamic_rnn() 默认会在训练期间更新状态。在 sess.run() 中打印状态确认它在整个迭代过程中都会发生变化。
是的，它会更新您输入的序列中每个时间步的状态。但大概你想在多个时期/等上多次调用dynamic_rnn？在这种情况下，每次调用 dynamic_rnn 产生的状态都需要在接下来的步骤中重新输入。那是为了训练。对于预测，它是同样的事情：你输入一个 10 步的 X，它会生成一个输出，然后你重新输入该输出以生成另一个输出。所有这些都需要小心维护状态。你的代码不这样做，我不认为
顺便说一句，这就是我说你必须确保你的喂养和批处理需要完成以围绕这个策略有意义的意思（即，如果你切碎你的训练数据 -设置成批次并在多个时期提供）。您需要确保对 dynamic_rnn 的每次调用都以一种有意义的方式将状态传递给下一次调用（即 dynamic_rnn 重复地看到 X 的序列，这些序列相互之间是有意义的，顺序）
好的。有意义的是，它将状态重写为跨不同批次的零。我会努力让它“记住”。再次感谢。

【解决方案2】：

根据所提供的信息，我建议这两个初始步骤来尝试改进模型。

增加迭代次数，循环神经网络的工作方式与其他深度架构不同，可能需要在迭代次数上增加一个数量级才能解决。
玩弄种子：根据我的经验，获得有意义的序列可能取决于所用种子的质量。

【讨论】：

感谢您的建议。首先，我离开 tensorflow 实现对 LSTM 进行了 100,000 次迭代，它最终产生的结果比原始 numpy 简单 RNN 在 500 次迭代后产生的结果更差。其次，种子在 tensorflow 和 numpy 代码中随机初始化。当样本量很大并且我只使用拉丁字母进行训练时，我认为这不会有很大的不同。