在 python 中使用 Keras RNN 进行拼写校正答案

【问题标题】：Spell correction using Keras RNN in python在 python 中使用 Keras RNN 进行拼写校正
【发布时间】：2018-03-19 12:24:53
【问题描述】：

我正在使用 RNN 进行拼写纠正，下面是我正在使用的代码

from __future__ import print_function, division, unicode_literals

import os
import errno
from collections import Counter
from hashlib import sha256
import re
import json
import itertools
import logging
import requests
import numpy as np
import pandas as pd
from numpy.random import choice as random_choice, randint as random_randint, shuffle as random_shuffle, seed as random_seed, rand
from numpy import zeros as np_zeros # pylint:disable=no-name-in-module

from keras.models import Sequential, load_model
from keras.layers import Activation, TimeDistributed, Dense, RepeatVector, Dropout, recurrent
from keras.callbacks import Callback

# Set a logger for the module
LOGGER = logging.getLogger(__name__) # Every log will use the module name
LOGGER.addHandler(logging.StreamHandler())
LOGGER.setLevel(logging.DEBUG)

random_seed(123) # Reproducibility

class Configuration(object):
    """Dump stuff here"""
CONFIG = Configuration()
#pylint:disable=attribute-defined-outside-init
# Parameters for the model:
CONFIG.input_layers = 2
CONFIG.output_layers = 2
CONFIG.amount_of_dropout = 0.2
CONFIG.hidden_size = 500
CONFIG.initialization = "he_normal" # : Gaussian initialization scaled by fan-in (He et al., 2014)
CONFIG.number_of_chars = 100
CONFIG.max_input_len = 20
CONFIG.inverted = True

# parameters for the training:
CONFIG.batch_size = 100 # As the model changes in size, play with the batch size to best fit the process in memory
CONFIG.epochs = 500 # due to mini-epochs.
CONFIG.steps_per_epoch = 1000 # This is a mini-epoch. Using News 2013 an epoch would need to be ~60K.
CONFIG.validation_steps = 10
CONFIG.number_of_iterations = 10

dataset=pd.read_csv("input_spell.csv")
input_data=dataset['input'].tolist()
input_data1=str(input_data)
output_data=dataset['output'].tolist()
output_data1=str(output_data)


chars=list("abcdefghijklmnopqrstuvwxyz")

MIN_INPUT_LEN = 1
AMOUNT_OF_NOISE = 0.2 / CONFIG.max_input_len
CHARS = list("abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ .")

class CharacterTable(object):
    """
    Given a set of characters:
    + Encode them to a one hot integer representation
    + Decode the one hot integer representation to their character output
    + Decode a vector of probabilities to their character output
    """
    def __init__(self, chars):
        self.chars = sorted(set(chars))
        self.char_indices = dict((c, i) for i, c in enumerate(self.chars))
        self.indices_char = dict((i, c) for i, c in enumerate(self.chars))

    @property
    def size(self):
        """The number of chars"""
        return len(self.chars)

    def encode(self, C, maxlen):
        """Encode as one-hot"""
        X = np.zeros((maxlen, len(self.chars)), dtype=np.bool) # pylint:disable=no-member
        for i, c in enumerate(C):
            X[i, self.char_indices[c]] = 1
        return X

    def decode(self, X, calc_argmax=True):
        """Decode from one-hot"""
        if calc_argmax:
            X = X.argmax(axis=-1)
        return ''.join(self.indices_char[x] for x in X if x)
    


def _vectorize(questions, answers, ctable):
    """Vectorize the data as numpy arrays"""
    len_of_questions = len(questions)
    X = np_zeros((len_of_questions, CONFIG.max_input_len, ctable.size),dtype=int)
    print("inputchars")
    for i in range(len(questions)):
        print(i)
        sentence = questions.pop()
        print(sentence)
        for j, c in enumerate(sentence):
            print(j)
            print(c)
            try:
                X[i, j, ctable.char_indices[c]] = 1
            except KeyError:
                pass # Padding
    y = np_zeros((len_of_questions, CONFIG.max_input_len, ctable.size), dtype=int)
    print("outputchars")
    for i in range(len(answers)):
        print(i)
        sentence = answers.pop()
        print(sentence)
        for j, c in enumerate(sentence):
            try:
                y[i, j, ctable.char_indices[c]] = 1
            except KeyError:
                pass # Padding
    return X, y



def vectorize(questions, answers, chars=None):
    """Vectorize the questions and expected answers"""
    print('Vectorization...')
    chars = chars or CHARS
    ctable = CharacterTable(chars)
    print("inputdata before _vec")
    print(questions)
    X, y = _vectorize(questions, answers, ctable)
    # Explicitly set apart 10% for validation data that we never train over
    #print("input after _vec")
    #print(X)
    #print("output after _vec")
    #print(y)

    print(X.shape)
    print(y.shape)

    return  X, y, CONFIG.max_input_len, ctable

def generate_model(output_len, chars=None):
    """Generate the model"""
    print('Build model...')
    chars = chars or CHARS
    model = Sequential()
    # "Encode" the input sequence using an RNN, producing an output of hidden_size
    # note: in a situation where your input sequences have a variable length,
    # use input_shape=(None, nb_feature).
    for layer_number in range(CONFIG.input_layers):
        model.add(recurrent.LSTM(CONFIG.hidden_size, input_shape=(None, len(chars)), kernel_initializer=CONFIG.initialization,
                                 return_sequences=layer_number + 1 < CONFIG.input_layers))
        model.add(Dropout(CONFIG.amount_of_dropout))
    # For the decoder's input, we repeat the encoded input for each time step
    model.add(RepeatVector(output_len))
    # The decoder RNN could be multiple layers stacked or a single layer
    for _ in range(CONFIG.output_layers):
        model.add(recurrent.LSTM(CONFIG.hidden_size, return_sequences=True, kernel_initializer=CONFIG.initialization))
        model.add(Dropout(CONFIG.amount_of_dropout))

    # For each of step of the output sequence, decide which character should be chosen
    model.add(TimeDistributed(Dense(len(chars), kernel_initializer=CONFIG.initialization)))
    model.add(Activation('softmax'))

    model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
    return model


def iterate_training(model, X_train, y_train, X_val, y_val, ctable):
    """Iterative Training"""
    # Train the model each generation and show predictions against the validation dataset
    for iteration in range(1, CONFIG.number_of_iterations):
        #print()
        #print('-' * 50)
        #print('Iteration', iteration)
        model.fit(X_train, y_train, batch_size=CONFIG.batch_size, epochs=CONFIG.epochs,
                  validation_data=(X_val, y_val))
        #print_random_predictions(model, ctable, X_val, y_val)


def print_random_predictions(model, ctable, X_val, y_val):
    """Select 10 samples from the validation set at random so we can visualize errors"""
    print()
    for _ in range(10):
        #ind = random_randint(0, len(X_val))
        #rowX, rowy = X_val[np.array([ind])], y_val[np.array([ind])] # pylint:disable=no-member
        rowX, rowy = X_val, y_val
        preds = model.predict_classes(rowX, verbose=0)
        print("preds")
        print(preds)
        q = ctable.decode(rowX[0])
        print("q-value")
        correct = ctable.decode(rowy[0])
        print("correct")
        print(correct)
        guess = ctable.decode(preds[0], calc_argmax=False)
        print("predicted")
        print(guess)
        return guess
        

X_train,y_train, y_maxlen, ctable = vectorize(input_data, output_data, chars)
print ("y_maxlen, chars", y_maxlen, "".join(chars))
model = generate_model(y_maxlen, chars)
iterate_training(model, X_train, y_train, X_train, y_train, ctable)
for inp in X_train:
    inputarray = ctable.decode(inp)
    print(inputarray)
prediction=model.predict_classes(X_train, verbose=0)
for p in prediction:
    guess = ctable.decode(p, calc_argmax=False)
    print(guess)

以下是文件input_spell.csv

的内容

input   output
sol     solid
kt      kit
whl     wheel
abr     abrasive
unv     universal
pp      pipe
plt     plate
accum   accumulator

我从deepspell获取代码

下面是我对训练集的预测结果

ccumultorrrrrrrrrr
plteeeeeeeeeeeeeeee
pipeeellllllllllllll
universllllllllllll
brsiveeellllllllll
wheellllllllllllllll
kitteeeeeeeeeeeellll
solidddddddddddddddd

输入和输出向量的大小

输入：(8, 20, 26) 输出：(8, 20, 26)

所以我得到长度为 20 的预测结果

我对RNN和LSTM有非常基本的了解

更新当我尝试可视化 model.summary() 我得到了

model.summary()
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
=================================================================
lstm_1 (LSTM)                (None, None, 500)         1054000   
_________________________________________________________________
dropout_1 (Dropout)          (None, None, 500)         0         
_________________________________________________________________
lstm_2 (LSTM)                (None, 500)               2002000   
_________________________________________________________________
dropout_2 (Dropout)          (None, 500)               0         
_________________________________________________________________
repeat_vector_1 (RepeatVecto (None, 20, 500)           0         
_________________________________________________________________
lstm_3 (LSTM)                (None, 20, 500)           2002000   
_________________________________________________________________
dropout_3 (Dropout)          (None, 20, 500)           0         
_________________________________________________________________
lstm_4 (LSTM)                (None, 20, 500)           2002000   
_________________________________________________________________
dropout_4 (Dropout)          (None, 20, 500)           0         
_________________________________________________________________
time_distributed_1 (TimeDist (None, 20, 26)            13026     
_________________________________________________________________
activation_1 (Activation)    (None, 20, 26)            0         
=================================================================
Total params: 7,073,026
Trainable params: 7,073,026
Non-trainable params: 0
_________________________________________________________________

谁能告诉我哪里出错了？

【问题讨论】：

您是否尝试通过打印model.summary() 来可视化模型以尝试发现错误？
@MohamedElzare 我已编辑并包含有问题的model.summary()
@RanjanaGirish 已解决问题

标签： python keras lstm recurrent-neural-network spell-checking

【解决方案1】：

我建议仔细检查max_input_len、output_len 和y_maxlen 变量，您的模型训练成功了吗？什么是验证错误？如果它似乎训练成功，那么我怀疑泛化步骤的向量化数据的形状有问题。

【讨论】：

【解决方案2】：

如果单词大小小于在向量化时定义大小，则可以通过在单词后添加填充来解决此问题。谢谢兰贾纳为此。只需在 for 循环后的 _vectorize 中添加以下代码如下图：

for j, c in enumerate(sentence): 
    try:
        char_index_nm = ctable.char_indices[c]
        X[0, j, char_index_nm] = 1.0
    except KeyError:
        pass # Padding
#Below is the line that need to be added for space
X[0, j + 1 :, ctable.char_indices[" "]] = 1.0

【讨论】：