【发布时间】:2018-03-19 12:24:53
【问题描述】:
我正在使用 RNN 进行拼写纠正,下面是我正在使用的代码
from __future__ import print_function, division, unicode_literals
import os
import errno
from collections import Counter
from hashlib import sha256
import re
import json
import itertools
import logging
import requests
import numpy as np
import pandas as pd
from numpy.random import choice as random_choice, randint as random_randint, shuffle as random_shuffle, seed as random_seed, rand
from numpy import zeros as np_zeros # pylint:disable=no-name-in-module
from keras.models import Sequential, load_model
from keras.layers import Activation, TimeDistributed, Dense, RepeatVector, Dropout, recurrent
from keras.callbacks import Callback
# Set a logger for the module
LOGGER = logging.getLogger(__name__) # Every log will use the module name
LOGGER.addHandler(logging.StreamHandler())
LOGGER.setLevel(logging.DEBUG)
random_seed(123) # Reproducibility
class Configuration(object):
"""Dump stuff here"""
CONFIG = Configuration()
#pylint:disable=attribute-defined-outside-init
# Parameters for the model:
CONFIG.input_layers = 2
CONFIG.output_layers = 2
CONFIG.amount_of_dropout = 0.2
CONFIG.hidden_size = 500
CONFIG.initialization = "he_normal" # : Gaussian initialization scaled by fan-in (He et al., 2014)
CONFIG.number_of_chars = 100
CONFIG.max_input_len = 20
CONFIG.inverted = True
# parameters for the training:
CONFIG.batch_size = 100 # As the model changes in size, play with the batch size to best fit the process in memory
CONFIG.epochs = 500 # due to mini-epochs.
CONFIG.steps_per_epoch = 1000 # This is a mini-epoch. Using News 2013 an epoch would need to be ~60K.
CONFIG.validation_steps = 10
CONFIG.number_of_iterations = 10
dataset=pd.read_csv("input_spell.csv")
input_data=dataset['input'].tolist()
input_data1=str(input_data)
output_data=dataset['output'].tolist()
output_data1=str(output_data)
chars=list("abcdefghijklmnopqrstuvwxyz")
MIN_INPUT_LEN = 1
AMOUNT_OF_NOISE = 0.2 / CONFIG.max_input_len
CHARS = list("abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ .")
class CharacterTable(object):
"""
Given a set of characters:
+ Encode them to a one hot integer representation
+ Decode the one hot integer representation to their character output
+ Decode a vector of probabilities to their character output
"""
def __init__(self, chars):
self.chars = sorted(set(chars))
self.char_indices = dict((c, i) for i, c in enumerate(self.chars))
self.indices_char = dict((i, c) for i, c in enumerate(self.chars))
@property
def size(self):
"""The number of chars"""
return len(self.chars)
def encode(self, C, maxlen):
"""Encode as one-hot"""
X = np.zeros((maxlen, len(self.chars)), dtype=np.bool) # pylint:disable=no-member
for i, c in enumerate(C):
X[i, self.char_indices[c]] = 1
return X
def decode(self, X, calc_argmax=True):
"""Decode from one-hot"""
if calc_argmax:
X = X.argmax(axis=-1)
return ''.join(self.indices_char[x] for x in X if x)
def _vectorize(questions, answers, ctable):
"""Vectorize the data as numpy arrays"""
len_of_questions = len(questions)
X = np_zeros((len_of_questions, CONFIG.max_input_len, ctable.size),dtype=int)
print("inputchars")
for i in range(len(questions)):
print(i)
sentence = questions.pop()
print(sentence)
for j, c in enumerate(sentence):
print(j)
print(c)
try:
X[i, j, ctable.char_indices[c]] = 1
except KeyError:
pass # Padding
y = np_zeros((len_of_questions, CONFIG.max_input_len, ctable.size), dtype=int)
print("outputchars")
for i in range(len(answers)):
print(i)
sentence = answers.pop()
print(sentence)
for j, c in enumerate(sentence):
try:
y[i, j, ctable.char_indices[c]] = 1
except KeyError:
pass # Padding
return X, y
def vectorize(questions, answers, chars=None):
"""Vectorize the questions and expected answers"""
print('Vectorization...')
chars = chars or CHARS
ctable = CharacterTable(chars)
print("inputdata before _vec")
print(questions)
X, y = _vectorize(questions, answers, ctable)
# Explicitly set apart 10% for validation data that we never train over
#print("input after _vec")
#print(X)
#print("output after _vec")
#print(y)
print(X.shape)
print(y.shape)
return X, y, CONFIG.max_input_len, ctable
def generate_model(output_len, chars=None):
"""Generate the model"""
print('Build model...')
chars = chars or CHARS
model = Sequential()
# "Encode" the input sequence using an RNN, producing an output of hidden_size
# note: in a situation where your input sequences have a variable length,
# use input_shape=(None, nb_feature).
for layer_number in range(CONFIG.input_layers):
model.add(recurrent.LSTM(CONFIG.hidden_size, input_shape=(None, len(chars)), kernel_initializer=CONFIG.initialization,
return_sequences=layer_number + 1 < CONFIG.input_layers))
model.add(Dropout(CONFIG.amount_of_dropout))
# For the decoder's input, we repeat the encoded input for each time step
model.add(RepeatVector(output_len))
# The decoder RNN could be multiple layers stacked or a single layer
for _ in range(CONFIG.output_layers):
model.add(recurrent.LSTM(CONFIG.hidden_size, return_sequences=True, kernel_initializer=CONFIG.initialization))
model.add(Dropout(CONFIG.amount_of_dropout))
# For each of step of the output sequence, decide which character should be chosen
model.add(TimeDistributed(Dense(len(chars), kernel_initializer=CONFIG.initialization)))
model.add(Activation('softmax'))
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
return model
def iterate_training(model, X_train, y_train, X_val, y_val, ctable):
"""Iterative Training"""
# Train the model each generation and show predictions against the validation dataset
for iteration in range(1, CONFIG.number_of_iterations):
#print()
#print('-' * 50)
#print('Iteration', iteration)
model.fit(X_train, y_train, batch_size=CONFIG.batch_size, epochs=CONFIG.epochs,
validation_data=(X_val, y_val))
#print_random_predictions(model, ctable, X_val, y_val)
def print_random_predictions(model, ctable, X_val, y_val):
"""Select 10 samples from the validation set at random so we can visualize errors"""
print()
for _ in range(10):
#ind = random_randint(0, len(X_val))
#rowX, rowy = X_val[np.array([ind])], y_val[np.array([ind])] # pylint:disable=no-member
rowX, rowy = X_val, y_val
preds = model.predict_classes(rowX, verbose=0)
print("preds")
print(preds)
q = ctable.decode(rowX[0])
print("q-value")
correct = ctable.decode(rowy[0])
print("correct")
print(correct)
guess = ctable.decode(preds[0], calc_argmax=False)
print("predicted")
print(guess)
return guess
X_train,y_train, y_maxlen, ctable = vectorize(input_data, output_data, chars)
print ("y_maxlen, chars", y_maxlen, "".join(chars))
model = generate_model(y_maxlen, chars)
iterate_training(model, X_train, y_train, X_train, y_train, ctable)
for inp in X_train:
inputarray = ctable.decode(inp)
print(inputarray)
prediction=model.predict_classes(X_train, verbose=0)
for p in prediction:
guess = ctable.decode(p, calc_argmax=False)
print(guess)
以下是文件input_spell.csv
的内容input output
sol solid
kt kit
whl wheel
abr abrasive
unv universal
pp pipe
plt plate
accum accumulator
我从deepspell获取代码
下面是我对训练集的预测结果
ccumultorrrrrrrrrr
plteeeeeeeeeeeeeeee
pipeeellllllllllllll
universllllllllllll
brsiveeellllllllll
wheellllllllllllllll
kitteeeeeeeeeeeellll
solidddddddddddddddd
输入和输出向量的大小
输入:(8, 20, 26) 输出:(8, 20, 26)
所以我得到长度为 20 的预测结果
我对RNN和LSTM有非常基本的了解
更新 当我尝试可视化 model.summary() 我得到了
model.summary()
_________________________________________________________________
Layer (type) Output Shape Param #
=================================================================
lstm_1 (LSTM) (None, None, 500) 1054000
_________________________________________________________________
dropout_1 (Dropout) (None, None, 500) 0
_________________________________________________________________
lstm_2 (LSTM) (None, 500) 2002000
_________________________________________________________________
dropout_2 (Dropout) (None, 500) 0
_________________________________________________________________
repeat_vector_1 (RepeatVecto (None, 20, 500) 0
_________________________________________________________________
lstm_3 (LSTM) (None, 20, 500) 2002000
_________________________________________________________________
dropout_3 (Dropout) (None, 20, 500) 0
_________________________________________________________________
lstm_4 (LSTM) (None, 20, 500) 2002000
_________________________________________________________________
dropout_4 (Dropout) (None, 20, 500) 0
_________________________________________________________________
time_distributed_1 (TimeDist (None, 20, 26) 13026
_________________________________________________________________
activation_1 (Activation) (None, 20, 26) 0
=================================================================
Total params: 7,073,026
Trainable params: 7,073,026
Non-trainable params: 0
_________________________________________________________________
谁能告诉我哪里出错了?
【问题讨论】:
-
您是否尝试通过打印
model.summary()来可视化模型以尝试发现错误? -
@MohamedElzare 我已编辑并包含有问题的
model.summary() -
@RanjanaGirish 已解决问题
标签: python keras lstm recurrent-neural-network spell-checking