keras 问答匹配孪生网络文本匹配 RNN 带有数据

用途：

这篇博客解释了如何搭建一个简单的匹配网络。并且使用了keras的lambda层。在建立网络之前需要对数据进行预处理。处理过后，文本转变为id字符序列。将一对question,answer分别编码可以得到两个向量，在匹配层中比较两个向量，计算相似度。

网络图示：

数据准备：

数据基于网上的淘宝客服对话数据，我也会放在我的下载页面中。原数据是对话，我筛选了其中label为1的对话。然后将对话拆解成QA对，q是用户，a是客服。然后对于每个q，有一个a是匹配的，label为1.再选择一个a，构成新的样本，label为0.

超参数：

比较简单，具体看代码就可以了。

# dialogue max pair q,a
max_pair = 30000
# top k frequent word ,k
MAX_FEATURES = 450
# fixed q,a length
MAX_SENTENCE_LENGTH = 30
embedding_size = 100
batch_size = 600
# learning rate
lr = 0.01
HIDDEN_LAYER_SIZE = n_hidden_units = 256  # neurons in hidden layer

细节：

导入一些库

# -*- coding: utf-8 -*-
from keras.layers.core import Activation, Dense, Dropout, SpatialDropout1D
from keras.layers.embeddings import Embedding
from keras.layers.recurrent import LSTM
from keras.preprocessing import sequence
from sklearn.model_selection import train_test_split
import collections
import matplotlib.pyplot as plt
import nltk
import numpy as np
import os
import pandas as pd
from alime_data import convert_dialogue_to_pair
from parameter import MAX_SENTENCE_LENGTH,MAX_FEATURES,embedding_size,max_pair,batch_size,HIDDEN_LAYER_SIZE
DATA_DIR = "../data"
NUM_EPOCHS = 2
# Read training data and generate vocabulary
maxlen = 0
num_recs = 0

数据准备，先统计词频，然后取出top N个常用词，然后将句子转换成单词id的序列。把句子中的有效id靠右边放，将句子左边补齐padding。然后分成训练集和测试集

word_freqs = collections.Counter()
training_data = convert_dialogue_to_pair(max_pair)
num_recs  = len([1 for r in training_data.iterrows()])

#for line in ftrain:
for line in training_data.iterrows():
    label ,sentence_q = line[1]['label'],line[1]['sentence_q']
    label ,sentence_a = line[1]['label'],line[1]['sentence_a']
    words = nltk.word_tokenize(sentence_q.lower())#.decode("ascii", "ignore")
    if len(words) > maxlen:
        maxlen = len(words)
    for word in words:
        word_freqs[word] += 1
    words = nltk.word_tokenize(sentence_a.lower())#.decode("ascii", "ignore")
    if len(words) > maxlen:
        maxlen = len(words)
    for word in words:
        word_freqs[word] += 1
    #num_recs += 1
## Get some information about our corpus

# 1 is UNK, 0 is PAD
# We take MAX_FEATURES-1 featurs to accound for PAD
vocab_size = min(MAX_FEATURES, len(word_freqs)) + 2
word2index = {x[0]: i+2 for i, x in enumerate(word_freqs.most_common(MAX_FEATURES))}
word2index["PAD"] = 0
word2index["UNK"] = 1
index2word = {v:k for k, v in word2index.items()}
# convert sentences to sequences
X_q = np.empty((num_recs, ), dtype=list)
X_a = np.empty((num_recs, ), dtype=list)
y = np.zeros((num_recs, ))
i = 0
def chinese_split(x):
    return x.split(' ')

for line in training_data.iterrows():
    label ,sentence_q,sentence_a = line[1]['label'],line[1]['sentence_q'],line[1]['sentence_a']
    #label, sentence = line.strip().split("\t")
    #print(label,sentence)
    #words = nltk.word_tokenize(sentence_q.lower())
    words = chinese_split(sentence_q)
    seqs = []
    for word in words:
        if word in word2index.keys():
            seqs.append(word2index[word])
        else:
            seqs.append(word2index["UNK"])
    X_q[i] = seqs
    #print('add_q')
    #words = nltk.word_tokenize(sentence_a.lower())
    words = chinese_split(sentence_a)
    seqs = []
    for word in words:
        if word in word2index.keys():
            seqs.append(word2index[word])
        else:
            seqs.append(word2index["UNK"])
    X_a[i] = seqs
    y[i] = int(label)
    i += 1
# Pad the sequences (left padded with zeros)
X_a = sequence.pad_sequences(X_a, maxlen=MAX_SENTENCE_LENGTH)
X_q = sequence.pad_sequences(X_q, maxlen=MAX_SENTENCE_LENGTH)
X = []
for i in range(len(X_a)):
    concat = [X_q[i],X_a[i]]
    X.append(concat)

# Split input into training and test
Xtrain, Xtest, ytrain, ytest = train_test_split(X, y, test_size=0.2,
                                                random_state=42)

#print(Xtrain.shape, Xtest.shape, ytrain.shape, ytest.shape)
Xtrain_Q = [e[0] for e in Xtrain]
Xtrain_A = [e[1] for e in Xtrain]
Xtest_Q = [e[0] for e in Xtest]
Xtest_A = [e[1] for e in Xtest]

最后建立网络。先定义两个函数，一个是句子编码器，另一个是lambda层，计算两个向量的绝对差。将QA分别用encoder处理得到两个向量，把两个向量放入lambda层。最后有了2*hidden size的一层，将这一层接一个dense层，接activation，得到分类概率。

from keras.layers.wrappers import Bidirectional
from keras.layers import Input,Lambda
from keras.models import Model

def encoder(inputs_seqs,rnn_hidden_size,dropout_rate):
    x_embed = Embedding(vocab_size, embedding_size, input_length=MAX_SENTENCE_LENGTH)(inputs_seqs)
    inputs_drop = SpatialDropout1D(0.2)(x_embed)
    encoded_Q = Bidirectional(
        LSTM(rnn_hidden_size, dropout=dropout_rate, recurrent_dropout=dropout_rate, name='RNN'))(inputs_drop)
    return encoded_Q

def absolute_difference(vecs):
    a,b =vecs
    #d = a-b
    return abs(a - b)

inputs_Q = Input(shape=(MAX_SENTENCE_LENGTH,), name="input")
# x_embed = Embedding(vocab_size, embedding_size, input_length=MAX_SENTENCE_LENGTH)(inputs_Q)
# inputs_drop = SpatialDropout1D(0.2)(x_embed)
# encoded_Q = Bidirectional(LSTM(HIDDEN_LAYER_SIZE, dropout=0.2, recurrent_dropout=0.2,name= 'RNN'))(inputs_drop)
inputs_A = Input(shape=(MAX_SENTENCE_LENGTH,), name="input_a")
# x_embed = Embedding(vocab_size, embedding_size, input_length=MAX_SENTENCE_LENGTH)(inputs_A)
# inputs_drop = SpatialDropout1D(0.2)(x_embed)
# encoded_A = Bidirectional(LSTM(HIDDEN_LAYER_SIZE, dropout=0.2, recurrent_dropout=0.2,name= 'RNN'))(inputs_drop)
encoded_Q = encoder(inputs_Q,HIDDEN_LAYER_SIZE,0.1)
encoded_A = encoder(inputs_A,HIDDEN_LAYER_SIZE,0.1)

# import tensorflow as tf
# difference = tf.subtract(encoded_Q, encoded_A)
# difference = tf.abs(difference)
similarity  =  Lambda(absolute_difference)([encoded_Q, encoded_A])
# x = concatenate([encoded_Q, encoded_A])
#
# matching_x = Dense(128)(x)
# matching_x = Activation("sigmoid")(matching_x)
polar = Dense(1)(similarity)
prop = Activation("sigmoid")(polar)
model = Model(inputs=[inputs_Q,inputs_A], outputs=prop)
model.compile(loss="binary_crossentropy", optimizer="adam",
              metrics=["accuracy"])
training_history = model.fit([Xtrain_Q, Xtrain_A], ytrain, batch_size=batch_size,
                             epochs=NUM_EPOCHS,
                             validation_data=([Xtest_Q,Xtest_A], ytest))
# plot loss and accuracy
def plot(training_history):
    plt.subplot(211)
    plt.title("Accuracy")
    plt.plot(training_history.history["acc"], color="g", label="Train")
    plt.plot(training_history.history["val_acc"], color="b", label="Validation")
    plt.legend(loc="best")

    plt.subplot(212)
    plt.title("Loss")
    plt.plot(training_history.history["loss"], color="g", label="Train")
    plt.plot(training_history.history["val_loss"], color="b", label="Validation")
    plt.legend(loc="best")
    plt.tight_layout()
    plt.show()

# evaluate
score, acc = model.evaluate([Xtest_Q,Xtest_A], ytest, batch_size = batch_size)
print("Test score: %.3f, accuracy: %.3f" % (score, acc))

for i in range(25):
    idx = np.random.randint(len(Xtest_Q))
    #idx2 = np.random.randint(len(Xtest_A))
    xtest_Q = Xtest_Q[idx].reshape(1,MAX_SENTENCE_LENGTH)
    xtest_A = Xtest_A[idx].reshape(1,MAX_SENTENCE_LENGTH)
    ylabel = ytest[idx]
    ypred = model.predict([xtest_Q,xtest_A])[0][0]
    sent_Q = " ".join([index2word[x] for x in xtest_Q[0].tolist() if x != 0])
    sent_A = " ".join([index2word[x] for x in xtest_A[0].tolist() if x != 0])
    print("%.0f\t%d\t%s\t%s" % (ypred, ylabel, sent_Q,sent_A))

最后是处理数据的函数，写在另一个文件里。

import nltk
from parameter import MAX_FEATURES,MAX_SENTENCE_LENGTH
import pandas as pd
from collections import Counter
def get_pair(number, dialogue):
    pairs = []
    for conversation in dialogue:
        utterances = conversation[2:].strip('\n').split('\t')
        # print(utterances)
        # break

        for i, utterance in enumerate(utterances):
            if i % 2 != 0: continue
            pairs.append([utterances[i], utterances[i + 1]])
            if len(pairs) >= number:
                return pairs
    return pairs


def convert_dialogue_to_pair(k):
    dialogue = open('dialogue_alibaba2.txt', encoding='utf-8', mode='r')
    dialogue = dialogue.readlines()
    dialogue = [p for p in dialogue if p.startswith('1')]
    print(len(dialogue))
    pairs = get_pair(k, dialogue)
    # break
    # print(pairs)
    data = []
    for p in pairs:
        data.append([p[0], p[1], 1])
    for i, p in enumerate(pairs):
        data.append([p[0], pairs[(i + 8) % len(pairs)][1], 0])
    df = pd.DataFrame(data, columns=['sentence_q', 'sentence_a', 'label'])

    print(len(data))
    return df