AlexNet 在张量流中不收敛答案

【问题标题】：AlexNet doesn't converge in tensorflowAlexNet 在张量流中不收敛
【发布时间】：2017-10-24 02:52:02
【问题描述】：

我最近决定从 MATLAB 迁移到 Tensorflow。我首先在 Tensorflow 中创建了 ALexNet 模型。我想从头开始用我自己的数据训练 AlexNet。（我已经在 MATLAB 中使用 matconvnet 成功地做到了这一点）。但是，我在 tensorflow 中的模型永远不会收敛。损失和准确性保持不变。我将数据保存在 tfrecords 文件中并读取它，然后可以验证数据是否正确加载。但我不明白为什么模型 deosnt 似乎可以训练。我在 Ubuntu 16.04 上使用 TensorFlow 1.2.0 和 Python 2.7。

这是我的代码：其中 'val.tfrecords' 是包含我的培训日期的 tfrecords 文件

import numpy as np
import matplotlib.pyplot as plt
import sys
import cv2
from random import shuffle
import random as rand
import glob
import tensorflow as tf
import os
import scipy.misc
import math

def _int64_feature(value):
    return tf.train.Feature(int64_list=tf.train.Int64List(value=[value]))
def _bytes_feature(value):
    return tf.train.Feature(bytes_list=tf.train.BytesList(value=[value]))

# Initialize the weights with random numbers
def W_init (w_height,w_width,num_channels,num_filters,method = 'normal', W_name = 'w'):
    if method is 'normal': std =1
    if method is 'xavier': std = std = np.sqrt(2./(w_width*w_height*num_channels))
    return(tf.Variable(tf.truncated_normal([w_height,w_width,num_channels,num_filters], mean=0.0, stddev=std),name = W_name,trainable=True))



# Create wrappers for simplicity

# Convolution layer
def conv_layer(x,W,b,stride,pad):
    x = tf.pad(x, [[0, 0], [pad, pad], [pad, pad], [0, 0]], "CONSTANT")
    y = tf.nn.conv2d(x, W, strides=[1,stride,stride,1], padding='VALID')
    y = tf.nn.bias_add(y,b)
    return y

# Pooling layer
def pool_layer(x,k,stride,method):
    if method is 'max':
        y = tf.nn.max_pool(x, ksize = [1,k,k,1], strides = [1,stride,stride,1], padding='VALID')
    if method is 'avg':
        y = tf.nn.avg_pool(x, ksize = [1,k,k,1], strides = [1,stride,stride,1], padding='VALID')
    return y 


# Create a model AlexNet
def AlexNet(x,y_):

    # input
    x = tf.reshape(x,shape =[-1,224,224,3] )

    # conv1 + relu1
    conv1 = tf.nn.relu(conv_layer(x,W=W_init (11,11,3,96,method = 'xavier',W_name ='w1'), b=tf.Variable(tf.zeros([96]),name='b1',trainable=True),stride=4,pad= 3))
    # maxpool1
    max1 = pool_layer(conv1,k=2,stride=2,method='max')


    # conv2 + relu2
    conv2 = tf.nn.relu(conv_layer(max1,W=W_init (5,5,96,256,method = 'xavier',W_name ='w2'), b=tf.Variable(tf.zeros([256]),name='b2',trainable=True),stride=1,pad= 2))
    # maxpool2
    max2 = pool_layer(conv2,k=2,stride=2,method='max')

    # conv3 + relu3
    conv3 = tf.nn.relu(conv_layer(max2,W=W_init (3,3,256,384,method = 'xavier',W_name ='w3'), b=tf.Variable(tf.zeros([384]),name='b3',trainable=True),stride=1,pad= 1))

    # conv4 + relu4
    conv4 = tf.nn.relu(conv_layer(conv3,W=W_init (3,3,384,384,method = 'xavier',W_name ='w4'), b=tf.Variable(tf.zeros([384]),name='b4',trainable=True),stride=1,pad= 1))

    # conv5 + relu5
    conv5 = tf.nn.relu(conv_layer(conv4,W=W_init (3,3,384,256,method = 'xavier',W_name ='w5'), b=tf.Variable(tf.zeros([256]),name='b5',trainable=True),stride=1,pad= 1))
    # maxpool5
    max5 = pool_layer(conv5,k=2,stride=2,method='max')

    # flatten the convolution output to use in fc layer
    max5_size = np.product([s.value for s in max5.get_shape()[1:]])
    max5_flat = tf.reshape(max5, [-1, max5_size ])    


    #  fc6 + relu6 +drop6
    fc6 = tf.nn.relu(tf.matmul(max5_flat,tf.Variable(tf.truncated_normal([max5_size,4096],mean=0.0, stddev=2./math.sqrt(max5_size)),name='w6',trainable=True))+  tf.Variable(tf.zeros([4096]),name='b6',trainable=True) )
    drop6 = tf.nn.dropout(fc6, 0.5)

    #  fc7 + relu7 +drop7
    fc7 = tf.nn.relu(tf.matmul(drop6, tf.Variable(tf.truncated_normal([4096,4096], mean=0.0, stddev=2./math.sqrt(4096)),name='w7',trainable=True))+ tf.Variable(tf.zeros([4096]),name='b7',trainable=True) )
    drop7 = tf.nn.dropout(fc7, 0.5)

    #  fc8
    fc8 = tf.matmul(drop7, tf.Variable(tf.truncated_normal([4096,23], mean=0.0, stddev=2./math.sqrt(23)),name='w8',trainable=True))+ tf.Variable(tf.zeros([23]),name='b8',trainable=True) 

    y = tf.nn.softmax(fc8) 
    cost = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(logits=y, labels=y_))

    # Evaluate model
    correct_pred = tf.equal(tf.argmax(y, 1), tf.argmax(y_, 1))
    accuracy = tf.reduce_mean(tf.cast(correct_pred, tf.float32))    
    optimizer = tf.train.AdamOptimizer(0.01).minimize(cost)


    return cost,accuracy



train_filename = '/home/Documents/MyData/val.tfrecords'

# Model parameters
learning_rate = 0.01
Nimages = 1087
mean_image = np.load('mean_image_256.npy') # This is the 
n_input = 224*224*3 # img shape: 224*224*3
n_classes = 23
batch_size = 200 
Num_epochs = 1000
display_step = 3*batch_size

## Read the tfrecord file we've just created #############################################
# 1- Create a list of filenames: In this case there's only a single file 
train_data_path = train_filename


# TF graph Inputs and Placeholders
x_ = tf.placeholder(tf.float32, [None,224,224,3])
y_ = tf.placeholder(tf.float32, [None,n_classes])

# Define out loss and optimizer

with tf.Session() as sess:
    feature = {'train/image' : tf.FixedLenFeature([], tf.string),
               'train/label' : tf.FixedLenFeature([], tf.int64),
               'train/height': tf.FixedLenFeature([], tf.int64),
               'train/width' : tf.FixedLenFeature([], tf.int64)}

    # 2- Create a queue to hold filenames: To do so, we use tf.train.string_input_producer function which hold filenames in a FIFO queue.
    # it gets the list of filnames. It also has some optional arguments including  num_epochs which indicates the number of epoch you want to to load the data,
    # and shuffle which indicates whether to suffle the filenames in the list or not. It is set to True by default.

    train_filename_queue = tf.train.string_input_producer([train_data_path], num_epochs=None)
    val_filename_queue   = tf.train.string_input_producer([val_data_path], num_epochs=None)

    # 3- Define a reader and read the next record
    # For files of TFRecords we need to define a TFRecordReader with reader = tf.TFRecordReader().
    # Now, the reader returns the next record using: reader.read(filename_queue)
    reader = tf.TFRecordReader()

    _, train_serialized_example = reader.read(train_filename_queue)
    _,  val_serialized_example =  reader.read(val_filename_queue)

    # 4- Decode the record read by the reader
    # A decoder is needed to decode the record read by the reader.
    # In case of using TFRecords files the decoder should be tf.parse_single_example. it takes a serialized Example and a dictionary
    # which maps feature keys to FixedLenFeature or VarLenFeature values
    # and returns a dictionary which maps feature keys to Tensor values: features = tf.parse_single_example(serialized_example, features=feature)
    train_features = tf.parse_single_example(train_serialized_example, features=feature)
    val_features = tf.parse_single_example(val_serialized_example, features=feature)

    # 5- Convert the image data from string back to the numbers
    # tf.decode_raw(bytes, out_type) takes a Tensor of type string and convert it to typeout_type.
    # However, for labels which have not been converted to string, we just need to cast them using tf.cast(x, dtype)
    train_image = tf.decode_raw(train_features['train/image'], tf.float32)

    # 6- Cast label data into int32 and Reshape image data into the original shape
    train_label = tf.cast(train_features['train/label'], tf.int32)
    train_label = tf.one_hot(train_label, n_classes)
    train_height = tf.cast(train_features['train/height'], tf.int32)
    train_width = tf.cast(train_features['train/width'], tf.int32)
    train_image = tf.reshape(train_image, tf.stack([train_height, train_width, 3]))



    # 7- Any preprocessing here ...
    train_image = tf.image.central_crop(train_image, 1)
    train_image = tf.image.resize_images(train_image, [256,256])
    train_image = tf.random_crop(train_image, [224, 224, 3])
    train_image = tf.image.random_flip_left_right(train_image)





    # 8- Creates batches by randomly shuffling tensors
    # Batching: Another queue is needed to create batches from the examples. You can create the batch queue using:
    # tf.train.shuffle_batch([image, label], batch_size=10, capacity=30, num_threads=1, min_after_dequeue=10)
    # where capacity is the maximum size of queue, min_after_dequeue is the minimum size of queue after dequeue,
    # and num_threads is the number of threads enqueuing examples.
    # Using more than one thread, it comes up with a faster reading.
    # The first argument in a list of tensors which you want to create batches from.
    train_images, train_labels = tf.train.shuffle_batch([train_image, train_label], batch_size=batch_size, capacity=3*batch_size, num_threads=1, min_after_dequeue=batch_size,allow_smaller_final_batch=True)




    coord = tf.train.Coordinator()
    threads = tf.train.start_queue_runners(coord=coord)

    cost,accuracy = AlexNet(x_,y_)

    # 9- Initialize all global and local variables

    init_op = tf.group(tf.global_variables_initializer(), tf.local_variables_initializer())
    sess.run(init_op) 
    writer = tf.summary.FileWriter(logdir = '/tmp/tf/foo', graph=tf.get_default_graph())
    writer.flush()

    # 10- Create a coordinator and run all QueueRunner objects
    # Filing the example queue: Some functions of tf.train such as tf.train.shuffle_batch add tf.train.
    # QueueRunner objects to your graph. Each of these objects hold a list of enqueue op for a queue 
    # to run in a thread. Therefore, to fill a queue you need to call tf.train.start_queue_runners
    # which starts threades for all the queue runners in the graph. 
    # However, to manage these threads you need a tf.train.Coordinator to terminate the threads at the proper time.



    # Create a saver for writing training checkpoints.
    saver = tf.train.Saver()


    train_acc = np.zeros(Num_epochs)
    val_acc = np.zeros(Num_epochs)

    for epoch in range(Num_epochs):
        im_counter = 0

        for iter in range(Nimages/batch_size):
            im_counter+=batch_size
            # Get a training image batch  and  subtract mean
            t_img, t_lbl = sess.run([train_images, train_labels])
            t_img = (t_img - scipy.misc.imresize(mean_image, (224,224)))/255
            #t_lbl = to_onehot(t_lbl,n_classes)



            # Run session

            # Calculate batch loss and accuracy
            loss, acc = sess.run([cost, accuracy], feed_dict={x_: t_img, y_: t_lbl})

            # Display training results
            if (im_counter%display_step)==0:
                print "epoch " + str(epoch) +  " Processed images: " + str(im_counter) + "/" +str(Nimages)+", Minibatch Loss= " + \
                "{:.6f}".format(loss) + ", Accuracy= " + \
                "{:.5f}".format(acc) 

        # After an epoch is trained, save model and run validation
        saver.save(sess, 'AlexNet_saved_model.ckpt')
        train_acc[epoch] = acc

        im_counter = 0


    print "Optimization Finished!"
    plt.plot(train_acc)
    plt.title('Training accuracy')
    plt.show()            
    # Stop the threads
    coord.request_stop()

    # Wait for threads to stop
    coord.join(threads) 
    sess.close()

【问题讨论】：

您为什么要从 MATLAB 转换，而不是使用任何方便的模型动物园中提供的 AlexNet？这应该可以解决您遇到的任何问题。

标签： python tensorflow

【解决方案1】：

我在您的代码中看到至少一个错误：

y = tf.nn.softmax(fc8) 
cost = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(logits=y, labels=y_))

您不应该在调用tf.nn.softmax_cross_entropy_with_logits 之前应用softmax。 documentation 明确指出：

警告：此操作需要未缩放的 logits，因为它执行 softmax 在logits 内部以提高效率。不要用 softmax 的输出，因为它会产生不正确的结果。

【讨论】：

感谢您的回答。我实际上是错误地在 softmax_cross_entropy_with_logits 之前应用了 softmax。我修好了。但仍然得到相同的行为。