tensorflow 自定义操作梯度答案

【问题标题】：tensorflow custom op gradienttensorflow 自定义操作梯度
【发布时间】：2016-03-24 15:39:59
【问题描述】：

我们想在 tensorflow 中创建一个自定义层。因此，我们决定简单地从一个玩具示例开始：复制层。经过一些尝试和错误，我们达到了梯度似乎可以传递正确值的地步。然而，在第二次迭代中，这些特征得到了 NAN。这可能是一个简单的错误，但目前我看不到。

总的来说，我有两个问题：

谁能发现这里的问题以及如何解决它？
调试 TensorFlow 会话的好方法是什么？

copy_op.cc

#include "tensorflow/core/framework/op.h"
#include "tensorflow/core/framework/op_kernel.h"
#include <stdio.h>

namespace tensorflow {



typedef Eigen::ThreadPoolDevice CPUDevice;
typedef Eigen::GpuDevice GPUDevice;

template<typename Device, typename T>
class MyCopyOp: public OpKernel {
public:
    explicit MyCopyOp(OpKernelConstruction* context) :
            OpKernel(context) {
    }

    void Compute(OpKernelContext* context) override {
        const Tensor& input = context->input(0);
        auto in_flat = input.flat<T>();

        printf("Debug MyCopyOp Features: %s \n",input.DebugString().c_str());

        Tensor* output = nullptr;
        OP_REQUIRES_OK(context,
                context->allocate_output(0, input.shape(), &output));

        auto out_flat = output->flat<T>();
        out_flat.setZero();

        for (int d = 0; d < input.dims(); ++d) {
            for (int i = 0; i < input.dim_size(d); ++i) {
                out_flat(d * input.dim_size(d) + i) = in_flat(
                        d * input.dim_size(d) + i);
            }
        }

        printf("Debug MyCopyOp Output: %s \n",output->DebugString().c_str());
    }

};


template<typename Device, typename T>
class MyCopyGradOp: public OpKernel {
public:
    explicit MyCopyGradOp(OpKernelConstruction* context) :
            OpKernel(context) {

    }

    void Compute(OpKernelContext* context) override {
        printf("called MyCopyGradOp.Compute() \n");
        const Tensor& gradients = context->input(0);
        const Tensor& features = context->input(1);
        printf("Debug MyCopyOpGrad Gradients: %s \n",gradients.DebugString().c_str());
        printf("Debug MyCopyOpGrad Features: %s \n",features.DebugString().c_str());

        TensorShape output_shape = features.shape();

        Tensor* output = nullptr;
        OP_REQUIRES_OK(context,
                context->allocate_output(0, output_shape, &output));
        output->flat<T>().setZero();

        const T* btm_ptr = gradients.flat<T>().data();
        T* top_ptr = output->flat<T>().data();

        for (int i = 0; i < gradients.NumElements(); ++i) {
            top_ptr[i] = btm_ptr[i];
        }

        printf("Debug MyCopyOpGrad Output: %s \n",output->DebugString().c_str());
        printf("---------------------------------- \n");
    }

};


REGISTER_OP("MyCopy")
.Input("features: T")
.Output("output: T")
.Attr("T: realnumbertype")
.Doc(R"doc(
Copies all input values to the output
)doc");

REGISTER_OP("MyCopyGrad")
.Input("gradients: T")
.Input("features: T")
.Output("backprops: T")
.Attr("T: realnumbertype")
.Doc(R"doc(
TODO!!
)doc");


#define REGISTER_MYCOPY_KERNELS(type)                                           \
  REGISTER_KERNEL_BUILDER(                                                      \
      Name("MyCopy").Device(DEVICE_CPU).TypeConstraint<type>("T"),              \
      MyCopyOp<Eigen::ThreadPoolDevice, type>);                                 \
  REGISTER_KERNEL_BUILDER(                                                      \
      Name("MyCopyGrad").Device(DEVICE_CPU).TypeConstraint<type>("T"),          \
      MyCopyGradOp<Eigen::ThreadPoolDevice, type>);                             //  \
  // REGISTER_KERNEL_BUILDER(                                                      \
  //     Name("MyCopy").Device(DEVICE_GPU).TypeConstraint<type>("T"),              \
  //     MyCopyOp<Eigen::GpuDevice, type>);                                        \
  // REGISTER_KERNEL_BUILDER(                                                      \
  //     Name("MyCopyGrad").Device(DEVICE_GPU).TypeConstraint<type>("T"),          \
  //     MyCopyGradOp<Eigen::GpuDevice, type>);                                


REGISTER_MYCOPY_KERNELS(float); 
REGISTER_MYCOPY_KERNELS(int);
REGISTER_MYCOPY_KERNELS(double);


}

我们以简单的 MNIST 示例为基础：

layer_test.py

from tensorflow.examples.tutorials.mnist import input_data
mnist = input_data.read_data_sets('MNIST_data', one_hot=True)

import tensorflow as tf
from tensorflow.python.framework import ops
copy_op_module = tf.load_op_library('copy_op.so')

@ops.RegisterGradient("MyCopy")
def _CopyOpGrad(op, grad):
  return copy_op_module.my_copy_grad(grad,op.inputs[0])

sess = tf.InteractiveSession()

x = tf.placeholder(tf.float32, shape=[None, 784])
y_ = tf.placeholder(tf.float32, shape=[None, 10])

W = tf.Variable(tf.zeros([784,10]))
b = tf.Variable(tf.zeros([10]))

sess.run(tf.initialize_all_variables())

y1 = tf.nn.softmax(tf.matmul(x,W) + b)
y = copy_op_module.my_copy(y1)            //Here: MyCopy Layer is inserted

cross_entropy = -tf.reduce_sum(y_*tf.log(y))

train_step = tf.train.GradientDescentOptimizer(0.01).minimize(cross_entropy)

for i in range(2):
  batch = mnist.train.next_batch(50)
  train_step.run(feed_dict={x: batch[0], y_: batch[1]})

correct_prediction = tf.equal(tf.argmax(y,1), tf.argmax(y_,1))
accuracy = tf.reduce_mean(tf.cast(correct_prediction, tf.float32))
print(accuracy.eval(feed_dict={x: mnist.test.images, y_: mnist.test.labels}))

编译

TF_INC=$(python -c 'import tensorflow as tf; print(tf.sysconfig.get_include())')
TF_LIB=$(python -c 'import tensorflow as tf; print(tf.sysconfig.get_lib())')
g++ -std=c++11 -shared copy_op.cc -o copy_op.so -I $TF_INC -L $TF_LIB -fPIC -Wl,-rpath $TF_LIB

输出：

Debug MyCopyOp Features: Tensor<type: float shape: [50,10] values: 0.1 0.1 0.1...> 
Debug MyCopyOp Output: Tensor<type: float shape: [50,10] values: 0.1 0.1 0.1...> 
called MyCopyGradOp.Compute() 
Debug MyCopyOpGrad Gradients: Tensor<type: float shape: [50,10] values: -0 -0 -0...> 
Debug MyCopyOpGrad Features: Tensor<type: float shape: [50,10] values: 0.1 0.1 0.1...> 
Debug MyCopyOpGrad Output: Tensor<type: float shape: [50,10] values: -0 -0 -0...> 
---------------------------------- 
Debug MyCopyOp Features: Tensor<type: float shape: [50,10] values: nan nan nan...> 
Debug MyCopyOp Output: Tensor<type: float shape: [50,10] values: nan nan nan...> 
called MyCopyGradOp.Compute() 
Debug MyCopyOpGrad Gradients: Tensor<type: float shape: [50,10] values: nan nan nan...> 
Debug MyCopyOpGrad Features: Tensor<type: float shape: [50,10] values: nan nan nan...> 
Debug MyCopyOpGrad Output: Tensor<type: float shape: [50,10] values: nan nan nan...> 
---------------------------------- 
Debug MyCopyOp Features: Tensor<type: float shape: [10000,10] values: nan nan nan...> 
Debug MyCopyOp Output: Tensor<type: float shape: [10000,10] values: nan nan nan...> 
0.098

提前非常感谢！

【问题讨论】：

从输出中，您的 MyCopyOp 和 MyCopyGradOp 似乎正在按预期工作。您能否在不使用副本的情况下确认权重是否变为NaN？（为此，只需移除复制层，运行单个训练步骤，然后在第二次迭代中调用 y1.eval(feed_dict={x: batch[0], y_: batch[1]})。）
对于它的价值，使用-tf.reduce_sum(y_ * tf.log(y)) 计算交叉熵存在已知的稳定性问题（改用tf.nn.softmax_cross_entropy_with_logits(y, y_)），并且将W 变量初始化为零通常会导致更糟结果比随机初始化它。（更多讨论请参见this answer。）
感谢您的帮助！ 1.不使用复制层y1 evals到[[ 0.07910535 0.07910535 0.07910535 0.11042032 0.10930145 ...而复制一步后的结果是[[ nan nan nan nan nan ...
添加/删除复制图层时W 或b 的渐变是否会改变？您可以通过调用W_grad, b_grad = tf.gradients(cross_entropy, [W, b]) 获取这些张量，然后使用sess.run([W_grad, b_grad], feed_dict={...}) 评估它们。
只需使用 tf.nn.softmax_cross_entropy_with_logits(y, y_) 就可以了！！

标签： tensorflow

【解决方案1】：

来自 mrry 在评论中：使用 -tf.reduce_sum(y_ * tf.log(y)) 计算交叉熵存在已知的稳定性问题（改用 tf.nn.softmax_cross_entropy_with_logits(y, y_)），并且将 W 变量初始化为零通常会导致比初始化更糟糕的结果它随机。 This answer 有更多关于权重初始化问题的详细信息。

【讨论】：