cifar10 示例 tensorflow 的分段错误（核心转储）错误答案

【问题标题】：Segmentation fault (core dumped) error for cifar10 example tensorflowcifar10 示例 tensorflow 的分段错误（核心转储）错误
【发布时间】：2016-09-28 21:06:49
【问题描述】：

我正在尝试调整示例 cifar10 代码，但我不确定为什么在运行调整 cifar10_eval.py 时出现分段错误（核心转储）错误。看起来这段代码实际上可以在 Mac 中运行，我不确定为什么它不适用于 linux。

感谢您的帮助。

-----------下面的代码----------- --------

# Copyright 2015 Google Inc. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.c
# ==============================================================================

"""Evaluation for CIFAR-10
Accuracy:
cifar10_train.py achieves 83.0% accuracy after 100K steps (256 epochs
of data) as judged by cifar10_eval.py.
Speed:
On a single Tesla K40, cifar10_train.py processes a single batch of 128 imagecs
in 0.25-0.35 sec (i.e. 350 - 600 images /sec). The model reaches ~86%
accuracy after 100K steps in 8 hours of training time.
Usage:
Please see the tutorial and website for how to download the CIFAR-10
data set, compile the program and train the model.
http://tensorflow.org/tutorials/deep_cnn/
"""
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function

from datetime import datetime
import math
import time

import numpy as np
import tensorflow as tf
import os
import StringIO
import cv
import cv2
import urllib


from PIL import Image

import matplotlib

import glob

import cifar10

cur_dir = os.getcwd()

FLAGS = tf.app.flags.FLAGS

tf.app.flags.DEFINE_string('eval_dir', '/tmp/cifar10_eval',
                          """Directory where to write event logs.""")
tf.app.flags.DEFINE_string('eval_data', 'test',
                           """Either 'test' or 'train_eval'.""")
tf.app.flags.DEFINE_string('checkpoint_dir', '/tmp/cifar10_train',
                          """Directory where to read model checkpoints.""")
tf.app.flags.DEFINE_integer('eval_interval_secs', 60 * 5,
                           """How often to run the eval.""")
tf.app.flags.DEFINE_integer('num_examples', 128,
                           """Number of examples to run.""")
tf.app.flags.DEFINE_boolean('run_once', False,
                        """Whether to run eval only once.""")


def eval_once(saver, summary_writer, top_k_op, summary_op,images,labels, logits):
 """Run Eval once.
 Args:
   saver: Saver.
   summary_writer: Summary writer.
   top_k_op: Top K op.
   summary_op: Summary op.
 """
 with tf.Session() as sess:
   ckpt = tf.train.get_checkpoint_state(FLAGS.checkpoint_dir)
   if ckpt and ckpt.model_checkpoint_path:
     # Restores from checkpoint
     saver.restore(sess, ckpt.model_checkpoint_path)
     # Assuming model_checkpoint_path looks something like:
     #   /my-favorite-path/cifar10_train/model.ckpt-0,
     # extract global_step from it.
     global_step = ckpt.model_checkpoint_path.split('/')[-1].split('-')[-1]
   else:
     print('No checkpoint file found')
     return

   # Start the queue runners.
   coord = tf.train.Coordinator()
   try:
     threads = []
     for qr in tf.get_collection(tf.GraphKeys.QUEUE_RUNNERS):
       threads.extend(qr.create_threads(sess, coord=coord, daemon=True,
                                        start=True))

     num_iter = int(math.ceil(FLAGS.num_examples / FLAGS.batch_size))
     true_count = 0  # Counts the number of correct predictions.
     total_sample_count = num_iter * FLAGS.batch_size
     step = 0





     while step < num_iter and not coord.should_stop():
       predictions = sess.run([top_k_op])
       true_count += np.sum(predictions)
       step += 1
     # Compute precision @ 1.
       precision = true_count / total_sample_count
       print('%s: precision @ 1 = %.3f' % (datetime.now(), precision))
       e = tf.nn.softmax(logits)
       log = sess.run(e)
       #print(log)
       predict = np.zeros([FLAGS.batch_size])
       max_logi = np.zeros([FLAGS.batch_size])

       for i in xrange(FLAGS.batch_size):
         predict[i] = np.argmax(log[i, :])
         max_logi[i] = log[i, :].max()
       lab = sess.run(labels)
       top = sess.run([top_k_op])
       predictions = sess.run([top_k_op])
       true_count = 0
       true_count += np.sum(predictions)
       # chk = sess.run(images)
       #print(top)c
       for i in xrange(FLAGS.batch_size):
         #    tf.cast(images, tf.uint8)
         img = sess.run(images)
         save_img = img[i, :]

         save_img = ((save_img - save_img.min()) / (save_img.max() - save_img.min()) * 255)

         #      save_img2 = Image.fromarray(save_img, "RGB")

         path = cur_dir + "/result/"

         if not os.path.exists(path):
           os.mkdir(path, 0755)
         if predictions[0][i]==True:
           path = path + "Correct/"
         else:
           path = path + "Incorect/"

         if not os.path.exists(path):
           os.mkdir(path, 0755)
         class_fold = path + str(predict[i]) + "/"
         # class_fold = path + str(max_logi[i]) + "/
         if not os.path.exists(path + str(predict[i]) + "/"):
           os.mkdir(class_fold, 0755)

         cv2.imwrite(os.path.join(class_fold, str(i) + ".jpeg"), save_img)



     summary = tf.Summary()
     summary.ParseFromString(sess.run(summary_op))
     summary.value.add(tag='Precision @ 1', simple_value=precision)
     summary_writer.add_summary(summary, global_step)
   except Exception as e:  # pylint: disable=broad-except
     coord.request_stop(e)

   coord.request_stop()
   coord.join(threads, stop_grace_period_secs=10)


def evaluate():
 """Eval CIFAR-10 for a number of steps."""
 with tf.Graph().as_default() as g:
   # Get images and labels for CIFAR-10.
   eval_data = FLAGS.eval_data == 'test'
   images, labels = cifar10.inputs(eval_data=eval_data)

   # Build a Graph that computes the logits predictions from the
   # inference model.
   logits = cifar10.inference(images)
   true_count = 0
   # Calculate predictions.
   top_k_op = tf.nn.in_top_k(logits, labels, 1)




   # Restore the moving average version of the learned variables for eval.
   variable_averages = tf.train.ExponentialMovingAverage(
       cifar10.MOVING_AVERAGE_DECAY)
   variables_to_restore = variable_averages.variables_to_restore()
   saver = tf.train.Saver(variables_to_restore)

   # Build the summary operation based on the TF collection of Summaries.
   summary_op = tf.merge_all_summaries()

   summary_writer = tf.train.SummaryWriter(FLAGS.eval_dir, g)

   #while True:
   eval_once(saver, summary_writer, top_k_op, summary_op,images,labels, logits)
   #  if False:
   #    break
   #  time.sleep(FLAGS.eval_interval_secs)


def main(argv=None):  # pylint: disable=unused-argument
 cifar10.maybe_download_and_extract()
 if tf.gfile.Exists(FLAGS.eval_dir):
   tf.gfile.DeleteRecursively(FLAGS.eval_dir)
 tf.gfile.MakeDirs(FLAGS.eval_dir)
 evaluate()


if __name__ == '__main__':
 tf.app.run()

【问题讨论】：

标签： python image gpu tensorflow deep-learning

【解决方案1】：

这看起来像recurring issue，其中 TensorFlow Python 模块与 OpenCV 和/或 PIL 库中的代码发生冲突。根本原因通常是这些库中包含的 libjpeg 或 libpng 版本不兼容。

在最新的 TensorFlow 夜间版本中，此问题应为 fixed。作为替代解决方法，您可以尝试移动线路：

import tensorflow as tf

...在cv、cv2 和PIL 的导入语句下方。

【讨论】：

似乎不再有错误消息了。感谢您的帮助！