连体网络输出答案

【问题标题】：Siamese network output连体网络输出
【发布时间】：2017-04-06 06:22:01
【问题描述】：

我正在尝试在 caffe 中实现一个连体网络，其中它由两个不共享权重的 imagenet 组成。所以我基本上想做的是给每个网络一个图像，最后尝试找出它们之间的距离是否相似，下面是我的prototxt。所以我的主要问题是我应该如何设置我的“num_output”？我的训练只有 2 个类，0 表示它们不相似，1 表示它们是否相似。

name: "Siamese_ImageNet"
layers {
  name: "data"
  type: IMAGE_DATA
  top: "data"
  top: "label"
  image_data_param {
    source: "train1.txt"
    batch_size: 32
    new_height: 256
    new_width: 256
  }
  include: { phase: TRAIN }
}
layers {
  name: "data"
  type: IMAGE_DATA
  top: "data"
  top: "label"
  image_data_param {
    source: "test1.txt"
    batch_size: 32
    new_height: 256
    new_width: 256
  }
  include: { phase: TEST }
}

layers {
  name: "data_p"
  type: IMAGE_DATA
  top: "data_p"
  top: "label_p"
  image_data_param {
    source: "train2.txt"
    batch_size: 32
    new_height: 256
    new_width: 256
  }
  include: { phase: TRAIN }
}
layers {
  name: "data_p"
  type: IMAGE_DATA
  top: "data_p"
  top: "label_p"
  image_data_param {
    source: "test2.txt"
    batch_size: 32
    new_height: 256
    new_width: 256
  }
  include: { phase: TEST }
}


layers {
  name: "conv1"
  type: CONVOLUTION
  bottom: "data"
  top: "conv1"
  blobs_lr: 1
  blobs_lr: 2
  weight_decay: 1
  weight_decay: 0
  convolution_param {
    num_output: 96
    kernel_size: 11
    stride: 4
    weight_filler {
      type: "gaussian"
      std: 0.01
    }
    bias_filler {
      type: "constant"
      value: 0
    }
  }
}
layers {
  name: "relu1"
  type: RELU
  bottom: "conv1"
  top: "conv1"
}
layers {
  name: "pool1"
  type: POOLING
  bottom: "conv1"
  top: "pool1"
  pooling_param {
    pool: MAX
    kernel_size: 3
    stride: 2
  }
}
layers {
  name: "norm1"
  type: LRN
  bottom: "pool1"
  top: "norm1"
  lrn_param {
    local_size: 5
    alpha: 0.0001
    beta: 0.75
  }
}
layers {
  name: "conv2"
  type: CONVOLUTION
  bottom: "norm1"
  top: "conv2"
  blobs_lr: 1
  blobs_lr: 2
  weight_decay: 1
  weight_decay: 0
  convolution_param {
    num_output: 256
    pad: 2
    kernel_size: 5
    group: 2
    weight_filler {
      type: "gaussian"
      std: 0.01
    }
    bias_filler {
      type: "constant"
      value: 1
    }
  }
}
layers {
  name: "relu2"
  type: RELU
  bottom: "conv2"
  top: "conv2"
}
layers {
  name: "pool2"
  type: POOLING
  bottom: "conv2"
  top: "pool2"
  pooling_param {
    pool: MAX
    kernel_size: 3
    stride: 2
  }
}
layers {
  name: "norm2"
  type: LRN
  bottom: "pool2"
  top: "norm2"
  lrn_param {
    local_size: 5
    alpha: 0.0001
    beta: 0.75
  }
}
layers {
  name: "conv3"
  type: CONVOLUTION
  bottom: "norm2"
  top: "conv3"
  blobs_lr: 1
  blobs_lr: 2
  weight_decay: 1
  weight_decay: 0
  convolution_param {
    num_output: 384
    pad: 1
    kernel_size: 3
    weight_filler {
      type: "gaussian"
      std: 0.01
    }
    bias_filler {
      type: "constant"
      value: 0
    }
  }
}
layers {
  name: "relu3"
  type: RELU
  bottom: "conv3"
  top: "conv3"
}
layers {
  name: "conv4"
  type: CONVOLUTION
  bottom: "conv3"
  top: "conv4"
  blobs_lr: 1
  blobs_lr: 2
  weight_decay: 1
  weight_decay: 0
  convolution_param {
    num_output: 384
    pad: 1
    kernel_size: 3
    group: 2
    weight_filler {
      type: "gaussian"
      std: 0.01
    }
    bias_filler {
      type: "constant"
      value: 1
    }
  }
}
layers {
  name: "relu4"
  type: RELU
  bottom: "conv4"
  top: "conv4"
}
layers {
  name: "conv5"
  type: CONVOLUTION
  bottom: "conv4"
  top: "conv5"
  blobs_lr: 1
  blobs_lr: 2
  weight_decay: 1
  weight_decay: 0
  convolution_param {
    num_output: 256
    pad: 1
    kernel_size: 3
    group: 2
    weight_filler {
      type: "gaussian"
      std: 0.01
    }
    bias_filler {
      type: "constant"
      value: 1
    }
  }
}
layers {
  name: "relu5"
  type: RELU
  bottom: "conv5"
  top: "conv5"
}
layers {
  name: "pool5"
  type: POOLING
  bottom: "conv5"
  top: "pool5"
  pooling_param {
    pool: MAX
    kernel_size: 3
    stride: 2
  }
}
layers {
  name: "fc6"
  type: INNER_PRODUCT
  bottom: "pool5"
  top: "fc6"
  blobs_lr: 1
  blobs_lr: 2
  weight_decay: 1
  weight_decay: 0
  inner_product_param {
    num_output: 4096
    weight_filler {
      type: "gaussian"
      std: 0.005
    }
    bias_filler {
      type: "constant"
      value: 1
    }
  }
}
layers {
  name: "relu6"
  type: RELU
  bottom: "fc6"
  top: "fc6"
}
layers {
  name: "drop6"
  type: DROPOUT
  bottom: "fc6"
  top: "fc6"
  dropout_param {
    dropout_ratio: 0.5
  }
}
layers {
  name: "fc7"
  type: INNER_PRODUCT
  bottom: "fc6"
  top: "fc7"
  blobs_lr: 1
  blobs_lr: 2
  weight_decay: 1
  weight_decay: 0
  inner_product_param {
    num_output: 2
    weight_filler {
      type: "gaussian"
      std: 0.005
    }
    bias_filler {
      type: "constant"
      value: 1
    }
  }
}
layers {
  name: "relu7"
  type: RELU
  bottom: "fc7"
  top: "fc7"
}
layers {
  name: "drop7"
  type: DROPOUT
  bottom: "fc7"
  top: "fc7"
  dropout_param {
    dropout_ratio: 0.5
  }
}

layers {
  name: "conv1_p"
  type: CONVOLUTION
  bottom: "data_p"
  top: "conv1_p"
  blobs_lr: 1
  blobs_lr: 2
  weight_decay: 1
  weight_decay: 0
  convolution_param {
    num_output: 96
    kernel_size: 11
    stride: 4
    weight_filler {
      type: "gaussian"
      std: 0.01
    }
    bias_filler {
      type: "constant"
      value: 0
    }
  }
}
layers {
  name: "relu1_p"
  type: RELU
  bottom: "conv1_p"
  top: "conv1_p"
}
layers {
  name: "pool1_p"
  type: POOLING
  bottom: "conv1_p"
  top: "pool1_p"
  pooling_param {
    pool: MAX
    kernel_size: 3
    stride: 2
  }
}
layers {
  name: "norm1_p"
  type: LRN
  bottom: "pool1_p"
  top: "norm1_p"
  lrn_param {
    local_size: 5
    alpha: 0.0001
    beta: 0.75
  }
}
layers {
  name: "conv2_p"
  type: CONVOLUTION
  bottom: "norm1_p"
  top: "conv2_p"
  blobs_lr: 1
  blobs_lr: 2
  weight_decay: 1
  weight_decay: 0
  convolution_param {
    num_output: 256
    pad: 2
    kernel_size: 5
    group: 2
    weight_filler {
      type: "gaussian"
      std: 0.01
    }
    bias_filler {
      type: "constant"
      value: 1
    }
  }
}
layers {
  name: "relu2_p"
  type: RELU
  bottom: "conv2_p"
  top: "conv2_p"
}
layers {
  name: "pool2_p"
  type: POOLING
  bottom: "conv2_p"
  top: "pool2_p"
  pooling_param {
    pool: MAX
    kernel_size: 3
    stride: 2
  }
}
layers {
  name: "norm2_p"
  type: LRN
  bottom: "pool2_p"
  top: "norm2_p"
  lrn_param {
    local_size: 5
    alpha: 0.0001
    beta: 0.75
  }
}
layers {
  name: "conv3_p"
  type: CONVOLUTION
  bottom: "norm2_p"
  top: "conv3_p"
  blobs_lr: 1
  blobs_lr: 2
  weight_decay: 1
  weight_decay: 0
  convolution_param {
    num_output: 384
    pad: 1
    kernel_size: 3
    weight_filler {
      type: "gaussian"
      std: 0.01
    }
    bias_filler {
      type: "constant"
      value: 0
    }
  }
}
layers {
  name: "relu3_p"
  type: RELU
  bottom: "conv3_p"
  top: "conv3_p"
}
layers {
  name: "conv4_p"
  type: CONVOLUTION
  bottom: "conv3_p"
  top: "conv4_p"
  blobs_lr: 1
  blobs_lr: 2
  weight_decay: 1
  weight_decay: 0
  convolution_param {
    num_output: 384
    pad: 1
    kernel_size: 3
    group: 2
    weight_filler {
      type: "gaussian"
      std: 0.01
    }
    bias_filler {
      type: "constant"
      value: 1
    }
  }
}
layers {
  name: "relu4_p"
  type: RELU
  bottom: "conv4_p"
  top: "conv4_p"
}
layers {
  name: "conv5_p"
  type: CONVOLUTION
  bottom: "conv4_p"
  top: "conv5_p"
  blobs_lr: 1
  blobs_lr: 2
  weight_decay: 1
  weight_decay: 0
  convolution_param {
    num_output: 256
    pad: 1
    kernel_size: 3
    group: 2
    weight_filler {
      type: "gaussian"
      std: 0.01
    }
    bias_filler {
      type: "constant"
      value: 1
    }
  }
}
layers {
  name: "relu5_p"
  type: RELU
  bottom: "conv5_p"
  top: "conv5_p"
}
layers {
  name: "pool5_p"
  type: POOLING
  bottom: "conv5_p"
  top: "pool5_p"
  pooling_param {
    pool: MAX
    kernel_size: 3
    stride: 2
  }
}
layers {
  name: "fc6_p"
  type: INNER_PRODUCT
  bottom: "pool5_p"
  top: "fc6_p"
  blobs_lr: 1
  blobs_lr: 2
  weight_decay: 1
  weight_decay: 0
  inner_product_param {
    num_output: 4096
    weight_filler {
      type: "gaussian"
      std: 0.005
    }
    bias_filler {
      type: "constant"
      value: 1
    }
  }
}
layers {
  name: "relu6_p"
  type: RELU
  bottom: "fc6_p"
  top: "fc6_p"
}
layers {
  name: "drop6_p"
  type: DROPOUT
  bottom: "fc6_p"
  top: "fc6_p"
  dropout_param {
    dropout_ratio: 0.5
  }
}
layers {
  name: "fc7_p"
  type: INNER_PRODUCT
  bottom: "fc6_p"
  top: "fc7_p"
  blobs_lr: 1
  blobs_lr: 2
  weight_decay: 1
  weight_decay: 0
  inner_product_param {
    num_output: 2
    weight_filler {
      type: "gaussian"
      std: 0.005
    }
    bias_filler {
      type: "constant"
      value: 1
    }
  }
}
layers {
  name: "relu7_p"
  type: RELU
  bottom: "fc7_p"
  top: "fc7_p"
}
layers {
  name: "drop7_p"
  type: DROPOUT
  bottom: "fc7_p"
  top: "fc7_p"
  dropout_param {
    dropout_ratio: 0.5
  }
}

layers {
    name: "loss"
    type: CONTRASTIVE_LOSS
    contrastive_loss_param {
        margin: 1.0
    }
    bottom: "fc7"
    bottom: "fc7_p"
    bottom: "label"
    top: "loss"
}

我的训练文件结构： 0不相似，1相似

 train1.txt:
 /aer/img1_1.jpg 0
 /aer/img1_2.jpg 1
 /aer/img1_3.jpg 1

 train2.txt:
 /tpd/img2_1.jpg 0
 /tpd/img2_2.jpg 1
 /tpd/img2_3.jpg 1

【问题讨论】：

如果你只有 2 个类，那么 1 个输出就足够了。
好的，所以我想确保每次迭代都是对来自每个数据层的图像进行比较？还是他们单独作为单独的班级接受培训？你知道我可以在哪里阅读更多关于这种类型的信息吗，我只能找到共享权重类型？
如果您使用对比损失，您可能希望 num_output 更高。
你指的是什么num_output？您的模型是否产生任何错误？
在 fc7 和 fc7_p 的内积中。请检查下面，我想我完全理解了其他东西......我应该有一个对比层，然后是 2 个最大软层吗？

标签： machine-learning computer-vision neural-network deep-learning caffe

【解决方案1】：

我相信num_output定义了提取的特征向量的维度，然后提取的特征可以用来确定L2的距离。如果L2 距离大于1，则它是不同的类，如果它接近0，则图像相似。休息戴尔的答案是完美的。

【讨论】：

【解决方案2】：

只是为了纠正 Dale 上面的 answer 对 Caffe 的超级敏感语法，对于像我一样卡住的菜鸟，这里有一些更正（层与层，一些引号，加上删除 cmets 和有效的大写）

layer {
  name: "concat"
  type: "Concat"
  bottom: "fc7"
  bottom: "fc7_p"  
  top: "fc_concat"
}
layer {
  name: "fc_cls"
  type: "InnerProduct"
  bottom: "fc_concat"
  top: "fc_cls"
  param {
    lr_mult: 1
  }
  param {
    lr_mult: 2
  }
  inner_product_param {
    num_output: 2
    weight_filler {
      type: "xavier"
    }
    bias_filler {
      type: "constant"
    }
  }
}
layer {
  name: "accuracy"
  type: "Accuracy"
  bottom: "fc_cls"
  bottom: "label"
  top: "accuracy"
  include {
    phase: TEST
  }
}
layer {
  name: "loss"
  type: "SoftmaxWithLoss"
  bottom: "fc_cls"
  bottom: "label"
  top: "loss"
}

【讨论】：

@Dale 好像您使用的是旧的 caffe prototxt 语法。
@Shai 是的，我想与问题保持一致。
我在想，好电话。你有一个好的地方来制定提高连体性能的策略吗？我可以用一些 40x40 的方格获得高达 86% 的准确度，但我希望能进入 90 年代中期。让它更深似乎并没有增加太多。

【解决方案3】：

我应该如何设置我的“num_output”？

在了解您应该设置多少 num_output 之前，让我们解释一下它的含义。事实上，你可以将 Simense 网络的两侧，data -> fc7，data_p -> fc7_p 视为 2 个特征提取器。每个都从相应数据层中的图像中提取特征，例如fc7 和fc7_p。所以num_output定义了提取的特征向量的维度。

在训练期间，ContrastiveLoss 层总是在向量表示的图像相似时（label == 1）尝试最小化提取的 2 个特征向量的距离，并在不同时最大化距离（label == 0）。即特征向量的距离越小，图像越相似。

那么特征向量的最佳维度是什么，以最好地包含指示相似性的信息？或者你应该如何设置num_output？可能没有确切的值，这取决于特征提取器的编码质量（您可以将特征视为图像的代码）以及识别图像相似性的难度。所以基本上如果网络（特征提取器）很深并且不太难识别相似性，您可以选择相对较小的num_output 例如200，因为该特征可能被更大的网络很好地编码并且更具辨别力。如果不是，您可以尝试更大的值，例如500、1000 或尝试更复杂的网络。

如果您想尝试使用MultinomialLogisticLoss 而不是ContrastiveLoss 层，您应该首先使用CONCAT 之类的层将2 个特征向量fc7、fc7_p 融合为1，然后将其输入@ 987654342@层，像这样：

...#original layers
layers {
  name: "concat"
  type: CONCAT
  bottom: "fc7"
  bottom: "fc7_p"  
  top: "fc_concat" # concatenate fc7 and fc7_p along channel axis
}
layer {
  name: "fc_cls"
  type: INNER_PRODUCT
  bottom: "fc_concat"
  top: "fc_cls"
  param {
    lr_mult: 1
  }
  param {
    lr_mult: 2
  }
  inner_product_param {
    num_output: 2 # a binary classification problem in this case
    weight_filler {
      type: "xavier"
    }
    bias_filler {
      type: "constant"
    }
  }
}
layer {
  name: "accuracy"
  type: ACCURACY
  bottom: "fc_cls"
  bottom: "label"
  top: "accuracy"
  include {
    phase: TEST
  }
}
layer {
  name: "loss"
  type: SOFTMAX_LOSS
  bottom: "fc_cls"
  bottom: "label"
  top: "loss"
}

更新

为了比较相似性并将其用于部署、Constrastive Loss 或 SoftMax Loss，哪种方法是最好的实现方法？

Softmax Loss 易于部署。但它只能给你二进制预测，即相似或不相似。它给出的 2 类（相似，不同）的概率分布通常太难（不均匀），例如[0.9*, 0.0*],[0.0*, 0.9*],....很多情况下并不能很好的反映真实的输入相似度。

使用 Constrastive Loss 时，您可以获得图像的判别特征向量。并且您可以使用向量来计算相似性概率，就像 CVPR 2005 论文 Learning a Similarity Metric Discriminatively, with Application to Face Verification 在第 4.1 节中所做的那样。（关键是使用从属于同一图像的图像生成的特征向量来计算多元法线密度主题）。您也可以使用阈值来控制模型的the false positive rate and the false negative rate 以获得ROC curve 以更好地评估模型。

对了，要挖掘更多预测相似度的CNN架构，可以参考CVPR 2015论文Learning to Compare Image Patches via Convolutional Neural Networks。

【讨论】：

但为了比较相似性并将其用于部署、Constrastive Loss 或 Soft Max Loss，这是实现的最佳方法
谢谢您，这非常有用！我认为我在正确的轨道上，我的部署是否应该与训练原型相同？
@MasterWizard 是的。计算fc7 和fc7_p 的欧式距离，并将其与字段margin 的值（默认1）进行比较。（可以在训练前在prototxt中的Constrastive层设置它的值。）
@MasterWizard 你的模型在训练中收敛了吗？您是否检查过fc7、fc7_p 中的权重不为零？您最好通过在数据层中添加transform_param { scale: 0.00390625 } 来将图像数据缩放到[0, 1]。
您可能会发现this thread 对调试很有用。