使用三元组损失预测图像答案

【问题标题】：Predicting image using triplet loss使用三元组损失预测图像
【发布时间】：2019-05-03 17:23:51
【问题描述】：

我是 NN 的新手。

我使用三元组损失方法构建了一个用于图像理解的神经网络。

而且我认为我缺少一些关于如何使用这种方法来预测图像标签的基本知识。

建立模型后，我应该如何预测示例图像？因为我的模型输入是三元组 - 三元组应该由什么构成？

至于理论，我认为我应该以某种方式获取测试图像的嵌入矩阵，然后使用 knn 和 k=1 来获得最近的嵌入。但我对如何在实践中做到这一点一无所知

我的代码正在运行并生成模型：

import numpy as np
import random
import os
import imageio
import matplotlib.pyplot as plt
import pandas as pd
from time import time

import tensorflow as tf
tf.set_random_seed(1)
from PIL import Image


from keras.models import Model
from keras.layers import Input, Lambda, concatenate
from keras.optimizers import Adam
from keras import backend as K
from keras.layers import Conv2D, PReLU, Flatten, Dense

ALPHA = 0.2  # Triplet Loss Parameter

def get_triplets(features):
  df_features = pd.DataFrame(features)

  triplets = []
  for index, row in df_features.iterrows():
    same_tag = df_features.loc[df_features.iloc[:, -1] == row.iloc[-1]]
    same_tag_indexes = list(set(same_tag.index) - {index})
    diff_tag_indexes = list(set(df_features.index) - set(same_tag_indexes) - {index})

    anchor = row.iloc[0]
    anchor = anchor.reshape(-1, anchor.shape[0], anchor.shape[1], anchor.shape[2])
    
    pos = df_features.iloc[random.choice(same_tag_indexes), :].iloc[0]
    pos = pos.reshape(-1, pos.shape[0], pos.shape[1], pos.shape[2])
    
    neg = df_features.iloc[random.choice(diff_tag_indexes), :].iloc[0]
    neg = neg.reshape(-1, neg.shape[0], neg.shape[1], neg.shape[2])
  
    triplets.append(list(list([anchor, pos, neg])))
    
  return np.array(triplets)

def triplet_loss(x):
    anchor, positive, negative = tf.split(x, 3, axis=1)

    pos_dist = tf.reduce_sum(tf.square(tf.subtract(anchor, positive)), 1)
    neg_dist = tf.reduce_sum(tf.square(tf.subtract(anchor, negative)), 1)

    basic_loss = tf.add(tf.subtract(pos_dist, neg_dist), ALPHA)
    loss = tf.reduce_mean(tf.maximum(basic_loss, 0.0), 0)

    return loss

# When fitting the model (i.e., model.fit()); use as an input [anchor_example,
# positive_example, negative_example] in that order and as an output zero.
# The reason to use the output as zero is that you are trying to minimize the 
# triplet loss as much as possible and the minimum value of the loss is zero.
def create_embedding_network(input_shape):
  input_shape = Input(input_shape)
  x = Conv2D(32, (3, 3))(input_shape)
  x = PReLU()(x)
  x = Conv2D(64, (3, 3))(x)
  x = PReLU()(x)

  x = Flatten()(x)
  x = Dense(10, activation='softmax')(x)
  model = Model(inputs=input_shape, outputs=x)
  return model

anchor_embedding = None

# Builds an embedding for each example (i.e., positive, negative, anchor)
# Then calculates the triplet loss between their embedding.
# Then applies identity loss on the triplet loss value to minimize it on training.
def build_model(input_shape):
    global anchor_embedding
    # Standardizing the input shape order
    K.set_image_data_format('channels_last')

    positive_example = Input(shape=input_shape)
    negative_example = Input(shape=input_shape)
    anchor_example = Input(shape=input_shape)

    # Create Common network to share the weights along different examples (+/-/Anchor)
    embedding_network = create_embedding_network(input_shape)

    positive_embedding = embedding_network(positive_example)
    negative_embedding = embedding_network(negative_example)
    anchor_embedding = embedding_network(anchor_example)

#     loss = merge([anchor_embedding, positive_embedding, negative_embedding],
#                  mode=triplet_loss, output_shape=(1,))
    
    merged_output = concatenate([anchor_embedding, positive_embedding, negative_embedding])
    loss = Lambda(triplet_loss, (1,))(merged_output)

    model = Model(inputs=[anchor_example, positive_example, negative_example],
                  outputs=loss)
    model.compile(loss='mean_absolute_error', optimizer=Adam())

    return model

#start_time = time()
numOfPhotosPerTag = 10
#Change this line to your own drive path
baseDir = "C:/Intelligent systems/DNN/images/"
imagesHashtags = ["beer", "bigcity"]
imagesDir = [baseDir + str(x) for x in imagesHashtags]
images = ["/" + str(x) + ".jpg" for x in range(1, numOfPhotosPerTag + 1)]
allImages = []

for x in imagesDir:
  allImages += [x + loc for loc in images]

imageio.imread(allImages[0], pilmode="RGB").shape

data = []
for x in allImages:
  image = imageio.imread(x, pilmode="RGB")
  tag = x.split('/')[-2]
  data.append((image, tag))
  
data = np.array(data)

triplets = get_triplets(data)

model = build_model((256, 256, 3))

#model.fit(triplets, y=np.zeros(len(triplets)), batch_size=1)
for i in range(len(data)):
    model.fit(list(triplets[0]), y=[0], batch_size=1, verbose=10)

【问题讨论】：

标签： python tensorflow keras neural-network

【解决方案1】：

如果您使用name= 标记模型的“正常”一半，则可以提取所需的层。为此，我们使用以下代码：

def triplet2normal(model, keep_str='pos', out='score'):
    """ take a triplet model, keep half of the model """
    new_out_layer_name = next(model.name for model in model.layers if keep_str in model.name and out in model.name)
    model_half = Model(inputs=[i for i in model.input if keep_str in i.name],
                   outputs=model.get_layer(new_out_layer_name).output
                  )
    return model_half

模型是任何三元组模型 - 下面的示例用于推荐，例如电影镜头组：

# Input placeholders
positive_item_input = Input((1,), name='pos_item_input')
negative_item_input = Input((1,), name='neg_item_input')
user_input = Input((1,), name='pos_neg_user_input')

# Embedding layers for the  items and for users
item_embedding_layer = Embedding(num_items, latent_dim, name='pos_neg_item_embedding', input_length=1)
user_embedding_layer = Embedding(num_users, latent_dim, name='pos_neg_user_embedding', input_length=1)

# Flatten the embedding layers
positive_item_embedding = Flatten(name='pos_item_embedded')(item_embedding_layer(positive_item_input))
negative_item_embedding = Flatten(name='neg_item_embedded')(item_embedding_layer(negative_item_input))
user_embedding = Flatten(name='pos_neg_user_embedded')(user_embedding_layer(user_input))

# Dot product - Matrix factorization
positive_scores = Dot(axes=1, name='positive_scores')([user_embedding, positive_item_embedding])
negative_scores = Dot(axes=1, name='negative_scores')([user_embedding, negative_item_embedding])

# Compare scores
delta_scores_1 = Subtract(name='delta_scores')([negative_scores, positive_scores])
loss = Activation('sigmoid')(delta_scores_1)

# Define model
model = Model(
    inputs=[user_input, positive_item_input, negative_item_input],
    outputs=loss,
)

【讨论】：

【解决方案2】：

如果您已正确训练您的 embedding_network，您现在就不需要再使用三元组了。
基本上，triplet-loss 概念的整个 point 是学习一个与预定义度量（通常只是欧几里得距离）兼容的嵌入，然后将这个嵌入用于简单的@ 987654322@您提到的分类。
因此，获取您的标记数据并通过embedding_network 传递所有点。
您现在在（低维？）空间中有一组点，其中“接近点”属于同一类。同样，这取决于数据、培训的成功程度等。
很自然的做法是将您的测试点通过相同的embedding_network，并将其与嵌入空间中标记点的距离进行比较。
KNN 是一种可行的分类解决方案，但真正的重点是，您的数据已经非常非线性地转换为“舒适”的空间，在该空间中，许多经典和简单的方法将更容易工作；聚类，分类，你说的。
希望对您有所帮助，祝您好运！

【讨论】：