【问题标题】:Deep Neural Network dimension issue深度神经网络维度问题
【发布时间】:2020-08-09 07:04:33
【问题描述】:

我的深度神经网络的维度存在问题,这会引发错误:

line 104, in linear_backward
dW = (dz @ a.T) * W * lambd / m
ValueError: operands could not be broadcast together with shapes (8,8) (8,32)

我似乎很清楚问题在于反向传播,但我找不到我的错误。 我提前为混乱的代码道歉,并且非常感谢任何帮助,因为我是一名高中生,周围没有任何帮助。

程序如下:

import matplotlib.pyplot as plt
from sklearn.datasets import load_breast_cancer
from sklearn.model_selection import train_test_split


# Neural Network Build

def sigmoid(Z):
    # activation function used to classify the output
    # range of the function is between 0 and 1.
    a = (1 + np.exp(-Z)) ** -1
    return a


def relu(Z):
    # activation function used in hidden layers

    A = np.maximum(0, Z)  # outputs Z if Z is positive, otherwise returns 0

    assert (A.shape == Z.shape) #stops program if this isn't true

    return A


def relu_back(dA, Z):
    # function finding the derivative of the relu function
    dZ = np.array(dA, copy=True)

    dZ[Z <= 0] = 0

    return dZ


def sigmoid_back(dA, Z):
    # function finding the derivative of the sigmoid function
    s = sigmoid(Z)

    dZ = dA * s * (1 - s)

    return dZ


def initialize_parameters(layer_dims):
    parameters = {}
    L = len(layer_dims)
    for layer in range(1, L):
        parameters["W" + str(layer)] = np.random.randn(layer_dims[layer], layer_dims[layer - 1]) * np.sqrt(
            2 / layer_dims[layer - 1])
        parameters["b" + str(layer)] = np.zeros((layer_dims[layer], 1))

        print(parameters["W" + str(layer)].shape)

        assert (parameters['W' + str(layer)].shape == (layer_dims[layer], layer_dims[layer - 1]))
        assert (parameters['b' + str(layer)].shape == (layer_dims[layer], 1))

    return parameters


def forward_prop(X, parameters):
    L = len(parameters) // 2

    forward_cache = {}

    forward_cache["z1"] = parameters["W1"] @ X + parameters["b1"]
    forward_cache["a1"] = relu(forward_cache["z1"])

    for layer in range(2, L):
        forward_cache["z" + str(layer)] = parameters["W" + str(layer)] @ forward_cache["a" + str(layer - 1)] + \
                                          parameters["b" + str(layer)]
        forward_cache["a" + str(layer)] = relu(forward_cache["z" + str(layer)])

    # Output neuron will have sigmoid activation function applied to classify the output according to a probability.
    # This needs to be handled separately.
    forward_cache["z" + str(L)] = parameters["W" + str(L)] @ forward_cache["a" + str(L - 1)] + parameters["b" + str(L)]
    forward_cache["a" + str(L)] = sigmoid(forward_cache["z" + str(L)])

    AL = forward_cache["a" + str(L)]

    assert (AL.shape == (1, X.shape[1]))

    return AL, forward_cache # Return y_hat (the model's prediction) and forward_cache (used in back-prop)


def compute_loss(A, Y, parameters, lambd):
    L = len(parameters) // 2
    m = Y.shape[1]
    log_function = -(np.multiply(Y, np.log(A)) + np.multiply((1 - Y), np.log(1 - A)))
    L2_regularisation_cost = 0
    for weight in range(1, L):
        L2_regularisation_cost += np.sum(np.square(parameters["W" + str(weight)]))

    loss = 1. / m * (np.nansum(log_function) + (L2_regularisation_cost * lambd / 2))

    loss = np.squeeze(loss)

    assert (loss.shape == ())

    return loss


def linear_backward(dz, W, b, a, lambd):
    m = a.shape[1]
    dW = (dz @ a.T) * W * lambd / m
    db = np.sum(dz, axis=1, keepdims=True)
    dA_prev = W.T @ dz

    print("W:", W.shape,"dZ:", dz.shape)
    print(dA_prev.shape, a.shape)

#    assert (dA_prev.shape == a.shape)
    assert (dW.shape == W.shape)
    assert (db.shape == b.shape)

    return dW, db, dA_prev


def activation_backward(dA, a, z, W, b, activation_function, lambd):
    if activation_function == "sigmoid":
        dz = sigmoid_back(dA, z)
    elif activation_function == "relu":
        dz = relu_back(dA, z)

    dW, db, dA_prev = linear_backward(dz, W, b, a, lambd)

    return dW, db, dA_prev


def backward_prop(AL, Y, cache, parameters, lambd):
    L = len(parameters) // 2
    print(AL)
    gradients = {}
    dAL = -(np.divide(Y, AL)) + np.divide(1 - Y, 1 - AL)
    dz = sigmoid_back(dAL, cache["z" + str(L)])
    gradients["dW" + str(L)], gradients["db" + str(L)], dA_prev = \
        linear_backward(dz, parameters["W" + str(L)], parameters["b" + str(L)], cache["a" + str(L)], lambd)

    for l in reversed(range(1, L)):
        gradients["dW" + str(l)], gradients["db" + str(l)], dA_prev = \
            activation_backward(dA_prev, cache["a" + str(l)], cache["z" + str(l)],
                                parameters["W" + str(l)], parameters["b" + str(l)],
                                "relu", lambd)

    return gradients


def update_parameters(parameters, gradients, learning_rate):
    L = len(parameters) // 2

    for l in range(1, L + 1):
        parameters["W" + str(l)] = parameters["W" + str(l)] - (gradients["dW" + str(l)] * learning_rate)
        parameters["b" + str(l)] = parameters["b" + str(l)] - (gradients["db" + str(l)] * learning_rate)

    return parameters


def predict(X, Y, parameters):
    """
        This function is used to predict the results of a  L-layer neural network.

        Arguments:
        X -- data set of examples you would like to label
        parameters -- parameters of the trained model

        Returns:
        p -- predictions for the given dataset X
        """

    m = X.shape[1]
    # n = len(parameters) // 2 number of layers in the neural network
    binary_outcome = np.zeros((1, m))

    # Forward propagation
    probabilities = deep_neural_network(X, Y, parameters, )

    # convert probabilities to 0/1 predictions
    for i in range(0, probabilities.shape[1]):
        if probabilities[0, i] > 0.5:
            binary_outcome[0, i] = 1
        else:
            binary_outcome[0, i] = 0

    # print results
    # print ("predictions: " + str(p))
    # print ("true labels: " + str(y))
    print("Accuracy: " + str(np.sum((binary_outcome == Y) / m)))

    return binary_outcome


def deep_neural_network(X, Y, layer_dims, number_of_iterations, learning_rate, print_cost, lambd):
    losses = []

    for epoch in range(number_of_iterations):

        parameters = initialize_parameters(layer_dims)

        AL, forward_cache = forward_prop(X, parameters)

        loss = compute_loss(AL, Y, parameters, lambd)

        losses.append(loss)

        gradients = backward_prop(AL, Y, forward_cache, parameters, lambd)

        parameters = update_parameters(parameters, gradients, learning_rate)

        if (epoch % 100 == 0) and print_cost:
            print(loss)

    return parameters

    return


cancer = load_breast_cancer()
data = cancer.data
labels = cancer.target

xtrain, xtest, ytrain, ytest = train_test_split(data, labels)

xtrain = xtrain.T
ytrain = ytrain.reshape((1, 426))

parameters, costs = deep_neural_network(xtrain, ytrain, [30, 64, 32, 8, 1], learning_rate=0.045,
                                        number_of_iterations=2000, print_cost=True, lambd=0.01)
plt.plot(np.squeeze(costs))
plt.ylabel('cost')
plt.xlabel('iterations (per tens)')
plt.title("Learning rate =" + str(0.075))
plt.show()
predictions = predict(X=xtrain, parameters=parameters)

【问题讨论】:

    标签: python numpy deep-learning neural-network


    【解决方案1】:

    已解决:

    以下代码有助于定制深度神经网络。我的 DNN 接受了 Extended MNIST 的训练。

    import numpy as np
    import matplotlib.pyplot as plt
    from emnist import extract_training_samples
    from emnist import extract_test_samples
    import keras
    import time
    import os
    
    
    # Converts data from the emnist database into the required format
    # for the neural network and outputs the X and Y datasets
    def getData(size, images, labels):
        training_samples = images.shape[0]
    
        # Converts each matrix of pixels (28x28) into feature vectors
        xtrain = images.reshape(training_samples, 784)
    
        # Arranges each training example into a column
        xtrain = xtrain.T
    
        #normalises the database to prevent the exploding gradients problem
        xtrain = xtrain / 255
    
        ytrain = labels.reshape(1, training_samples)
    
        # Converts the numerical labels into one-hot arrays with 47 elements.
        # For example a label of 10 will be converted into an array of 0s with
        # a 1 in the 10th position. This can be abstracted as the 10th neuron
        # firing in the neural network.
        y_train = keras.utils.to_categorical(ytrain, 47)
    
        y_train = y_train[0].T
    
        # Replaces 1s and 0s with approximate probabilities to prevent NaN errors in
        # the cost entropy function
        y_train[y_train == 0] = 0.01
        y_train[y_train == 1] = 0.99
    
        # Allows choice of the number of training examples input into the neural network.
        xsmall = np.hsplit(xtrain, [size, (training_samples - size)])[0]
        ysmall = np.hsplit(y_train, [size, (training_samples - size)])[0]
    
        # sanity check
        print(xsmall.shape, ysmall.shape)
    
        return xsmall, ysmall
    
    
    # Activation function that replaces all negative values with 0
    def relu(Z):
        A = np.maximum(0, Z)
    
        assert (A.shape == Z.shape)
    
        return A
    
    
    # Derivative of the relu function: 0 if Z is negative and Z if Z is positive
    def relu_backward(dA, cache, layer):
    
        # Accesses Z from the cache set up in the feed_forward function
        Z = cache["Z" + str(layer)]
    
        dZ = np.array(dA, copy=True)
    
        # At 0, I've set the derivative to be 0 as well, although it isn't differentiable at that point
        dZ[Z <= 0] = 0
    
        assert (dZ.shape == Z.shape)
    
        return dZ
    
    # Activation function on the final layer that converts the linear transformation
    # from the layer into probabilities that sum to 1. This is the neural net's estimate of how
    # likely it is that the input example is each label in the dataset: the highest probability is
    # what we choose as the neural net's output
    def softmax(Z):
    
        exp_Z = np.exp(Z - np.max(Z))
    
        A = exp_Z / np.sum(exp_Z, axis=0)
    
        assert (A.shape == Z.shape)
    
        return A
    
    
    # Derivative of the softmax function works very well with the cross-entropy cost function,
    # which simplifies to the difference between the neural net's guess and the actual label.
    def softmax_backward(cache, layer):
    
        # fetching values from the cache
        Z = cache["Z" + str(layer)]
        AL = cache["A" + str(layer)]
        Y = cache["Y"]
    
        dZ = AL - Y
    
        assert (dZ.shape == Z.shape)
    
        return dZ
    
    
    # This sets up our parameters depending on the required dimensions of the neural network.
    def initialise_parameters(layer_dims):
        parameters = {}
    
        np.random.seed(2)    
                # He initialisation allows for symmetry breaking and efficient gradient descent
                parameters[W_layer] = np.random.randn(layer_dims[l], layer_dims[l - 1]) * np.sqrt(2 / layer_dims[l - 1])
                parameters[b_layer] = np.zeros((layer_dims[l], 1))
    
                assert (parameters[W_layer].shape == (layer_dims[l], layer_dims[l - 1]))
                assert (parameters[b_layer].shape == (layer_dims[l], 1))
    
        return parameters
    
    
    # Outputs Z as the linear transformation of the previous layer's output using the parameters: Weight and bias.
    def linear_forward(W, A_previous, b):
        Z = W @ A_previous + b
    
        assert (Z.shape == (W.shape[0], A_previous.shape[1]))
    
        return Z
    
    
    # chooses the activation function that Z passes through depending on the activation_type requested.
    def activation_forward(Z, activation_type):
        if activation_type == "relu":
            A = relu(Z)
    
        elif activation_type == "softmax":
            A = softmax(Z)
    
        assert (A.shape == Z.shape)
    
        return A
    
    
    
    def feed_forward(X, parameters, Y):
        cache = {}
        A = X
    
        # Fetches number of layers in the neural network.
        L = len(parameters) // 2
    
        # considers the feature vector input as the 0th layer, initialising the cache
        cache["A0"] = X
        cache["Y"] = Y
    
        # Uses the relu for all the hidden layers
        for l in range(1, L + 1):
    
            # Uses softmax for the output layer
            activation_type = "relu"
            if l == L:
                activation_type = "softmax"
    
            # Combines all the helper functions above to propagate through the neural network
            A_previous = A
            W = parameters["W" + str(l)]
            b = parameters["b" + str(l)]
            Z = linear_forward(W, A_previous, b)
            A = activation_forward(Z, activation_type)
    
            # We store these parameters for use in the back propagation, since we use the element inside
            # when differentiating according to the chain rule. Saving these here improves efficiency and speed.
            cache["W" + str(l)] = W
            cache["b" + str(l)] = b
            cache["Z" + str(l)] = Z
            cache["A" + str(l)] = A
    
        # Returns the final output layer and the cache
        return cache["A" + str(L)], cache
    
    
    def compute_cost(AL, Y):
    
        training_samples = Y.shape[1]
    
        # Zeroes can introduce NaN errors in the cost function, so we approximate them closely
        # to prevent log(0) from occurring and messing up our cost.
        AL_nonzero = np.maximum(AL, 1.0e-15)
        cost = (-1. / training_samples) * (np.sum(np.multiply(Y, np.log(AL_nonzero))) + np.sum(np.multiply(1 - Y, np.log(1 - AL_nonzero))))
    
        # Removes unnecessary dimensions from the cost value, so [[0.15]] is converted into 0.15
        cost = np.squeeze(cost)
        assert (cost.shape == ())
    
        return cost
    
    
    # Performs backward propagation from one layer to the previous depending on
    # the activation function specified.
    def activation_backward(dAL, cache, layer, activation_type):
        if activation_type == "relu":
            dZ = relu_backward(dAL, cache, layer)
    
        elif activation_type == "softmax":
            dZ = softmax_backward(cache, layer)
    
        # Accesses values from cache
        A_prev = cache["A" + str(layer - 1)]
        W = cache["W" + str(layer)]
        b = cache["b" + str(layer)]
        training_samples = A_prev.shape[1]
    
    
        dW = (1. / training_samples) * np.dot(dZ, A_prev.T)
        db = (1. / training_samples) * np.sum(dZ, axis=1, keepdims=True)
        dA_prev = np.dot(W.T, dZ)
    
        assert (dA_prev.shape == A_prev.shape)
        assert (dW.shape == W.shape)
        assert (db.shape == b.shape)
    
        return dA_prev, dW, db
    
    
    # Performs backward propagation on the whole neural network to find the derivatives of the
    # cost function with respect to the parameters Weight and bias of each layer. These derivatives will
    # allow us to perform gradient descent on the the cost function, tuning the parameters to improve accuracy.
    def back_propagation(cache, Y, L):
        gradients = {}
    
        # Accesses output layer matrix from cache
        AL = cache["A" + str(L)]
    
        # Derivative of the cross entropy cost function
        gradients["dA" + str(L)] = - (np.divide(Y, AL) - np.divide(1 - Y, 1 - AL))
    
        # Uses relu_backward for all the hidden layers
        for l in reversed(range(1, L + 1)):
            activation_type = "relu"
    
            # Uses softmax_backward for the output layer
            if l == L:
                activation_type = "softmax"
    
            # finds gradients of each layer
            dA_previous, dW_current, db_current = activation_backward(gradients["dA" + str(l)], cache, l, activation_type)
    
            # Stores these gradients in the gradients dictionary
            gradients["dA" + str(l - 1)] = dA_previous
            gradients["dW" + str(l)] = dW_current
            gradients["db" + str(l)] = db_current
    
        return gradients
    
    
    # Updates all of the parameters by subtracting their gradients from them
    def update_parameters(gradients, parameters, learning_rate):
        L = len(parameters) // 2
    
        for l in range(1, L):
            W_layer = "W" + str(l)
            b_layer = "b" + str(l)
            dW_layer = "dW" + str(l)
            db_layer = "db" + str(l)
    
            # The learning rate needs to be tuned to approach the minimum of the cost accurately and reliably
            # while optimising time taken to train the neural net
            parameters[W_layer] = parameters[W_layer] - (gradients[dW_layer] * learning_rate)
            parameters[b_layer] = parameters[b_layer] - (gradients[db_layer] * learning_rate)
    
        return parameters
    
    
    # Calculates accuracy of the neural net's prediction against the true labels
    def calculate_accuracy(parameters, x, y_onehot, size):
    
        # Uses parameters to get the neural net's guess
        AL, cache = feed_forward(x, parameters, y_onehot)
    
        # Finds the highest probability character for each training example
        AL_max = np.argmax(AL, axis=0)
    
        AL_max = AL_max.reshape(1, AL_max.shape[0])
    
        # Converts the labels from one-hot matrices back into a numerical vector
        y_int = np.argmax(y_onehot, axis=0)
        y_int = y_int.reshape(1, y_int.shape[0])
    
        # Compares the two
        accuracy = np.mean(AL_max == y_int)
    
        return accuracy
    
    # Brings everything together to train the model
    def NN_model(X, Y, layer_dims, learning_rate, num_iterations, size, print_values):
    
        # Accesses initial parameters
        parameters = initialise_parameters(layer_dims)
    
        # Used to calculate the training time over the iteration
        times = []
    
        # Used to plot the accuracies at the end
        accuracies = []
    
        times.append(time.time())
    
        for i in range(num_iterations):
    
            # Accesses the output and cache using forward propagation through the parameters
            AL, cache = feed_forward(X, parameters, Y)
    
            # Calculates the gradients using the previouslt generated cache
            gradients = back_propagation(cache, Y, len(layer_dims) - 1)
    
            # Tunes the parameters to achieve greater accuracy
            parameters = update_parameters(gradients, parameters, learning_rate)
    
    
            # Used to track progress during training
            if i % 10 == 0 and print_values:
                time_difference = time.time() - times[-1]
    
                times.append(time.time())
    
                print("Time:", time_difference)
    
                cost = compute_cost(AL, Y)
    
                accuracy = calculate_accuracy(parameters, x_test, y_test, size)
    
                print("Cost on test dataset after iteration %i: %f, accuracy: %f" % (i, cost, accuracy))
    
                #           costs.append(cost)
    
                accuracies.append(accuracy)
    
            if i % 1000 == 0:
                learning_rate = learning_rate * 0.98
    
        # Plots graph of accuracy to mark improvement
        plt.xlabel('iterations')
        plt.title("Learning rate =" + str(learning_rate))
    
        line, = plt.plot(np.squeeze(accuracies), label='accuracies')
    
        plt.xlabel('iterations x10')
    
        plt.legend()
    
        plt.show()
    
        return parameters
    
    
    
    size = 30000
    
    images, labels = extract_training_samples('balanced')
    
    x_train, y_train = getData(size, images, labels)
    
    images, labels = extract_test_samples('balanced')
    
    global x_test
    global y_test
    
    x_test, y_test = getData(size, images, labels)
    
    layer_dims = [784, 30, 20, 47]
    
    parameters = initialise_parameters(layer_dims)
    
    print(calculate_accuracy(parameters, x_train, y_train, size))
    
    parameters = NN_model(x_train, y_train, layer_dims, 0.03, 40000, size, print_values=True)
    
    print(calculate_accuracy(parameters, x_train, y_train, size))
    

    【讨论】:

      猜你喜欢
      • 1970-01-01
      • 1970-01-01
      • 2019-12-02
      • 1970-01-01
      • 2020-05-15
      • 2018-10-05
      • 1970-01-01
      • 1970-01-01
      • 2017-04-04
      相关资源
      最近更新 更多