Improving Deep Neural Networks: Hyperparameter tuning, Regularization and Optimization
Week 1
Week 2
1 - Gradient Descent
| 12
 3
 
 | i = str(l+1)parameters["W"+i] = parameters["W"+i] - learning_rate * grads['dW'+i]
 parameters["b"+i] = parameters["b"+i] - learning_rate * grads['db'+i]
 
 | 
2 - Mini-Batch Gradient descent
| 12
 3
 4
 5
 
 | mini_batch_X = shuffled_X[:, k * mini_batch_size: (k+1) * mini_batch_size]mini_batch_Y = shuffled_Y[:, k * mini_batch_size: (k+1) * mini_batch_size]
 
 mini_batch_X = shuffled_X[:, mini_batch_size * num_complete_minibatches]
 mini_batch_Y = shuffled_Y[:, mini_batch_size * num_complete_minibatches]
 
 | 
3 - Momentum
| 12
 3
 
 | i = str(l+1)v["dW"+i] = np.zeros_like(parameters['W'+i])
 v["db"+i] = np.zeros_like(parameters['b'+i])
 
 | 
$$
v\*{db^{[l]}} = \beta v\*{db^{[l]}} + (1 - \beta) db^{[l]}
b^{[l]} = b^{[l]} - \alpha v\_{db^{[l]}}
\tag{4}$$
| 12
 3
 4
 5
 6
 
 | i = str(l+1)v["dW"+i] = beta * v["dW"+i] + (1 - beta) * grads['dW'+i]
 v["db"+i] = beta * v["db"+i] + (1 - beta) * grads['db'+i]
 
 parameters["W"+i] = parameters["W"+i] - learning_rate * v["dW"+i]
 parameters["b"+i] = parameters["b"+i] - learning_rate * v["db"+i]
 
 | 
4 - Adam
| 12
 3
 4
 5
 
 | i = str(l+1)v["dW"+i] = np.zeros_like(parameters["W"+i])
 v["db"+i] = np.zeros_like(parameters["b"+i])
 s["dW"+i] = np.zeros_like(parameters["W"+i])
 s["db"+i] = np.zeros_like(parameters["b"+i])
 
 | 
$$v^{corrected}{dW^{[l]}} = \frac{v{dW^{[l]}}}{1 - (\beta_1)^t}$$
$$s^{corrected}{dW^{[l]}} = \frac{s{dW^{[l]}}}{1 - (\beta_2)^t}$$
$$W^{[l]} = W^{[l]} - \alpha \frac{v^{corrected}{dW^{[l]}}}{\sqrt{s^{corrected}{dW^{[l]}}} + \varepsilon}$$
| 12
 3
 4
 5
 6
 7
 8
 9
 10
 11
 12
 13
 14
 15
 
 | i = str(l+1)v["dW"+i] = beta1 * v["dW"+i] + (1 - beta1) * grads['dW'+i]
 v["db"+i] = beta1 * v["db"+i] + (1 - beta1) * grads['db'+i]
 
 v_corrected["dW"+i] = v["dW"+i] / (1 - beta1**t)
 v_corrected["db"+i] = v["db"+i] / (1 - beta1**t)
 
 s["dW"+i] = beta2 * s["dW"+i] + (1 - beta2) * (grads['dW'+i])**2
 s["db"+i] = beta2 * s["db"+i] + (1 - beta2) * (grads['db'+i])**2
 
 s_corrected["dW"+i] = s["dW"+i] / (1 - beta2**t)
 s_corrected["db"+i] = s["db"+i] / (1 - beta2**t)
 
 parameters["W"+i] = parameters["W"+i] - learning_rate * (v_corrected["dW"+i] / (np.sqrt(s_corrected["dW"+i]) + epsilon))
 parameters["b"+i] = parameters["b"+i] - learning_rate * (v_corrected["db"+i] / (np.sqrt(s_corrected["db"+i]) + epsilon))
 
 | 
Week 3
1.1 - Linear function
| 12
 3
 4
 5
 6
 7
 
 | X = np.random.randn(3,1)W = np.random.randn(4,3)
 b = np.random.randn(4,1)
 Y = tf.add(tf.matmul(W, X), b)
 
 sess = tf.Session()
 result = sess.run(Y)
 
 | 
1.2 - Computing the sigmoid
| 12
 3
 4
 5
 
 | x = tf.placeholder(tf.float32, name = "x")sigmoid = tf.sigmoid(x)
 
 with tf.Session() as sess:
 result = sess.run(sigmoid, feed_dict = {x: z})
 
 | 
1.3 - Computing the Cost
| 12
 3
 4
 5
 6
 7
 8
 
 | z = tf.placeholder(tf.float32, name='z')y = tf.placeholder(tf.float32, name='y')
 
 cost = tf.nn.sigmoid_cross_entropy_with_logits(logits = z,  labels = y)
 
 sess = tf.Session()
 cost = sess.run(cost, feed_dict = {z: logits, y: labels})
 sess.close()
 
 | 
1.4 - Using One Hot encodings
| 12
 3
 4
 5
 6
 
 | C = tf.constant(C)one_hot_matrix = tf.one_hot(labels, C, axis=0)
 
 sess = tf.Session()
 one_hot = sess.run(one_hot_matrix)
 sess.close()
 
 | 
1.5 - Initialize with zeros and ones
| 12
 3
 4
 5
 
 | ones = tf.ones(shape)
 sess = tf.Session()
 ones = sess.run(ones)
 sess.close()
 
 | 
2.1 - Create placeholders
| 12
 
 | X = tf.placeholder(tf.float32, [n_x, None])Y = tf.placeholder(tf.float32, [n_y, None])
 
 | 
2.2 - Initializing the parameters
| 12
 3
 4
 5
 6
 
 | W1 = tf.get_variable("W1", [25,12288], initializer = tf.contrib.layers.xavier_initializer(seed = 1))b1 = tf.get_variable("b1", [25,1], initializer = tf.zeros_initializer())
 W2 = tf.get_variable("W2", [12,25], initializer = tf.contrib.layers.xavier_initializer(seed = 1))
 b2 = tf.get_variable("b2", [12,1], initializer = tf.zeros_initializer())
 W3 = tf.get_variable("W3", [6,12], initializer = tf.contrib.layers.xavier_initializer(seed = 1))
 b3 = tf.get_variable("b3", [6,1], initializer = tf.zeros_initializer())
 
 | 
2.3 - Forward propagation in tensorflow
| 12
 3
 4
 5
 
 | Z1 = tf.matmul(W1, X) + b1A1 = tf.nn.relu(Z1)
 Z2 = tf.matmul(W2, A1) + b2
 A2 = tf.nn.relu(Z2)
 Z3 = tf.matmul(W3, A2) + b3
 
 | 
2.4 Compute cost
| 1
 | cost = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(logits = logits, labels = labels))
 | 
2.5 - Backward propagation & parameter updates
| 12
 3
 4
 5
 6
 7
 
 | X, Y = create_placeholders(n_x, n_y)parameters = initialize_parameters()
 Z3 = forward_propagation(X, parameters)
 cost = compute_cost(Z3, Y)
 optimizer = tf.train.AdamOptimizer(learning_rate = learning_rate).minimize(cost)
 
 _ , minibatch_cost = sess.run([optimizer, cost], feed_dict={X: minibatch_X, Y: minibatch_Y})
 
 |