深度学习课程 2

Improving Deep Neural Networks: Hyperparameter tuning, Regularization and Optimization

Week 1

Week 2

1 - Gradient Descent


1
2
3
i = str(l+1)
parameters["W"+i] = parameters["W"+i] - learning_rate * grads['dW'+i]
parameters["b"+i] = parameters["b"+i] - learning_rate * grads['db'+i]

2 - Mini-Batch Gradient descent


1
2
3
4
5
mini_batch_X = shuffled_X[:, k * mini_batch_size: (k+1) * mini_batch_size]
mini_batch_Y = shuffled_Y[:, k * mini_batch_size: (k+1) * mini_batch_size]
# ...
mini_batch_X = shuffled_X[:, mini_batch_size * num_complete_minibatches]
mini_batch_Y = shuffled_Y[:, mini_batch_size * num_complete_minibatches]

3 - Momentum

1
2
3
i = str(l+1)
v["dW"+i] = np.zeros_like(parameters['W'+i])
v["db"+i] = np.zeros_like(parameters['b'+i])


$$

v\*{db^{[l]}} = \beta v\*{db^{[l]}} + (1 - \beta) db^{[l]}
b^{[l]} = b^{[l]} - \alpha v\_{db^{[l]}}
\tag{4}$$

1
2
3
4
5
6
i = str(l+1)
v["dW"+i] = beta * v["dW"+i] + (1 - beta) * grads['dW'+i]
v["db"+i] = beta * v["db"+i] + (1 - beta) * grads['db'+i]
# update parameters
parameters["W"+i] = parameters["W"+i] - learning_rate * v["dW"+i]
parameters["b"+i] = parameters["b"+i] - learning_rate * v["db"+i]

4 - Adam

1
2
3
4
5
i = str(l+1)
v["dW"+i] = np.zeros_like(parameters["W"+i])
v["db"+i] = np.zeros_like(parameters["b"+i])
s["dW"+i] = np.zeros_like(parameters["W"+i])
s["db"+i] = np.zeros_like(parameters["b"+i])

$$v^{corrected}{dW^{[l]}} = \frac{v{dW^{[l]}}}{1 - (\beta_1)^t}$$

$$s^{corrected}{dW^{[l]}} = \frac{s{dW^{[l]}}}{1 - (\beta_2)^t}$$

$$W^{[l]} = W^{[l]} - \alpha \frac{v^{corrected}{dW^{[l]}}}{\sqrt{s^{corrected}{dW^{[l]}}} + \varepsilon}$$

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
i = str(l+1)
v["dW"+i] = beta1 * v["dW"+i] + (1 - beta1) * grads['dW'+i]
v["db"+i] = beta1 * v["db"+i] + (1 - beta1) * grads['db'+i]

v_corrected["dW"+i] = v["dW"+i] / (1 - beta1**t)
v_corrected["db"+i] = v["db"+i] / (1 - beta1**t)

s["dW"+i] = beta2 * s["dW"+i] + (1 - beta2) * (grads['dW'+i])**2
s["db"+i] = beta2 * s["db"+i] + (1 - beta2) * (grads['db'+i])**2

s_corrected["dW"+i] = s["dW"+i] / (1 - beta2**t)
s_corrected["db"+i] = s["db"+i] / (1 - beta2**t)

parameters["W"+i] = parameters["W"+i] - learning_rate * (v_corrected["dW"+i] / (np.sqrt(s_corrected["dW"+i]) + epsilon))
parameters["b"+i] = parameters["b"+i] - learning_rate * (v_corrected["db"+i] / (np.sqrt(s_corrected["db"+i]) + epsilon))

Week 3

1.1 - Linear function

1
2
3
4
5
6
7
X = np.random.randn(3,1)
W = np.random.randn(4,3)
b = np.random.randn(4,1)
Y = tf.add(tf.matmul(W, X), b)

sess = tf.Session()
result = sess.run(Y)

1.2 - Computing the sigmoid

1
2
3
4
5
x = tf.placeholder(tf.float32, name = "x")
sigmoid = tf.sigmoid(x)

with tf.Session() as sess:
result = sess.run(sigmoid, feed_dict = {x: z})

1.3 - Computing the Cost

1
2
3
4
5
6
7
8
z = tf.placeholder(tf.float32, name='z')
y = tf.placeholder(tf.float32, name='y')

cost = tf.nn.sigmoid_cross_entropy_with_logits(logits = z, labels = y)

sess = tf.Session()
cost = sess.run(cost, feed_dict = {z: logits, y: labels})
sess.close()

1.4 - Using One Hot encodings

1
2
3
4
5
6
C = tf.constant(C)
one_hot_matrix = tf.one_hot(labels, C, axis=0)

sess = tf.Session()
one_hot = sess.run(one_hot_matrix)
sess.close()

1.5 - Initialize with zeros and ones

1
2
3
4
5
ones = tf.ones(shape)

sess = tf.Session()
ones = sess.run(ones)
sess.close()

2.1 - Create placeholders

1
2
X = tf.placeholder(tf.float32, [n_x, None])
Y = tf.placeholder(tf.float32, [n_y, None])

2.2 - Initializing the parameters

1
2
3
4
5
6
W1 = tf.get_variable("W1", [25,12288], initializer = tf.contrib.layers.xavier_initializer(seed = 1))
b1 = tf.get_variable("b1", [25,1], initializer = tf.zeros_initializer())
W2 = tf.get_variable("W2", [12,25], initializer = tf.contrib.layers.xavier_initializer(seed = 1))
b2 = tf.get_variable("b2", [12,1], initializer = tf.zeros_initializer())
W3 = tf.get_variable("W3", [6,12], initializer = tf.contrib.layers.xavier_initializer(seed = 1))
b3 = tf.get_variable("b3", [6,1], initializer = tf.zeros_initializer())

2.3 - Forward propagation in tensorflow

1
2
3
4
5
Z1 = tf.matmul(W1, X) + b1
A1 = tf.nn.relu(Z1)
Z2 = tf.matmul(W2, A1) + b2
A2 = tf.nn.relu(Z2)
Z3 = tf.matmul(W3, A2) + b3

2.4 Compute cost

1
cost = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(logits = logits, labels = labels))

2.5 - Backward propagation & parameter updates

1
2
3
4
5
6
7
X, Y = create_placeholders(n_x, n_y)
parameters = initialize_parameters()
Z3 = forward_propagation(X, parameters)
cost = compute_cost(Z3, Y)
optimizer = tf.train.AdamOptimizer(learning_rate = learning_rate).minimize(cost)
# ...
_ , minibatch_cost = sess.run([optimizer, cost], feed_dict={X: minibatch_X, Y: minibatch_Y})