以下内容根据cs231n课程材料总结。
文章目录
0. 参考资料
https://cs231n.github.io/optimization-2/
http://cs231n.stanford.edu/handouts/derivatives.pdf
http://cs231n.stanford.edu/slides/2021/lecture_4.pdf
cs231n 作业2
1. 全连接层
前向传播
Y = X W + B (1) Y=XW+B\tag{1} Y=XW+B(1)
def affine_forward(x, w, b):
out = None # 这一层前向传播的结果,也就是上面的Y
N = x.shape[0]
x_ = x.reshape(N, -1) # shape: (N, D1, D2, ···,Dk) => (N, D)
out = x_ @ w + b
cache = (x, w, b) # 用于反向传播
return out, cache
反向传播
Y
n
,
m
=
Z
n
,
m
+
B
m
(2)
Y_{n,m}=Z_{n,m}+B_{m}\tag{2}
Yn,m=Zn,m+Bm(2)
∂
L
∂
B
m
=
∑
n
∂
L
∂
Y
n
,
m
∂
Y
n
,
m
∂
B
m
=
∑
n
∂
L
∂
Y
n
,
m
⋅
1
=
∑
n
∂
L
∂
Y
n
,
m
(3)
\begin{aligned}\frac{\partial{L}}{\partial{B_{m}}}&=\sum_{n}{\frac{\partial{L}}{\partial{Y_{n,m}}}\frac{\partial{Y_{n,m}}}{\partial{B_{m}}}}\\&=\sum_{n}{\frac{\partial{L}}{\partial{Y_{n,m}}}\cdot1}\\&=\sum_{n}{\frac{\partial{L}}{\partial{Y_{n,m}}}}\end{aligned}\tag{3}
∂Bm∂L=n∑∂Yn,m∂L∂Bm∂Yn,m=n∑∂Yn,m∂L⋅1=n∑∂Yn,m∂L(3)
即
∂
L
∂
B
=
∑
n
∂
L
∂
Y
n
,
⋅
(4)
\frac{\partial{L}}{\partial{B}}=\sum_n{\frac{\partial{L}}{\partial{Y_{n,\cdot}}}}\tag{4}
∂B∂L=n∑∂Yn,⋅∂L(4)
∂
L
∂
Z
n
,
m
=
∂
L
∂
Y
n
,
m
∂
Y
n
,
m
∂
Z
n
,
m
=
∂
L
∂
Y
n
,
m
⋅
1
=
∂
L
∂
Y
n
,
m
(5)
\begin{aligned}\frac{\partial{L}}{\partial{Z_{n,m}}}&=\frac{\partial{L}}{\partial{Y_{n,m}}}\frac{\partial{Y_{n,m}}}{\partial{Z_{n,m}}}\\&=\frac{\partial{L}}{\partial{Y_{n,m}}}\cdot1\\&=\frac{\partial{L}}{\partial{Y_{n,m}}}\end{aligned}\tag{5}
∂Zn,m∂L=∂Yn,m∂L∂Zn,m∂Yn,m=∂Yn,m∂L⋅1=∂Yn,m∂L(5)
即
∂
L
∂
Z
=
∂
L
∂
Y
(6)
\frac{\partial{L}}{\partial{Z}}=\frac{\partial{L}}{\partial{Y}}\tag{6}
∂Z∂L=∂Y∂L(6)
由矩阵乘法公式可知
Z
n
,
m
=
∑
d
X
n
,
d
W
d
,
m
(7)
Z_{n,m}=\sum_d{X_{n,d}W_{d,m}}\tag{7}
Zn,m=d∑Xn,dWd,m(7)
∂
L
∂
X
n
,
d
=
∑
m
∂
L
∂
Z
n
,
m
∂
Z
n
,
m
∂
X
n
,
d
=
∑
m
∂
L
∂
Z
n
,
m
W
d
,
m
(8)
\begin{aligned}\frac{\partial{L}}{\partial{X_{n,d}}}&=\sum_m{\frac{\partial{L}}{\partial{Z_{n,m}}}\frac{\partial{Z_{n,m}}}{\partial{X_{n,d}}}}\\&=\sum_m{\frac{\partial{L}}{\partial{Z_{n,m}}}W_{d,m}}\end{aligned}\tag{8}
∂Xn,d∂L=m∑∂Zn,m∂L∂Xn,d∂Zn,m=m∑∂Zn,m∂LWd,m(8)
即
∂
L
∂
X
=
∂
L
∂
Z
W
T
(9)
\frac{\partial{L}}{\partial{X}}=\frac{\partial{L}}{\partial{Z}}W^T\tag{9}
∂X∂L=∂Z∂LWT(9)
∂
L
∂
W
d
,
m
=
∑
n
∂
L
∂
Z
n
,
m
∂
Z
n
,
m
∂
W
d
,
m
=
∑
n
∂
L
∂
Z
n
,
m
X
n
,
d
(10)
\begin{aligned}\frac{\partial{L}}{\partial{W_{d,m}}}&=\sum_n{\frac{\partial{L}}{\partial{Z_{n,m}}}\frac{\partial{Z_{n,m}}}{\partial{W_{d,m}}}}\\&=\sum_n{\frac{\partial{L}}{\partial{Z_{n,m}}}X_{n,d}}\end{aligned}\tag{10}
∂Wd,m∂L=n∑∂Zn,m∂L∂Wd,m∂Zn,m=n∑∂Zn,m∂LXn,d(10)
即
∂
L
∂
W
=
X
T
∂
L
∂
Z
(11)
\frac{\partial{L}}{\partial{W}}=X^T\frac{\partial{L}}{\partial{Z}}\tag{11}
∂W∂L=XT∂Z∂L(11)
def affine_backward(dout, cache):
"""Computes the backward pass for an affine (fully connected) layer.
Inputs:
- dout: Upstream derivative, of shape (N, M)
- cache: Tuple of:
- x: Input data, of shape (N, d_1, ... d_k)
- w: Weights, of shape (D, M)
- b: Biases, of shape (M,)
Returns a tuple of:
- dx: Gradient with respect to x, of shape (N, d1, ..., d_k)
- dw: Gradient with respect to w, of shape (D, M)
- db: Gradient with respect to b, of shape (M,)
"""
x, w, b = cache
dx, dw, db = None, None, None
N = x.shape[0]
x_ = x.reshape(N, -1)
db = np.sum(dout, axis=0)
dw = x_.T @ dout
dx = dout @ w.T
return dx, dw, db
2. ReLU
前向传播
Inputs:
- X: (N, D)
Returns:
- Y: (N, D)
Y = max ( 0 , X ) (12) Y=\max{(0,X)}\tag{12} Y=max(0,X)(12)
def relu_forward(x):
out = None
out = np.maximum(0, x)
cache = x
return out, cache
反向传播
∂ L ∂ X n , d = ∂ L ∂ Y n , d ∂ Y n , d ∂ X n , d = ∂ L ∂ Y n , d 1 { X n , d > 0 } (13) \begin{aligned}\frac{\partial{L}}{\partial{X_{n,d}}}&=\frac{\partial{L}}{\partial{Y_{n,d}}}\frac{\partial{Y_{n,d}}}{\partial{X_{n,d}}}\\&=\frac{\partial{L}}{\partial{Y_{n,d}}}\mathbf{1}\{X_{n,d}>0\}\end{aligned}\tag{13} ∂Xn,d∂L=∂Yn,d∂L∂Xn,d∂Yn,d=∂Yn,d∂L1{Xn,d>0}(13)
def relu_backward(dout, cache):
"""Computes the backward pass for a layer of rectified linear units (ReLUs).
Input:
- dout: Upstream derivatives, of any shape
- cache: Input x, of same shape as dout
Returns:
- dx: Gradient with respect to x
"""
dx, x = None, cache
dx = (x > 0) * dout
return dx