残差神经网络(ResNet)
为什么神经网络的层数越来越深
由图可知随着神经网络层数的增加,神经网络的训练误差和测试误差相较于较低层数的神经网络都要更高。但是这个与越深的网络,训练误差和测试误差都要更小这个第一感觉相违背。
在相同的迭代次数下,更深的神经网络的误差更容易趋向于平稳。
神经网络随着层数增加误差率反而上升
这是因为数据在经过常规的神经网络层后,经过链式法则梯度相乘后,会发生梯度消失,神经网络的收敛速度就会下降。
Plain Net
数据直接经过权重层和和非线性变换得到输出\(H(x)\)
Residual Net
将输入直接加到输出层,输出就等于经过变换的x和原始输入x,\(H(x)=F(x)+x\)
这里以MNIST数据集来构建一个残差神经网络模型
残差神经网络块(Residual Bolck)
class ResidualBlock(nn.Module):
def __init__(self, channels):
super(ResidualBolck, self).__init__()
self.channels = channels
self.conv1 = nn.Conv2d(channels, channels, kernel_size=3, padding=1)
self.conv2 = nn.Conv2d(channels, channels, kernel_size=3, padding=1)
def forward(self, x):
y = F.relu(self.conv1(x))
y = F.relu(self.conv2(x))
return F.relu(x + y)
# 创建两个通道数不变的卷积层,通过填充padding使得图像大小不变,然后将经过两个卷积层变换后的y加上原始输入x,最后返回x+y
# 卷积后图像减少(kernel_size-1)/stride,也就是padding=(kernel_size-1)/stride就能保持图像不变
整个神经网络代码
class Net(nn.Module):
def __init__(self):
super(Net, self).__init__()
self.conv1 = nn.Conv2d(1, 16, kernel_size=5) # 卷积层1
self.conv2 = nn.Conv2d(16, 32, kernel_size=5) # 卷积层2
self.mp = nn.MaxPool2d(2) # 最大池化层
self.rblock1 = ResidualBlock(16) # 残差块1
self.rblock2 = ResidualBlock(32) # 残差块2
self.fc = nn.Linear(512, 10) # Full Connect 全连接层
def forward(self, x): # 定义网络的正向传播
in_size = x.size(0) # 获取数据的batch_size,方便后面展平数据(batch_size,channel,height,width)
x = self.mp(F.relu(self.conv1(x))) # (1,28,28)->(16,24,24)->(16,12,12)
x = self.rblock1(x) # (16,12,12)->(16,12,12)
x = self.mp(F.relu(self.conv2(x))) # (16,12,12)->(32,8,8)->(32,4,4)
x = self.rbolck2(x) # (32,4,4)->(32,4,4)
x = x.view(in_size, -1) # 将数据展平接入全连接层
x = self.fc(x) # 32*4*4=512
return x
完整代码:
import torch
import torch.nn as nn
from torchvision import transforms
from torchvision import datasets
from torch.utils.data import DataLoader
import torch.nn.functional as F
batch_size = 64
transform = transforms.Compose([
transforms.ToTensor(),
transforms.Normalize((0.1307,), (0.3081,))
])
train_dataset = datasets.MNIST(root='./dataset/mnist/',
train=True,
download=True,
transform=transform)
train_loader = DataLoader(dataset=train_dataset,
shuffle=True,
batch_size=batch_size)
test_dataset = datasets.MNIST(root='./dataset/mnist/',
train=False,
download=True,
transform=transform)
test_loader = DataLoader(dataset=test_dataset,
shuffle=False,
batch_size=batch_size)
class ResidualBlock(nn.Module):
def __init__(self, channels):
super(ResidualBlock, self).__init__()
self.channels = channels
self.conv1 = nn.Conv2d(channels, channels, kernel_size=3, padding=1)
self.conv2 = nn.Conv2d(channels, channels, kernel_size=3, padding=1)
def forward(self, x):
y = F.relu(self.conv1(x))
y = self.conv2(y)
return F.relu(x + y)
class Net(nn.Module):
def __init__(self):
super(Net, self).__init__()
self.conv1 = nn.Conv2d(1, 16, kernel_size=5)
self.conv2 = nn.Conv2d(16, 32, kernel_size=5)
self.mp = nn.MaxPool2d(2)
self.rblock1 = ResidualBlock(16)
self.rblock2 = ResidualBlock(32)
self.fc = nn.Linear(512, 10)
def forward(self, x):
in_size = x.size(0)
x = self.mp(F.relu(self.conv1(x)))
x = self.rblock1(x)
x = self.mp(F.relu(self.conv2(x)))
x = self.rblock2(x)
x = x.view(in_size, -1)
x = self.fc(x)
return x
device = torch.device('cuda:0')
model = Net().to(device)
criterion = torch.nn.CrossEntropyLoss().to(device)
optimizer = torch.optim.SGD(model.parameters(), lr=0.01, momentum=.5)
loss_l = []
def train(epoch):
running_loss = 0
for batch_idx, (x, y) in enumerate(train_loader):
inputs, target = x, y
inputs, target = inputs.to(device), target.to(device)
optimizer.zero_grad()
outputs = model(inputs)
loss = criterion(outputs, target)
loss.backward()
optimizer.step()
running_loss += loss
if batch_idx % 300 == 299:
print('[{},{}] loss:{:.3f}'.format(epoch + 1, batch_idx + 1, running_loss / 300))
running_loss = 0
def test():
corrent = 0
total = 0
with torch.no_grad():
for data in test_loader:
images, labels = data
images, labels = images.to(device), labels.to(device)
outputs = model(images)
_, prediction = torch.max(outputs, dim=1)
total += labels.size(0)
corrent += (prediction == labels).sum().item()
loss_l.append(100 * corrent / total)
print('Accuracy on test set %d %%' % (100 * corrent / total))
if __name__ == '__main__':
for epoch in range(3):
train(epoch)
test()
运行结果:
准确率达到了98.64%
模型参数:
Net(
(conv1): Conv2d(1, 16, kernel_size=(5, 5), stride=(1, 1))
(conv2): Conv2d(16, 32, kernel_size=(5, 5), stride=(1, 1))
(mp): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
(rblock1): ResidualBlock(
(conv1): Conv2d(16, 16, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
(conv2): Conv2d(16, 16, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
)
(rblock2): ResidualBlock(
(conv1): Conv2d(32, 32, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
(conv2): Conv2d(32, 32, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
)
(fc): Linear(in_features=512, out_features=10, bias=True)
)``