Pytorch搭建AlexNet
前言
AlexNet是第一个深度神经网络,使用了更多的卷积层和更大的参数空间来拟合大规模图像识别数据集ImageNet。一起来学习一下吧。
AlexNet的网络结构图如图所示:
网络结构
如图所示,AlexNet使用了8个参数层,包括5个卷积层和3个全连接层。注意,由于早期显存的限制,AlexNet使用2个GPU,每个GPU处理一半的模型参数,现在来做的话用一个GPU处理以上模型参数就够了。
AlexNet与LeNet的设计理念很相似,但也有显著区别:
第一,AlexNet模型更大,使用了8个参数层,包括5个卷积层和3个全连接层。各层的参数在图中标识地很清楚了。注意,由于早期显存的限制,AlexNet使用2个GPU,每个GPU处理一半的模型参数,如图中所示,现在来做的话用一个GPU处理以上模型参数就够了。
第一个卷积层,窗口形状为11×11,步幅为4,填充为2,通道数为96,输出特征图形状为(96,55,55);
第二个卷积层,窗口形状为5×5,步幅为1,填充为2,通道数为256,输出特征图形状为(256,27,27);
第三至五个卷积层,窗口形状为3×3,步幅为1,填充为1,输出特征图形状分别为(384,13,13), (384,13,13), (256,13,13)。
在第一个、第二个和第五个卷积层之后,使用窗口形状为3、步幅为2的最大池化层。
第二,AlexNet将sigmoid激活函数改成了更加简单的ReLU激活函数。
第三,AlexNet通过Dropout方法来控制全连接层的模型复杂度,降低过拟合。
第四,AlexNet引入了大量的图像增广,如翻转、裁剪和颜色变化,从而进一步扩大数据集来缓解过拟合。
下面实现稍微简化过的AlexNet:
class AlexNet(nn.Module):
def __init__(self, num_channels=3, num_classes=1000):
super(AlexNet, self).__init__()
self.conv = nn.Sequential(
nn.Conv2d(num_channels, 96, kernel_size=11, stride=4, padding=2),
nn.ReLU(inplace=True),
nn.MaxPool2d(3, 2),
nn.Conv2d(96, 256, kernel_size=5, padding=2),
nn.ReLU(inplace=True),
nn.MaxPool2d(3, 2),
nn.Conv2d(256, 384, kernel_size=3, padding=1),
nn.ReLU(inplace=True),
nn.Conv2d(384, 384, kernel_size=3, padding=1),
nn.ReLU(inplace=True),
nn.Conv2d(384, 256, kernel_size=3, padding=1),
nn.ReLU(inplace=True),
nn.MaxPool2d(3, 2)
)
self.fc = nn.Sequential(
nn.Linear(256 * 6 * 6, 4096),
nn.ReLU(inplace=True),
nn.Dropout(0.5),
nn.Linear(4096, 4096),
nn.ReLU(inplace=True),
nn.Dropout(0.5),
nn.Linear(4096, num_classes)
)
def forward(self, x):
feature = self.conv(x)
feature = feature.view(x.shape[0], -1)
return self.fc(feature)
用torchsummary输出一下网络结构:
if __name__ == "__main__":
net = AlexNet(num_classes=2)
from torchsummary import summary
net.cuda()
summary(net, (3, 224, 224))
输出:
----------------------------------------------------------------
Layer (type) Output Shape Param #
================================================================
Conv2d-1 [-1, 96, 55, 55] 34,944
ReLU-2 [-1, 96, 55, 55] 0
MaxPool2d-3 [-1, 96, 27, 27] 0
Conv2d-4 [-1, 256, 27, 27] 614,656
ReLU-5 [-1, 256, 27, 27] 0
MaxPool2d-6 [-1, 256, 13, 13] 0
Conv2d-7 [-1, 384, 13, 13] 885,120
ReLU-8 [-1, 384, 13, 13] 0
Conv2d-9 [-1, 384, 13, 13] 1,327,488
ReLU-10 [-1, 384, 13, 13] 0
Conv2d-11 [-1, 256, 13, 13] 884,992
ReLU-12 [-1, 256, 13, 13] 0
MaxPool2d-13 [-1, 256, 6, 6] 0
Linear-14 [-1, 4096] 37,752,832
ReLU-15 [-1, 4096] 0
Dropout-16 [-1, 4096] 0
Linear-17 [-1, 4096] 16,781,312
ReLU-18 [-1, 4096] 0
Dropout-19 [-1, 4096] 0
Linear-20 [-1, 2] 8,194
================================================================
Total params: 58,289,538
Trainable params: 58,289,538
Non-trainable params: 0
----------------------------------------------------------------
Input size (MB): 0.57
Forward/backward pass size (MB): 11.08
Params size (MB): 222.36
Estimated Total Size (MB): 234.01
----------------------------------------------------------------
和网络结构图比较是一致的。
读取数据和训练模型
从头开始训练一个AlexNet模型对热狗数据集进行分类吧:
import torch
from torch import nn, optim
from torch.utils.data import DataLoader
from torchvision import transforms, datasets, models
import time
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
train_dir = "../data/hotdog/train"
test_dir = "../data/hotdog/test"
# 将图像调整为224×224尺寸并归一化
mean = [0.485, 0.456, 0.406]
std = [0.229, 0.224, 0.225]
train_augs = transforms.Compose([
transforms.RandomResizedCrop(size=224),
transforms.RandomHorizontalFlip(),
transforms.ToTensor(),
transforms.Normalize(mean, std)
])
test_augs = transforms.Compose([
transforms.Resize(size=256),
transforms.CenterCrop(size=224),
transforms.ToTensor(),
transforms.Normalize(mean, std)
])
train_set = datasets.ImageFolder(train_dir, transform=train_augs)
test_set = datasets.ImageFolder(test_dir, transform=test_augs)
batch_size = 32
train_iter = DataLoader(train_set, batch_size=batch_size, shuffle=True)
test_iter = DataLoader(test_set, batch_size=batch_size)
def train(net, train_iter, test_iter, criterion, optimizer, num_epochs):
net = net.to(device)
print("training on", device)
for epoch in range(num_epochs):
start = time.time()
net.train() # 训练模式
train_loss_sum, train_acc_sum, n, batch_count = 0.0, 0.0, 0, 0
for X, y in train_iter:
X, y = X.to(device), y.to(device)
optimizer.zero_grad() # 梯度清零
y_hat = net(X)
loss = criterion(y_hat, y)
loss.backward()
optimizer.step()
train_loss_sum += loss.cpu().item()
train_acc_sum += (y_hat.argmax(dim=1) == y).sum().cpu().item()
n += y.shape[0]
batch_count += 1
with torch.no_grad():
net.eval() # 评估模式
test_acc_sum, n2 = 0.0, 0
for X, y in test_iter:
test_acc_sum += (net(X.to(device)).argmax(dim=1) == y.to(device)).float().sum().cpu().item()
n2 += y.shape[0]
print('epoch %d, loss %.4f, train acc %.3f, test acc %.3f, time %.1f sec'
% (epoch + 1, train_loss_sum / batch_count, train_acc_sum / n, test_acc_sum / n2, time.time() - start))
from alexnet import AlexNet
net = AlexNet(num_classes=2)
optimizer = optim.Adam(net.parameters(), lr=0.001)
loss = nn.CrossEntropyLoss()
train(net, train_iter, test_iter, loss, optimizer, num_epochs=5)
训练过程:
training on cuda
epoch 1, loss 0.7605, train acc 0.533, test acc 0.541, time 16.6 sec
epoch 2, loss 0.6489, train acc 0.625, test acc 0.709, time 16.6 sec
epoch 3, loss 0.5020, train acc 0.781, test acc 0.776, time 17.5 sec
epoch 4, loss 0.4634, train acc 0.785, test acc 0.829, time 17.4 sec
epoch 5, loss 0.4773, train acc 0.778, test acc 0.809, time 17.1 sec