4.2 从0实现
import torch
import torchvision
def get_data(batch_size=50):
trans = torchvision.transforms.ToTensor()
mnist_train = torchvision.datasets.FashionMNIST(root="../data", train=True,
transform=trans,
download=True)
mnist_test = torchvision.datasets.FashionMNIST(root="../data", train=False,
transform=trans, download=True)
train = torch.utils.data.DataLoader(mnist_train, batch_size=batch_size, shuffle=True)
test = torch.utils.data.DataLoader(mnist_test, batch_size=batch_size, shuffle=False)
return train, test
train_iter, test_iter = get_data()
from d2l import torch as d2l
1 net
def my_relu(X):
a = torch.zeros_like(X)
return torch.max(X, a)
in_num=784
hid_num=256
out_num=10
W1 = torch.nn.Parameter(torch.randn(in_num, hid_num, requires_grad=True) * 0.01)
b1 = torch.nn.Parameter(torch.zeros(hid_num, requires_grad=True))
W2 = torch.nn.Parameter(torch.randn(hid_num, out_num, requires_grad=True) * 0.01)
b2 = torch.nn.Parameter(torch.zeros(out_num, requires_grad=True))
params = [W1, b1, W2, b2]
def mlp(X):
X = X.reshape((-1, in_num))
return torch.matmul(my_relu((torch.matmul(X, W1)+b1)), W2)+b2
net = mlp
loss
loss = torch.nn.CrossEntropyLoss()
optim
op = torch.optim.SGD(params, lr=0.01)
train
def accuracy(y_pred, y):
if len(y_pred.shape) > 1 and y_pred.shape[1] > 1:
y_pred = torch.argmax(y_pred, axis=1)
cmp = y_pred.type(y.dtype) == y
return float(cmp.type(y.dtype).sum())
def evaluate_accuracy(test_iter, net):
if type(net) == torch.nn.Module:
net.eval()
metric = d2l.Accumulator(2)
for X, y in test_iter:
metric.add(accuracy(net(X), y), y.numel())
return metric[0]/metric[1]
def train_epoch(train_iter, net, loss, op):
if type(net) == torch.nn.Module:
net.train()
metric = d2l.Accumulator(3)
for X, y in train_iter:
y_pred = net(X)
l = loss(y_pred, y)
op.zero_grad()
l.backward()
op.step()
metric.add(y.numel(), l*y.numel(), accuracy(y_pred, y))
return metric[1]/metric[0], metric[2]/metric[0]
def train(epoch, train_iter, test_iter, net, loss, op):
animator = d2l.Animator(xlabel='epoch', xlim=[1, epoch], ylim=[0.3, 0.9],
legend=['train loss', 'train acc', 'test acc'])
for i in range(epoch):
train_metric = train_epoch(train_iter, net, loss, op)
acc = evaluate_accuracy(test_iter, net)
animator.add(i+1, train_metric + (acc,))
epoch = 10
train(epoch, train_iter, test_iter, net, loss, op)
4.2.7 练习
1、在所有其他参数保持不变的情况下,更改超参数num_hiddens的值,并查看此超参数的变化对结果有何影响。确定此超参数的最佳值。
2、尝试添加更多的隐藏层,并查看它对结果有何影响。
3、改变学习速率会如何影响结果?保持模型结构和其他超参数(包括迭代周期数)不变,学习率设置为多少会带来最好的结果?
4、通过对所有超参数(学习率、迭代周期数、隐藏层数、每层的隐藏单元数)进行联合优化,可以得到的最佳结果是什么?
5、描述为什么涉及多个超参数更具挑战性。
多个超参数各种组合,可能出现的情况太多了
6、如果要构建多个超参数的搜索方法,你能想到的最聪明的策略是什么?
网格法?
4.3 简洁实现
from d2l import torch as d2l
1 net
net = torch.nn.Sequential(torch.nn.Flatten(), torch.nn.Linear(784, 512), torch.nn.ReLU(), torch.nn.Linear(512, 256), torch.nn.ReLU(),torch.nn.Linear(256, 10))
def para_init(m):
if isinstance(m, torch.nn.Linear):
torch.nn.init.normal_(m.weight, std=0.01)
net.apply(para_init)
loss
loss = torch.nn.CrossEntropyLoss()
optim
op = torch.optim.SGD(net.parameters(), lr=0.01)
train
def accuracy(y_pred, y):
if len(y_pred.shape) > 1 and y_pred.shape[1] > 1:
y_pred = torch.argmax(y_pred, axis=1)
cmp = y_pred.type(y.dtype) == y
return float(cmp.type(y.dtype).sum())
def evaluate_accuracy(test_iter, net):
if type(net) == torch.nn.Module:
net.eval()
metric = d2l.Accumulator(2)
for X, y in test_iter:
metric.add(accuracy(net(X), y), y.numel())
return metric[0]/metric[1]
def train_epoch(train_iter, net, loss, op):
if type(net) == torch.nn.Module:
net.train()
metric = d2l.Accumulator(3)
for X, y in train_iter:
y_pred = net(X)
l = loss(y_pred, y)
op.zero_grad()
l.backward()
op.step()
metric.add(y.numel(), l*y.numel(), accuracy(y_pred, y))
return metric[1]/metric[0], metric[2]/metric[0]
def train(epoch, train_iter, test_iter, net, loss, op):
animator = d2l.Animator(xlabel='epoch', xlim=[1, epoch], ylim=[0.3, 0.9],
legend=['train loss', 'train acc', 'test acc'])
for i in range(epoch):
train_metric = train_epoch(train_iter, net, loss, op)
acc = evaluate_accuracy(test_iter, net)
animator.add(i+1, train_metric + (acc,))
epoch = 10
train(epoch, train_iter, test_iter, net, loss, op)
4.3.3. 练习
1、尝试添加不同数量的隐藏层(也可以修改学习率)。怎么样设置效果最好?
2、尝试不同的激活函数。哪个效果最好?
3、尝试不同的方案来初始化权重。什么方法效果最好?