概述 最近从朋友那儿得知华为的深度学习框架MindSpore更新到了1.0.0版本,效率比起之前提高了好多,到MindSpore官网上看看了一番,官网上可视化教程记事本文档引起了我的兴趣。于是,打算 在这篇记事本文档基础之上,学习下如何记录训练过程的数据。 过程 首先是下载数据集 附上代码: import os, shutil
import urllib.request
from urllib.parse import urlparse
def callbackfunc(blocknum, blocksize, totalsize):
percent = 100.0 * blocknum * blocksize / totalsize
if percent > 100:
percent = 100
print("downloaded {:.1f}".format(percent), end="\r")
def _download_dataset():
ds_url = "https://www.cs.toronto.edu/~kriz/cifar-10-binary.tar.gz"
file_base_name = urlparse(ds_url).path.split("/")[-1]
file_name = os.path.join("./datasets", file_base_name)
if not os.path.exists(file_name):
urllib.request.urlretrieve(ds_url, file_name, callbackfunc)
print("{:*^40}".format("DataSets Downloaded"))
shutil.unpack_archive(file_name, extract_dir="./datasets/cifar-10-binary")
def _copy_dataset(ds_part, dest_path):
data_source_path = "./datasets/cifar-10-binary/cifar-10-batches-bin"
ds_part_source_path = os.path.join(data_source_path, ds_part)
if not os.path.exists(ds_part_source_path):
_download_dataset()
shutil.copy(ds_part_source_path, dest_path)
def download_cifar10_dataset():
ds_base_path = "./datasets/cifar-10-batches-bin"
train_path = os.path.join(ds_base_path, "train")
test_path = os.path.join(ds_base_path, "test")
print("{:*^40}".format("Checking DataSets Path."))
if not os.path.exists(train_path) and not os.path.exists(train_path):
os.makedirs(train_path)
os.makedirs(test_path)
print("{:*^40}".format("Downloading CIFAR-10 DataSets."))
for i in range(1, 6):
train_part = "data_batch_{}.bin".format(i)
if not os.path.exists(os.path.join(train_path, train_part)):
_copy_dataset(train_part, train_path)
pops = train_part + " is ok"
print("{:*^40}".format(pops))
test_part = "test_batch.bin"
if not os.path.exists(os.path.join(test_path, test_part)):
_copy_dataset(test_part, test_path)
print("{:*^40}".format(test_part+" is ok"))
print("{:*^40}".format("Downloaded CIFAR-10 DataSets Already."))
download_cifar10_dataset()
原来这里是使用urllib模块将cifar10数据集下载到本地目录 处理数据集 数据集下载下来后还不能直接用于训练,需要进行一些预处理操作,研究了一下,主要是用了以下几个处理方法: import mindspore.dataset.transforms.c_transforms as C
import mindspore.dataset.vision.c_transforms as CV
resize_op = CV.Resize(size=(227, 227))
rescale_op = CV.Rescale(rescale, shift)
channel_swap_op = CV.HWC2CHW()
typecast_op = C.TypeCast(mstype.int32)
cifar_ds = cifar_ds.map(operations=resize_op, input_columns="image")
cifar_ds = cifar_ds.map(operations=rescale_op, input_columns="image")
cifar_ds = cifar_ds.map(operations=normalize_op, input_columns="image")
cifar_ds = cifar_ds.map(operations=channel_swap_op, input_columns="image")
使用pyplot查看一下预处理的数据图像: from matplotlib import pyplot as plt
import numpy as np
label_list = ["airplane", "automobile", "bird", "cat", "deer", "dog", "rog", "horse", "ship", "truck"]
print("The 32 images with label of the first batch in ds_train are showed below:")
ds_iterator = ds_train.create_dict_iterator()
ds_iterator.get_next()
batch_1 = ds_iterator.get_next()
batch_image = batch_1["image"].asnumpy()
batch_label = batch_1["label"].asnumpy()
%matplotlib inline
plt.figure(dpi=144)
for i,image in enumerate(batch_image):
plt.subplot(4, 8, i+1)
plt.subplots_adjust(wspace=0.2, hspace=0.2)
image = image/np.amax(image)
image = np.clip(image, 0, 1)
image = np.transpose(image,(1,2,0))
plt.imshow(image)
num = batch_label<i>
plt.title(f"image {i+1}\n{label_list[num]}", y=-0.65, fontdict={"fontsize":8})
plt.axis('off')
plt.show()
好激动,打印出了第一个batch的图像信息: 定义网络 现在终于到了定义网络这一步了,还是用上终极大招:Ctrl + C/Ctrl + V,一步到位: import mindspore.nn as nn
from mindspore.common.initializer import TruncatedNormal
from mindspore.ops import operations as P
def conv(in_channels, out_channels, kernel_size, stride=1, padding=0, pad_mode="valid"):
weight = weight_variable()
return nn.Conv2d(in_channels, out_channels,
kernel_size=kernel_size, stride=stride, padding=padding,
weight_init=weight, has_bias=False, pad_mode=pad_mode)
def fc_with_initialize(input_channels, out_channels):
weight = weight_variable()
bias = weight_variable()
return nn.Dense(input_channels, out_channels, weight, bias)
def weight_variable():
return TruncatedNormal(0.02)
class AlexNet(nn.Cell):
"""
Alexnet
"""
def __init__(self, num_classes=10, channel=3):
super(AlexNet, self).__init__()
self.conv1 = conv(channel, 96, 11, stride=4)
self.conv2 = conv(96, 256, 5, pad_mode="same")
self.conv3 = conv(256, 384, 3, pad_mode="same")
self.conv4 = conv(384, 384, 3, pad_mode="same")
self.conv5 = conv(384, 256, 3, pad_mode="same")
self.relu = nn.ReLU()
self.max_pool2d = P.MaxPool(ksize=3, strides=2)
self.flatten = nn.Flatten()
self.fc1 = fc_with_initialize(6*6*256, 4096)
self.fc2 = fc_with_initialize(4096, 4096)
self.fc3 = fc_with_initialize(4096, num_classes)
# Init TensorSummary
self.tensor_summary = P.TensorSummary()
# Init ImageSummary
self.image_summary = P.ImageSummary()
def construct(self, x):
# Record image by Summary operator
self.image_summary("Image", x)
x = self.conv1(x)
# Record tensor by Summary operator
self.tensor_summary("Tensor", x)
x = self.relu(x)
x = self.max_pool2d(x)
x = self.conv2(x)
x = self.relu(x)
x = self.max_pool2d(x)
x = self.conv3(x)
x = self.relu(x)
x = self.conv4(x)
x = self.relu(x)
x = self.conv5(x)
x = self.relu(x)
x = self.max_pool2d(x)
x = self.flatten(x)
x = self.fc1(x)
x = self.relu(x)
x = self.fc2(x)
x = self.relu(x)
x = self.fc3(x)
return x
大概明白了,定义的AlexNet网络分为了好多层(construct 方法中的),然后使用TensorSummary记录张量数据,ImageSummary记录图像数据。 但其他参数数据是怎么记录的呢,继续学习。 开始训练 不懂就是Ctrl+C network = AlexNet(num_classes=10)
net_loss = nn.SoftmaxCrossEntropyWithLogits(sparse=True, reduction="mean")
lr = Tensor(get_lr(0, 0.002, 10, ds_train.get_dataset_size()))
net_opt = nn.Momentum(network.trainable_params(), learning_rate=lr, momentum=0.9)
time_cb = TimeMonitor(data_size=ds_train.get_dataset_size())
config_ck = CheckpointConfig(save_checkpoint_steps=1562, keep_checkpoint_max=10)
ckpoint_cb = ModelCheckpoint(prefix="checkpoint_alexnet", config=config_ck)
model = Model(network, net_loss, net_opt, metrics={"Accuracy": Accuracy()})
summary_base_dir = "./summary_dir"
os.system(f"mindinsight start --summary-base-dir {summary_base_dir} --port=8080")
# Init a SummaryCollector callback instance, and use it in model.train or model.eval
specified = {"collect_metric": True, "histogram_regular": "^conv1.*|^conv2.*", "collect_graph": True, "collect_dataset_graph": True}
summary_collector = SummaryCollector(summary_dir="./summary_dir/summary_01", collect_specified_data=specified, collect_freq=1, keep_default_action=False, collect_tensor_freq=200)
print("============== Starting Training ==============")
model.train(epoch=10, train_dataset=ds_train, callbacks=[time_cb, ckpoint_cb, LossMonitor(), summary_collector], dataset_sink_mode=True)
print("============== Starting Testing ==============")
param_dict = load_checkpoint("checkpoint_alexnet-10_1562.ckpt")
load_param_into_net(network, param_dict)
acc = model.eval(ds_eval, callbacks=summary_collector, dataset_sink_mode=True)
print("============== {} ==============".format(acc))
仔细看了一遍代码后,原来是使用了SummaryCollector算子记录标量数据和参数分布图的。SummaryCollector的使用方法原文中有很好的解释,附上我的训练结果: epoch: 10 step: 1562, loss is 0.40318152
Epoch time: 116258.987, per step time: 74.430
============== Starting Testing ==============
============== {'Accuracy': 0.8340344551282052} ==============
一共迭代训练了10次,精度达到0.83,结果还不错,GPU加速下时间也还挺快的。 查看结果 按照文档中的方法,在本地浏览器中打开127.0.0.1:8080,终于查看到了记录的结果: 经过许久的时间,终于得到了收获。MindSpore使用起来真的很方便,而且参考官方教程文档学习和使用加速了学习和体验的过程。深深地为国产AI框架打call! 联系邮箱:602642050@qq.com |