NumPy搭建我的第一个神经网络
前言
利用纯numpy实现手势识别,首先是进行的整体的网络构成,然后再展示代码部分。这是我的第一个神经网络。
完整代码: GitHub
网络大体体现:
输入层,隐藏层,输出层。已经知道的是输出层是有十个结果的,就是10个数字的概率。
关于训练集,验证集,测试集
关于手算的梯度下降
三个参数的求解
第一个参数手算详细过程
代码部分
激活函数,以及激活函数的导数
def tanh(x): return np.tanh(x) def bypass(x): return x def softmax(x): exp=np.exp(x-x.max()) # 为了防止指数爆炸(指数函数) # 所以现在就要将其中的减去最大值,变成负数,但是最后是相处, #结果都是一样的。 return exp/exp.sum() def d_softmax(data): sm = softmax(data) return np.diag(sm) - np.outer(sm, sm) def d_tanh(data): # 返回值返回向量 return 1 / (np.cosh(data)) ** 2 def d_bypass(x): return 1 differential = {softmax: d_softmax, tanh: d_tanh,bypass:d_bypass} d_type = {bypass:'times',softmax:'dot',tanh:'times'}
初始化参数
dimensions=[28*28,100,10] # 28*28个神经元,100个神经元是隐藏层,输出的10个数字 # 28*28个神经元先是连接到100个神经元中再连接到10个输出层中 activation =[bypass,tanh,softmax] # bypass 两个激活函数 distribution=[ # 初始化过程 {},# 空着 {'b':[0,0],'w':[-math.sqrt(6/(dimensions[0]+dimensions[1])),math.sqrt(6/(dimensions[0]+dimensions[1]))]}, {'b':[0,0],'w':[-math.sqrt(6/(dimensions[1]+dimensions[2])),math.sqrt(6/(dimensions[1]+dimensions[2]))]}, ]
初始化参数函数
def init_parameters_b(layer): # 初始化b dist= distribution[layer]['b'] return np.random.rand(dimensions[layer])*(dist[1]-dist[0])+dist[0] def init_parameters_w(layer): dist=distribution[layer]['w'] return np.random.rand(dimensions[layer-1],dimensions[layer])*(dist[1]-dist[0])+dist[0] def init_parameters(): parameter=[] # 迭代每一次的结果 for i in range (len(distribution)): layer_parameter={} # 存放每一次迭代的数据,并且一直存储起来 for j in distribution[i].keys(): if j=='b': layer_parameter['b']=init_parameters_b(i) continue if j=='w': layer_parameter['w']=init_parameters_w(i) continue parameter.append(layer_parameter) return parameter
最开始的初始化的参数(未经过训练)
parameters=init_parameters()
预测函数
def predict(img,parameters): # 参数:图片,参数 l_in=img l_out=activation[0](l_in) # 第一层的初始化 for layer in range(1,len(dimensions)): l_in = np.dot(l_out,parameters[layer]['w'])+parameters[layer]['b'] # 不断地进行迭代 l_out = activation[layer](l_in) return l_out
第一次的预测(本次的预测由于是使用初始化的模型,所以是不稳定)
predict(train_img[0],init_parameters()) //结果: array([0.07210171, 0.07957606, 0.13152407, 0.05420442, 0.08498909, 0.12788144, 0.14911174, 0.14570486, 0.08225591, 0.07265069])
训练集、验证集、测试集
dataset_path=Path('D:/Desktop/MNIST') train_img_path=dataset_path/'train-images-idx3-ubyte/train-images.idx3-ubyte' train_lab_path=dataset_path/'train-labels-idx1-ubyte/train-labels.idx1-ubyte' test_img_path=dataset_path/'t10k-images-idx3-ubyte/t10k-images.idx3-ubyte' test_lab_path=dataset_path/'t10k-labels-idx1-ubyte/t10k-labels.idx1-ubyte'
各个集合分开
train_num = 50000 # 训练 valid_num = 10000 # 验证 test_num = 10000 # 测试 with open(train_img_path, 'rb') as f: struct.unpack('>4i', f.read(16)) temp_img = np.fromfile(f, dtype=np.uint8).reshape(-1, 28 * 28) / 255 train_img = temp_img[:train_num] # 将训练集中的数据分了1w出去给验证 valid_img = temp_img[train_num:] with open(test_img_path, 'rb') as f: struct.unpack('>4i', f.read(16)) test_img = np.fromfile(f, dtype=np.uint8).reshape(-1, 28 * 28) / 255 with open(train_lab_path, 'rb') as f: struct.unpack('>2i', f.read(8)) temp_lab = np.fromfile(f, dtype=np.uint8) train_lab = temp_lab[:train_num] valid_lab = temp_lab[train_num:] with open(test_lab_path, 'rb') as f: struct.unpack('>2i', f.read(8)) test_lab = np.fromfile(f, dtype=np.uint8)
展示图片标签
def show_train(index): plt.imshow(train_img[index].reshape(28, 28), cmap='gray') pylab.show() print('label:{}'.format(train_lab[index])) def show_valid(index): plt.imshow(valid_img[index].reshape(28, 28), cmap='gray') pylab.show() print('label:{}'.format(valid_lab[index])) def show_test(index): plt.imshow(test_img[index].reshape(28, 28), cmap='gray') pylab.show() print('test:{}'.format(test_lab[index]))
运用随机数进行预测结果
predict(np.random.rand(784),parameters) //结果: array([0.0942381 , 0.11644771, 0.05850607, 0.23711087, 0.02732923, 0.0176975 , 0.19317991, 0.14196864, 0.08510021, 0.02842176])
验证自己的导数有没有写对,用定义验证,不然要是导数导错了的话,后面都白费了。
h = 0.0001 func = softmax input_len = 4 for i in range(input_len): # 两种求导数的方法 # 一种是定义方法,与一种是直接求导法 test_input = np.random.rand(input_len) derivative = differential[func](test_input) value1 = func(test_input) test_input[i] += h value2 = func(test_input) # print((value2 - value1) / h) # print(derivative[i] - (value2 - value1) / h) # 差值 onehot = np.identity(dimensions[-1]) # 10个数字
梯度下降,这里就是要反向传递(简单来说就是链式求导),为了让loss_function的值尽可能的小,就是要将y1-y0的值尽可能接近,所以就是让y1更加接近真值,然后将初始的值进行一次更新,不断重复更新。
def sqr_loss(img, lab, parameters): y_pred = predict(img, parameters) y = onehot[lab] diff = y - y_pred return np.dot(diff, diff) def grad_parameters(img, lab, parameters): # 参数:图片,参数 l_in_list=[img] # 第一个参数是0所以img+0是等于img l_out_list=[activation[0](l_in_list[0])] # 第一个out也是初始化之后的out for layer in range(1,len(dimensions)): l_in = np.dot(l_in_list[layer-1], parameters[layer]['w']) + parameters[layer]['b'] l_out = activation[layer](l_in) l_in_list.append(l_in) l_out_list.append(l_out) d_layer =-2*(onehot[lab] - l_out_list[-1]) grad_result=[None]*len(dimensions) for layer in range(len(dimensions)-1,0,-1): # 反向传播 if d_type[activation[layer]]=='times': d_layer = differential[activation[layer]](l_in_list[layer])*d_layer if d_type[activation[layer]]=='dot': d_layer = np.dot(differential[activation[layer]](l_in_list[layer]), d_layer) # 参数一: grad_result[layer]={} grad_result[layer]['b'] = d_layer grad_result[layer]['w'] = np.outer(l_out_list[layer-1], d_layer) # 上一层的结果 d_layer = np.dot(parameters[layer]['w'],d_layer) return grad_result
反向传播之后的参数结果
grad_parameters(train_img[0],train_lab[0],init_parameters())
验证反向传播的求偏导是否正确
# 验证参数b h = 0.00001 # 验证反向传递求的对不对,求导过程。 layer=2 parameters = init_parameters() pname='b' for i in range(len(parameters[layer][pname])): # 两种求导数的方法 # 一种是定义方法,与一种是直接求导法 img_i = np.random.randint(train_num) # 随机找数字图片 test_parameters = init_parameters() # 随机找 derivative = grad_parameters(train_img[img_i], train_lab[img_i], test_parameters)[layer][pname] value1 = sqr_loss(train_img[img_i], train_lab[img_i], test_parameters) test_parameters[layer][pname][i] += h value2 = sqr_loss(train_img[img_i], train_lab[img_i], test_parameters) print(derivative[i]-(value2-value1)/h) # 验证参数w h = 0.00001 # 验证方向传递求的对不对,求导过程。 layer=1 parameters = init_parameters() pname='w' grad_list=[] for i in range(len(parameters[layer][pname])): # 两种求导数的方法 for j in range(len(parameters[layer][pname][0])): img_i = np.random.randint(train_num) # 随机找数字图片 test_parameters = init_parameters() # 随机找 derivative = grad_parameters(train_img[img_i], train_lab[img_i], test_parameters)[layer][pname] value1 = sqr_loss(train_img[img_i], train_lab[img_i], test_parameters) test_parameters[layer][pname][i][j] += h value2 = sqr_loss(train_img[img_i], train_lab[img_i], test_parameters) grad_list.append(derivative[i][j]-(value2-value1)/h) np.abs(grad_list).max()
损失函数
def valid_loss(parameters): # 验证集函数 loss_accu = 0 for img_i in range(valid_num): loss_accu += sqr_loss(valid_img[img_i], valid_lab[img_i], parameters) return loss_accu / (valid_num / 10000) # 每1w张图片的loss_accu # 进行归一化,1w个图片为整体 def valid_accuracy(parameters): # 准确率 correct = [predict(valid_img[img_i], parameters).argmax() == valid_lab[img_i] for img_i in range(valid_num)] return correct.count(True) / len(correct) def train_loss(parameters): # 验证集函数 loss_accu = 0 for img_i in range(train_num): loss_accu += sqr_loss(train_img[img_i], train_lab[img_i], parameters) return loss_accu / (train_num / 10000) def train_accuracy(parameters): # 准确率 correct = [predict(train_img[img_i], parameters).argmax() == train_lab[img_i] for img_i in range(train_num)] return correct.count(True) / len(correct) def test_accuracy(parameters): # 准确率 correct = [predict(test_img[img_i], parameters).argmax() == test_lab[img_i] for img_i in range(test_num)] return correct.count(True) / len(correct)
def grad_add(grad1,grad2): for layer in range(1,len(grad1)): for pname in grad1[layer].keys(): grad1[layer][pname]+=grad2[layer][pname] return grad1 def grad_divide(grad,denominator): for layer in range(1,len(grad)): for pname in grad[layer].keys(): grad[layer][pname]/=denominator return grad def combine_parameters(parameters, grad, learn_rate): # 新的参数的形成 parameter_tmp = copy.deepcopy(parameters) for layer in range(len(parameter_tmp)): for pname in parameter_tmp[layer].keys(): parameter_tmp[layer][pname] -= learn_rate * grad[layer][pname] return parameter_tmp
训练量的一个整体
batch_size = 100 # 一百张图片为整体 # 100张图片当一小块 # 训练一小块 def train_batch(current_batch, parameters): # 将这一百张图片的梯度计算起来,拿到一个平均值! grad_accu = grad_parameters(train_img[current_batch * batch_size], train_lab[current_batch * batch_size], parameters) for img_i in range(1, batch_size): grad_temp = grad_parameters(train_img[current_batch * batch_size + img_i], train_lab[current_batch * batch_size + img_i], parameters) grad_add(grad_accu,grad_temp) # 获得累加的梯度 grad_divide(grad_accu,batch_size) # 拿到方向 return grad_accu parameters = init_parameters()
训练过程
from tqdm import tqdm_notebook current_epoch = 0 train_loss_list = [] # 存储loss的数值 valid_loss_list = [] # 存储验证的loss的数值 train_accu_list = [] # 训练的正确性 valid_accu_list = [] # 验证集的正确性 learn_rate = 10**-0.3 # 学习率到最后要变小 epoch_num = 5 # 训练的次数 训练完一次叫一个epoch for epoch in tqdm_notebook(range(epoch_num)): for i in range(train_num // batch_size): # if i % 100 == 99: # print('running batch{}/{}'.format(i + 1, train_num // batch_size)) grad_tmp = train_batch(i, parameters) parameters = combine_parameters(parameters, grad_tmp, learn_rate) current_epoch += 1 train_loss_list.append(train_loss(parameters)) train_accu_list.append(train_accuracy(parameters)) valid_loss_list.append(valid_loss(parameters)) valid_accu_list.append(valid_accuracy(parameters)) valid_accuracy(parameters)
lower = -0 plt.plot(valid_loss_list[lower:], color='black', label='validation loss') plt.plot(train_loss_list[lower:], color='red', label='train loss') plt.show()
验证集和训练集的精确率
plt.plot(valid_accu_list[lower:], color='black', label='validation accuracy') plt.plot(train_accu_list[lower:], color='red', label='train accuracy') plt.show()