深度学习神经网络基础笔记

本文最后更新于：2023年6月17日下午

模型构造

预备知识

模型
损失函数
- 因为需要计算损失函数的梯度，所以我们应该先定义损失函数。
优化算法
- 在每一步中，使用从数据集中随机抽取的一个小批量，然后根据参数计算损失的梯度。
- 接下来，朝着减少损失的方向更新我们的参数。
- 该函数接受模型参数集合、学习速率和批量大小作为输入。每一步更新的大小由学习速率lr决定。
训练
- 在每次迭代中，我们读取一小批量训练样本，并通过我们的模型来获得一组预测，并计算损失
- 计算完损失后，我们开始反向传播，存储每个参数的梯度。最后，我们调用优化算法sgd来更新模型参数
- 概括
  1. 初始化参数
    - net.apply(init_func())
    - nn.inti.X_()
  2. 重复以下训练
    1. 计算梯度 \(\mathbf{g} \leftarrow \partial_{(\mathbf{w},b)} \frac{1}{|\mathcal{B}|} \sum_{i \in \mathcal{B}} l(\mathbf{x}^{(i)}, y^{(i)}, \mathbf{w}, b)\)
      - 计算损失,loss = nn.CorssEntropyLoss()，l = loss(net(X),y)
      - 计算梯度,l.sum().backward()
    2. 更新参数\((\mathbf{w}, b) \leftarrow (\mathbf{w}, b) - \eta \mathbf{g}\)
      - 常写为trainer，或者optimizer
      - optimizer/trianer = torch.optim.SGD(net.parameters(), lr=lr)
总结：

实现流程

import numpy as np
import torch
from torch.utils import data
from d2l import torch as d2l
 
# 线性回归的简洁实现
 
# 通过使用深度学习框架来简洁地实现线性回归模型，生成数据集
true_w = torch.tensor([2, -3.4])
true_b = 4.2
features, labels = d2l.synthetic_data(true_w, true_b, 1000)
 
 
def load_array(data_arrays, batch_size, is_train=True):
    """构造一个PyTorch数据迭代器"""
    dataset = data.TensorDataset(*data_arrays)  # dataset拿到数据集
    return data.DataLoader(dataset, batch_size, shuffle=is_train)  # DataLoader从中挑选样本出来
 
 
batch_size = 10
data_iter = load_array((features, labels), batch_size)
print(next(iter(data_iter)))  # 通过next得到X和y
 
 
# 使用框架的预定义好的层
# nn是神经网络的缩写
from torch import nn
net = nn.Sequential(nn.Linear(2, 1))  # 2，1分别指的是输入维度和输出维度；Sequential是一个list of layers，是一个容器
 
 
# 初始化模型参数
net[0].weight.data.normal_(0, 0.01)  # 使用正态分布来替换掉data的值
net[0].bias.data.fill_(0)
# 计算局方误差使用的是MSELoss类，也称为平方范数
loss = nn.MSELoss()
# 实例化SGD实例
trainer = torch.optim.SGD(net.parameters(), lr=0.03)  # SGD在名为optimizer的module里面；net.parameters()包括了w和b
 
 
# 训练过程
num_epochs = 3
for epoch in range(num_epochs):
    for X, y in data_iter:
        l = loss(net(X), y)
        trainer.zero_grad()  # 先把梯度清零，否则会在之前的梯度上做累加
        l.backward()
        trainer.step()  # step进行模型的更新，即分别更新权重和偏差
    l = loss(net(features), labels)
    print(f'epoch {epoch + 1}, loss {l:f}')

实现流程

def train_ch6(net, train_iter, test_iter, num_epochs, lr, device):
    """Train a model with a GPU (defined in Chapter 6).

    Defined in :numref:`sec_lenet`"""
    def init_weights(m):
        if type(m) == nn.Linear or type(m) == nn.Conv2d:
            nn.init.xavier_uniform_(m.weight)
    net.apply(init_weights)
    print('training on', device)
    net.to(device)
    # 定义优化器
    optimizer = torch.optim.SGD(net.parameters(), lr=lr)
    # 定义损失函数
    loss = nn.CrossEntropyLoss()
    animator = d2l.Animator(xlabel='epoch', xlim=[1, num_epochs],
                            legend=['train loss', 'train acc', 'test acc'])
    timer, num_batches = d2l.Timer(), len(train_iter)
    for epoch in range(num_epochs):
        # Sum of training loss, sum of training accuracy, no. of examples
        metric = d2l.Accumulator(3)
        net.train()
        for i, (X, y) in enumerate(train_iter):
            timer.start()
            # 计算优化器
            optimizer.zero_grad()
            X, y = X.to(device), y.to(device)
            y_hat = net(X)
            # 计算loss
            l = loss(y_hat, y)
            # 梯度反向传播的计算过程
            l.backward()
            # 更新模型参数
            optimizer.step()
            # 计算误差
            with torch.no_grad():
                metric.add(l * X.shape[0], d2l.accuracy(y_hat, y), X.shape[0])
            timer.stop()
            train_l = metric[0] / metric[2]
            train_acc = metric[1] / metric[2]
            if (i + 1) % (num_batches // 5) == 0 or i == num_batches - 1:
                animator.add(epoch + (i + 1) / num_batches,
                             (train_l, train_acc, None))
        test_acc = evaluate_accuracy_gpu(net, test_iter)
        animator.add(epoch + 1, (None, None, test_acc))
    print(f'loss {train_l:.3f}, train acc {train_acc:.3f}, '
          f'test acc {test_acc:.3f}')
    print(f'{metric[2] * num_epochs / timer.sum():.1f} examples/sec '
          f'on {str(device)}')

模型构造

class MLP(nn.Module):
    # init函数中 包括了你所有网络需要用到的层
    def __init__(self):
        super().__init()__
        self.hidden = nn.Linear(20,256)
        self.out = nn.Linear(256,10)
    # 定义前向函数 
    # 把输入放入到hidden层中 使用 relu函数激活 再放到输出层中进行输出
    def forward(self,X):
        return self.out(F.relu(self.hidden(X)))

1
2
3

net = MLP()
net(X) 
# net.forward(X) 已经在Module父类中做了一次映射 可以直接写为net(X)

嵌套块

def block1():
    return nn.Sequential(nn.Linear(4,8),nn.ReLU(),nn.Linear(8,4),nn.ReLU())

def block2():
    net = nn.Sequential()
    for i in range(4):
        # 把层添加进去
        net.add_module(f'block{i}',block1())
    return net

rgnet = nn.Sequential(block2(),nn.Linear(4,1))

`nn.Sequential()`类注释

# 手动实现

class MySequential(nn.Module):
    # *args  list of input arguments 
    # 多个输入被打包成一个变量
    def __init__(self,*args):
        super().__init__()
        # 放入_modules变量中
       	for block in args:
            self._modules[block] = block
      
    def forward(self,X):
      	# 按层调用，最后返回X
    	for block in self._modeles.values():
            X = block(X)
        return X

# 按顺序传入层 被*args参数接收
net = MySequential(nn.Linear(20,256),nn.ReLU(),nn.Linear(256,10))

net(X)

参数管理

class MLP(nn.Module):
    def __init__(self):
        super().__init()__
        self.hidden = nn.Linear(20,256)
        self.out = nn.Linear(256,10)

    def forward(self,X):
        return self.out(F.relu(self.hidden(X)))

#打印 hidden层 的参数    
 net.hidden.state_dict()

shared = nn.Linear(8, 8)
net = nn.Sequential(nn.Linear(4, 8), nn.ReLU(),
                    shared, nn.ReLU(),
                    shared, nn.ReLU(),
                    nn.Linear(8, 1))
net(X)
# nn.Linear(4,8)的参数
net[0].state_dict()  
# nn.Linear(8,1)的参数
net[6].state_dict()

内置初始化

EG1：

def init_normal(m):
    # 常用来放实现一些对Net进行初始化的函数
   
    if type(m) == nn.Linear:
        # 如果传入的Module是个线性层
        # 下划线表示normal_是个inplace函数，直接会对输入的参数进行操作
        
        nn.init.normal_(m.weight,mean=0,std=0.01)
        # 对weights进行权重为0，方差为0.01的初始化
        nn.init.zeros_(m.bias)

#对所有Net里的Layer进行遍历，应用init_normal规则
#给你一个方式让你遍历整个神经网络（net[0]~net[n]），做一些修改
net.apply(init_normal)

# 正态分布，偏移为0 
net[0].weight.data[0],net[0].bias.data[0]


EG2：

def init_constant(m):
    if type(m) == nn.Linear:
        # 把 类型为Linear的Module中的权重全赋值为1
        nn.init.constant_(m.weight,1)
        # 把 类型为Linear的Module中的偏差全赋值为0
        nn.init.zeros_(m.bias)

net.apply(init_constant)


EG3：
#可以对不同的层应用不同的事情

def xavir(m):
    if type(m) == nn.Linear:
		nn.init.xavier_uniform_(m.weight)
        
def init_42(m):
    if type(m) == nn.Linear:
        nn.init.constant_(m.weight.42)

#网络的不同层用不同的方法初始化
net[0].apply(xavier)
net[2].apply(init_42)

print(net[0].weight.data[0])
print(net[2].weight.data)

参数绑定

#第三层和第五层参数绑定
#无论怎么边，第三层和第五层的参数是一样的
shared = nn.Linear(8, 8)
net = nn.Sequential(nn.Linear(4, 8), nn.ReLU(),
                    shared, nn.ReLU(),
                    shared, nn.ReLU(),
                    nn.Linear(8, 1))
net(X)

自定义层

class CenteredLayer(nn.Module):
    def __init__(self):
        super().__init__()
        
    def forward(self,X):
        return X-X.mean()

layer = CenteredLayer()
layer(torch.FloatTensor([1,2,3,4,5]))

`nn.Parameter()` 模型参数的初始化

Python的torch.nn.Parameter初始化方法_nn.parameter 初始化

自定义带参数的层

# 自定义一个全连接层
# 该层输入大小为in_units，输出为units
# 权重为 randn(in_units,units)
# 偏置为 randn(units,)
class MyLinear(nn.Module):
    def __init__(self,in_units,units):
        super().__init__()
        #  添加权重和参数
        #  并且自动计算梯度，requires_gard = True
        self.weight = nn.Parameter(torch.randn(in_units,units))
        self.bias = nn.Parameter(torch.randn(units,))
        
    def forward(self,X):
        linear = torch.matmul(X,self.weight.data) + self.bias.data
        return F.relu(linear)

dense = MyLinear(5,3)
demse.weight

读写文件

张量

y = torch.zeros(4)
torch.save([x,y],'x-files')
x2,y2 = torch.load('x-files')

从字符串映射到张量的字典

1
2
3

mydict = {'x':x,'y':y}
torch.save(mydict,'mydict')
mydict2 = torch.load('mydict')

加载和保存模型参数

存的是模型的权重，不存计算

class MLP(nn.Module):
    # init函数中 包括了你所有网络需要用到的层
    def __init__(self):
        super().__init()__
        self.hidden = nn.Linear(20,256)
        self.out = nn.Linear(256,10)
    # 定义前向函数 
    # 把输入放入到hidden层中 使用 relu函数激活 再放到输出层中进行输出
    def forward(self,X):
        return self.out(F.relu(self.hidden(X)))
    
net = MLP()
X = torch.randn(size=(2,20))
Y = net(X)

# 把MLP中的所有参数存为一个字典，参数名字到后面的映射
torch.save(net.state_dict(),'mlp.params')


# 复制、读取保存的状态
# 声明一个网络，通过load_state_dict加载之前保存的状态
# 达到备份的效果
clone = MLP()
clone.load_state_dict(torch.load('mlp.params'))

Notes

#DeepLearning

深度学习神经网络基础笔记

https://anonymouslosty.ink/2023/06/02/深度学习神经网络基础笔记/

作者

Ling yi

发布于

2023年6月2日

更新于

2023年6月17日

许可协议

深度学习卷积笔记上一篇

自己的学习环境搭建下一篇

深度学习 神经网络基础 笔记

模型构造

预备知识

模型构造

嵌套块

nn.Sequential()类注释

参数管理

内置初始化

参数绑定

自定义层

nn.Parameter() 模型参数的初始化

自定义带参数的层

读写文件

张量

从字符串映射到张量的字典

加载和保存模型参数

深度学习神经网络基础笔记

`nn.Sequential()`类注释

`nn.Parameter()` 模型参数的初始化