PyTorchBlitz学习笔记

以下为 3b1b 中有关反向传播算法原理的介绍

torch.autograd 简介

在对 Tensor 做操作时，autograd 会记录其输入张量与输出张量，并构建一个有向无环图(DAG)^[1]，其中根为输入张量，叶子为输出张量。在反向传递时调用.backward()时，autograd 会从每个.grad_fn 计算梯度并累积在各自张量的 .grad 属性中，最终传播到叶子即输出张量

python

import torch  
from torchvision.models import resnet18, ResNet18_Weights  
model = resnet18(weights=ResNet18_Weights.DEFAULT)  
data = torch.rand(1, 3, 64, 64)  
labels = torch.rand(1, 1000)  
  
prediction = model(data) # 前向传播  
  
loss = (prediction - labels).sum() #某种代价值但不是常用的  
loss.backward() #反向传播  
  
#SGD——Stochastic Gradient Descent随机梯度下降  
#model.parameters() 模型的权重与偏置  
#lr为学习率或者说步长  
#momentum为动量，也可以理解为惯性 
optim = torch.optim.SGD(model.parameters(), lr=1e-2, momentum=0.9)  
  
optim.step() #梯度下降

Autograd 中的自动微分

python

import torch  
  
#创建张量并使用requires_grad追踪操作  
#通过requires_grad=True控制张量是否需要进行梯度  
a = torch.tensor([2., 3.], requires_grad=True)  
b = torch.tensor([6., 4.], requires_grad=True)  
  
Q = 3*a**3 - b**2  
  
external_grad = torch.tensor([1., 1.])  
Q.backward(gradient=external_grad)  
  
# 检查结果是否为a与b向量求导后的结果  
print(9*a**2 == a.grad)  
print(-2*b == b.grad)

autograd 追踪计算原理：

神经网络(Neural Networks)

结合 3B1B 视频可以更好理解原理

python

import torch  
import torch.nn as nn  
import torch.nn.functional as F  
  
class Net(nn.Module):  
  
    def __init__(self):  
        super(Net, self).__init__()  
        # 1 input image channel, 6 output channels, 5x5 square convolution对应self.conv1  
        # 一个输入图像通道，6个输出通道，在5×5的正方形卷积核中  
        # 卷积层self.conv  
        self.conv1 = nn.Conv2d(1, 6, 5)  
        self.conv2 = nn.Conv2d(6, 16, 5)  
        # 连接层self.fc  
        # an affine operation: y = Wx + b（权重×特征+偏置）  
        self.fc1 = nn.Linear(16 * 5 * 5, 120)  # 5*5 from image dimension  
        self.fc2 = nn.Linear(120, 84)  
        self.fc3 = nn.Linear(84, 10)  
  
    def forward(self, input):  
        # Convolution layer C1: 1 input image channel, 6 output channels,  
                # 5x5 square convolution, it uses RELU activation function, and        # outputs a Tensor with size (N, 6, 28, 28), where N is the size of the batch        # 特征提取  
        # 卷积层c1，将图像输出为(N, 6, 28, 28)的格式，后续c3为第二个卷积层  
        c1 = F.relu(self.conv1(input))  
        # Subsampling layer S2: 2x2 grid, purely functional,  
        # this layer does not have any parameter, and outputs a (N, 6, 14, 14) Tensor        # 下采样/池化层，长宽各缩减一半（特征浓缩），后续s4为第二次最大池化  
        s2 = F.max_pool2d(c1, (2, 2))  
        # Convolution layer C3: 6 input channels, 16 output channels,  
        # 5x5 square convolution, it uses RELU activation function, and        # outputs a (N, 16, 10, 10) Tensor        
        c3 = F.relu(self.conv2(s2))  
        # Subsampling layer S4: 2x2 grid, purely functional,  
        # this layer does not have any parameter, and outputs a (N, 16, 5, 5) Tensor        s4 = F.max_pool2d(c3, 2)  
        # Flatten operation: purely functional, outputs a (N, 400) Tensor  
        # 展平，将多维图像拉直为一维向量  
        s4 = torch.flatten(s4, 1)  
        # Fully connected layer F5: (N, 400) Tensor input,  
        # and outputs a (N, 120) Tensor, it uses RELU activation function        # 分类决策  
        f5 = F.relu(self.fc1(s4))  
        # Fully connected layer F6: (N, 120) Tensor input,  
        # and outputs a (N, 84) Tensor, it uses RELU activation function        
        f6 = F.relu(self.fc2(f5))  
        # Fully connected layer OUTPUT: (N, 84) Tensor input, and  
        # outputs a (N, 10) Tensor        # 输出最终输出 10个分类的得分  
        output = self.fc3(f6)  
        return output  
  
net = Net()  
print(net)  
# 模型的学习参数  
params = list(net.parameters())  
print(len(params)) # 2个卷积层 + 3个全连接层，每一层两种参数——权重（Weight）×偏置（Bias）=10  
print(params[0].size())  # conv1's .weight [输出通道数, 输入通道数, 卷积核高度, 卷积核宽度]—  
                        # 输出通道数：卷积核数目，输入通道数：黑白图片只有1个通道，卷积核尺寸：5, 5  
  
#随机的32×32输入到神经网络  
input = torch.randn(1, 1, 32, 32)  
out = net(input)  
print(out)  
#反向传播  
net.zero_grad()  
out.backward(torch.randn(1, 10))  
  
# NN包中的损失函数  
output = net(input)  
target = torch.randn(10)  # a dummy target, for example  
target = target.view(1, -1)  # make it the same shape as output  
criterion = nn.MSELoss() #MSE：输出与目标均方差  
  
loss = criterion(output, target) # 损失函数  
print(loss)  
  
# 反向传播  
net.zero_grad()     # zeroes the gradient buffers of all parameters  
  
print('conv1.bias.grad before backward')  
print(net.conv1.bias.grad)  
  
loss.backward()  
  
print('conv1.bias.grad after backward')  
print(net.conv1.bias.grad)  
  
# 更新权重  
# weight = weight - learning_rate * gradient 权重-学习率×梯度  
learning_rate = 0.01  
for f in net.parameters():  
    f.data.sub_(f.grad.data * learning_rate)

训练图像分类器(Training a Classifier)

神经网络训练循环流程图如图(ChatGPT生成)：在官网给的代码中num_workers=2在Windows系统会导致死循环，解决方法是将训练代码全部放在if __name__ == '__main__':下，或直接更改num_workers=0^[2]

python

import torch  
import torch.nn as nn  
import torch.nn.functional as F  
import torch  
import torchvision  
import torchvision.transforms as transforms  
import torch.optim as optim  
  
class Net(nn.Module):  
    def __init__(self):  
        super().__init__()  
        self.conv1 = nn.Conv2d(3, 6, 5)  
        self.pool = nn.MaxPool2d(2, 2)  
        self.conv2 = nn.Conv2d(6, 16, 5)  
        self.fc1 = nn.Linear(16 * 5 * 5, 120)  
        self.fc2 = nn.Linear(120, 84)  
        self.fc3 = nn.Linear(84, 10)  
  
    def forward(self, x):  
        x = self.pool(F.relu(self.conv1(x)))  
        x = self.pool(F.relu(self.conv2(x)))  
        x = torch.flatten(x, 1)  # flatten all dimensions except batch  
        x = F.relu(self.fc1(x))  
        x = F.relu(self.fc2(x))  
        x = self.fc3(x)  
        return x  
  
net = Net()  
  
if __name__ == '__main__':  
    transform = transforms.Compose(  
        [transforms.ToTensor(),  # 将图片转化为张量并进行归一化  
         transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5))])  # 将RGB颜色进行标准化  
  
    batch_size = 4  # 每次训练数据个数  
  
    trainset = torchvision.datasets.CIFAR10(root='./data', train=True,  
                                            download=True, transform=transform)  # 数据集  
    trainloader = torch.utils.data.DataLoader(trainset, batch_size=batch_size,  
                                              shuffle=True, num_workers=10)  
    # 数据搬运，batch_size每次搬运数据数，shuffle数据顺序打乱，num_workers并行线程  
  
    # 训练集  
    testset = torchvision.datasets.CIFAR10(root='./data', train=False,  
                                           download=True, transform=transform)  
    testloader = torch.utils.data.DataLoader(testset, batch_size=batch_size,  
                                             shuffle=False, num_workers=10)  
  
    # 训练后对应的标签  
    classes = ('plane', 'car', 'bird', 'cat',  
               'deer', 'dog', 'frog', 'horse', 'ship', 'truck')  
  
    import matplotlib.pyplot as plt  
    import numpy as np  
  
  
    # functions to show an image  
  
    def imshow(img):  
        img = img / 2 + 0.5  # unnormalize  
        npimg = img.numpy()  
        plt.imshow(np.transpose(npimg, (1, 2, 0)))  
        plt.show()  
  
  
    # get some random training images  
    # 获取随机训练数据  
    dataiter = iter(trainloader)  
    images, labels = next(dataiter)  
  
    # 展示图片  
    imshow(torchvision.utils.make_grid(images))  
    # 输出标签  
    print(' '.join(f'{classes[labels[j]]:5s}' for j in range(batch_size)))  
  
    #定义损失函数和优化器  
    criterion = nn.CrossEntropyLoss()  
    optimizer = optim.SGD(net.parameters(), lr=0.001, momentum=0.9)  
  
    #训练网路  
    for epoch in range(2):  # loop over the dataset multiple times  
  
        running_loss = 0.0  
        for i, data in enumerate(trainloader, 0):  
            # get the inputs; data is a list of [inputs, labels]  
            inputs, labels = data  
  
            # zero the parameter gradients  
            optimizer.zero_grad()  
  
            # forward + backward + optimize  
            outputs = net(inputs)  
            loss = criterion(outputs, labels)  
            loss.backward()  
            optimizer.step()  
  
            # print statistics  
            running_loss += loss.item()  
            if i % 2000 == 1999:  # print every 2000 mini-batches  
                print(f'[{epoch + 1}, {i + 1:5d}] loss: {running_loss / 2000:.3f}')  
                running_loss = 0.0  
  
    print('Finished Training')  
  
    #保存模型  
    PATH = './cifar_net.pth'  
    torch.save(net.state_dict(), PATH)  
  
    #进行测试  
    dataiter = iter(testloader)  
    images, labels = next(dataiter)  
  
    # print images  
    imshow(torchvision.utils.make_grid(images))  
    print('GroundTruth: ', ' '.join(f'{classes[labels[j]]:5s}' for j in range(4)))  
  
    net = Net()  
    net.load_state_dict(torch.load(PATH, weights_only=True))  
    outputs = net(images) #神经网络识别结果  
    _, predicted = torch.max(outputs, 1)  
  
    #图像预测结果  
    print('Predicted: ', ' '.join(f'{classes[predicted[j]]:5s}'for j in range(4)))  
  
    #整体数据集表现  
    correct = 0  
    total = 0  
    # since we're not training, we don't need to calculate the gradients for our outputs  
    with torch.no_grad():  
        for data in testloader:  
            images, labels = data  
            # calculate outputs by running images through the network  
            outputs = net(images)  
            # the class with the highest energy is what we choose as prediction  
            _, predicted = torch.max(outputs, 1)  
            total += labels.size(0)  
            correct += (predicted == labels).sum().item()  
  
    print(f'Accuracy of the network on the 10000 test images: {100 * correct // total} %')  
  
    #具体每类表现  
    # prepare to count predictions for each class  
    correct_pred = {classname: 0 for classname in classes}  
    total_pred = {classname: 0 for classname in classes}  
  
    # again no gradients needed  
    with torch.no_grad():  
        for data in testloader:  
            images, labels = data  
            outputs = net(images)  
            _, predictions = torch.max(outputs, 1)  
            # collect the correct predictions for each class  
            for label, prediction in zip(labels, predictions):  
                if label == prediction:  
                    correct_pred[classes[label]] += 1  
                total_pred[classes[label]] += 1  
  
    # print accuracy for each class  
    for classname, correct_count in correct_pred.items():  
        accuracy = 100 * float(correct_count) / total_pred[classname]  
        print(f'Accuracy for class: {classname:5s} is {accuracy:.1f} %')

最终训练结果如图：

示例训练结果

图：由节点(Nodes，代表实体或任务)和连接的边(Edge，代表关系)组成有向：边是有方向的，不能反向逆行无环：从节点出发顺着箭头不会回到原点 ↩︎
官网也有给解决方案:If you are running this tutorial on Windows or MacOS and encounter a BrokenPipeError or RuntimeError related to multiprocessing, try setting the num_worker of torch.utils.data.DataLoader() to 0. ↩︎

PyTorchBlitz学习笔记 ​

torch.autograd 简介 ​

神经网络(Neural Networks) ​

训练图像分类器(Training a Classifier) ​

PyTorchBlitz学习笔记

torch.autograd 简介

神经网络(Neural Networks)

训练图像分类器(Training a Classifier)