权值共享
import torch
import torch.nn as nnclass model(nn.Module):def __init__(self):super(model,self).__init__()self.lstm = nn.LSTM(input_size = 10,hidden_size = 5)self.linear = nn.Linear(input_features = 5,out_features = 2)def forward(self,inputdata1,inputdata2):lstm_result1 = self.lstm (input_data1)lstm_result2 = self.lstm(inputdata2)output = self.linear(lstm_result1+lstm_result2)return output
注释:在神经网络的训练过程中经常用到两层网络共享权值,在上述代码片中,定义神经网络时定义一个lstm模型和一个全连接层,在前向计算中多次调用lstm层进行计算,相当于神经网络模型中有两个lstm层,即计算inputdata1和inputdata2的两个lstm共享权值。
参考:https://www.cnblogs.com/sdu20112013/p/12132786.html
某些层参数不更新
在查询此类资料时,在博客中看到模型层中添加了requires_grad = False后参数仍会训练的问题,博主并给出了相关解决方法,这里记录两个感觉使用方便的方法。
更多内容参考:https://blog.csdn.net/guotong1988/article/details/79739775
import torch
import torch.nn as nnclass model(nn.Module):def __init__(self):super(model,self).__init__()self.lstm = nn.LSTM(input_size = 10,hidden_size = 5)for p in self.parameters():p.requires_grad = Falseself.linear = nn.Linear(input_features = 5,out_features = 2)def forward(self,inputdata1,inputdata2):lstm_result1 = self.lstm (input_data1)lstm_result2 = self.lstm(inputdata2)output = self.linear(lstm_result1+lstm_result2)return output
注释:在不需要参数更新的层后边添加如下代码行:
for p in self.parameters():p.requires_grad = False
但是上述方法适用于模型中最初几层都不需训练,顶层需要训练的情况,如果出现需要训练和不需要训练的模型层交替出现的时候,上述方法就无法使用。博主给出了使用范围更广的方法:
import torch
import torch.nn as nn
from torch.nn import CrossEntropyLossclass model(nn.Module):def __init__(self):super(model,self).__init__()self.lstm1 = nn.LSTM(input_size = 10,hidden_size = 10,requires_grad = True)self.lstm2 = nn.LSTM(input_size = 10,hidden_size = 5,requires_grad = False)self.linear = nn.Linear(input_features = 5,out_features = 2,requires_grad = True)def forward(self,inputdata):lstm_result1 = self.lstm1(input_data)lstm_result2 = self.lstm(lstm_result1)output = self.linear(lstm_result1+lstm_result2)return output
model = model()
#人为构造输入和真实标签
input_data = torch.randn([1,10])#[1,10]代表输入一个样本,该样本的向量是10维,此处必须是二位数据
target = torch.tensor([1],dtype = torch.long)#输入一个样本时真实标签只有一个,如果输入是[5,10],则真实标签就应该为5个,例如,torch.tensor([0,1,1,1,0])#模型计算,反向传播
result = model(input_data)
loss_fc = CrossEntropyLoss()
loss = loss_fc(input_data,target)
loss.backward()#优化函数优化
torch.optimizer.SGD(filter(lambda p:p.requires_grad = True,model.parameters(),lr = 0.01))
注释:上述代码片在优化函数部分对参数进行过滤,只选取requires_grad = True的参数进行优化更新。
为不同的层赋予不同的学习率
import torch
import torch.nn as nn
from torch.nn import CrossEntropyLossclass model(nn.Module):def __init__(self):super(model,self).__init__()self.lstm = nn.LSTM(input_size = 10,hidden_size = 10,requires_grad = True)self.linear = nn.Linear(input_features = 5,out_features = 2,requires_grad = True)def forward(self,inputdata):lstm_result = self.lstm(input_data)output = self.linear(lstm_result)return outputmodel = model()#人为构造输入和真实标签
input_data = torch.randn([1,10])#[1,10]代表输入一个样本,该样本的向量是10维,此处必须是二位数据
target = torch.tensor([1],dtype = torch.long)#输入一个样本时真实标签只有一个,如果输入是[5,10],则真实标签就应该为5个,例如,torch.tensor([0,1,1,1,0])#模型计算,反向传播
result = model(input_data)
loss_fc = CrossEntropyLoss()
loss = loss_fc(input_data,target)
loss.backward()#使用优化函数优化过程中,为不同的层赋予不同的学习率,
param_lstm = [p for p in model.lstm.parameters()]
param_linear = [p for p in model.linear.parameters()]
params = [{'params':param_lstm,'lr':0.1},{'params':param_linear,'lr':0.01}]
torch.optimizer.SGD(params)
将两个模型参数的平均值赋予第三个模型
import torch
import torch.nn as nn
from collections import OrderedDict
#创建两个模型
model1 = nn.Linear(10,10)
model2 = nn.Linear(10,10)#获取两个模型的平均值
param_dict = {}
for key in model1.state_dict.keys():#model1.state_dict()输出值为OrderedDict类型param_key = (model1.state_dict[key] + model2.state_dict[key]) / 2param_dict[key] = param_key#将两个模型的平均值转换成OrderedDict类型,并赋予第三个模型
param_dict = OrderedDict(param_dict)
model3 = nn.Linear(10,10)#三个模型的构造必须一致
model3.load_state_dict(param_dict)
输出模型中每个层的梯度
import torch
import torch.nn as nn
from torch.nn import CrossEntropyLossclass model(nn.Module):def __init__(self):super(model,self).__init__()self.lstm = nn.LSTM(input_size = 10,hidden_size = 10,requires_grad = True)self.linear = nn.Linear(input_features = 5,out_features = 2,requires_grad = True)def forward(self,inputdata):lstm_result = self.lstm(input_data)output = self.linear(lstm_result)return outputmodel = model()#人为构造输入和真实标签
input_data = torch.randn([1,10])#[1,10]代表输入一个样本,该样本的向量是10维,此处必须是二位数据
target = torch.tensor([1],dtype = torch.long)#输入一个样本时真实标签只有一个,如果输入是[5,10],则真实标签就应该为5个,例如,torch.tensor([0,1,1,1,0])#模型计算,反向传播
result = model(input_data)
loss_fc = CrossEntropyLoss()
loss = loss_fc(input_data,target)
loss.backward()#输出不同层的梯度
print(model.lstm.grad)
print(model.linear.grad)#细分输出不同层权值和偏置的梯度
print(model.lstm.weight.grad)
print(model.lstm.bias.grad)
print(model.linear.weight.grad)
print(model.linear.bias.grad)
查看模型梯度参考:https://zhuanlan.zhihu.com/p/36121066
后续还需了解如何直接为某层赋予一定的梯度。