LSTM 详解代码¶
Data - IMDB¶
In [ ]:
Copied!
# 小提示: 每次重新运行时请尽量安装模块进行
# 比如完整的重新运行 MyLSTM以下所有内容
# 单独重新运行一个 cell,可能出现意料之外的情况
# 小提示: 每次重新运行时请尽量安装模块进行
# 比如完整的重新运行 MyLSTM以下所有内容
# 单独重新运行一个 cell,可能出现意料之外的情况
In [ ]:
Copied!
In [1]:
Copied!
# 借助 torchtext 加载 IMDB 数据集
# 借助 torchtext 加载 IMDB 数据集
In [2]:
Copied!
# dataset import
# !pip install torchtext torchdata
from torchtext.datasets import IMDB
from torchtext.datasets.imdb import NUM_LINES
from torchtext.data import get_tokenizer
from torchtext.vocab import build_vocab_from_iterator
from torchtext.data.functional import to_map_style_dataset
import torch
import torch.nn as nn
from torch import utils
import torch.nn.functional as F
# log 以及工具
import numpy as np
from tqdm import tqdm
import os
import sys
import logging
logging.basicConfig(
level=logging.WARN, stream=sys.stdout, \
format="%(asctime)s (%(module)s:%(lineno)d) %(levelname)s: %(message)s")
# 设备 无显卡会被设置为 cpu
device = 'cuda'
# dataset import
# !pip install torchtext torchdata
from torchtext.datasets import IMDB
from torchtext.datasets.imdb import NUM_LINES
from torchtext.data import get_tokenizer
from torchtext.vocab import build_vocab_from_iterator
from torchtext.data.functional import to_map_style_dataset
import torch
import torch.nn as nn
from torch import utils
import torch.nn.functional as F
# log 以及工具
import numpy as np
from tqdm import tqdm
import os
import sys
import logging
logging.basicConfig(
level=logging.WARN, stream=sys.stdout, \
format="%(asctime)s (%(module)s:%(lineno)d) %(levelname)s: %(message)s")
# 设备 无显卡会被设置为 cpu
device = 'cuda'
In [3]:
Copied!
def seed_everything(seed=42):
os.environ['PYTHONHASHSEED'] = str(seed)
np.random.seed(seed)
torch.manual_seed(seed)
torch.cuda.manual_seed(seed)
torch.backends.cudnn.deterministic = True
seed_everything(1998)
def seed_everything(seed=42):
os.environ['PYTHONHASHSEED'] = str(seed)
np.random.seed(seed)
torch.manual_seed(seed)
torch.cuda.manual_seed(seed)
torch.backends.cudnn.deterministic = True
seed_everything(1998)
In [4]:
Copied!
train_data_iter = IMDB(root="./data", split="train")
train_data_iter = IMDB(root="./data", split="train")
In [ ]:
Copied!
In [5]:
Copied!
# 对输入文本进行分割,返回值是 一个数组
def yeild_tokens(train_data_iter, tokenizer):
for i, sample in enumerate(train_data_iter):
label, comment = sample
# 打开 cell 4 中的注释时请注意切换此处注释
yield tokenizer(comment)
# return tokenizer(comment)
# 对输入文本进行分割,返回值是 一个数组
def yeild_tokens(train_data_iter, tokenizer):
for i, sample in enumerate(train_data_iter):
label, comment = sample
# 打开 cell 4 中的注释时请注意切换此处注释
yield tokenizer(comment)
# return tokenizer(comment)
In [6]:
Copied!
# 请打开下方注释理解 yeild_tokens
# x = yeild_tokens(train_data_iter, tokenizer)
# x
# 请打开下方注释理解 yeild_tokens
# x = yeild_tokens(train_data_iter, tokenizer)
# x
In [7]:
Copied!
# 分词、构建词表 这里第一次运行可能需要较长时间
tokenizer = get_tokenizer("basic_english")
# 只使用出现次数大于20的token
vocab = build_vocab_from_iterator(yeild_tokens(train_data_iter, tokenizer), min_freq=20, specials=["<unk>"])
vocab.set_default_index(0) # 特殊索引设置为0
print(f'单词表大小: {len(vocab)}')
# 分词、构建词表 这里第一次运行可能需要较长时间
tokenizer = get_tokenizer("basic_english")
# 只使用出现次数大于20的token
vocab = build_vocab_from_iterator(yeild_tokens(train_data_iter, tokenizer), min_freq=20, specials=[""])
vocab.set_default_index(0) # 特殊索引设置为0
print(f'单词表大小: {len(vocab)}')
单词表大小: 13351
In [8]:
Copied!
# 构建词向量 -- 可以使用 word2vec 或者 glovec 等方式来替代随机生成
embedding = nn.Embedding(len(vocab), 64)
# a = torch.LongTensor([0])
# a
# embedding(a)
# 构建词向量 -- 可以使用 word2vec 或者 glovec 等方式来替代随机生成
embedding = nn.Embedding(len(vocab), 64)
# a = torch.LongTensor([0])
# a
# embedding(a)
In [9]:
Copied!
# 由于 LSTM 这里考虑使用 batch 进行处理
# 针对同一 batch 中 句子长度不同的情况
# 每次都对长度不足的句子进行 padding
# 另外将标记由 1、2 修改为 0、1
def collate_fn(batch):
"""
对DataLoader所生成的mini-batch进行后处理
"""
# print(batch)
target = []
token_index = []
max_length = 0 # 最大的token长度
for i, (label, comment) in enumerate(batch):
tokens = tokenizer(comment)
# print(tokens)
# print(vocab(tokens))
token_index.append(vocab(tokens)) # 字符列表转换为索引列表
# 确定最大的句子长度
if len(tokens) > max_length:
max_length = len(tokens)
# 设定目标 label 1标记为 0 2标记为 1
if label == 1:
target.append(0)
else:
target.append(1)
# print(token_index)
# padding 到最长长度
token_index = [index + [0]*(max_length-len(index)) for index in token_index]
# 词向量化
token_index = embedding(torch.tensor(token_index).to(torch.int32))
# print(token_index.shape)
# one-hot接收长整形的数据,所以要转换为int64
return (torch.tensor(target).to(torch.int64),token_index )
# 由于 LSTM 这里考虑使用 batch 进行处理
# 针对同一 batch 中 句子长度不同的情况
# 每次都对长度不足的句子进行 padding
# 另外将标记由 1、2 修改为 0、1
def collate_fn(batch):
"""
对DataLoader所生成的mini-batch进行后处理
"""
# print(batch)
target = []
token_index = []
max_length = 0 # 最大的token长度
for i, (label, comment) in enumerate(batch):
tokens = tokenizer(comment)
# print(tokens)
# print(vocab(tokens))
token_index.append(vocab(tokens)) # 字符列表转换为索引列表
# 确定最大的句子长度
if len(tokens) > max_length:
max_length = len(tokens)
# 设定目标 label 1标记为 0 2标记为 1
if label == 1:
target.append(0)
else:
target.append(1)
# print(token_index)
# padding 到最长长度
token_index = [index + [0]*(max_length-len(index)) for index in token_index]
# 词向量化
token_index = embedding(torch.tensor(token_index).to(torch.int32))
# print(token_index.shape)
# one-hot接收长整形的数据,所以要转换为int64
return (torch.tensor(target).to(torch.int64),token_index )
In [10]:
Copied!
# 理解上方代码请打开下方注释,其中 batch_size >= 2 时 请注意最大长度填充。
# for batch_index, (target, token_index) in enumerate(train_data_loader):
# print(batch_index)
# print(target)
# print(token_index)
# # (batch_size, seq_len, input_size)
# print(token_index.shape)
# break
# 理解上方代码请打开下方注释,其中 batch_size >= 2 时 请注意最大长度填充。
# for batch_index, (target, token_index) in enumerate(train_data_loader):
# print(batch_index)
# print(target)
# print(token_index)
# # (batch_size, seq_len, input_size)
# print(token_index.shape)
# break
In [11]:
Copied!
# 定义超参数
input_size = 64 #
hidden_size = 128
num_layers = 1
num_classes = 2
batch_size = 32
max_seq_len = 512
learning_rate = 0.01
# 定义超参数
input_size = 64 #
hidden_size = 128
num_layers = 1
num_classes = 2
batch_size = 32
max_seq_len = 512
learning_rate = 0.01
In [12]:
Copied!
# Train Dataloader
train_data_iter = IMDB(root="data", split="train")
train_data_loader = torch.utils.data.DataLoader(
to_map_style_dataset(train_data_iter), batch_size=batch_size, collate_fn=collate_fn, shuffle=True)
# Eval Dataloader
eval_data_iter = IMDB(root="data", split="test")
eval_data_loader = utils.data.DataLoader(
to_map_style_dataset(eval_data_iter), batch_size=batch_size, collate_fn=collate_fn)
#to_map_style_dataset 可以自行百度
# Train Dataloader
train_data_iter = IMDB(root="data", split="train")
train_data_loader = torch.utils.data.DataLoader(
to_map_style_dataset(train_data_iter), batch_size=batch_size, collate_fn=collate_fn, shuffle=True)
# Eval Dataloader
eval_data_iter = IMDB(root="data", split="test")
eval_data_loader = utils.data.DataLoader(
to_map_style_dataset(eval_data_iter), batch_size=batch_size, collate_fn=collate_fn)
#to_map_style_dataset 可以自行百度
Model¶
Gate¶
In [4]:
Copied!
hideen_size_temp = 3
hideen_size_temp = 3
In [5]:
Copied!
sigmoid = nn.Sigmoid()
sigmoid
sigmoid = nn.Sigmoid()
sigmoid
Out[5]:
Sigmoid()
In [17]:
Copied!
hid = torch.ones(hideen_size_temp, hideen_size_temp)
hid
hid = torch.ones(hideen_size_temp, hideen_size_temp)
hid
Out[17]:
tensor([[1., 1., 1.], [1., 1., 1.], [1., 1., 1.]])
In [18]:
Copied!
model = nn.Linear(hideen_size_temp, hideen_size_temp)
model
model = nn.Linear(hideen_size_temp, hideen_size_temp)
model
Out[18]:
Linear(in_features=3, out_features=3, bias=True)
In [21]:
Copied!
mid_output = model(x)
mid_output
mid_output = model(x)
mid_output
Out[21]:
tensor([[-0.2625, 0.3412, -0.5055], [-0.2625, 0.3412, -0.5055]], grad_fn=<AddmmBackward0>)
In [22]:
Copied!
gate = sigmoid(mid_output)
gate
gate = sigmoid(mid_output)
gate
Out[22]:
tensor([[0.4347, 0.5845, 0.3762], [0.4347, 0.5845, 0.3762]], grad_fn=<SigmoidBackward0>)
In [23]:
Copied!
final_output = gate * mid_output
final_output
final_output = gate * mid_output
final_output
Out[23]:
tensor([[-0.1141, 0.1994, -0.1902], [-0.1141, 0.1994, -0.1902]], grad_fn=<MulBackward0>)
In [ ]:
Copied!
# 再次定义超参数 避免干扰下方训练
input_size = 64 #
hidden_size = 128
num_layers = 1
num_classes = 2
batch_size = 32
max_seq_len = 512
learning_rate = 0.01
# 再次定义超参数 避免干扰下方训练
input_size = 64 #
hidden_size = 128
num_layers = 1
num_classes = 2
batch_size = 32
max_seq_len = 512
learning_rate = 0.01
MyLSTM¶
In [13]:
Copied!
# 定义基础模型
class LSTM(nn.Module):
def __init__(self, input_size, hidden_size, num_layers, num_classes):
"""
args:
input_size: 输入大小
hidden_size: 隐藏层大小
num_layers: 几层的LSTM
num_classes: 最后输出的类别,在这个示例中,输出应该是 0 或者 1
"""
super(LSTM, self).__init__()
self.input_size = input_size
self.hidden_size = hidden_size
self.num_layers = num_layers
self.fc_i = nn.Linear(input_size + hidden_size, hidden_size)
self.fc_f = nn.Linear(input_size + hidden_size, hidden_size)
self.fc_g = nn.Linear(input_size + hidden_size, hidden_size)
self.fc_o = nn.Linear(input_size + hidden_size, hidden_size)
self.sigmoid = nn.Sigmoid()
self.tanh = nn.Tanh()
self.fc_out = nn.Linear(hidden_size, num_classes)
def forward(self, x):
# shape (batch_size, seq_len, input_size)
# print(x.shape)
h_t = torch.zeros(x.size(0), x.size(1), self.hidden_size).to(x.device)
c_t = torch.zeros(x.size(0), x.size(1), self.hidden_size).to(x.device)
# print(h_t.shape)
# print(c_t.shape)
combined = torch.cat((x, h_t), dim=2)
i_t = self.sigmoid(self.fc_i(combined))
f_t = self.sigmoid(self.fc_f(combined))
g_t = self.tanh(self.fc_g(combined))
o_t = self.sigmoid(self.fc_o(combined))
c_t = f_t * c_t + i_t * g_t
h_t = o_t * self.tanh(c_t)
# print(x.shape)
# print(combined.shape)
# print(i_t.shape)
# print(f_t.shape)
# print(g_t.shape)
# print(o_t.shape)
# print(h_t.shape)
h_t = F.avg_pool2d(h_t, (h_t.shape[1],1)).squeeze()
out = self.fc_out(h_t)
# print(out.cpu().shape)
return out
# 定义基础模型
class LSTM(nn.Module):
def __init__(self, input_size, hidden_size, num_layers, num_classes):
"""
args:
input_size: 输入大小
hidden_size: 隐藏层大小
num_layers: 几层的LSTM
num_classes: 最后输出的类别,在这个示例中,输出应该是 0 或者 1
"""
super(LSTM, self).__init__()
self.input_size = input_size
self.hidden_size = hidden_size
self.num_layers = num_layers
self.fc_i = nn.Linear(input_size + hidden_size, hidden_size)
self.fc_f = nn.Linear(input_size + hidden_size, hidden_size)
self.fc_g = nn.Linear(input_size + hidden_size, hidden_size)
self.fc_o = nn.Linear(input_size + hidden_size, hidden_size)
self.sigmoid = nn.Sigmoid()
self.tanh = nn.Tanh()
self.fc_out = nn.Linear(hidden_size, num_classes)
def forward(self, x):
# shape (batch_size, seq_len, input_size)
# print(x.shape)
h_t = torch.zeros(x.size(0), x.size(1), self.hidden_size).to(x.device)
c_t = torch.zeros(x.size(0), x.size(1), self.hidden_size).to(x.device)
# print(h_t.shape)
# print(c_t.shape)
combined = torch.cat((x, h_t), dim=2)
i_t = self.sigmoid(self.fc_i(combined))
f_t = self.sigmoid(self.fc_f(combined))
g_t = self.tanh(self.fc_g(combined))
o_t = self.sigmoid(self.fc_o(combined))
c_t = f_t * c_t + i_t * g_t
h_t = o_t * self.tanh(c_t)
# print(x.shape)
# print(combined.shape)
# print(i_t.shape)
# print(f_t.shape)
# print(g_t.shape)
# print(o_t.shape)
# print(h_t.shape)
h_t = F.avg_pool2d(h_t, (h_t.shape[1],1)).squeeze()
out = self.fc_out(h_t)
# print(out.cpu().shape)
return out
In [14]:
Copied!
# 检查模型是否存在问题
# 设置随机数种子以保证结果可重复
torch.manual_seed(2023)
# 生成测试数据
x = torch.randn(batch_size, max_seq_len, input_size).to(device)
y = torch.randint(0, num_classes, (batch_size,)).to(device)
# 初始化模型
model = LSTM(input_size, hidden_size, num_layers, num_classes)
model.to(device)
# 打印模型参数数量
print("模型参数数量:", sum(p.numel() for p in model.parameters() if p.requires_grad))
# 计算模型输出
output = model(x)
# output
# 检查模型是否存在问题
# 设置随机数种子以保证结果可重复
torch.manual_seed(2023)
# 生成测试数据
x = torch.randn(batch_size, max_seq_len, input_size).to(device)
y = torch.randint(0, num_classes, (batch_size,)).to(device)
# 初始化模型
model = LSTM(input_size, hidden_size, num_layers, num_classes)
model.to(device)
# 打印模型参数数量
print("模型参数数量:", sum(p.numel() for p in model.parameters() if p.requires_grad))
# 计算模型输出
output = model(x)
# output
模型参数数量: 99074
In [15]:
Copied!
# 查看模型结构
model
# 查看模型结构
model
Out[15]:
LSTM( (fc_i): Linear(in_features=192, out_features=128, bias=True) (fc_f): Linear(in_features=192, out_features=128, bias=True) (fc_g): Linear(in_features=192, out_features=128, bias=True) (fc_o): Linear(in_features=192, out_features=128, bias=True) (sigmoid): Sigmoid() (tanh): Tanh() (fc_out): Linear(in_features=128, out_features=2, bias=True) )
In [16]:
Copied!
optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)
cur_epoch = -1
for epoch_index in range(0,6):
num_batches = len(train_data_loader)
for batch_index, (target, token_index) in tqdm(enumerate(train_data_loader)):
model.train()
optimizer.zero_grad()
target = target.to(device)
token_index = token_index.to(device)
step = num_batches*(epoch_index) + batch_index + 1
logits = model(token_index)
loss = F.nll_loss(F.log_softmax(logits,dim=-1), target)
loss.backward()
optimizer.step()
if batch_index % 300 == 0:
cur_epoch = epoch_index
logging.warning(f"epoch_index: {epoch_index}, batch_index: {batch_index}, loss: {loss}")
optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)
cur_epoch = -1
for epoch_index in range(0,6):
num_batches = len(train_data_loader)
for batch_index, (target, token_index) in tqdm(enumerate(train_data_loader)):
model.train()
optimizer.zero_grad()
target = target.to(device)
token_index = token_index.to(device)
step = num_batches*(epoch_index) + batch_index + 1
logits = model(token_index)
loss = F.nll_loss(F.log_softmax(logits,dim=-1), target)
loss.backward()
optimizer.step()
if batch_index % 300 == 0:
cur_epoch = epoch_index
logging.warning(f"epoch_index: {epoch_index}, batch_index: {batch_index}, loss: {loss}")
0it [00:00, ?it/s]
2023-03-17 00:37:03,079 (233408485:20) WARNING: epoch_index: 0, batch_index: 0, loss: 0.6886175274848938
298it [00:07, 38.08it/s]
2023-03-17 00:37:10,820 (233408485:20) WARNING: epoch_index: 0, batch_index: 300, loss: 0.6428654193878174
599it [00:15, 39.36it/s]
2023-03-17 00:37:18,390 (233408485:20) WARNING: epoch_index: 0, batch_index: 600, loss: 0.48170384764671326
782it [00:20, 39.09it/s] 0it [00:00, ?it/s]
2023-03-17 00:37:23,065 (233408485:20) WARNING: epoch_index: 1, batch_index: 0, loss: 0.44872817397117615
300it [00:07, 37.30it/s]
2023-03-17 00:37:30,612 (233408485:20) WARNING: epoch_index: 1, batch_index: 300, loss: 0.5459613800048828
600it [00:15, 34.74it/s]
2023-03-17 00:37:38,373 (233408485:20) WARNING: epoch_index: 1, batch_index: 600, loss: 0.41060134768486023
782it [00:20, 39.04it/s] 0it [00:00, ?it/s]
2023-03-17 00:37:43,096 (233408485:20) WARNING: epoch_index: 2, batch_index: 0, loss: 0.25612199306488037
297it [00:07, 38.88it/s]
2023-03-17 00:37:51,098 (233408485:20) WARNING: epoch_index: 2, batch_index: 300, loss: 0.33394932746887207
596it [00:15, 41.05it/s]
2023-03-17 00:37:58,503 (233408485:20) WARNING: epoch_index: 2, batch_index: 600, loss: 0.4152831733226776
782it [00:19, 39.31it/s] 0it [00:00, ?it/s]
2023-03-17 00:38:02,998 (233408485:20) WARNING: epoch_index: 3, batch_index: 0, loss: 0.31165584921836853
296it [00:07, 41.12it/s]
2023-03-17 00:38:10,474 (233408485:20) WARNING: epoch_index: 3, batch_index: 300, loss: 0.348602294921875
600it [00:15, 35.43it/s]
2023-03-17 00:38:18,298 (233408485:20) WARNING: epoch_index: 3, batch_index: 600, loss: 0.22047576308250427
782it [00:20, 38.53it/s] 0it [00:00, ?it/s]
2023-03-17 00:38:23,300 (233408485:20) WARNING: epoch_index: 4, batch_index: 0, loss: 0.3187304735183716
300it [00:07, 39.59it/s]
2023-03-17 00:38:31,194 (233408485:20) WARNING: epoch_index: 4, batch_index: 300, loss: 0.17088855803012848
600it [00:15, 36.99it/s]
2023-03-17 00:38:39,150 (233408485:20) WARNING: epoch_index: 4, batch_index: 600, loss: 0.4386771321296692
782it [00:20, 38.19it/s] 0it [00:00, ?it/s]
2023-03-17 00:38:43,787 (233408485:20) WARNING: epoch_index: 5, batch_index: 0, loss: 0.2063377946615219
300it [00:07, 39.54it/s]
2023-03-17 00:38:51,578 (233408485:20) WARNING: epoch_index: 5, batch_index: 300, loss: 0.3431491255760193
600it [00:15, 41.39it/s]
2023-03-17 00:38:59,081 (233408485:20) WARNING: epoch_index: 5, batch_index: 600, loss: 0.2605396807193756
782it [00:19, 39.37it/s]
In [17]:
Copied!
model.eval()
total_acc_account = 0
total_account = 0
for eval_batch_index, (eval_target, eval_token_index) in tqdm(enumerate(eval_data_loader)):
eval_target = eval_target.to(device)
eval_token_index = eval_token_index.to(device)
total_account += eval_target.shape[0]
eval_logits = model(eval_token_index)
total_acc_account += (torch.argmax(eval_logits, dim=-1) == eval_target).sum().item()
eval_loss = F.nll_loss(F.log_softmax(eval_logits,dim=-1), eval_target)
logging.warning(f"eval_loss: {eval_loss}, eval_acc: {total_acc_account / total_account}")
model.eval()
total_acc_account = 0
total_account = 0
for eval_batch_index, (eval_target, eval_token_index) in tqdm(enumerate(eval_data_loader)):
eval_target = eval_target.to(device)
eval_token_index = eval_token_index.to(device)
total_account += eval_target.shape[0]
eval_logits = model(eval_token_index)
total_acc_account += (torch.argmax(eval_logits, dim=-1) == eval_target).sum().item()
eval_loss = F.nll_loss(F.log_softmax(eval_logits,dim=-1), eval_target)
logging.warning(f"eval_loss: {eval_loss}, eval_acc: {total_acc_account / total_account}")
782it [00:10, 71.24it/s]
2023-03-17 00:39:14,595 (612167170:11) WARNING: eval_loss: 0.6957253813743591, eval_acc: 0.86988
LSTM-Pytorch¶
In [18]:
Copied!
class LSTM(nn.Module):
def __init__(self, input_size, hidden_size, num_layers, num_classes):
"""
args:
input_size: 输入大小
hidden_size: 隐藏层大小
num_layers: 几层的LSTM
num_classes: 最后输出的类别,在这个示例中,输出应该是 0 或者 1
"""
super(LSTM, self).__init__()
self.input_size = input_size
self.hidden_size = hidden_size
self.num_layers = num_layers
self.lstm = nn.LSTM(self.input_size, self.hidden_size, num_layers=num_layers,batch_first=True,bidirectional=False)
self.fc_out = nn.Linear(hidden_size, num_classes)
def forward(self, x):
x,_ = self.lstm(x)
x = F.avg_pool2d(x, (x.shape[1],1)).squeeze()
out = self.fc_out(x)
return out
class LSTM(nn.Module):
def __init__(self, input_size, hidden_size, num_layers, num_classes):
"""
args:
input_size: 输入大小
hidden_size: 隐藏层大小
num_layers: 几层的LSTM
num_classes: 最后输出的类别,在这个示例中,输出应该是 0 或者 1
"""
super(LSTM, self).__init__()
self.input_size = input_size
self.hidden_size = hidden_size
self.num_layers = num_layers
self.lstm = nn.LSTM(self.input_size, self.hidden_size, num_layers=num_layers,batch_first=True,bidirectional=False)
self.fc_out = nn.Linear(hidden_size, num_classes)
def forward(self, x):
x,_ = self.lstm(x)
x = F.avg_pool2d(x, (x.shape[1],1)).squeeze()
out = self.fc_out(x)
return out
In [19]:
Copied!
# 检查模型是否存在问题
# 设置随机数种子以保证结果可重复
# 生成测试数据
x = torch.randn(batch_size, max_seq_len, input_size).to(device)
y = torch.randint(0, num_classes, (batch_size,)).to(device)
# 初始化模型
model = LSTM(input_size, hidden_size, num_layers, num_classes)
model.to(device)
# 打印模型参数数量
print("模型参数数量:", sum(p.numel() for p in model.parameters() if p.requires_grad))
# 计算模型输出
output = model(x)
# output
# 检查模型是否存在问题
# 设置随机数种子以保证结果可重复
# 生成测试数据
x = torch.randn(batch_size, max_seq_len, input_size).to(device)
y = torch.randint(0, num_classes, (batch_size,)).to(device)
# 初始化模型
model = LSTM(input_size, hidden_size, num_layers, num_classes)
model.to(device)
# 打印模型参数数量
print("模型参数数量:", sum(p.numel() for p in model.parameters() if p.requires_grad))
# 计算模型输出
output = model(x)
# output
模型参数数量: 99586
In [20]:
Copied!
# 查看模型结构
model
# 查看模型结构
model
Out[20]:
LSTM( (lstm): LSTM(64, 128, batch_first=True) (fc_out): Linear(in_features=128, out_features=2, bias=True) )
In [21]:
Copied!
optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)
cur_epoch = -1
for epoch_index in range(0,3):
num_batches = len(train_data_loader)
for batch_index, (target, token_index) in tqdm(enumerate(train_data_loader)):
model.train()
optimizer.zero_grad()
target = target.to(device)
token_index = token_index.to(device)
step = num_batches*(epoch_index) + batch_index + 1
logits = model(token_index)】
loss = F.nll_loss(F.log_softmax(logits,dim=-1), target)
loss.backward()
# nn.utils.clip_grad_norm_(model.parameters(), 0.1) # 梯度的正则进行截断,保证训练稳定
optimizer.step()
if batch_index % 300 == 0:
cur_epoch = epoch_index
logging.warning(f"epoch_index: {epoch_index}, batch_index: {batch_index}, loss: {loss}")
optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)
cur_epoch = -1
for epoch_index in range(0,3):
num_batches = len(train_data_loader)
for batch_index, (target, token_index) in tqdm(enumerate(train_data_loader)):
model.train()
optimizer.zero_grad()
target = target.to(device)
token_index = token_index.to(device)
step = num_batches*(epoch_index) + batch_index + 1
logits = model(token_index)】
loss = F.nll_loss(F.log_softmax(logits,dim=-1), target)
loss.backward()
# nn.utils.clip_grad_norm_(model.parameters(), 0.1) # 梯度的正则进行截断,保证训练稳定
optimizer.step()
if batch_index % 300 == 0:
cur_epoch = epoch_index
logging.warning(f"epoch_index: {epoch_index}, batch_index: {batch_index}, loss: {loss}")
0it [00:00, ?it/s]
2023-03-17 00:39:14,813 (3862857436:23) WARNING: epoch_index: 0, batch_index: 0, loss: 0.6966263055801392
299it [00:08, 38.52it/s]
2023-03-17 00:39:23,082 (3862857436:23) WARNING: epoch_index: 0, batch_index: 300, loss: 0.5508784055709839
599it [00:16, 36.47it/s]
2023-03-17 00:39:31,275 (3862857436:23) WARNING: epoch_index: 0, batch_index: 600, loss: 0.5803604125976562
782it [00:21, 36.57it/s] 0it [00:00, ?it/s]
2023-03-17 00:39:36,188 (3862857436:23) WARNING: epoch_index: 1, batch_index: 0, loss: 0.40404248237609863
298it [00:08, 34.45it/s]
2023-03-17 00:39:44,230 (3862857436:23) WARNING: epoch_index: 1, batch_index: 300, loss: 0.28521886467933655
600it [00:16, 37.37it/s]
2023-03-17 00:39:52,248 (3862857436:23) WARNING: epoch_index: 1, batch_index: 600, loss: 0.2420959770679474
782it [00:20, 37.49it/s] 0it [00:00, ?it/s]
2023-03-17 00:39:57,044 (3862857436:23) WARNING: epoch_index: 2, batch_index: 0, loss: 0.40587106347084045
296it [00:08, 35.26it/s]
2023-03-17 00:40:05,171 (3862857436:23) WARNING: epoch_index: 2, batch_index: 300, loss: 0.4198710024356842
599it [00:16, 37.80it/s]
2023-03-17 00:40:13,358 (3862857436:23) WARNING: epoch_index: 2, batch_index: 600, loss: 0.33481597900390625
782it [00:21, 36.72it/s]
In [22]:
Copied!
model.eval()
total_acc_account = 0
total_account = 0
for eval_batch_index, (eval_target, eval_token_index) in tqdm(enumerate(eval_data_loader)):
eval_target = eval_target.to(device)
eval_token_index = eval_token_index.to(device)
total_account += eval_target.shape[0]
eval_logits = model(eval_token_index)
total_acc_account += (torch.argmax(eval_logits, dim=-1) == eval_target).sum().item()
eval_loss = F.nll_loss(F.log_softmax(eval_logits,dim=-1), eval_target)
logging.warning(f"eval_loss: {eval_loss}, eval_acc: {total_acc_account / total_account}")
model.eval()
total_acc_account = 0
total_account = 0
for eval_batch_index, (eval_target, eval_token_index) in tqdm(enumerate(eval_data_loader)):
eval_target = eval_target.to(device)
eval_token_index = eval_token_index.to(device)
total_account += eval_target.shape[0]
eval_logits = model(eval_token_index)
total_acc_account += (torch.argmax(eval_logits, dim=-1) == eval_target).sum().item()
eval_loss = F.nll_loss(F.log_softmax(eval_logits,dim=-1), eval_target)
logging.warning(f"eval_loss: {eval_loss}, eval_acc: {total_acc_account / total_account}")
782it [00:10, 74.04it/s]
2023-03-17 00:40:28,881 (612167170:11) WARNING: eval_loss: 0.6789069175720215, eval_acc: 0.8048
RNN¶
In [33]:
Copied!
import torch.nn as nn
class RNN(nn.Module):
def __init__(self, input_size, output_size, hidden_dim, n_layers):
super(RNN, self).__init__()
# Defining some parameters
self.hidden_dim = hidden_dim
self.n_layers = n_layers
#Defining the layers
# RNN Layer
self.rnn = nn.RNN(input_size, hidden_dim, n_layers, batch_first=True)
# Fully connected layer
self.fc = nn.Linear(hidden_dim, output_size)
def forward(self, x):
batch_size = x.size(0)
# Initializing hidden state for first input using method defined below
hidden = self.init_hidden(batch_size)
# Passing in the input and hidden state into the model and obtaining outputs
out, hidden = self.rnn(x, hidden)
# Reshaping the outputs such that it can be fit into the fully connected layer
#out = out.contiguous().view(-1, self.hidden_dim)
out = F.avg_pool2d(out, (out.shape[1],1)).squeeze()
out = self.fc(out)
return out
def init_hidden(self, batch_size):
# This method generates the first hidden state of zeros which we'll use in the forward pass
# We'll send the tensor holding the hidden state to the device we specified earlier as well
hidden = torch.zeros(self.n_layers, batch_size, self.hidden_dim).to(device)
return hidden
import torch.nn as nn
class RNN(nn.Module):
def __init__(self, input_size, output_size, hidden_dim, n_layers):
super(RNN, self).__init__()
# Defining some parameters
self.hidden_dim = hidden_dim
self.n_layers = n_layers
#Defining the layers
# RNN Layer
self.rnn = nn.RNN(input_size, hidden_dim, n_layers, batch_first=True)
# Fully connected layer
self.fc = nn.Linear(hidden_dim, output_size)
def forward(self, x):
batch_size = x.size(0)
# Initializing hidden state for first input using method defined below
hidden = self.init_hidden(batch_size)
# Passing in the input and hidden state into the model and obtaining outputs
out, hidden = self.rnn(x, hidden)
# Reshaping the outputs such that it can be fit into the fully connected layer
#out = out.contiguous().view(-1, self.hidden_dim)
out = F.avg_pool2d(out, (out.shape[1],1)).squeeze()
out = self.fc(out)
return out
def init_hidden(self, batch_size):
# This method generates the first hidden state of zeros which we'll use in the forward pass
# We'll send the tensor holding the hidden state to the device we specified earlier as well
hidden = torch.zeros(self.n_layers, batch_size, self.hidden_dim).to(device)
return hidden
In [34]:
Copied!
# 生成测试数据
x = torch.randn(batch_size, max_seq_len,input_size).to(device)
y = torch.randint(0, num_classes, (batch_size,)).to(device)
hidden = torch.zeros(batch_size, hidden_size).to(device)
# 初始化模型
model = RNN(input_size, num_classes,hidden_size,1)
model.to(device)
# 打印模型参数数量
print("模型参数数量:", sum(p.numel() for p in model.parameters() if p.requires_grad))
# 计算模型输出
output = model(x)
# output
# 生成测试数据
x = torch.randn(batch_size, max_seq_len,input_size).to(device)
y = torch.randint(0, num_classes, (batch_size,)).to(device)
hidden = torch.zeros(batch_size, hidden_size).to(device)
# 初始化模型
model = RNN(input_size, num_classes,hidden_size,1)
model.to(device)
# 打印模型参数数量
print("模型参数数量:", sum(p.numel() for p in model.parameters() if p.requires_grad))
# 计算模型输出
output = model(x)
# output
模型参数数量: 25090
In [35]:
Copied!
model
model
Out[35]:
RNN( (rnn): RNN(64, 128, batch_first=True) (fc): Linear(in_features=128, out_features=2, bias=True) )
In [38]:
Copied!
# lr 0.01 存在跑飞情况
# 0.001 训练 18 epoch
optimizer = torch.optim.Adam(model.parameters(), lr=0.0001)
cur_epoch = -1
for epoch_index in range(12,18):
num_batches = len(train_data_loader)
for batch_index, (target, token_index) in tqdm(enumerate(train_data_loader)):
model.train()
optimizer.zero_grad()
target = target.to(device)
token_index = token_index.to(device)
step = num_batches*(epoch_index) + batch_index + 1
output = model(token_index)
loss = F.nll_loss(F.log_softmax(output,dim=-1), target)
loss.backward()
nn.utils.clip_grad_norm_(model.parameters(), 0.1) # 梯度的正则进行截断,保证训练稳定
optimizer.step()
if batch_index % 300 == 0:
cur_epoch = epoch_index
logging.warning(f"epoch_index: {epoch_index}, batch_index: {batch_index}, loss: {loss}")
# lr 0.01 存在跑飞情况
# 0.001 训练 18 epoch
optimizer = torch.optim.Adam(model.parameters(), lr=0.0001)
cur_epoch = -1
for epoch_index in range(12,18):
num_batches = len(train_data_loader)
for batch_index, (target, token_index) in tqdm(enumerate(train_data_loader)):
model.train()
optimizer.zero_grad()
target = target.to(device)
token_index = token_index.to(device)
step = num_batches*(epoch_index) + batch_index + 1
output = model(token_index)
loss = F.nll_loss(F.log_softmax(output,dim=-1), target)
loss.backward()
nn.utils.clip_grad_norm_(model.parameters(), 0.1) # 梯度的正则进行截断,保证训练稳定
optimizer.step()
if batch_index % 300 == 0:
cur_epoch = epoch_index
logging.warning(f"epoch_index: {epoch_index}, batch_index: {batch_index}, loss: {loss}")
0it [00:00, ?it/s]
2023-03-17 00:59:03,908 (1096697844:23) WARNING: epoch_index: 12, batch_index: 0, loss: 0.6250293254852295
300it [00:06, 50.94it/s]
2023-03-17 00:59:10,319 (1096697844:23) WARNING: epoch_index: 12, batch_index: 300, loss: 0.6724219918251038
599it [00:12, 42.78it/s]
2023-03-17 00:59:16,588 (1096697844:23) WARNING: epoch_index: 12, batch_index: 600, loss: 0.5623435378074646
782it [00:16, 47.56it/s] 0it [00:00, ?it/s]
2023-03-17 00:59:20,355 (1096697844:23) WARNING: epoch_index: 13, batch_index: 0, loss: 0.5873069167137146
295it [00:06, 49.88it/s]
2023-03-17 00:59:26,896 (1096697844:23) WARNING: epoch_index: 13, batch_index: 300, loss: 0.5667327642440796
597it [00:12, 46.18it/s]
2023-03-17 00:59:33,395 (1096697844:23) WARNING: epoch_index: 13, batch_index: 600, loss: 0.630550742149353
782it [00:16, 47.39it/s] 0it [00:00, ?it/s]
2023-03-17 00:59:36,858 (1096697844:23) WARNING: epoch_index: 14, batch_index: 0, loss: 0.5228468179702759
300it [00:06, 43.12it/s]
2023-03-17 00:59:42,946 (1096697844:23) WARNING: epoch_index: 14, batch_index: 300, loss: 0.46919119358062744
598it [00:12, 44.54it/s]
2023-03-17 00:59:49,330 (1096697844:23) WARNING: epoch_index: 14, batch_index: 600, loss: 0.6358293294906616
782it [00:16, 47.77it/s] 0it [00:00, ?it/s]
2023-03-17 00:59:53,238 (1096697844:23) WARNING: epoch_index: 15, batch_index: 0, loss: 0.6251819729804993
297it [00:06, 52.90it/s]
2023-03-17 00:59:59,394 (1096697844:23) WARNING: epoch_index: 15, batch_index: 300, loss: 0.5249137282371521
597it [00:11, 49.78it/s]
2023-03-17 01:00:05,039 (1096697844:23) WARNING: epoch_index: 15, batch_index: 600, loss: 0.6910139918327332
782it [00:15, 51.67it/s] 0it [00:00, ?it/s]
2023-03-17 01:00:08,375 (1096697844:23) WARNING: epoch_index: 16, batch_index: 0, loss: 0.5868373513221741
299it [00:05, 52.95it/s]
2023-03-17 01:00:14,215 (1096697844:23) WARNING: epoch_index: 16, batch_index: 300, loss: 0.6238973736763
595it [00:11, 46.68it/s]
2023-03-17 01:00:20,129 (1096697844:23) WARNING: epoch_index: 16, batch_index: 600, loss: 0.48795947432518005
782it [00:15, 49.65it/s] 0it [00:00, ?it/s]
2023-03-17 01:00:24,120 (1096697844:23) WARNING: epoch_index: 17, batch_index: 0, loss: 0.5158190131187439
296it [00:06, 48.67it/s]
2023-03-17 01:00:30,500 (1096697844:23) WARNING: epoch_index: 17, batch_index: 300, loss: 0.7333455681800842
596it [00:11, 56.56it/s]
2023-03-17 01:00:36,137 (1096697844:23) WARNING: epoch_index: 17, batch_index: 600, loss: 0.5788255333900452
782it [00:15, 50.98it/s]
In [39]:
Copied!
model.eval()
total_acc_account = 0
total_account = 0
for eval_batch_index, (eval_target, eval_token_index) in tqdm(enumerate(eval_data_loader)):
eval_target = eval_target.to(device)
eval_token_index = eval_token_index.to(device)
total_account += eval_target.shape[0]
eval_logits = model(eval_token_index)
total_acc_account += (torch.argmax(eval_logits, dim=-1) == eval_target).sum().item()
eval_loss = F.nll_loss(F.log_softmax(eval_logits,dim=-1), eval_target)
logging.warning(f"eval_loss: {eval_loss}, eval_acc: {total_acc_account / total_account}")
model.eval()
total_acc_account = 0
total_account = 0
for eval_batch_index, (eval_target, eval_token_index) in tqdm(enumerate(eval_data_loader)):
eval_target = eval_target.to(device)
eval_token_index = eval_token_index.to(device)
total_account += eval_target.shape[0]
eval_logits = model(eval_token_index)
total_acc_account += (torch.argmax(eval_logits, dim=-1) == eval_target).sum().item()
eval_loss = F.nll_loss(F.log_softmax(eval_logits,dim=-1), eval_target)
logging.warning(f"eval_loss: {eval_loss}, eval_acc: {total_acc_account / total_account}")
782it [00:08, 92.90it/s]
2023-03-17 01:01:44,283 (612167170:11) WARNING: eval_loss: 1.3197190761566162, eval_acc: 0.67968
In [ ]:
Copied!
# 当前情况下 log 存在训练不足,有一定提高空间
# 当前情况下 log 存在训练不足,有一定提高空间
In [ ]:
Copied!
In [ ]:
Copied!
In [ ]:
Copied!
In [ ]:
Copied!
最后更新:
November 30, 2023
创建日期: November 30, 2023
创建日期: November 30, 2023