https://github.com/FudanNLP/nlp-beginner
参考文章:ESIM论文,ESMI代码实现参考
实现效果很差,暂时仅当是个记录。找到问题了,完全跑完估计要花个四五天,总之先挂着等数据。
1. 代码
模型部分是基本上是按着参考代码打的,只是加了点size的注释,去掉了mask机制,其他数据处理和训练等部分和task2大同小异
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from torch.utils.data import random_split
import pandas as pd
import numpy as np
import random
label_to_index = {
'contradiction': 0,
'neutral': 1,
'entailment': 2
}
index_to_label = {key: value for key, value in label_to_index.items()}
read_data = pd.read_table('../snli_1.0_train.txt')
data = []
data_len = int(read_data.shape[0]) # 共550152条
for i in range(data_len):
if pd.isnull(read_data['sentence2_binary_parse'][i]):
# read_data['sentence2_binary_parse'][i] = 'N/A' # 会出现N/A
continue
data.append([read_data['sentence1_binary_parse'][i].lower().replace('(', ' ').replace(')', ' ').split(),
read_data['sentence2_binary_parse'][i].lower().replace('(', ' ').replace(')', ' ').split(),
label_to_index[read_data['label1'][i]]])
word_to_ix = {} # 给每个词分配index
ix_to_word = {}
word_set = set()
for sent, sent2, _ in data:
for word in sent:
if word not in word_to_ix:
ix_to_word[len(word_to_ix)] = word
word_to_ix[word] = len(word_to_ix)
word_set.add(word)
for word in sent2:
if word not in word_to_ix:
ix_to_word[len(word_to_ix)] = word
word_to_ix[word] = len(word_to_ix)
word_set.add(word)
unk = '<unk>'
ix_to_word[len(word_to_ix)] = unk
word_to_ix[unk] = len(word_to_ix)
word_set.add(unk)
torch.manual_seed(6) # 设置torch的seed,影响后面初始化参数和random_split
train_len = int(0.8 * data_len)
test_len = data_len - train_len
train_data, test_data = random_split(data, [train_len, test_len]) # 分割数据集
# print(type(train_data)) # torch.utils.data.dataset.Subset
train_data = list(train_data)
test_data = list(test_data)
# 参数字典,方便成为调参侠
args = {
'vocab_size': len(word_to_ix), # 有多少词,embedding需要以此来生成词向量
'embedding_size': 50, # 每个词向量有几维(几个特征)
'hidden_size': 16,
'type_num': 5, # 分类个数
'train_batch_size': 1000, # int(train_len / 10),
'dropout': 0.1
}
f = open('../glove.6B.50d.txt', 'r', encoding='utf-8')
line = f.readline()
glove_word2vec = {}
pretrained_vec = []
while line:
line = line.split()
word = line[0]
if word in word_set:
glove_word2vec[word] = [float(v) for v in line[1:]]
line = f.readline()
unk_num = 0
for i in range(args['vocab_size']):
if ix_to_word[i] in glove_word2vec:
pretrained_vec.append(glove_word2vec[ix_to_word[i]])
else:
pretrained_vec.append(list(torch.randn(args['embedding_size'])))
unk_num += 1
print(unk_num, args['vocab_size'])
pretrained_vec = np.array(pretrained_vec)
train_len = int(int(train_len / args['train_batch_size']) * args['train_batch_size'])
class Embedding_Layer(nn.Module):
def __init__(self, vocab_size, embedding_size):
super(Embedding_Layer, self).__init__()
self.embedding = nn.Embedding(vocab_size, embedding_size)
self.embedding.weight.data.copy_(torch.from_numpy(pretrained_vec))
def forward(self, x):
return self.embedding(x)
class Encoding_Layer(nn.Module):
def __init__(self, embedding_size, hidden_size):
super(Encoding_Layer, self).__init__()
self.lstm = nn.LSTM(embedding_size, hidden_size, bidirectional=True, batch_first=True)
def forward(self, x):
# self.lstm.flatten_parameters()
x, _ = self.lstm(x) # _返回的是(h_n, c_n)
return x # [batch_size, length, hidden_size * 2]
class LocalInference_Layer(nn.Module):
def __init__(self):
super(LocalInference_Layer, self).__init__()
self.softmax1 = nn.Softmax(dim=1)
self.softmax2 = nn.Softmax(dim=2)
def forward(self, p, h):
e = torch.matmul(p, h.transpose(1, 2)) # [batch_size, length1, length2]
# h_score -> paper(13), p_score -> paper(12)
h_score, p_score = self.softmax1(e), self.softmax2(e)
h_ = h_score.transpose(1, 2).bmm(p) # [batch_size, length2, hidden_size * 2]
p_ = p_score.bmm(h) # [batch_size, length1, hidden_size * 2]
m_p = torch.cat((p, p_, p - p_, p * p_), dim=2) # [batch_size, length1, hidden_size * 8]
m_h = torch.cat((h, h_, h - h_, h * h_), dim=2) # [batch_size, length2, hidden_size * 8]
return m_p, m_h
class Composition_Layer(nn.Module):
def __init__(self, input_size, output_size, hidden_size, dropout=0.0):
super(Composition_Layer, self).__init__()
self.hidden = nn.Linear(input_size, output_size)
self.lstm = nn.LSTM(output_size, hidden_size, bidirectional=True, batch_first=True)
self.dropout = nn.Dropout(dropout)
def forward(self, x):
x = self.hidden(x)
x = self.dropout(x)
x, _ = self.lstm(x)
return x # [batch_size, length, hidden_size * 2]
class Pooling_Layer(nn.Module):
def __init__(self):
super(Pooling_Layer, self).__init__()
def forward(self, x):
v_avg = x.sum(1) / x.shape[1] # [batch_size, hidden_size * 2]
v_max = x.max(1)[0] # [batch_size, hidden_size * 2]
return torch.cat((v_avg, v_max), dim=-1) # [batch_size, hidden_size * 4]
class InferenceComposition_Layer(nn.Module):
def __init__(self, input_size, output_size, hidden_size, dropout=0.0):
super(InferenceComposition_Layer, self).__init__()
self.composition = Composition_Layer(input_size, output_size, hidden_size, dropout)
self.pooling = Pooling_Layer()
def forward(self, m_p, m_h):
v_p, v_h = self.composition(m_p), self.composition(m_h)
v_p, v_h = self.pooling(v_p), self.pooling(v_h)
return torch.cat((v_p, v_h), dim=1) # [batch_size, hidden_size * 8]
class Output_Layer(nn.Module):
def __init__(self, input_size, output_size, type_num, dropout=0.0):
super(Output_Layer, self).__init__()
self.mlp = nn.Sequential(
nn.Dropout(dropout),
nn.Linear(input_size, output_size, nn.Tanh),
nn.Linear(output_size, type_num)
)
def forward(self, x):
return self.mlp(x)
class ESIM(nn.Module):
def __init__(self):
super(ESIM, self).__init__()
vocab_size = args['vocab_size']
embedding_size = args['embedding_size']
hidden_size = args['hidden_size']
type_num = args['type_num']
dropout = args['dropout']
self.embed = Embedding_Layer(vocab_size, embedding_size)
self.encoder = Encoding_Layer(embedding_size, hidden_size)
self.inference = LocalInference_Layer()
self.inference_composition = InferenceComposition_Layer(hidden_size * 8, hidden_size, hidden_size, dropout)
self.out = Output_Layer(hidden_size * 8, hidden_size, type_num, dropout)
def forward(self, p, h):
p_embeded = self.embed(p)
h_embeded = self.embed(h)
p_ = self.encoder(p_embeded)
h_ = self.encoder(h_embeded)
m_p, m_h = self.inference(p_, h_)
v = self.inference_composition(m_p, m_h)
return F.softmax(self.out(v), dim=1)
model = ESIM()
loss_function = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.2)
def match(test_batch):
acc = 0
with torch.no_grad():
for instance1, instance2, label in test_batch:
premise = [word_to_ix[word] for word in instance1] # 要先把每个词转换为其对应的index
hyposis = [word_to_ix[word] for word in instance2]
premise = torch.LongTensor(premise).view(1, -1)
hyposis = torch.LongTensor(hyposis).view(1, -1)
log_probs = model(premise, hyposis)
b = torch.argmax(log_probs, dim=1)
if b[0] == label:
acc += 1
print('acc = %.6lf%%' % (acc / test_len * 100))
def train(batch_data, batch_size):
model.zero_grad()
for instance1, instance2, label in batch_data:
premise = [word_to_ix[word] for word in instance1] # 要先把每个词转换为其对应的index
hyposis = [word_to_ix[word] for word in instance2]
premise = torch.LongTensor(premise).view(1, -1)
hyposis = torch.LongTensor(hyposis).view(1, -1)
target = torch.LongTensor([label])
log_probs = model(premise, hyposis)
loss = loss_function(log_probs, target) / batch_size
loss.backward()
print(' loss = %.6lf' % loss)
optimizer.step()
# match(test_data) # 初始33.30%
random.seed(6)
for epoch in range(10):
print('now in epoch %d...' % epoch)
random.shuffle(train_data)
for i in range(0, train_len, args['train_batch_size']):
train(train_data[i: i + args['train_batch_size']], args['train_batch_size'])
match(test_data)
2. 总结
1. 模型部分基本上是顺着参考代码打了一遍,主要是因为觉得这种按层的写法太舒服了,比起来我之前写得简直就不能看,甚至才知道LSTM可以直接设置bidirection... 所以一方面是参考代码实现,另一方面是想学习这种风格,很喜欢,十分感谢大佬提供的参考代码。
2. 第一次跟着论文一步一步慢慢去实现一个模型,受益匪浅。
3. 虽然是跟着实现了,但是模型跑出来的效果奇差,回看了好几遍在逻辑上也找不出问题,总之就先做个记录了。和参考代码的差就在没用mask机制,并且用的是强行扩大batch_size的方法,但是mask机制不就是用来解决不定长输入的问题么,感觉用梯度累积机制应该没有什么太大差距才对... 慢就不多说了,十几个小时跑一个epoch,acc还不变甚是郁闷。
4. 本来batch_size是沿用task2的,本来以为可能还是更新次数太小的锅,试着改成了参考代码的1000,不过更新了两三次之后loss就只在两个数值之间跳,迷惑,试着跑了一个epoch结果肯定也不行。
5. 读取sentence的时候为了省力直接把binary_parse的结构给拆了进行分词了,既然不用树结构也不会有影响吧。
6. 找到原来的问题了,就是这段代码,我也不知道当时怎么想的(ry,遇到N/A应该直接跳过
if pd.isnull(read_data['sentence2_binary_parse'][i]):
# read_data['sentence2_binary_parse'][i] = 'N/A' # 会出现N/A
continue
3. 结果
跑了一天多4个epoch有事就停了,accs = [65.3340, 71.3278, 74.4897, 76.0856],因为只用了50d的embedding估计最多80不到一些吧。