import torch import torch.nn as nn from torch.autograd import Variable import torch.optim as optim import torch.nn.functional as F import nltk import random import numpy as np from collections import Counter from sklearn_crfsuite import metrics import pickle
预定义
1 2 3 4 5 6 7 8
random.seed(1024) flatten = lambda l: [item for sublist in l for item in sublist]
USE_CUDA = torch.cuda.is_available()
FloatTensor = torch.cuda.FloatTensor if USE_CUDA else torch.FloatTensor LongTensor = torch.cuda.LongTensor if USE_CUDA else torch.LongTensor ByteTensor = torch.cuda.ByteTensor if USE_CUDA else torch.ByteTensor
# 映射 # 词映射,<UNK> 为未登录词,<DUMMY> 为扩展填充词 word2index={'<UNK>' : 0, '<DUMMY>' : 1}for vo in vocab: if word2index.get(vo) isNone: word2index[vo] = len(word2index) index2word = {v:k for k, v in word2index.items()}
tag2index = {} for tag in tagset: if tag2index.get(tag) isNone: tag2index[tag] = len(tag2index) index2tag={v:k for k, v in tag2index.items()}
生成窗口词对
这一步的生成结果如文章开头的示意图
1 2 3 4 5 6 7 8 9 10 11 12
WINDOW_SIZE = 2 windows_train = [] for sample in train_data: #每一个sample就是完整的一句话以及对应的单词tag dummy = ['<DUMMY>'] * WINDOW_SIZE window = list(nltk.ngrams(dummy + list(sample[0]) + dummy, WINDOW_SIZE * 2 + 1)) windows_train.extend([[list(window[i]), sample[1][i]] for i inrange(len(sample[0]))])
windows_test = [] for sample in test_data: #每一个sample就是完整的一句话以及对应的单词tag dummy = ['<DUMMY>'] * WINDOW_SIZE window = list(nltk.ngrams(dummy + list(sample[0]) + dummy, WINDOW_SIZE * 2 + 1)) windows_test.extend([[list(window[i]), sample[1][i]] for i inrange(len(sample[0]))])
model = WindowClassifier(len(word2index), EMBEDDING_SIZE, WINDOW_SIZE, HIDDEN_SIZE, len(tag2index)) if USE_CUDA: model = model.cuda() loss_function = nn.CrossEntropyLoss() optimizer = optim.Adam(model.parameters(), lr=LEARNING_RATE)
for epoch inrange(EPOCH): losses = [] for i,batch inenumerate(getBatch(BATCH_SIZE, train_data)): x,y=list(zip(*batch)) inputs = torch.cat([prepare_sequence(sent, word2index).view(1, -1) for sent in x]) # view是变成二维 这里是把batch绑起来一个矩阵 targets = torch.cat([prepare_tag(tag, tag2index) for tag in y]) model.zero_grad() preds = model(inputs, is_training=True) loss = loss_function(preds, targets) losses.append(loss.item()) loss.backward() optimizer.step()
if i % 1000 == 0: print("[%d/%d] mean_loss : %0.2f" %(epoch, EPOCH, np.mean(losses))) losses = [] # 保存模型 torch.save(model.state_dict(), "./simplennmodel.pkl")