当前位置: 首页 > news >正文

BiLSTM-CRF的中文命名实体识别

项目地址:NLP-Application-and-Practice/11_BiLSTM-ner-bilstm-crf/11.3-BiLSTM-CRF的中文命名实体识别/ner_bilstm_crf at master · zz-zik/NLP-Application-and-Practice (github.com)

 读取renmindata.pkl文件

read_file_pkl.py

# encoding:utf-8import pickle# 读取数据
def load_data():pickle_path = './data_target_pkl/renmindata.pkl'with open(pickle_path, 'rb') as inp:word2id = pickle.load(inp)id2word = pickle.load(inp)tag2id = pickle.load(inp)id2tag = pickle.load(inp)x_train = pickle.load(inp)y_train = pickle.load(inp)x_test = pickle.load(inp)y_test = pickle.load(inp)x_valid = pickle.load(inp)y_valid = pickle.load(inp)print("train len:", len(x_train))print("test len:", len(x_test))print("valid len:", len(x_valid))return word2id, tag2id, x_train, x_test, x_valid, y_train, y_test, y_valid, id2tagdef main():word = load_data()print(len(word))if __name__ == '__main__':main()

这段代码定义了一个函数load_data(),用于读取存储在文件'../data_target_pkl/renminddata.pkl'中的数据。函数首先使用pickle模块打开文件,然后逐个加载文件中的数据并赋值给相应的变量。最后,打印出训练集、测试集和验证集的长度,并返回这些变量。在main()函数中,调用load_data()函数并打印其返回值。这段代码的目的是读取并加载pickle文件中的数据,并在main()函数中测试load_data()函数的正确性。 

构建BiLSTM-CRF

bilstm_crf_model.py

# encoding:utf-8import torch
import torch.nn as nn
from TorchCRF import CRF
from torch.utils.data import Dataset# 命名体识别数据
class NERDataset(Dataset):def __init__(self, X, Y, *args, **kwargs):self.data = [{'x': X[i], 'y': Y[i]} for i in range(X.shape[0])]def __getitem__(self, index):return self.data[index]def __len__(self):return len(self.data)# LSTM_CRF模型
class NERLSTM_CRF(nn.Module):def __init__(self, config):super(NERLSTM_CRF, self).__init__()self.embedding_dim = config.embedding_dimself.hidden_dim = config.hidden_dimself.vocab_size = config.vocab_sizeself.num_tags = config.num_tagsself.embeds = nn.Embedding(self.vocab_size, self.embedding_dim)self.dropout = nn.Dropout(config.dropout)self.lstm = nn.LSTM(self.embedding_dim,self.hidden_dim // 2,num_layers=1,bidirectional=True,batch_first=True,  # 该属性设置后,需要特别注意数据的形状)self.linear = nn.Linear(self.hidden_dim, self.num_tags)# CRF 层self.crf = CRF(self.num_tags)def forward(self, x, mask):embeddings = self.embeds(x)feats, hidden = self.lstm(embeddings)emissions = self.linear(self.dropout(feats))outputs = self.crf.viterbi_decode(emissions, mask)return outputsdef log_likelihood(self, x, labels, mask):embeddings = self.embeds(x)feats, hidden = self.lstm(embeddings)emissions = self.linear(self.dropout(feats))loss = -self.crf.forward(emissions, labels, mask)return torch.sum(loss)# ner chinese

这段代码定义了一个用于命名体识别的LSTM_CRF模型。NERDataset类是一个自定义的用于存储命名体识别数据的类,继承自torch.utils.data.Dataset。NERLSTM_CRF类是一个自定义的继承自torch.nn.Module的类,用于实现LSTM_CRF模型的前向传播和训练过程。该模型包含嵌入层、LSTM层、线性层和CRF层。通过调用log_likelihood方法可以计算给定输入序列的对数似然。 

模型信息

utils.py

# encoding:utf-8
import torch
from utils import load_data
from utils import parse_tags
from utils import utils_to_train
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import f1_score
from sklearn.metrics import classification_reportword2id = load_data()[0]
max_epoch, device, train_data_loader, valid_data_loader, test_data_loader, optimizer, model = utils_to_train()# 中文命名体识别
class ChineseNER(object):def train(self):for epoch in range(max_epoch):# 训练模式model.train()for index, batch in enumerate(train_data_loader):# 梯度归零optimizer.zero_grad()# 训练数据-->gpux = batch['x'].to(device)mask = (x > 0).to(device)y = batch['y'].to(device)# 前向计算计算损失loss = model.log_likelihood(x, y, mask)# 反向传播loss.backward()# 梯度裁剪torch.nn.utils.clip_grad_norm_(parameters=model.parameters(),max_norm=10)# 更新参数optimizer.step()if index % 200 == 0:print('epoch:%5d,------------loss:%f' %(epoch, loss.item()))# 验证损失和精度aver_loss = 0preds, labels = [], []for index, batch in enumerate(valid_data_loader):# 验证模式model.eval()# 验证数据-->gpuval_x, val_y = batch['x'].to(device), batch['y'].to(device)val_mask = (val_x > 0).to(device)predict = model(val_x, val_mask)# 前向计算损失loss = model.log_likelihood(val_x, val_y, val_mask)aver_loss += loss.item()# 统计非0的,也就是真实标签的长度leng = []res = val_y.cpu()for i in val_y.cpu():tmp = []for j in i:if j.item() > 0:tmp.append(j.item())leng.append(tmp)for index, i in enumerate(predict):preds += i[:len(leng[index])]for index, i in enumerate(val_y.tolist()):labels += i[:len(leng[index])]# 损失值与评测指标aver_loss /= (len(valid_data_loader) * 64)precision = precision_score(labels, preds, average='macro')recall = recall_score(labels, preds, average='macro')f1 = f1_score(labels, preds, average='macro')report = classification_report(labels, preds)print(report)torch.save(model.state_dict(), 'params1.data_target_pkl')# 预测,输入为单句,输出为对应的单词和标签def predict(self, input_str=""):model.load_state_dict(torch.load("../models/ner/params1.data_target_pkl"))model.eval()if not input_str:input_str = input("请输入文本: ")input_vec = []for char in input_str:if char not in word2id:input_vec.append(word2id['[unknown]'])else:input_vec.append(word2id[char])# convert to tensorsentences = torch.tensor(input_vec).view(1, -1).to(device)mask = sentences > 0paths = model(sentences, mask)res = parse_tags(input_str, paths[0])return res# 在测试集上评判性能def test(self, test_dataloader):model.load_state_dict(torch.load("../models/ner/params1.data_target_pkl"))aver_loss = 0preds, labels = [], []for index, batch in enumerate(test_dataloader):# 验证模式model.eval()# 验证数据-->gpuval_x, val_y = batch['x'].to(device), batch['y'].to(device)val_mask = (val_x > 0).to(device)predict = model(val_x, val_mask)# 前向计算损失loss = model.log_likelihood(val_x, val_y, val_mask)aver_loss += loss.item()# 统计非0的,也就是真实标签的长度leng = []for i in val_y.cpu():tmp = []for j in i:if j.item() > 0:tmp.append(j.item())leng.append(tmp)for index, i in enumerate(predict):preds += i[:len(leng[index])]for index, i in enumerate(val_y.tolist()):labels += i[:len(leng[index])]# 损失值与评测指标aver_loss /= len(test_dataloader)precision = precision_score(labels, preds, average='macro')recall = recall_score(labels, preds, average='macro')f1 = f1_score(labels, preds, average='macro')report = classification_report(labels, preds)print(report)if __name__ == '__main__':cn = ChineseNER()cn.train()

这段代码定义了一个用于命名实体识别的模型和训练函数。其中,parse_tags函数用于将模型的预测结果解码成可读的实体类别;Config类定义了一些超参数;utils_to_train函数返回训练过程中需要用到的各种对象和参数。 

BiLSTM-CRF的训练

train.py

# encoding:utf-8
import torch
from utils import load_data
from utils import parse_tags
from utils import utils_to_train
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import f1_score
from sklearn.metrics import classification_reportword2id = load_data()[0]
max_epoch, device, train_data_loader, valid_data_loader, test_data_loader, optimizer, model = utils_to_train()# 中文命名体识别
class ChineseNER(object):def train(self):for epoch in range(max_epoch):# 训练模式model.train()for index, batch in enumerate(train_data_loader):# 梯度归零optimizer.zero_grad()# 训练数据-->gpux = batch['x'].to(device)mask = (x > 0).to(device)y = batch['y'].to(device)# 前向计算计算损失loss = model.log_likelihood(x, y, mask)# 反向传播loss.backward()# 梯度裁剪torch.nn.utils.clip_grad_norm_(parameters=model.parameters(),max_norm=10)# 更新参数optimizer.step()if index % 200 == 0:print('epoch:%5d,------------loss:%f' %(epoch, loss.item()))# 验证损失和精度aver_loss = 0preds, labels = [], []for index, batch in enumerate(valid_data_loader):# 验证模式model.eval()# 验证数据-->gpuval_x, val_y = batch['x'].to(device), batch['y'].to(device)val_mask = (val_x > 0).to(device)predict = model(val_x, val_mask)# 前向计算损失loss = model.log_likelihood(val_x, val_y, val_mask)aver_loss += loss.item()# 统计非0的,也就是真实标签的长度leng = []res = val_y.cpu()for i in val_y.cpu():tmp = []for j in i:if j.item() > 0:tmp.append(j.item())leng.append(tmp)for index, i in enumerate(predict):preds += i[:len(leng[index])]for index, i in enumerate(val_y.tolist()):labels += i[:len(leng[index])]# 损失值与评测指标aver_loss /= (len(valid_data_loader) * 64)precision = precision_score(labels, preds, average='macro')recall = recall_score(labels, preds, average='macro')f1 = f1_score(labels, preds, average='macro')report = classification_report(labels, preds)print(report)torch.save(model.state_dict(), 'params1.data_target_pkl')# 预测,输入为单句,输出为对应的单词和标签def predict(self, input_str=""):model.load_state_dict(torch.load("../models/ner/params1.data_target_pkl"))model.eval()if not input_str:input_str = input("请输入文本: ")input_vec = []for char in input_str:if char not in word2id:input_vec.append(word2id['[unknown]'])else:input_vec.append(word2id[char])# convert to tensorsentences = torch.tensor(input_vec).view(1, -1).to(device)mask = sentences > 0paths = model(sentences, mask)res = parse_tags(input_str, paths[0])return res# 在测试集上评判性能def test(self, test_dataloader):model.load_state_dict(torch.load("../models/ner/params1.data_target_pkl"))aver_loss = 0preds, labels = [], []for index, batch in enumerate(test_dataloader):# 验证模式model.eval()# 验证数据-->gpuval_x, val_y = batch['x'].to(device), batch['y'].to(device)val_mask = (val_x > 0).to(device)predict = model(val_x, val_mask)# 前向计算损失loss = model.log_likelihood(val_x, val_y, val_mask)aver_loss += loss.item()# 统计非0的,也就是真实标签的长度leng = []for i in val_y.cpu():tmp = []for j in i:if j.item() > 0:tmp.append(j.item())leng.append(tmp)for index, i in enumerate(predict):preds += i[:len(leng[index])]for index, i in enumerate(val_y.tolist()):labels += i[:len(leng[index])]# 损失值与评测指标aver_loss /= len(test_dataloader)precision = precision_score(labels, preds, average='macro')recall = recall_score(labels, preds, average='macro')f1 = f1_score(labels, preds, average='macro')report = classification_report(labels, preds)print(report)if __name__ == '__main__':cn = ChineseNER()cn.train()

这段代码实现了一个中文命名体识别的训练和预测功能。通过加载数据和训练参数,使用循环神经网络模型进行训练和验证,计算损失和评估指标,然后在测试集上进行性能评估。最后,提供一个函数用于对输入文本进行预测,并返回预测结果。

http://www.lryc.cn/news/248548.html

相关文章:

  • paddle detection 训练参数
  • 用bat制作图片马——一句话木马
  • json_encode() 返回 false
  • Android-Jetpack--Hilt详解
  • Docker 下载加速
  • 1091 Acute Stroke (三维搜索)
  • java elasticsearch 桶聚合(bucket)
  • 【人生苦短,我学 Python】(4)Python 常用内置数据类型 II —— 序列数据类型(str、tuple、list、bytes和bytearray)
  • Android 9.0 系统默认显示电量百分比
  • 原神:夏洛蒂是否值得培养?全队瞬抬治疗量不输五星,但缺点也很明显
  • Sublime text 添加到鼠标右键菜单,脚本实现
  • 【算法】离散化 与 哈希 之间的区别
  • Android : GPS定位 获取当前位置—简单应用
  • 目标检测——R-CNN算法解读
  • 基于傅里叶变换的运动模糊图像恢复算法matlab仿真
  • 使用mock.js模拟数据
  • Android Handler同步屏障:深入解析
  • HT for Web (Hightopo) 使用心得(5)- 动画的实现
  • Leetcode(面试题 08.01.)三步问题
  • AIGC: 关于ChatGPT中输出表格/表情/图片/图表这些非文本的方式
  • 聊聊logback的addtivity属性
  • 在网络安全护网中,溯源是什么?
  • 【刷题】动态规划
  • hadoop操作
  • 角色管理--高级产品经理岗
  • nginx: [alert] could not open error log file
  • MySQL数据库:外键、唯一键、唯一索引
  • CSS核心功能手册:从熟悉到精通
  • 编程的重要性及解决技术难题的方法
  • 如何成为一名高效的前端开发者(10X开发者)