当前位置：首页 > news >正文

8-day06预训练模型

news 2025/7/12 12:03:33

预训练方式
文本 -> 向量
word2vec也可以做到同样的事
但word2vec是静态的，而BERT是动态的

Bert结构-Embedding

Bert结构-transformer-self_attention

Bert结构-transformer-多头机制multi_head

Bert结构-transformer

Bert优势
1、通过预训练利用了海量无标注文本数据
2、相比词向量，BERT的文本表示结合了语境
3、Transformer模型结构有很强的拟合能力，词与词之间的距离不会造成关系计算上的损失
4、效果大幅提升
Bert劣势
1.预训练需要数据，时间，和机器（开源模型缓解了这一问题）
2.难以应用在生成式任务上
3.参数量大，运算复杂，满足不了部分真实场景性能需求
4.没有下游数据做fine-tune，效果依然不理想

DiyBert

import torch
import math
import numpy as np
from transformers import BertModel'''
通过手动矩阵运算实现Bert结构
模型文件下载 https://huggingface.co/models
'''
bert = BertModel.from_pretrained(r"..\..\bert-base-chinese", return_dict=False)
state_dict = bert.state_dict()
bert.eval()
x = np.array([2450, 15486, 102, 2110])   #假想成4个字的句子
torch_x = torch.LongTensor([x])          #pytorch形式输入
seqence_output, pooler_output = bert(torch_x)
print(seqence_output.shape, pooler_output.shape)
# print(seqence_output, pooler_output)
print(bert.state_dict().keys())  #查看所有的权值矩阵名称#softmax归一化
def softmax(x):return np.exp(x)/np.sum(np.exp(x), axis=-1, keepdims=True)#gelu激活函数
def gelu(x):return 0.5 * x * (1 + np.tanh(math.sqrt(2 / math.pi) * (x + 0.044715 * np.power(x, 3))))class DiyBert:#将预训练好的整个权重字典输入进来def __init__(self, state_dict):self.num_attention_heads = 12self.hidden_size = 768self.num_layers = 12        #注意这里的层数要跟预训练config.json文件中的模型层数一致self.load_weights(state_dict)def load_weights(self, state_dict):#embedding部分self.word_embeddings = state_dict["embeddings.word_embeddings.weight"].numpy()self.position_embeddings = state_dict["embeddings.position_embeddings.weight"].numpy()self.token_type_embeddings = state_dict["embeddings.token_type_embeddings.weight"].numpy()self.embeddings_layer_norm_weight = state_dict["embeddings.LayerNorm.weight"].numpy()self.embeddings_layer_norm_bias = state_dict["embeddings.LayerNorm.bias"].numpy()self.transformer_weights = []#transformer部分，有多层for i in range(self.num_layers):q_w = state_dict["encoder.layer.%d.attention.self.query.weight" % i].numpy()q_b = state_dict["encoder.layer.%d.attention.self.query.bias" % i].numpy()k_w = state_dict["encoder.layer.%d.attention.self.key.weight" % i].numpy()k_b = state_dict["encoder.layer.%d.attention.self.key.bias" % i].numpy()v_w = state_dict["encoder.layer.%d.attention.self.value.weight" % i].numpy()v_b = state_dict["encoder.layer.%d.attention.self.value.bias" % i].numpy()attention_output_weight = state_dict["encoder.layer.%d.attention.output.dense.weight" % i].numpy()attention_output_bias = state_dict["encoder.layer.%d.attention.output.dense.bias" % i].numpy()attention_layer_norm_w = state_dict["encoder.layer.%d.attention.output.LayerNorm.weight" % i].numpy()attention_layer_norm_b = state_dict["encoder.layer.%d.attention.output.LayerNorm.bias" % i].numpy()intermediate_weight = state_dict["encoder.layer.%d.intermediate.dense.weight" % i].numpy()intermediate_bias = state_dict["encoder.layer.%d.intermediate.dense.bias" % i].numpy()output_weight = state_dict["encoder.layer.%d.output.dense.weight" % i].numpy()output_bias = state_dict["encoder.layer.%d.output.dense.bias" % i].numpy()ff_layer_norm_w = state_dict["encoder.layer.%d.output.LayerNorm.weight" % i].numpy()ff_layer_norm_b = state_dict["encoder.layer.%d.output.LayerNorm.bias" % i].numpy()self.transformer_weights.append([q_w, q_b, k_w, k_b, v_w, v_b, attention_output_weight, attention_output_bias,attention_layer_norm_w, attention_layer_norm_b, intermediate_weight, intermediate_bias,output_weight, output_bias, ff_layer_norm_w, ff_layer_norm_b])#pooler层self.pooler_dense_weight = state_dict["pooler.dense.weight"].numpy()self.pooler_dense_bias = state_dict["pooler.dense.bias"].numpy()#bert embedding，使用3层叠加，在经过一个Layer norm层def embedding_forward(self, x):# x.shape = [max_len]we = self.get_embedding(self.word_embeddings, x)  # shpae: [max_len, hidden_size]# position embeding的输入 [0, 1, 2, 3]pe = self.get_embedding(self.position_embeddings, np.array(list(range(len(x)))))  # shpae: [max_len, hidden_size]# token type embedding,单输入的情况下为[0, 0, 0, 0]te = self.get_embedding(self.token_type_embeddings, np.array([0] * len(x)))  # shpae: [max_len, hidden_size]embedding = we + pe + te# 加和后有一个归一化层embedding = self.layer_norm(embedding, self.embeddings_layer_norm_weight, self.embeddings_layer_norm_bias)  # shpae: [max_len, hidden_size]return embedding#embedding层实际上相当于按index索引，或理解为onehot输入乘以embedding矩阵def get_embedding(self, embedding_matrix, x):return np.array([embedding_matrix[index] for index in x])#执行全部的transformer层计算def all_transformer_layer_forward(self, x):for i in range(self.num_layers):x = self.single_transformer_layer_forward(x, i)return x#执行单层transformer层计算def single_transformer_layer_forward(self, x, layer_index):weights = self.transformer_weights[layer_index]#取出该层的参数，在实际中，这些参数都是随机初始化，之后进行预训练q_w, q_b, \k_w, k_b, \v_w, v_b, \attention_output_weight, attention_output_bias, \attention_layer_norm_w, attention_layer_norm_b, \intermediate_weight, intermediate_bias, \output_weight, output_bias, \ff_layer_norm_w, ff_layer_norm_b = weights#self attention层attention_output = self.self_attention(x,q_w, q_b,k_w, k_b,v_w, v_b,attention_output_weight, attention_output_bias,self.num_attention_heads,self.hidden_size)#bn层，并使用了残差机制x = self.layer_norm(x + attention_output, attention_layer_norm_w, attention_layer_norm_b)#feed forward层feed_forward_x = self.feed_forward(x,intermediate_weight, intermediate_bias,output_weight, output_bias)#bn层，并使用了残差机制x = self.layer_norm(x + feed_forward_x, ff_layer_norm_w, ff_layer_norm_b)return x# self attention的计算def self_attention(self,x,q_w,q_b,k_w,k_b,v_w,v_b,attention_output_weight,attention_output_bias,num_attention_heads,hidden_size):# x.shape = max_len * hidden_size# q_w, k_w, v_w  shape = hidden_size * hidden_size# q_b, k_b, v_b  shape = hidden_sizeq = np.dot(x, q_w.T) + q_b  # shape: [max_len, hidden_size]      W * X + B lINERk = np.dot(x, k_w.T) + k_b  # shpae: [max_len, hidden_size]v = np.dot(x, v_w.T) + v_b  # shpae: [max_len, hidden_size]attention_head_size = int(hidden_size / num_attention_heads)# attention_head_size头大小, num_attention_heads头数量# q.shape = num_attention_heads, max_len, attention_head_sizeq = self.transpose_for_scores(q, attention_head_size, num_attention_heads)# k.shape = num_attention_heads, max_len, attention_head_sizek = self.transpose_for_scores(k, attention_head_size, num_attention_heads)# v.shape = num_attention_heads, max_len, attention_head_sizev = self.transpose_for_scores(v, attention_head_size, num_attention_heads)# qk.shape = num_attention_heads, max_len, max_lenqk = np.matmul(q, k.swapaxes(1, 2))  qk /= np.sqrt(attention_head_size)qk = softmax(qk)# qkv.shape = num_attention_heads, max_len, attention_head_sizeqkv = np.matmul(qk, v)# qkv.shape = max_len, hidden_sizeqkv = qkv.swapaxes(0, 1).reshape(-1, hidden_size)# attention.shape = max_len, hidden_sizeattention = np.dot(qkv, attention_output_weight.T) + attention_output_biasreturn attention#多头机制def transpose_for_scores(self, x, attention_head_size, num_attention_heads):# hidden_size = 768  num_attent_heads = 12 attention_head_size = 64max_len, hidden_size = x.shape# 把(sentenceLength, hiddenSize)转为(sentenceLength, numHeads, headSize)x = x.reshape(max_len, num_attention_heads, attention_head_size)x = x.swapaxes(1, 0)  # output shape = [num_attention_heads, max_len, attention_head_size]return x#前馈网络的计算def feed_forward(self,x,intermediate_weight,  # intermediate_size, hidden_sizeintermediate_bias,  # intermediate_sizeoutput_weight,  # hidden_size, intermediate_sizeoutput_bias,  # hidden_size):# output shpae: [max_len, intermediate_size]x = np.dot(x, intermediate_weight.T) + intermediate_biasx = gelu(x)# output shpae: [max_len, hidden_size]x = np.dot(x, output_weight.T) + output_biasreturn x#归一化层def layer_norm(self, x, w, b):x = (x - np.mean(x, axis=1, keepdims=True)) / np.std(x, axis=1, keepdims=True)x = x * w + breturn x#链接[cls] token的输出层def pooler_output_layer(self, x):x = np.dot(x, self.pooler_dense_weight.T) + self.pooler_dense_biasx = np.tanh(x)return x#最终输出def forward(self, x):x = self.embedding_forward(x)sequence_output = self.all_transformer_layer_forward(x)pooler_output = self.pooler_output_layer(sequence_output[0])return sequence_output, pooler_output#自制
db = DiyBert(state_dict)
diy_sequence_output, diy_pooler_output = db.forward(x)
#torch
torch_sequence_output, torch_pooler_output = bert(torch_x)print(diy_sequence_output)
print(torch_sequence_output)# print(diy_pooler_output)
# print(torch_pooler_output)

BertDemo

#coding:utf8
import torch
import torch.nn as nn
import numpy as np
import random
import json
from transformers import BertModel"""
基于pytorch的网络编写
实现一个网络完成一个简单nlp任务
判断文本中是否有某些特定字符出现
week2的例子，修改引入bert
"""
class TorchModel(nn.Module):def __init__(self, input_dim, sentence_length, vocab):super(TorchModel, self).__init__()# 原始代码# self.embedding = nn.Embedding(len(vocab) + 1, input_dim)# self.layer = nn.Linear(input_dim, input_dim)# self.pool = nn.MaxPool1d(sentence_length)self.bert = BertModel.from_pretrained(r"..\..\bert-base-chinese", return_dict=False)self.classify = nn.Linear(input_dim, 3)self.activation = torch.sigmoid     #sigmoid做激活函数self.dropout = nn.Dropout(0.5)self.loss = nn.functional.cross_entropy#当输入真实标签，返回loss值；无真实标签，返回预测值def forward(self, x, y=None):# 原始代码# x = self.embedding(x)  #input shape:(batch_size, sen_len) (10,6)# x = self.layer(x)      #input shape:(batch_size, sen_len, input_dim) (10,6,20)# x = self.dropout(x)    #input shape:(batch_size, sen_len, input_dim)# x = self.activation(x) #input shape:(batch_size, sen_len, input_dim)# x = self.pool(x.transpose(1,2)).squeeze() #input shape:(batch_size, sen_len, input_dim)sequence_output, pooler_output = self.bert(x)# (20,6,768),(20,768)x = self.classify(pooler_output)# (20,3)y_pred = self.activation(x)if y is not None:return self.loss(y_pred, y)else:return y_pred#字符集随便挑了一些汉字，实际上还可以扩充
#为每个字生成一个标号
#{"a":1, "b":2, "c":3...}
#abc -> [1,2,3]
def build_vocab():chars = "abcdefghijklmnopqrstuvwxyz"  #字符集vocab = {}for index, char in enumerate(chars):vocab[char] = index + 1   #每个字对应一个序号vocab['unk'] = len(vocab)+1return vocab#随机生成一个样本
#从所有字中选取sentence_length个字
def build_sample(vocab, sentence_length):#随机从字表选取sentence_length个字，可能重复x = [random.choice(list(vocab.keys())) for _ in range(sentence_length)]#A类样本if set("abc") & set(x) and not set("xyz") & set(x):y = 0#B类样本elif not set("abc") & set(x) and set("xyz") & set(x):y = 1#C类样本else:y = 2x = [vocab.get(word, vocab['unk']) for word in x]   #将字转换成序号，为了做embeddingreturn x, y#建立数据集
#输入需要的样本数量。需要多少生成多少
def build_dataset(batch_size, vocab, sentence_length):dataset_x = []dataset_y = []for i in range(batch_size):x, y = build_sample(vocab, sentence_length)dataset_x.append(x)dataset_y.append(y)return torch.LongTensor(dataset_x), torch.LongTensor(dataset_y)#建立模型
def build_model(vocab, char_dim, sentence_length):model = TorchModel(char_dim, sentence_length, vocab)return model#测试代码
#用来测试每轮模型的准确率
def evaluate(model, vocab, sample_length):model.eval()total = 200 #测试样本数量x, y = build_dataset(total, vocab, sample_length)   #建立200个用于测试的样本y = y.squeeze()print("A类样本数量：%d, B类样本数量：%d, C类样本数量：%d"%(y.tolist().count(0), y.tolist().count(1), y.tolist().count(2)))correct, wrong = 0, 0with torch.no_grad():y_pred = model(x)      #模型预测for y_p, y_t in zip(y_pred, y):  #与真实标签进行对比if int(torch.argmax(y_p)) == int(y_t):correct += 1   #正样本判断正确else:wrong += 1print("正确预测个数：%d / %d, 正确率：%f"%(correct, total, correct/(correct+wrong)))return correct/(correct+wrong)def main():epoch_num = 15        #训练轮数batch_size = 20       #每次训练样本个数train_sample = 1000   #每轮训练总共训练的样本总数char_dim = 768         #每个字的维度sentence_length = 6   #样本文本长度vocab = build_vocab()       #建立字表model = build_model(vocab, char_dim, sentence_length)    #建立模型optim = torch.optim.Adam(model.parameters(), lr=1e-5)   #建立优化器log = []for epoch in range(epoch_num):model.train()watch_loss = []for batch in range(int(train_sample / batch_size)):x, y = build_dataset(batch_size, vocab, sentence_length) #构建一组训练样本optim.zero_grad()    #梯度归零loss = model(x, y)   #计算lossloss.backward()      #计算梯度optim.step()         #更新权重watch_loss.append(loss.item())print("=========\n第%d轮平均loss:%f" % (epoch + 1, np.mean(watch_loss)))acc = evaluate(model, vocab, sentence_length)   #测试本轮模型结果log.append([acc, np.mean(watch_loss)])# plt.plot(range(len(log)), [l[0] for l in log])  #画acc曲线# plt.plot(range(len(log)), [l[1] for l in log])  #画loss曲线# plt.show()#保存模型torch.save(model.state_dict(), "model.pth")# 保存词表writer = open("vocab.json", "w", encoding="utf8")writer.write(json.dumps(vocab, ensure_ascii=False, indent=2))writer.close()return#最终预测
def predict(model_path, vocab_path, input_strings):char_dim = 20  # 每个字的维度sentence_length = 6  # 样本文本长度vocab = json.load(open(vocab_path, "r", encoding="utf8"))model = build_model(vocab, char_dim, sentence_length)    #建立模型model.load_state_dict(torch.load(model_path))       #加载训练好的权重x = []for input_string in input_strings:x.append([vocab[char] for char in input_string])  #将输入序列化model.eval()   #测试模式，不使用dropoutwith torch.no_grad():  #不计算梯度result = model.forward(torch.LongTensor(x))  #模型预测for i, input_string in enumerate(input_strings):print(int(torch.argmax(result[i])), input_string, result[i]) #打印结果if __name__ == "__main__":main()# test_strings = ["juvaee", "yrwfrg", "rtweqg", "nlhdww"]# predict("model.pth", "vocab.json", test_strings)

计算Bert参数量

from transformers import BertModel
'''
计算Bert参数量
'''
model = BertModel.from_pretrained(r"..\bert-base-chinese", return_dict=False)
n = 2                       # 输入最大句子个数
vocab = 21128               # 词表数目
max_sequence_length = 512   # 最大句子长度
embedding_size = 768        # embedding维度
hide_size = 3072            # 隐藏层维数
num_layers = 1             # 隐藏层层数# embedding过程中的参数，其中 vocab * embedding_size是词表embedding参数， max_sequence_length * embedding_size是位置参数， n * embedding_size是句子参数
# embedding_size + embedding_sizes是layer_norm层参数
embedding_parameters = vocab * embedding_size + max_sequence_length * embedding_size + n * embedding_size + embedding_size + embedding_size# self_attention过程的参数, 其中embedding_size * embedding_size是权重参数，embedding_size是bias， *3是K Q V三个
self_attention_parameters = (embedding_size * embedding_size + embedding_size) * 3# self_attention_out参数 其中 embedding_size * embedding_size + embedding_size + embedding_size是self输出的线性层参数，embedding_size + embedding_size是layer_norm层参数
self_attention_out_parameters = embedding_size * embedding_size + embedding_size + embedding_size + embedding_size# Feed Forward参数 其中embedding_size * hide_size + hide_size第一个线性层，embedding_size * hide_size + embedding_size第二个线性层，
# embedding_size + embedding_size是layer_norm层
feed_forward_parameters = embedding_size * hide_size + hide_size + embedding_size * hide_size + embedding_size + embedding_size + embedding_size# pool_fc层参数
pool_fc_parameters = embedding_size * embedding_size + embedding_size# 模型总参数 = embedding层参数 + self_attention参数 + self_attention_out参数 + Feed_Forward参数 + pool_fc层参数
all_paramerters = embedding_parameters + (self_attention_parameters + self_attention_out_parameters + \feed_forward_parameters) * num_layers + pool_fc_parameters
print("模型实际参数个数为%d" % sum(p.numel() for p in model.parameters()))
print("diy计算参数个数为%d" % all_paramerters)