当前位置: 首页 > news >正文

大模型入门3:理解LLAMA

  • LLama在transformers库中的代码,以及各部分原理
  • Llama3.1技术报告
  • LLama 33b 微调尝试

Model

  • a stack of DecoderBlocks(SelfAttention, FeedForward, and RMSNorm)
    在这里插入图片描述
    decoder block 整体结构:最大的区别在pre-norm

x -> norm(x) -> attention() -> residual connect -> norm() -> ffn -> residual connect

class DecoderBlock(nn.Module):def __init__(self, config):super().__init__()self.n_heads = config['n_heads']self.dim = config['embed_dim']self.head_dim = self.dim // self.n_headsself.attention = SelfAttention(config)self.feed_forward = FeedForward(config)# rms before attention blockself.attention_norm = RMSNorm(self.dim, eps=config['norm_eps'])# rms before  feed forward blockself.ffn_norm = RMSNorm(self.dim, eps=config['norm_eps'])def forward(self, x, start_pos, freqs_complex):# (m, seq_len, dim)h = x + self.attention.forward(self.attention_norm(x), start_pos, freqs_complex)# (m, seq_len, dim)out = h + self.feed_forward.forward(self.ffn_norm(h))return outclass Transformer(nn.Module):def __init__(self, config):super().__init__()self.vocab_size = config['vocab_size']self.n_layers = config['n_layers']self.tok_embeddings = nn.Embedding(self.vocab_size, config['embed_dim'])self.head_dim = config['embed_dim'] // config['n_heads']self.layers = nn.ModuleList()for layer_id in range(config['n_layers']):self.layers.append(DecoderBlock(config))self.norm = RMSNorm(config['embed_dim'], eps=config['norm_eps'])self.output = nn.Linear(config['embed_dim'], self.vocab_size, bias=False)self.freqs_complex = precompute_theta_pos_frequencies(self.head_dim, config['max_seq_len'] * 2, device=(config['device']))def forward(self, tokens, start_pos):# (m, seq_len)batch_size, seq_len = tokens.shape# (m, seq_len) -> (m, seq_len, embed_dim)h = self.tok_embeddings(tokens)# (seq_len, (embed_dim/n_heads)/2]freqs_complex = self.freqs_complex[start_pos:start_pos + seq_len]# Consecutively apply all the encoder layers# (m, seq_len, dim)for layer in self.layers:h = layer(h, start_pos, freqs_complex)h = self.norm(h)# (m, seq_len, vocab_size)output = self.output(h).float()return outputmodel = Transformer(config).to(config['device'])
res = model.forward(test_set['input_ids'].to(config['device']), 0)
print(res.size())

RoPE

在这里插入图片描述

def precompute_theta_pos_frequencies(head_dim, seq_len, device, theta=10000.0):# theta_i = 10000^(-2(i-1)/dim) for i = [1, 2, ... dim/2]# (head_dim / 2)theta_numerator = torch.arange(0, head_dim, 2).float()theta = 1.0 / (theta ** (theta_numerator / head_dim)).to(device)# (seq_len)m = torch.arange(seq_len, device=device)# (seq_len, head_dim / 2)freqs = torch.outer(m, theta).float()# complex numbers in polar, c = R * exp(m * theta), where R = 1:# (seq_len, head_dim/2)freqs_complex = torch.polar(torch.ones_like(freqs), freqs)return freqs_complexdef apply_rotary_embeddings(x, freqs_complex, device):# last dimension pairs of two values represent real and imaginary# two consecutive values will become a single complex number# (m, seq_len, num_heads, head_dim/2, 2)x = x.float().reshape(*x.shape[:-1], -1, 2)# (m, seq_len, num_heads, head_dim/2)x_complex = torch.view_as_complex(x)# (seq_len, head_dim/2) --> (1, seq_len, 1, head_dim/2)freqs_complex = freqs_complex.unsqueeze(0).unsqueeze(2)# multiply each complex number# (m, seq_len, n_heads, head_dim/2)x_rotated = x_complex * freqs_complex# convert back to the real number# (m, seq_len, n_heads, head_dim/2, 2)x_out = torch.view_as_real(x_rotated)# (m, seq_len, n_heads, head_dim)x_out = x_out.reshape(*x.shape)return x_out.type_as(x).to(device)

RMS norm

class RMSNorm(nn.Module):def __init__(self, dim, eps=1e-6):super().__init__()self.eps = epsself.weight = nn.Parameter(torch.ones(dim))def _norm(self, x: torch.Tensor):# (m, seq_len, dim) * (m, seq_len, 1) = (m, seq_len, dim)# rsqrt: 1 / sqrt(x)return x * torch.rsqrt(x.pow(2).mean(-1, keepdim=True) + self.eps)def forward(self, x: torch.Tensor):# weight is a gain parameter used to re-scale the standardized summed inputs# (dim) * (m, seq_len, dim) = (m, seq_Len, dim)return self.weight * self._norm(x.float()).type_as(x)

KV Caching

在这里插入图片描述
在这里插入图片描述

class KVCache:def __init__(self, max_batch_size, max_seq_len, n_kv_heads, head_dim, device):self.cache_k = torch.zeros((max_batch_size, max_seq_len, n_kv_heads, head_dim)).to(device)self.cache_v = torch.zeros((max_batch_size, max_seq_len, n_kv_heads, head_dim)).to(device)def update(self, batch_size, start_pos, xk, xv):self.cache_k[:batch_size, start_pos :start_pos + xk.size(1)] = xkself.cache_v[:batch_size, start_pos :start_pos + xv.size(1)] = xvdef get(self, batch_size, start_pos, seq_len):keys = self.cache_k[:batch_size,  :start_pos + seq_len]values = self.cache_v[:batch_size, :start_pos + seq_len]return keys, values

Grouped Query Attention

在这里插入图片描述

def repeat_kv(x, n_rep):batch_size, seq_len, n_kv_heads, head_dim = x.shapeif n_rep == 1:return xelse:# (m, seq_len, n_kv_heads, 1, head_dim)# --> (m, seq_len, n_kv_heads, n_rep, head_dim)# --> (m, seq_len, n_kv_heads * n_rep, head_dim)return (x[:, :, :, None, :].expand(batch_size, seq_len, n_kv_heads, n_rep, head_dim).reshape(batch_size, seq_len, n_kv_heads * n_rep, head_dim))class SelfAttention(nn.Module):def __init__(self, config):super().__init__()self.n_heads = config['n_heads']self.n_kv_heads = config['n_kv_heads']self.dim = config['embed_dim']self.n_kv_heads = self.n_heads if self.n_kv_heads is None else self.n_kv_headsself.n_heads_q = self.n_headsself.n_rep = self.n_heads_q // self.n_kv_headsself.head_dim = self.dim // self.n_headsself.wq = nn.Linear(self.dim, self.n_heads * self.head_dim, bias=False)self.wk = nn.Linear(self.dim, self.n_kv_heads * self.head_dim, bias=False)self.wv = nn.Linear(self.dim, self.n_kv_heads * self.head_dim, bias=False)self.wo = nn.Linear(self.n_heads * self.head_dim, self.dim, bias=False)self.cache = KVCache(max_batch_size=config['max_batch_size'],max_seq_len=config['max_seq_len'],n_kv_heads=self.n_kv_heads,head_dim=self.head_dim,device=config['device'])def forward(self, x, start_pos, freqs_complex):# seq_len is always 1 during inferencebatch_size, seq_len, _ = x.shape# (m, seq_len, dim)xq = self.wq(x)# (m, seq_len, h_kv * head_dim)xk = self.wk(x)xv = self.wv(x)# (m, seq_len, n_heads, head_dim)xq = xq.view(batch_size, seq_len, self.n_heads_q, self.head_dim)# (m, seq_len, h_kv, head_dim)xk = xk.view(batch_size, seq_len, self.n_kv_heads, self.head_dim)xv = xv.view(batch_size, seq_len, self.n_kv_heads, self.head_dim)# (m, seq_len, num_head, head_dim)xq = apply_rotary_embeddings(xq, freqs_complex, device=x.device)# (m, seq_len, h_kv, head_dim)xk = apply_rotary_embeddings(xk, freqs_complex, device=x.device)# replace the entry in the cacheself.cache.update(batch_size, start_pos, xk, xv)# (m, seq_len, h_kv, head_dim)keys, values = self.cache.get(batch_size, start_pos, seq_len)# (m, seq_len, h_kv, head_dim) --> (m, seq_len, n_heads, head_dim)keys = repeat_kv(keys, self.n_rep)values = repeat_kv(values, self.n_rep)# (m, n_heads, seq_len, head_dim)# seq_len is 1 for xq during inferencexq = xq.transpose(1, 2)# (m, n_heads, seq_len, head_dim)keys = keys.transpose(1, 2)values = values.transpose(1, 2)# (m, n_heads, seq_len_q, head_dim) @ (m, n_heads, head_dim, seq_len) -> (m, n_heads, seq_len_q, seq_len)scores = torch.matmul(xq, keys.transpose(2, 3)) / math.sqrt(self.head_dim)# (m, n_heads, seq_len_q, seq_len)scores = F.softmax(scores.float(), dim=-1).type_as(xq)# (m, n_heads, seq_len_q, seq_len) @ (m, n_heads, seq_len, head_dim) -> (m, n_heads, seq_len_q, head_dim)output = torch.matmul(scores, values)# ((m, n_heads, seq_len_q, head_dim) -> (m, seq_len_q, dim)output = (output.transpose(1, 2).contiguous().view(batch_size, seq_len, -1))# (m, seq_len_q, dim)return self.wo(output)

SwiGlu

def sigmoid(x, beta=1):return 1 / (1 + torch.exp(-x * beta))def swiglu(x, beta=1):return x * sigmoid(x, beta)
class FeedForward(nn.Module):def __init__(self, config):super().__init__()hidden_dim = 4 * config['embed_dim']hidden_dim = int(2 * hidden_dim / 3)if config['ffn_dim_multiplier'] is not None:hidden_dim = int(config['ffn_dim_multiplier'] * hidden_dim)# Round the hidden_dim to the nearest multiple of the multiple_of parameterhidden_dim = config['multiple_of'] * ((hidden_dim + config['multiple_of'] - 1) // config['multiple_of'])self.w1 = nn.Linear(config['embed_dim'], hidden_dim, bias=False)self.w2 = nn.Linear(config['embed_dim'], hidden_dim, bias=False)self.w3 = nn.Linear(hidden_dim, config['embed_dim'], bias=False)def forward(self, x: torch.Tensor):# (m, seq_len, dim) --> (m, seq_len, hidden_dim)swish = swiglu(self.w1(x))# (m, seq_len, dim) --> (m, seq_len, hidden_dim)x_V = self.w2(x)# (m, seq_len, hidden_dim)x = swish * x_V# (m, seq_len, hidden_dim) --> (m, seq_len, dim)return self.w3(x)

小结

  • padding 方式

reference

  • llama tech report
  • 源码:transformers
  • 参数量计算: https://zhuanlan.zhihu.com/p/676113501
  • 基于 MLX 的 LLAMA2-13B 的详细分析 - 亚东的文章 - 知乎 https://zhuanlan.zhihu.com/p/677125915
  • 2023年你最喜欢的MLSys相关的工作是什么? - Lin Zhang的回答 - 知乎
  • https://ai.plainenglish.io/understanding-llama2-kv-cache-grouped-query-attention-rotary-embedding-and-more-c17e5f49a6d7
  • https://github.com/wdndev/llama3-from-scratch-zh/blob/main/llama3/model.py
http://www.lryc.cn/news/440249.html

相关文章:

  • React学习day07-ReactRouter-抽象路由模块、路由导航、路由导航传参、嵌套路由、默认二级路由的设置、两种路由模式
  • Unity项目的脚本继承关系
  • 【自动驾驶】决策规划算法(一)决策规划仿真平台搭建 | Matlab + Prescan + Carsim 联合仿真基本操作
  • grep 命令:文本搜索
  • python画图|中秋到了,尝试画个月亮(球体画法)
  • 【网络安全的神秘世界】攻防环境搭建及漏洞原理学习
  • pythonnet python图像 C# .NET图像 互转
  • spring security OAuth2 搭建资源服务器以及授权服务器/jdbc/jwt两种方案
  • 计算机视觉—3d点云数据基础
  • Matlab simulink建模与仿真 第十八章(Stateflow状态机)
  • Linux系统终端中文件权限的10位字符是什么意思
  • Qt QSerialPort串口编程
  • 扫雷游戏及其中的知识点
  • 【乐企-业务篇】开票前置校验服务-规则链服务接口实现(发票基础信息校验)
  • 【搜索算法】以扩召回为目标,item-tag不如query-tag能扩更多数量
  • SpringBoot入门(黑马)
  • Stream流操作
  • 【Linux】查看操作系统开机时初始化的驱动模块列表的一个方法
  • 快速入门Vue
  • ubuntu系统服务器离线安装python包
  • re题(30)BUUCTF-[HDCTF2019]Maze
  • day36+day37 0-1背包
  • PostMan使用变量
  • 多线程同步
  • 第159天:安全开发-Python-协议库爆破FTPSSHRedisSMTPMYSQL等
  • 软件测试 | APP测试 —— Appium 的环境搭建及工具安装教程
  • 计算机人工智能前沿进展-大语言模型方向-2024-09-13
  • 衡石分析平台使用手册-替换衡石minio
  • 怎么将几个pdf合成为一个?把几个PDF合并成为一个的8种方法
  • 明明没有程序占用端口,但是启动程序却提示端口无法使用,项目也启动失败