这是很棒的模型但有个问题
建议在下个版本中增进code的知识和coding的能力
在https://chat.lmsys.org/ 评测Temperature 0.9时的输出
我的输入:
将下述模型修改为双向注意力, 保留原有组件并增加一个额外的组件, 整合前向和后向的注意力输出, 并确保在生成时顺序输出, 在训练时学习双向输入输出 :
class Attention(nn.Module):
def init(self, args: ModelArgs):
super().init()
self.n_kv_heads = args.n_heads if args.n_kv_heads is None else args.n_kv_heads
assert args.n_heads % self.n_kv_heads == 0
model_parallel_size = 1
self.n_local_heads = args.n_heads // model_parallel_size
self.n_local_kv_heads = self.n_kv_heads // model_parallel_size
self.n_rep = self.n_local_heads // self.n_local_kv_heads
self.head_dim = args.dim // args.n_heads
self.wq = nn.Linear(args.dim, args.n_heads * self.head_dim, bias=False)
self.wk = nn.Linear(args.dim, self.n_kv_heads * self.head_dim, bias=False)
self.wv = nn.Linear(args.dim, self.n_kv_heads * self.head_dim, bias=False)
self.wo = nn.Linear(args.n_heads * self.head_dim, args.dim, bias=False)
self.attn_dropout = nn.Dropout(args.dropout)
self.resid_dropout = nn.Dropout(args.dropout)
self.dropout = args.dropout
# use flash attention or a manual implementation?
self.flash = hasattr(torch.nn.functional, 'scaled_dot_product_attention')
if not self.flash:
print("WARNING: using slow attention. Flash Attention requires PyTorch >= 2.0")
mask = torch.full((1, 1, args.max_seq_len, args.max_seq_len), float("-inf"))
mask = torch.triu(mask, diagonal=1)
self.register_buffer("mask", mask)
def forward(
self,
x: torch.Tensor,
freqs_cos: torch.Tensor,
freqs_sin: torch.Tensor,
):
bsz, seqlen, _ = x.shape
# QKV
xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)
xq = xq.view(bsz, seqlen, self.n_local_heads, self.head_dim)
xk = xk.view(bsz, seqlen, self.n_local_kv_heads, self.head_dim)
xv = xv.view(bsz, seqlen, self.n_local_kv_heads, self.head_dim)
# RoPE relative positional embeddings
xq, xk = apply_rotary_emb(xq, xk, freqs_cos, freqs_sin)
# grouped multiquery attention: expand out keys and values
xk = repeat_kv(xk, self.n_rep) # (bs, seqlen, n_local_heads, head_dim)
xv = repeat_kv(xv, self.n_rep) # (bs, seqlen, n_local_heads, head_dim)
# make heads into a batch dimension
xq = xq.transpose(1, 2) # (bs, n_local_heads, seqlen, head_dim)
xk = xk.transpose(1, 2)
xv = xv.transpose(1, 2)
# flash implementation
if self.flash:
output = torch.nn.functional.scaled_dot_product_attention(xq, xk, xv, attn_mask=None, dropout_p=self.dropout if self.training else 0.0, is_causal=True)
else:
# manual implementation
scores = torch.matmul(xq, xk.transpose(2, 3)) / math.sqrt(self.head_dim)
assert hasattr(self, 'mask')
scores = scores + self.mask[:, :, :seqlen, :seqlen] # (bs, n_local_heads, seqlen, cache_len + seqlen)
scores = F.softmax(scores.float(), dim=-1).type_as(xq)
scores = self.attn_dropout(scores)
output = torch.matmul(scores, xv) # (bs, n_local_heads, seqlen, head_dim)
# restore time as batch dimension and concat heads
output = output.transpose(1, 2).contiguous().view(bsz, seqlen, -1)
# final projection into the residual stream
output = self.wo(output)
output = self.resid_dropout(output)
return output
qwen1.5 72b-chat:
要将模型修改为双重视注意力模型,我们可以在现有模型中添加一个后向注意力的计算,将前向和后向的注意力输出进行整合。以下是修改后的代码:
import torch
import torch.nn as nn
class BiAttention(nn.Module):
def init(self, args: ModelArgs):
super().init()
self.n_kv_heads = args.n_heads if args.n_kv_heads is None else args.n_kv_heads
assert args.n_heads % self.n_kv_heads == 0
model_parallel_size = 1
self.n_local_heads = args.n_heads // model_parallel_size
self.n_local_kv_heads = self.n_kv_heads // model_parallel_size
self.n_rep = self.n_local_heads // self.n_local_kv_heads
self.head_dim = args.dim // args.n_heads
self.wq = nn.Linear(args.dim, args.n_heads * self.head_dim, bias=False)
self.wk = nn.Linear(args.dim, self.n_kv_heads * self.head_dim, bias=False)
self.wv = nn.Linear(args.dim, self.n_kv_heads * self.head_dim, bias=False)
self.wq_back = nn.Linear(args.dim, args.n_heads * self.head_dim, bias=False)
self.wk_back = nn.Linear(args.dim, self.n_kv_heads * self.head_dim, bias=False)
self.wv_back = nn.Linear(args.dim, self.n_kv_heads * self.head_dim, bias=False)
self.wo = nn.Linear(args.n_heads * self.head_dim * 2, args.dim, bias=False)
self.attn_dropout = nn.Dropout(args.dropout)
self.resid_dropout = nn.Dropout(args.dropout)
self.dropout = args.dropout
# use flash attention or a manual implementation?
self.flash = hasattr(torch.nn.functional, 'scaled_dot_product_attention')
if not self.flash:
print("WARNING: using slow attention. Flash Attention requires PyTorch >= 2.0")
mask = torch.full((1, 1, args.max_seq_len, args.max_seq_len), float("-inf"))
mask = torch.triu(mask, diagonal=1)
self.register_buffer("mask", mask)
def forward(
self,
x: torch.Tensor,
freqs_cos: torch.Tensor,
freqs_sin: torch.Tensor,
):
bsz, seqlen, _ = x.shape
# QKV (forward)
xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)
xq = xq.view(bsz, seqlen, self.n_local_heads, self.head_dim)
xk = xk.view(bsz, seqlen, self.n_local_kv_heads, self.head_dim)
xv = xv.view(bsz, seqlen, self.n_local_kv_heads, self.head_dim)
# QKV (backward)
xq_back, xk_back, xv_back = self.wq_back(x.flip(dims=[1])), self.wk_back(x.flip(dims=[1])), self.wv_back(x.flip(dims=[1]))
xq_back = xq_back.view(bsz, seqlen, self.n_local_heads, self.head_dim)
xk_back = xk_back.view(bsz, seqlen, self.n_local_kv_heads, self.head_dim)
xv_back = xv_back.view(bsz, seqlen, self.n_local_kv_heads, self.head_dim)
# RoPE relative positional embeddings
xq, xk = apply_rotary_emb(xq, xk, freqs_cos, freqs_sin)
xq_back, xk_back = apply_rotary_emb(xq_back, xk_back, freqs_cos, freqs_sin)
# grouped multiquery attention: expand out keys and values
xk = repeat_kv(xk, self.n_rep) # (bs, seqlen, n_local_heads, head_dim)
xv = repeat_kv(xv, self.n_rep) # (bs, seqlen, n_local_heads, head_dim)
xk_back = repeat_kv(xk_back, self.n_rep) # (bs, seqlen, n_local_heads, head_dim)
xv_back = repeat_kv(xv_back, self.n_rep) # (bs, seqlen, n_local_heads, head_dim)
# make heads into a batch dimension
xq = xq.transpose(1, 2) # (bs, n_local_heads, seqlen, head_dim)
xk = xk.transpose(1, 2)
xv = xv.transpose(1, 2)
xq_back = xq_back.transpose(1, 2)
xk_back = xk_back.transpose(1, 2)
xv_back = xv_back.transpose(1, 2)
# flash implementation
if self.flash:
output_fw = torch.nn.functional.scaled_dot_product_attention(xq, xk, xv, attn_mask=None, dropout_p=self.dropout if self.training else 0.0, is_causal=True)
output_bw = torch.nn.functional.scaled_dot_product_attention(xq_back, xk_back, xv_back, attn_mask=None, dropout_p=self.dropout if self.training else 0.0, is_causal=False)
else:
# manual implementation
scores_fw = torch.matmul(xq, xk.transpose(2, 3)) / math.sqrt(self.head_dim)
scores_bw = torch.matmul(xq_back, xk_back.transpose(2, 3)) / math.sqrt(self.head_dim)
assert hasattr(self, 'mask')
scores_fw = scores_fw + self.mask[:, :, :seqlen, :seqlen] # (bs, n_local_heads, seqlen, cache_len + seqlen)
scores_bw = scores_bw + self.mask.flip(dims=[2, 3])[:, :, :seqlen, :seqlen] # (bs, n_local_heads, seqlen, cache_len + seqlen)
scores_fw = F.softmax(scores_fw.float(), dim=-1).type_as(xq)
scores_bw = F.softmax(scores_bw.float(), dim=-1).type_as(xq_back)
scores_fw = self.attn_dropout(scores_fw)
scores_bw = self.attn_dropout(scores_bw)
output_fw = torch.matmul(scores_fw, xv) # (bs, n_local_heads, seqlen, head_dim)
output_bw = torch.matmul(scores_bw, xv_back) # (bs, n_local_heads, seqlen, head_dim)
# restore time as batch dimension and concat heads (forward and backward)
output_fw = output_fw.transpose(1, 2).contiguous().view(bsz, seqlen, -1)
output_bw = output_bw.flip(dims=[1]).transpose(1, 2).contiguous().view(bsz, seqlen, -1)
output = torch.cat([output_fw, output_bw], dim=-1)
# final projection into the residual stream
output = self.wo(output)
output = self.resid_dropout(output)
return output
在这个修改后的模型中,我们添加了后向注意力的计算(xq_back, xk_back, xv_back),并且在合并注意力输出时也分别处理了前向和后向的注意力。注意,由于我们需要在训练时学习双输入输出,所以前后向的线性层(wq_back, wk_back, wv_back)都是独立的。在整合输出时,我们将前向和后向的输出沿着特征维度(-1)进行堆叠。这样,模型在生成时会按照顺序输出,在训练时可以学习到双向输入输出。