Update rwkv.
This commit is contained in:
parent
4f18296e40
commit
240858c030
|
@ -1,55 +0,0 @@
|
|||
#include <stdio.h>
|
||||
#include <assert.h>
|
||||
#include "ATen/ATen.h"
|
||||
|
||||
typedef at::Half bf16;
|
||||
// typedef at::BFloat16 bf16;
|
||||
|
||||
template <typename F>
|
||||
__global__ void kernel_forward(const int B, const int T, const int C, const int H,
|
||||
const F *__restrict__ const _r, const F *__restrict__ const _w, const F *__restrict__ const _k, const F *__restrict__ const _v, const F *__restrict__ const _a, const F *__restrict__ const _b,
|
||||
F *__restrict__ const _y)
|
||||
{
|
||||
const int e = blockIdx.x / H;
|
||||
const int h = blockIdx.x % H;
|
||||
const int i = threadIdx.x;
|
||||
|
||||
float state[_N_] = {0};
|
||||
__shared__ float r[_N_], k[_N_], w[_N_], a[_N_], b[_N_];
|
||||
|
||||
for (int _t = 0; _t < T; _t++)
|
||||
{
|
||||
const int t = e*T*C + h*_N_ + i + _t * C;
|
||||
__syncthreads();
|
||||
r[i] = float(_r[t]);
|
||||
w[i] = __expf(-__expf(float(_w[t])));
|
||||
k[i] = float(_k[t]);
|
||||
a[i] = float(_a[t]);
|
||||
b[i] = float(_b[t]);
|
||||
__syncthreads();
|
||||
|
||||
float sa = 0;
|
||||
#pragma unroll
|
||||
for (int j = 0; j < _N_; j++)
|
||||
{
|
||||
sa += a[j] * state[j];
|
||||
}
|
||||
|
||||
float vv = float(_v[t]);
|
||||
float y = 0;
|
||||
#pragma unroll
|
||||
for (int j = 0; j < _N_; j++)
|
||||
{
|
||||
float& s = state[j];
|
||||
s = s * w[j] + k[j] * vv + sa * b[j];
|
||||
y += s * r[j];
|
||||
}
|
||||
_y[t] = F(y);
|
||||
}
|
||||
}
|
||||
|
||||
void cuda_forward(int B, int T, int C, int H, bf16 *r, bf16* w, bf16 *k, bf16 *v, bf16 *a, bf16 *b, bf16 *y)
|
||||
{
|
||||
assert(H*_N_ == C);
|
||||
kernel_forward<<<dim3(B * H), dim3(_N_)>>>(B, T, C, H, r, w, k, v, a, b, y);
|
||||
}
|
|
@ -1,15 +0,0 @@
|
|||
#include <torch/extension.h>
|
||||
#include "ATen/ATen.h"
|
||||
|
||||
typedef at::Half bf16;
|
||||
// typedef at::BFloat16 bf16;
|
||||
|
||||
void cuda_forward(int B, int T, int C, int H, bf16 *r, bf16 *w, bf16 *k, bf16 *v, bf16 *a, bf16 *b, bf16 *y);
|
||||
|
||||
void forward(int64_t B, int64_t T, int64_t C, int64_t H, torch::Tensor &r, torch::Tensor &w, torch::Tensor &k, torch::Tensor &v, torch::Tensor &a, torch::Tensor &b, torch::Tensor &y) {
|
||||
cuda_forward(B, T, C, H, r.data_ptr<bf16>(), w.data_ptr<bf16>(), k.data_ptr<bf16>(), v.data_ptr<bf16>(), a.data_ptr<bf16>(), b.data_ptr<bf16>(), y.data_ptr<bf16>());
|
||||
}
|
||||
|
||||
TORCH_LIBRARY(wkv7, m) {
|
||||
m.def("forward", forward);
|
||||
}
|
|
@ -1,64 +0,0 @@
|
|||
#include <stdio.h>
|
||||
#include <assert.h>
|
||||
#include "ATen/ATen.h"
|
||||
|
||||
typedef at::Half bf16;
|
||||
// typedef at::BFloat16 bf16;
|
||||
|
||||
template <typename F>
|
||||
__global__ void kernel_forward(const int B, const int T, const int C, const int H,
|
||||
float *__restrict__ _state, const F *__restrict__ const _r, const F *__restrict__ const _w, const F *__restrict__ const _k, const F *__restrict__ const _v, const F *__restrict__ const _a, const F *__restrict__ const _b,
|
||||
F *__restrict__ const _y)
|
||||
{
|
||||
const int e = blockIdx.x / H;
|
||||
const int h = blockIdx.x % H;
|
||||
const int i = threadIdx.x;
|
||||
_state += h*_N_*_N_ + i*_N_; // wrong if B > 1 !!!
|
||||
|
||||
float state[_N_];
|
||||
#pragma unroll
|
||||
for (int j = 0; j < _N_; j++)
|
||||
state[j] = _state[j];
|
||||
|
||||
__shared__ float r[_N_], k[_N_], w[_N_], a[_N_], b[_N_];
|
||||
|
||||
for (int _t = 0; _t < T; _t++)
|
||||
{
|
||||
const int t = e*T*C + h*_N_ + i + _t * C;
|
||||
__syncthreads();
|
||||
r[i] = float(_r[t]);
|
||||
w[i] = __expf(-__expf(float(_w[t])));
|
||||
k[i] = float(_k[t]);
|
||||
a[i] = float(_a[t]);
|
||||
b[i] = float(_b[t]);
|
||||
__syncthreads();
|
||||
|
||||
float sa = 0;
|
||||
#pragma unroll
|
||||
for (int j = 0; j < _N_; j++)
|
||||
{
|
||||
sa += a[j] * state[j];
|
||||
}
|
||||
|
||||
float vv = float(_v[t]);
|
||||
float y = 0;
|
||||
#pragma unroll
|
||||
for (int j = 0; j < _N_; j++)
|
||||
{
|
||||
float& s = state[j];
|
||||
s = s * w[j] + k[j] * vv + sa * b[j];
|
||||
y += s * r[j];
|
||||
}
|
||||
_y[t] = F(y);
|
||||
}
|
||||
#pragma unroll
|
||||
for (int j = 0; j < _N_; j++)
|
||||
_state[j] = state[j];
|
||||
}
|
||||
|
||||
void cuda_forward(int B, int T, int C, int H, float *state, bf16 *r, bf16* w, bf16 *k, bf16 *v, bf16 *a, bf16 *b, bf16 *y)
|
||||
{
|
||||
assert(H*_N_ == C);
|
||||
assert(B == 1); // only for B=1
|
||||
kernel_forward<<<dim3(B * H), dim3(_N_)>>>(B, T, C, H, state, r, w, k, v, a, b, y);
|
||||
}
|
|
@ -1,15 +0,0 @@
|
|||
#include <torch/extension.h>
|
||||
#include "ATen/ATen.h"
|
||||
|
||||
typedef at::Half bf16;
|
||||
// typedef at::BFloat16 bf16;
|
||||
|
||||
void cuda_forward(int B, int T, int C, int H, float *state, bf16 *r, bf16 *w, bf16 *k, bf16 *v, bf16 *a, bf16 *b, bf16 *y);
|
||||
|
||||
void forward(int64_t B, int64_t T, int64_t C, int64_t H, torch::Tensor &state, torch::Tensor &r, torch::Tensor &w, torch::Tensor &k, torch::Tensor &v, torch::Tensor &a, torch::Tensor &b, torch::Tensor &y) {
|
||||
cuda_forward(B, T, C, H, state.data_ptr<float>(), r.data_ptr<bf16>(), w.data_ptr<bf16>(), k.data_ptr<bf16>(), v.data_ptr<bf16>(), a.data_ptr<bf16>(), b.data_ptr<bf16>(), y.data_ptr<bf16>());
|
||||
}
|
||||
|
||||
TORCH_LIBRARY(wkv7s, m) {
|
||||
m.def("forward", forward);
|
||||
}
|
|
@ -0,0 +1,18 @@
|
|||
|
||||
|
||||
|
||||
R-Receptance 这个接受度可以从代码上直接看到,它是模型对过去的记忆程度。
|
||||
W-Weight 这个Weight本身并不是一个泛指,是一个过去信息的时间衰减
|
||||
K、V 就是等同于Transformer的Key与Value。
|
||||
|
||||
- 记住过去的信息(通过 V)
|
||||
- 找到相关的信息(通过 K)
|
||||
- 控制信息的重要性(通过 W)
|
||||
- 决定使用多少信息(通过 R)
|
||||
|
||||
|
||||
TimeMix,指的是过去信息x-1与当前信息x的混合。 xx = self.time_shift(x) - x 这个是典型的操作
|
||||
|
||||
|
||||
|
||||
nn.Embedding
|
|
@ -42,8 +42,6 @@ DTYPE = torch.half # better
|
|||
args.head_size_a = 64 # don't change
|
||||
HEAD_SIZE = args.head_size_a
|
||||
|
||||
USE_CUDA_KERNEL = True # False => UNOPTIMIZED, VERY SLOW
|
||||
|
||||
########################################################################################################
|
||||
# RWKV Tokenizer (slow version)
|
||||
########################################################################################################
|
||||
|
@ -129,93 +127,6 @@ class RWKV_TOKENIZER:
|
|||
|
||||
tokenizer = RWKV_TOKENIZER("rwkv_vocab_v20230424.txt")
|
||||
|
||||
########################################################################################################
|
||||
# CUDA Kernel
|
||||
########################################################################################################
|
||||
|
||||
if USE_CUDA_KERNEL:
|
||||
|
||||
from torch.utils.cpp_extension import load
|
||||
|
||||
load(
|
||||
name="wkv7",
|
||||
sources=["cuda/wkv7_op.cpp", f"cuda/wkv7.cu"],
|
||||
is_python_module=False,
|
||||
verbose=True,
|
||||
extra_cuda_cflags=[
|
||||
"-res-usage",
|
||||
"--use_fast_math",
|
||||
"-O3",
|
||||
"-Xptxas -O3",
|
||||
"--extra-device-vectorization",
|
||||
f"-D_N_={HEAD_SIZE}",
|
||||
],
|
||||
)
|
||||
|
||||
class WKV_7(torch.autograd.Function):
|
||||
@staticmethod
|
||||
def forward(ctx, r, w, k, v, a, b):
|
||||
with torch.no_grad():
|
||||
B, T, C = r.size()
|
||||
H = C // HEAD_SIZE
|
||||
N = HEAD_SIZE
|
||||
assert HEAD_SIZE == C // H
|
||||
assert r.dtype == DTYPE
|
||||
assert w.dtype == DTYPE
|
||||
assert k.dtype == DTYPE
|
||||
assert v.dtype == DTYPE
|
||||
assert a.dtype == DTYPE
|
||||
assert b.dtype == DTYPE
|
||||
assert r.is_contiguous()
|
||||
assert w.is_contiguous()
|
||||
assert k.is_contiguous()
|
||||
assert v.is_contiguous()
|
||||
assert a.is_contiguous()
|
||||
assert b.is_contiguous()
|
||||
y = torch.empty((B, T, C), device=k.device, dtype=DTYPE, memory_format=torch.contiguous_format)
|
||||
torch.ops.wkv7.forward(B, T, C, H, r, w, k, v, a, b, y)
|
||||
return y
|
||||
|
||||
def RWKV7_OP(r, w, k, v, a, b):
|
||||
return WKV_7.apply(r, w, k, v, a, b)
|
||||
|
||||
else:
|
||||
|
||||
def RWKV7_OP(r, w, k, v, a, b):
|
||||
B, T, C = r.size()
|
||||
H = C // HEAD_SIZE
|
||||
N = HEAD_SIZE
|
||||
r = r.view(B, T, H, N).float()
|
||||
k = k.view(B, T, H, N).float()
|
||||
v = v.view(B, T, H, N).float()
|
||||
a = a.view(B, T, H, N).float()
|
||||
b = b.view(B, T, H, N).float()
|
||||
w = torch.exp(-torch.exp(w.view(B, T, H, N).float()))
|
||||
out = torch.zeros((B, T, H, N), device=r.device, dtype=torch.float)
|
||||
state = torch.zeros((B, H, N, N), device=r.device, dtype=torch.float)
|
||||
|
||||
for t in range(T):
|
||||
kk = k[:, t, :].view(B, H, 1, N)
|
||||
rr = r[:, t, :].view(B, H, N, 1)
|
||||
vv = v[:, t, :].view(B, H, N, 1)
|
||||
aa = a[:, t, :].view(B, H, N, 1)
|
||||
bb = b[:, t, :].view(B, H, 1, N)
|
||||
state = state * w[:, t, :, None, :] + state @ aa @ bb + vv @ kk
|
||||
out[:, t, :] = (state @ rr).view(B, H, N)
|
||||
|
||||
# another method using einsum
|
||||
#
|
||||
# kk = k[:, t, :]
|
||||
# rr = r[:, t, :]
|
||||
# vv = v[:, t, :]
|
||||
# aa = a[:, t, :]
|
||||
# bb = b[:, t, :]
|
||||
# sab = torch.einsum('bhik,bhk,bhj->bhij', state, aa, bb)
|
||||
# state = state * w[: , t, :, None, :] + sab + torch.einsum('bhj,bhi->bhij', kk, vv)
|
||||
# out[:, t, :] = torch.einsum('bhj,bhij->bhi', rr, state)
|
||||
|
||||
return out.view(B, T, C).to(dtype=DTYPE)
|
||||
|
||||
|
||||
########################################################################################################
|
||||
# RWKV TimeMix
|
||||
|
@ -296,7 +207,32 @@ class RWKV_Tmix_x070(Module):
|
|||
kk = F.normalize(kk.view(B, T, H, -1), dim=-1, p=2.0).view(B, T, C)
|
||||
k = k * (1 + (a - 1) * self.k_a)
|
||||
|
||||
def RWKV7_OP(r, w, k, v, a, b):
|
||||
B, T, C = r.size()
|
||||
H = C // HEAD_SIZE
|
||||
N = HEAD_SIZE
|
||||
r = r.view(B, T, H, N).float()
|
||||
k = k.view(B, T, H, N).float()
|
||||
v = v.view(B, T, H, N).float()
|
||||
a = a.view(B, T, H, N).float()
|
||||
b = b.view(B, T, H, N).float()
|
||||
w = torch.exp(-torch.exp(w.view(B, T, H, N).float()))
|
||||
out = torch.zeros((B, T, H, N), device=r.device, dtype=torch.float)
|
||||
state = torch.zeros((B, H, N, N), device=r.device, dtype=torch.float)
|
||||
|
||||
for t in range(T):
|
||||
kk = k[:, t, :].view(B, H, 1, N)
|
||||
rr = r[:, t, :].view(B, H, N, 1)
|
||||
vv = v[:, t, :].view(B, H, N, 1)
|
||||
aa = a[:, t, :].view(B, H, N, 1)
|
||||
bb = b[:, t, :].view(B, H, 1, N)
|
||||
state = state * w[:, t, :, None, :] + state @ aa @ bb + vv @ kk
|
||||
out[:, t, :] = (state @ rr).view(B, H, N)
|
||||
|
||||
return out.view(B, T, C).to(dtype=DTYPE)
|
||||
|
||||
x = RWKV7_OP(r, w, k, v, -kk, kk * a)
|
||||
|
||||
x = self.ln_x(x.view(B * T, C)).view(B, T, C)
|
||||
|
||||
x = x + (
|
||||
|
|
Loading…
Reference in New Issue