Update rwkv.

2025-03-03 21:30:58 +08:00 · 2025-03-03 21:30:58 +08:00 · 240858c030
parent 4f18296e40
commit 240858c030
6 changed files with 43 additions and 238 deletions
--- a/rwkv/RWKV-v7/cuda/wkv7.cu
+++ b/rwkv/RWKV-v7/cuda/wkv7.cu
@ -1,55 +0,0 @@
-#include <stdio.h>
-#include <assert.h>
-#include "ATen/ATen.h"
-
-typedef at::Half bf16;
-// typedef at::BFloat16 bf16;
-
-template <typename F>
-__global__ void kernel_forward(const int B, const int T, const int C, const int H,
-                               const F *__restrict__ const _r, const F *__restrict__ const _w, const F *__restrict__ const _k, const F *__restrict__ const _v, const F *__restrict__ const _a, const F *__restrict__ const _b,
-                               F *__restrict__ const _y)
-{
-    const int e = blockIdx.x / H;
-    const int h = blockIdx.x % H;
-    const int i = threadIdx.x;
-
-    float state[_N_] = {0};
-    __shared__ float r[_N_], k[_N_], w[_N_], a[_N_], b[_N_];
-
-    for (int _t = 0; _t < T; _t++)
-    {
-        const int t = e*T*C + h*_N_ + i + _t * C;
-        __syncthreads();
-        r[i] = float(_r[t]);
-        w[i] = __expf(-__expf(float(_w[t])));
-        k[i] = float(_k[t]);
-        a[i] = float(_a[t]);
-        b[i] = float(_b[t]);
-        __syncthreads();
-
-        float sa = 0;
-        #pragma unroll
-        for (int j = 0; j < _N_; j++)
-        {
-            sa += a[j] * state[j];
-        }
-
-        float vv = float(_v[t]);
-        float y = 0;
-        #pragma unroll
-        for (int j = 0; j < _N_; j++)
-        {
-            float& s = state[j];
-            s = s * w[j] + k[j] * vv + sa * b[j];
-            y += s * r[j];
-        }
-        _y[t] = F(y);
-    }
-}
-
-void cuda_forward(int B, int T, int C, int H, bf16 *r, bf16* w, bf16 *k, bf16 *v, bf16 *a, bf16 *b, bf16 *y)
-{
-    assert(H*_N_ == C);
-    kernel_forward<<<dim3(B * H), dim3(_N_)>>>(B, T, C, H, r, w, k, v, a, b, y);
-}
--- a/rwkv/RWKV-v7/cuda/wkv7_op.cpp
+++ b/rwkv/RWKV-v7/cuda/wkv7_op.cpp
@ -1,15 +0,0 @@
-#include <torch/extension.h>
-#include "ATen/ATen.h"
-
-typedef at::Half bf16;
-// typedef at::BFloat16 bf16;
-
-void cuda_forward(int B, int T, int C, int H, bf16 *r, bf16 *w, bf16 *k, bf16 *v, bf16 *a, bf16 *b, bf16 *y);
-
-void forward(int64_t B, int64_t T, int64_t C, int64_t H, torch::Tensor &r, torch::Tensor &w, torch::Tensor &k, torch::Tensor &v, torch::Tensor &a, torch::Tensor &b, torch::Tensor &y) {
-    cuda_forward(B, T, C, H, r.data_ptr<bf16>(), w.data_ptr<bf16>(), k.data_ptr<bf16>(), v.data_ptr<bf16>(), a.data_ptr<bf16>(), b.data_ptr<bf16>(), y.data_ptr<bf16>());
-}
-
-TORCH_LIBRARY(wkv7, m) {
-    m.def("forward", forward);
-}
--- a/rwkv/RWKV-v7/cuda/wkv7s.cu
+++ b/rwkv/RWKV-v7/cuda/wkv7s.cu
@ -1,64 +0,0 @@
-#include <stdio.h>
-#include <assert.h>
-#include "ATen/ATen.h"
-
-typedef at::Half bf16;
-// typedef at::BFloat16 bf16;
-
-template <typename F>
-__global__ void kernel_forward(const int B, const int T, const int C, const int H,
-                               float *__restrict__ _state, const F *__restrict__ const _r, const F *__restrict__ const _w, const F *__restrict__ const _k, const F *__restrict__ const _v, const F *__restrict__ const _a, const F *__restrict__ const _b,
-                               F *__restrict__ const _y)
-{
-    const int e = blockIdx.x / H;
-    const int h = blockIdx.x % H;
-    const int i = threadIdx.x;
-    _state += h*_N_*_N_ + i*_N_; // wrong if B > 1 !!!
-
-    float state[_N_];
-    #pragma unroll
-    for (int j = 0; j < _N_; j++)
-        state[j] = _state[j];
-
-    __shared__ float r[_N_], k[_N_], w[_N_], a[_N_], b[_N_];
-
-    for (int _t = 0; _t < T; _t++)
-    {
-        const int t = e*T*C + h*_N_ + i + _t * C;
-        __syncthreads();
-        r[i] = float(_r[t]);
-        w[i] = __expf(-__expf(float(_w[t])));
-        k[i] = float(_k[t]);
-        a[i] = float(_a[t]);
-        b[i] = float(_b[t]);
-        __syncthreads();
-
-        float sa = 0;
-        #pragma unroll
-        for (int j = 0; j < _N_; j++)
-        {
-            sa += a[j] * state[j];
-        }
-
-        float vv = float(_v[t]);
-        float y = 0;
-        #pragma unroll
-        for (int j = 0; j < _N_; j++)
-        {
-            float& s = state[j];
-            s = s * w[j] + k[j] * vv + sa * b[j];
-            y += s * r[j];
-        }
-        _y[t] = F(y);
-    }
-    #pragma unroll
-    for (int j = 0; j < _N_; j++)
-        _state[j] = state[j];    
-}
-
-void cuda_forward(int B, int T, int C, int H, float *state, bf16 *r, bf16* w, bf16 *k, bf16 *v, bf16 *a, bf16 *b, bf16 *y)
-{
-    assert(H*_N_ == C);
-    assert(B == 1); // only for B=1
-    kernel_forward<<<dim3(B * H), dim3(_N_)>>>(B, T, C, H, state, r, w, k, v, a, b, y);
-}
--- a/rwkv/RWKV-v7/cuda/wkv7s_op.cpp
+++ b/rwkv/RWKV-v7/cuda/wkv7s_op.cpp
@ -1,15 +0,0 @@
-#include <torch/extension.h>
-#include "ATen/ATen.h"
-
-typedef at::Half bf16;
-// typedef at::BFloat16 bf16;
-
-void cuda_forward(int B, int T, int C, int H, float *state, bf16 *r, bf16 *w, bf16 *k, bf16 *v, bf16 *a, bf16 *b, bf16 *y);
-
-void forward(int64_t B, int64_t T, int64_t C, int64_t H, torch::Tensor &state, torch::Tensor &r, torch::Tensor &w, torch::Tensor &k, torch::Tensor &v, torch::Tensor &a, torch::Tensor &b, torch::Tensor &y) {
-    cuda_forward(B, T, C, H, state.data_ptr<float>(), r.data_ptr<bf16>(), w.data_ptr<bf16>(), k.data_ptr<bf16>(), v.data_ptr<bf16>(), a.data_ptr<bf16>(), b.data_ptr<bf16>(), y.data_ptr<bf16>());
-}
-
-TORCH_LIBRARY(wkv7s, m) {
-    m.def("forward", forward);
-}
--- a/rwkv/RWKV-v7/model.md
+++ b/rwkv/RWKV-v7/model.md
@ -0,0 +1,18 @@
+
+
+
+R-Receptance 这个接受度可以从代码上直接看到，它是模型对过去的记忆程度。
+W-Weight 这个Weight本身并不是一个泛指，是一个过去信息的时间衰减
+K、V 就是等同于Transformer的Key与Value。
+
+- 记住过去的信息（通过 V）
+- 找到相关的信息（通过 K）
+- 控制信息的重要性（通过 W）
+- 决定使用多少信息（通过 R）
+
+
+TimeMix，指的是过去信息x-1与当前信息x的混合。 xx = self.time_shift(x) - x 这个是典型的操作
+
+
+
+nn.Embedding
--- a/rwkv/RWKV-v7/rwkv_v7_demo.py
+++ b/rwkv/RWKV-v7/rwkv_v7_demo.py
@ -42,8 +42,6 @@ DTYPE = torch.half  # better
 args.head_size_a = 64  # don't change
 HEAD_SIZE = args.head_size_a

-USE_CUDA_KERNEL = True  # False => UNOPTIMIZED, VERY SLOW
-
 ########################################################################################################
 # RWKV Tokenizer (slow version)
 ########################################################################################################
@ -129,93 +127,6 @@ class RWKV_TOKENIZER:

 tokenizer = RWKV_TOKENIZER("rwkv_vocab_v20230424.txt")

-########################################################################################################
-# CUDA Kernel
-########################################################################################################
-
-if USE_CUDA_KERNEL:
-
-    from torch.utils.cpp_extension import load
-
-    load(
-        name="wkv7",
-        sources=["cuda/wkv7_op.cpp", f"cuda/wkv7.cu"],
-        is_python_module=False,
-        verbose=True,
-        extra_cuda_cflags=[
-            "-res-usage",
-            "--use_fast_math",
-            "-O3",
-            "-Xptxas -O3",
-            "--extra-device-vectorization",
-            f"-D_N_={HEAD_SIZE}",
-        ],
-    )
-
-    class WKV_7(torch.autograd.Function):
-        @staticmethod
-        def forward(ctx, r, w, k, v, a, b):
-            with torch.no_grad():
-                B, T, C = r.size()
-                H = C // HEAD_SIZE
-                N = HEAD_SIZE
-                assert HEAD_SIZE == C // H
-                assert r.dtype == DTYPE
-                assert w.dtype == DTYPE
-                assert k.dtype == DTYPE
-                assert v.dtype == DTYPE
-                assert a.dtype == DTYPE
-                assert b.dtype == DTYPE
-                assert r.is_contiguous()
-                assert w.is_contiguous()
-                assert k.is_contiguous()
-                assert v.is_contiguous()
-                assert a.is_contiguous()
-                assert b.is_contiguous()
-                y = torch.empty((B, T, C), device=k.device, dtype=DTYPE, memory_format=torch.contiguous_format)
-                torch.ops.wkv7.forward(B, T, C, H, r, w, k, v, a, b, y)
-                return y
-
-    def RWKV7_OP(r, w, k, v, a, b):
-        return WKV_7.apply(r, w, k, v, a, b)
-
-else:
-
-    def RWKV7_OP(r, w, k, v, a, b):
-        B, T, C = r.size()
-        H = C // HEAD_SIZE
-        N = HEAD_SIZE
-        r = r.view(B, T, H, N).float()
-        k = k.view(B, T, H, N).float()
-        v = v.view(B, T, H, N).float()
-        a = a.view(B, T, H, N).float()
-        b = b.view(B, T, H, N).float()
-        w = torch.exp(-torch.exp(w.view(B, T, H, N).float()))
-        out = torch.zeros((B, T, H, N), device=r.device, dtype=torch.float)
-        state = torch.zeros((B, H, N, N), device=r.device, dtype=torch.float)
-
-        for t in range(T):
-            kk = k[:, t, :].view(B, H, 1, N)
-            rr = r[:, t, :].view(B, H, N, 1)
-            vv = v[:, t, :].view(B, H, N, 1)
-            aa = a[:, t, :].view(B, H, N, 1)
-            bb = b[:, t, :].view(B, H, 1, N)
-            state = state * w[:, t, :, None, :] + state @ aa @ bb + vv @ kk
-            out[:, t, :] = (state @ rr).view(B, H, N)
-
-            # another method using einsum
-            #
-            # kk = k[:, t, :]
-            # rr = r[:, t, :]
-            # vv = v[:, t, :]
-            # aa = a[:, t, :]
-            # bb = b[:, t, :]
-            # sab = torch.einsum('bhik,bhk,bhj->bhij', state, aa, bb)
-            # state = state * w[: , t, :, None, :] + sab + torch.einsum('bhj,bhi->bhij', kk, vv)
-            # out[:, t, :] = torch.einsum('bhj,bhij->bhi', rr, state)
-
-        return out.view(B, T, C).to(dtype=DTYPE)
-

 ########################################################################################################
 # RWKV TimeMix
@ -296,7 +207,32 @@ class RWKV_Tmix_x070(Module):
        kk = F.normalize(kk.view(B, T, H, -1), dim=-1, p=2.0).view(B, T, C)
        k = k * (1 + (a - 1) * self.k_a)

+        def RWKV7_OP(r, w, k, v, a, b):
+            B, T, C = r.size()
+            H = C // HEAD_SIZE
+            N = HEAD_SIZE
+            r = r.view(B, T, H, N).float()
+            k = k.view(B, T, H, N).float()
+            v = v.view(B, T, H, N).float()
+            a = a.view(B, T, H, N).float()
+            b = b.view(B, T, H, N).float()
+            w = torch.exp(-torch.exp(w.view(B, T, H, N).float()))
+            out = torch.zeros((B, T, H, N), device=r.device, dtype=torch.float)
+            state = torch.zeros((B, H, N, N), device=r.device, dtype=torch.float)
+
+            for t in range(T):
+                kk = k[:, t, :].view(B, H, 1, N)
+                rr = r[:, t, :].view(B, H, N, 1)
+                vv = v[:, t, :].view(B, H, N, 1)
+                aa = a[:, t, :].view(B, H, N, 1)
+                bb = b[:, t, :].view(B, H, 1, N)
+                state = state * w[:, t, :, None, :] + state @ aa @ bb + vv @ kk
+                out[:, t, :] = (state @ rr).view(B, H, N)
+
+            return out.view(B, T, C).to(dtype=DTYPE)
+
        x = RWKV7_OP(r, w, k, v, -kk, kk * a)
+
        x = self.ln_x(x.view(B * T, C)).view(B, T, C)

        x = x + (