Revert ovxlib topk kernel modification (#694)

Internal ovxlib commit:b12b1f138e66c78e0fb4032e5399a68a7280a801 is revert for sw compatibility Type: Bug Fix Signed-off-by: Feiyue Chen <Feiyue.Chen@verisilicon.com>
2024-04-27 07:54:30 +08:00 · 2024-04-27 07:54:30 +08:00 · e1c2f0a18d
parent 3b80968fb1
commit e1c2f0a18d
6 changed files with 25 additions and 782 deletions
--- a/src/tim/vx/internal/include/vsi_nn_context.h
+++ b/src/tim/vx/internal/include/vsi_nn_context.h
@ -60,7 +60,9 @@ typedef struct _vsi_nn_hw_config_t
 {
    char target_name[VSI_NN_MAX_TARGET_NAME];
    vsi_nn_hw_evis_t evis;
+#if VX_HARDWARE_CAPS_PARAMS_EXT_SUPPORT
    uint32_t subGroupSize;
+#endif
    uint32_t use_40bits_va;
    uint32_t support_stream_processor;
    uint32_t sp_exec_count;
--- a/src/tim/vx/internal/include/vsi_nn_version.h
+++ b/src/tim/vx/internal/include/vsi_nn_version.h
@ -33,7 +33,7 @@ extern "C"{

 #define VSI_NN_VERSION_MAJOR 1
 #define VSI_NN_VERSION_MINOR 2
-#define VSI_NN_VERSION_PATCH 6
+#define VSI_NN_VERSION_PATCH 5
 #define VSI_NN_VERSION \
    (VSI_NN_VERSION_MAJOR * 10000 + VSI_NN_VERSION_MINOR * 100 + VSI_NN_VERSION_PATCH)

--- a/src/tim/vx/internal/src/kernel/cl/topk_cl.c
+++ b/src/tim/vx/internal/src/kernel/cl/topk_cl.c
@ -34,24 +34,20 @@
 #include "vsi_nn_tensor_util.h"
 #include "utils/vsi_nn_util.h"
 #include "kernel/vsi_nn_kernel.h"
+#include "libnnext/vx_lib_nnext.h"

 __BEGIN_DECLS

 #define _TOPK_KERNEL_SOURCE      "topk"
 #define STR(a) #a
 // Add kernel hashtable here
-#define TOPK_HASH_KEY( IN_DTYPE, OUT_DTYPE, STAGES, SECTION ) \
-        ( ( IN_DTYPE ) | ( OUT_DTYPE << 8 ) | (STAGES << 16) | (SECTION << 26))
+#define TOPK_HASH_KEY( IN_DTYPE, OUT_DTYPE, STAGES ) \
+        ( ( IN_DTYPE ) | ( OUT_DTYPE << 8 ) | (STAGES << 16) )
 #define PACK_KERNEL_MAP( IN_DTYPE, OUT_DTYPE, STAGES ) \
-        { TOPK_HASH_KEY( IN_DTYPE, OUT_DTYPE, STAGES, 0 ), \
+        { TOPK_HASH_KEY( IN_DTYPE, OUT_DTYPE, STAGES ), \
          CVIVANTE_NAMESPACE("cl.topk_stage"STR(STAGES)"_"STR(IN_DTYPE)"to"STR(OUT_DTYPE)"_I32"), \
          _TOPK_KERNEL_SOURCE }

-#define PACK_MERGE_KERNEL_MAP( IN_DTYPE, OUT_DTYPE ) \
-        { TOPK_HASH_KEY( IN_DTYPE, OUT_DTYPE, 0, 1 ), \
-          CVIVANTE_NAMESPACE("cl.topk_stage_"STR(IN_DTYPE)"to"STR(OUT_DTYPE)"_I32"), \
-          "topk2" }
-
 #define TOPK_ODD_EVEN_SORT_HASH_KEY( IN_DTYPE, OUT_DTYPE ) \
        ( ( IN_DTYPE ) | ( OUT_DTYPE << 8 ) )
 #define PACK_ODD_EVEN_SORT_KERNEL_MAP( IN_DTYPE, OUT_DTYPE ) \
@ -115,9 +111,6 @@ static const _kernel_map_type _topk_kernel_map[] =
    PACK_KERNEL_MAP( F32, I32, 4 ),
    PACK_KERNEL_MAP( F32, I32, 5 ),
    PACK_KERNEL_MAP( F32, I32, 6 ),
-
-    PACK_MERGE_KERNEL_MAP(U32, U32),
-    PACK_MERGE_KERNEL_MAP(I32, I32),
 };

 static const _kernel_map_type _topk_odd_even_sort_kernel_map[] =
@ -261,8 +254,7 @@ static vsi_status _query_kernel
    vsi_nn_kernel_t * kernel,
    vsi_nn_tensor_t * const * const inputs,
    vsi_nn_tensor_t * const * const outputs,
-    int32_t num_stages,
-    vsi_bool is_bitnoic_segment
+    int32_t num_stages
    )
 {
    vsi_status status = VSI_FAILURE;
@ -280,23 +272,21 @@ static vsi_status _query_kernel
    in_dtype  = vsi_nn_kernel_map_dtype( inputs[0]->attr.dtype.vx_type );
    out_dtype = vsi_nn_kernel_map_dtype( outputs[0]->attr.dtype.vx_type );

-    num_stages = is_bitnoic_segment ? 0 : num_stages;
-
    switch (_PACK_SELECT_KEY(in_dtype, out_dtype))
    {
    case _PACK_SELECT_KEY(F32, F32):
    case _PACK_SELECT_KEY(F16, F16):
-        key = TOPK_HASH_KEY( F32, F32, num_stages, is_bitnoic_segment );
+        key = TOPK_HASH_KEY( F32, F32, num_stages );
        break;
    case _PACK_SELECT_KEY(U32, U32):
    case _PACK_SELECT_KEY(U16, U16):
    case _PACK_SELECT_KEY(U8,  U8):
-        key = TOPK_HASH_KEY( U32, U32, num_stages, is_bitnoic_segment );
+        key = TOPK_HASH_KEY( U32, U32, num_stages );
        break;
    case _PACK_SELECT_KEY(I32, I32):
    case _PACK_SELECT_KEY(I16, I16):
    case _PACK_SELECT_KEY(I8,  I8):
-        key = TOPK_HASH_KEY( I32, I32, num_stages, is_bitnoic_segment );
+        key = TOPK_HASH_KEY( I32, I32, num_stages );
        break;
    case _PACK_SELECT_KEY(F32, U32):
    case _PACK_SELECT_KEY(F16, U32):
@ -304,7 +294,7 @@ static vsi_status _query_kernel
    case _PACK_SELECT_KEY(F16, U16):
    case _PACK_SELECT_KEY(F32, U8):
    case _PACK_SELECT_KEY(F16, U8):
-        key = TOPK_HASH_KEY( F32, U32, num_stages, is_bitnoic_segment );
+        key = TOPK_HASH_KEY( F32, U32, num_stages );
        break;
    case _PACK_SELECT_KEY(F32, I32):
    case _PACK_SELECT_KEY(F16, I32):
@ -312,7 +302,7 @@ static vsi_status _query_kernel
    case _PACK_SELECT_KEY(F16, I16):
    case _PACK_SELECT_KEY(F32, I8):
    case _PACK_SELECT_KEY(F16, I8):
-        key = TOPK_HASH_KEY( F32, I32, num_stages, is_bitnoic_segment );
+        key = TOPK_HASH_KEY( F32, I32, num_stages );
        break;
    default:
        break;
@ -450,12 +440,7 @@ static vsi_nn_kernel_node_t _setup
    int32_t top_k = vsi_nn_kernel_param_get_int32(params, "top_k");
    int32_t num_stages = (int32_t)vsi_nn_max(ceil(log10(block_size / 2.0f) / log10(2.0f)), 0);
    vsi_bool is_odd_even_sort = FALSE;
-    vsi_bool is_bitnoic_segment = FALSE;
    size_t param_num = _TOPK_PARAM_NUM;
-    int32_t max_stages = 7 + (int32_t)log2(graph->ctx->config.subGroupSize >> 2);
-    vsi_nn_kernel_dtype_e type0 = vsi_nn_kernel_map_dtype( inputs[0]->attr.dtype.vx_type );
-    vsi_nn_kernel_dtype_e type1 = vsi_nn_kernel_map_dtype( outputs[0]->attr.dtype.vx_type );
-
    float inputScale  = vsi_nn_get_tensor_scale(inputs[0]);
    float inputTail   = (float)vsi_nn_get_tensor_zero_point(inputs[0]);
    float outputScale = vsi_nn_get_tensor_scale(outputs[0]);
@ -486,14 +471,9 @@ static vsi_nn_kernel_node_t _setup
    rs_tensors[0] = vsi_nn_reshape_tensor( graph,
        inputs[0], shape[0], 2 );

-    is_bitnoic_segment = (num_stages >= 9) && (top_k <= 512 && max_stages > 9) &&
-        type0 == type1 && (type0 == U8 || type0 == I8 || type0 == I16 || type0 == U16 || type0 == I32 || type0 == U32);
-    num_stages = is_bitnoic_segment ? 9 : num_stages;
-    max_stages = is_bitnoic_segment ? max_stages : 7;
-
-    if (num_stages < max_stages || is_bitnoic_segment)
+    if (num_stages < 7)
    {
-        status = _query_kernel( kernel, inputs, outputs, num_stages, is_bitnoic_segment );
+        status = _query_kernel( kernel, inputs, outputs, num_stages );

        rs_tensors[1] = vsi_nn_reshape_tensor( graph,
            outputs[0], shape[1], 2 );
--- a/src/tim/vx/internal/src/libnnext/ops/cl/topk.cl
+++ b/src/tim/vx/internal/src/libnnext/ops/cl/topk.cl
@ -51,7 +51,7 @@ __kernel __attribute__((reqd_work_group_size(LOCAL_SIZE0, 1, 1))) void topk_stag
            float left_elem = local_data[left_id]; \
            float right_elem = local_data[right_id]; \
 \
-            if ((left_elem < right_elem || (left_elem == right_elem && left_idx < right_idx)) ^ signo) \
+            if ((left_elem < right_elem) ^ signo) \
            { \
                local_data[left_id] = right_elem; \
                local_data[right_id] = left_elem; \
@ -139,7 +139,7 @@ __kernel __attribute__((reqd_work_group_size(LOCAL_SIZE0, 1, 1))) void topk_stag
            uint left_elem = local_data[left_id]; \
            uint right_elem = local_data[right_id]; \
 \
-            if ((left_elem < right_elem || (left_elem == right_elem && left_idx < right_idx)) ^ signo) \
+            if ((left_elem < right_elem) ^ signo) \
            { \
                local_data[left_id] = right_elem; \
                local_data[right_id] = left_elem; \
@ -227,7 +227,7 @@ __kernel __attribute__((reqd_work_group_size(LOCAL_SIZE0, 1, 1))) void topk_stag
            int left_elem = local_data[left_id]; \
            int right_elem = local_data[right_id]; \
 \
-            if ((left_elem < right_elem || (left_elem == right_elem && left_idx < right_idx)) ^ signo) \
+            if ((left_elem < right_elem) ^ signo) \
            { \
                local_data[left_id] = right_elem; \
                local_data[right_id] = left_elem; \
@ -315,7 +315,7 @@ __kernel __attribute__((reqd_work_group_size(LOCAL_SIZE0, 1, 1))) void topk_stag
            float left_elem = local_data[left_id]; \
            float right_elem = local_data[right_id]; \
 \
-            if ((left_elem < right_elem || (left_elem == right_elem && left_idx < right_idx)) ^ signo) \
+            if ((left_elem < right_elem) ^ signo) \
            { \
                local_data[left_id] = right_elem; \
                local_data[right_id] = left_elem; \
@ -403,7 +403,7 @@ __kernel __attribute__((reqd_work_group_size(LOCAL_SIZE0, 1, 1))) void topk_stag
            float left_elem = local_data[left_id]; \
            float right_elem = local_data[right_id]; \
 \
-            if ((left_elem < right_elem || (left_elem == right_elem && left_idx < right_idx)) ^ signo) \
+            if ((left_elem < right_elem) ^ signo) \
            { \
                local_data[left_id] = right_elem; \
                local_data[right_id] = left_elem; \
--- a/src/tim/vx/internal/src/libnnext/ops/cl/topk2.cl
+++ b/src/tim/vx/internal/src/libnnext/ops/cl/topk2.cl
@ -1,368 +0,0 @@
-
-#define BITONIC_STEP(dtype) \
-void bitonic_step_##dtype(uint num_stages, int lx, \
-        __local dtype *local_data, __local int *local_indices) \
-{ \
-    for (uint stage = 0; stage < num_stages + 1; ++stage) \
-    { \
-        uint signo = (lx >> stage) & 1; \
- \
-        for (uint passOfStage = 0; passOfStage < stage + 1; ++passOfStage) \
-        { \
-            uint postShift = (stage - passOfStage); \
-            uint pairDistance = 1 << postShift; \
- \
-            uint left_id = ( (lx >> postShift) << (postShift + 1)) + (lx & (pairDistance - 1)); \
-            uint right_id = left_id + pairDistance; \
- \
-            int left_idx = local_indices[left_id]; \
-            int right_idx = local_indices[right_id]; \
- \
-            dtype left_elem = local_data[left_id]; \
-            dtype right_elem = local_data[right_id]; \
- \
-            if ((left_elem < right_elem || (left_elem == right_elem && left_idx < right_idx)) ^ signo) \
-            { \
-                local_data[left_id] = right_elem; \
-                local_data[right_id] = left_elem; \
- \
-                local_indices[left_id] = right_idx; \
-                local_indices[right_id] = left_idx; \
-            } \
- \
-            barrier(CLK_LOCAL_MEM_FENCE); \
-        } \
-    } \
-}
-BITONIC_STEP(int)
-BITONIC_STEP(uint)
-
-#define BITONIC_STEP_ASCEND(dtype) \
-void bitonic_step_ascend_##dtype(uint num_stages, int lx, \
-        __local dtype *p_share_k, __local int *p_share_v) \
-{ \
-    for (uint stage = 0; stage < num_stages + 1; ++stage) \
-    { \
-        uint signo = (lx >> stage) & 1; \
- \
-        for (uint passOfStage = 0; passOfStage < stage + 1; ++passOfStage) \
-        { \
-            uint postShift = (stage - passOfStage); \
-            uint pairDistance = 1 << postShift; \
- \
-            uint left_id = ( (lx >> postShift) << (postShift + 1)) + (lx & (pairDistance - 1)); \
-            uint right_id = left_id + pairDistance; \
- \
-            int left_idx = p_share_v[left_id]; \
-            int right_idx = p_share_v[right_id]; \
- \
-            dtype left_elem = p_share_k[left_id]; \
-            dtype right_elem = p_share_k[right_id]; \
- \
-            if ((left_elem > right_elem || (left_elem == right_elem && left_idx > right_idx)) ^ signo) \
-            { \
-                p_share_k[left_id] = right_elem; \
-                p_share_k[right_id] = left_elem; \
- \
-                p_share_v[left_id] = right_idx; \
-                p_share_v[right_id] = left_idx; \
-            } \
- \
-            barrier(CLK_LOCAL_MEM_FENCE); \
-        } \
-    } \
-}
-BITONIC_STEP_ASCEND(int)
-BITONIC_STEP_ASCEND(uint)
-
-#define BITONIC_MERGE(dtype) \
-void bitonic_merge_##dtype(uint num_stages, int lx, \
-        __local dtype *local_data, __local int *local_indices) \
-{ \
-    uint stage = num_stages; \
-    uint signo = (lx >> stage) & 1; \
- \
-    for (uint passOfStage = 0; passOfStage < stage + 1; ++passOfStage) \
-    { \
-        uint postShift = (stage - passOfStage); \
-        uint pairDistance = 1 << postShift; \
- \
-        uint left_id = ( (lx >> postShift) << (postShift + 1)) + (lx & (pairDistance - 1)); \
-        uint right_id = left_id + pairDistance; \
- \
-        int left_idx = local_indices[left_id]; \
-        int right_idx = local_indices[right_id]; \
- \
-        dtype left_elem = local_data[left_id]; \
-        dtype right_elem = local_data[right_id]; \
- \
-        if ((left_elem < right_elem || (left_elem == right_elem && left_idx < right_idx)) ^ signo) \
-        { \
-            local_data[left_id] = right_elem; \
-            local_data[right_id] = left_elem; \
- \
-            local_indices[left_id] = right_idx; \
-            local_indices[right_id] = left_idx; \
-        } \
- \
-        barrier(CLK_LOCAL_MEM_FENCE); \
-    } \
-}
-BITONIC_MERGE(int)
-BITONIC_MERGE(uint)
-
-#define BLOCK_SIZE              (512)
-
-__kernel __attribute__((reqd_work_group_size(BLOCK_SIZE, 1, 1))) void topk_stage_I32toI32_I32
-(
-  __read_only  image2d_t input,
-  __write_only image2d_t output,
-  __write_only image2d_t indices,
-               float     input_scale,
-               float     input_tail,
-               float     output_scale,
-               float     output_tail,
-               int       _num_stages,
-               int       width
-  )
- {
-    uint lx = get_local_id(0);
-    const int init_k = -2147483647;
-    const int init_v = -2147483647;
-    const int num_stages = 9;
-    const int threads_per_block = BLOCK_SIZE;
-    const int index_minus_1 = threads_per_block * 2 - 1;
-    uint offset = 0;
-    uint lx1 = lx + threads_per_block;
-
-    int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(0), get_global_id(1));
-
-    __local int local_data[1536];
-    __local int local_indices[1536];
-
-    int left = read_imagei(input, coord.xy).x;
-    coord.z += threads_per_block;
-    int right = read_imagei(input, coord.zy).x;
-
-    local_data[lx] = left;
-    local_indices[lx] = coord.x;
-    local_data[lx1] = right;
-    local_indices[lx1] = coord.z;
-
-    barrier(CLK_LOCAL_MEM_FENCE);
-
-    bitonic_step_int(num_stages, lx, local_data, local_indices);
-
-    int min_data = local_data[511];
-
-    int *p_share_k = local_data + threads_per_block;
-    int *p_share_v = local_indices + threads_per_block;
-
-    int limit = (width >> 10) << 10;
-    p_share_k[lx] = init_k;
-    p_share_v[lx] = init_v;
-
-    p_share_k[lx1] = init_k;
-    p_share_v[lx1] = init_v;
-    barrier(CLK_LOCAL_MEM_FENCE);
-
-    for (coord.x = lx + threads_per_block * 2; coord.x < limit; coord.x = coord.x + threads_per_block * 2)
-    {
-        int2 data;
-        coord.z = coord.x + threads_per_block;
-        data.x = read_imagei(input, coord.xy).x;
-        data.y = read_imagei(input, coord.zy).x;
-
-        p_share_k[lx] = data.x;
-        p_share_v[lx] = coord.x;
-
-        p_share_k[lx1] = data.y;
-        p_share_v[lx1] = coord.z;
-        barrier(CLK_LOCAL_MEM_FENCE);
-
-        bitonic_step_ascend_int(num_stages, lx, p_share_k, p_share_v);
-
-        if (p_share_k[index_minus_1] < min_data)
-        {
-            continue;
-        }
-
-        p_share_k[lx] = p_share_k[lx1];
-        p_share_v[lx] = p_share_v[lx1];
-        barrier(CLK_LOCAL_MEM_FENCE);
-
-        bitonic_merge_int(num_stages, lx, local_data, local_indices);
-
-        min_data = local_data[511];
-        p_share_k[lx] = init_k;
-        p_share_v[lx] = init_v;
-        p_share_k[lx1] = init_k;
-        p_share_v[lx1] = init_v;
-    }
-
-    if (width > limit)
-    {
-        if (coord.x < width)
-        {
-            int2 data;
-            data.x = read_imagei(input, coord.xy).x;
-            coord.z = coord.x + threads_per_block;
-            data.y = read_imagei(input, coord.zy).x;
-
-            p_share_k[lx] = data.x;
-            p_share_v[lx] = coord.x;
-
-            p_share_k[lx1] = coord.z < width ? data.y : init_k;
-            p_share_v[lx1] = coord.z < width ? coord.z : init_v;
-        }
-        barrier(CLK_LOCAL_MEM_FENCE);
-
-        bitonic_step_ascend_int(num_stages, lx, p_share_k, p_share_v);
-
-        if (p_share_k[index_minus_1] >= min_data)
-        {
-            p_share_k[lx] = p_share_k[lx1];
-            p_share_v[lx] = p_share_v[lx1];
-            barrier(CLK_LOCAL_MEM_FENCE);
-            bitonic_merge_int(num_stages, lx, local_data, local_indices);
-        }
-    }
-
-    int4 dst;
-    dst.x = local_data[lx];
-
-    coord.x = lx;
-    write_imagei(output, coord.xy, dst.xxxx);
-
-    int4 index;
-    index.x = local_indices[lx];
-
-    write_imagei(indices, coord.xy, index.xxxx);
-}
-
-__kernel __attribute__((reqd_work_group_size(BLOCK_SIZE, 1, 1))) void topk_stage_U32toU32_I32
-(
-  __read_only  image2d_t input,
-  __write_only image2d_t output,
-  __write_only image2d_t indices,
-               float     input_scale,
-               float     input_tail,
-               float     output_scale,
-               float     output_tail,
-               int       _num_stages,
-               int       width
-  )
- {
-    uint lx = get_local_id(0);
-    const uint init_k = 0;
-    const int init_v = -2147483647;
-    const int num_stages = 9;
-    const int threads_per_block = BLOCK_SIZE;
-    const int index_minus_1 = threads_per_block * 2 - 1;
-    uint offset = 0;
-    uint lx1 = lx + threads_per_block;
-
-    int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(0), get_global_id(1));
-
-    __local uint local_data[1536];
-    __local int local_indices[1536];
-
-    uint left = read_imageui(input, coord.xy).x;
-    coord.z += threads_per_block;
-    uint right = read_imageui(input, coord.zy).x;
-
-    local_data[lx] = left;
-    local_indices[lx] = coord.x;
-    local_data[lx1] = right;
-    local_indices[lx1] = coord.z;
-
-    barrier(CLK_LOCAL_MEM_FENCE);
-
-    bitonic_step_uint(num_stages, lx, local_data, local_indices);
-
-    uint min_data = local_data[511];
-
-    uint *p_share_k = local_data + threads_per_block;
-    int *p_share_v = local_indices + threads_per_block;
-
-    int limit = (width >> 10) << 10;
-    p_share_k[lx] = init_k;
-    p_share_v[lx] = init_v;
-
-    p_share_k[lx1] = init_k;
-    p_share_v[lx1] = init_v;
-    barrier(CLK_LOCAL_MEM_FENCE);
-
-    for (coord.x = lx + threads_per_block * 2; coord.x < limit; coord.x = coord.x + threads_per_block * 2)
-    {
-        uint2 data;
-        coord.z = coord.x + threads_per_block;
-        data.x = read_imageui(input, coord.xy).x;
-        data.y = read_imageui(input, coord.zy).x;
-
-        p_share_k[lx] = data.x;
-        p_share_v[lx] = coord.x;
-
-        p_share_k[lx1] = data.y;
-        p_share_v[lx1] = coord.z;
-        barrier(CLK_LOCAL_MEM_FENCE);
-
-        bitonic_step_ascend_uint(num_stages, lx, p_share_k, p_share_v);
-
-        if (p_share_k[index_minus_1] < min_data)
-        {
-            continue;
-        }
-
-        p_share_k[lx] = p_share_k[lx1];
-        p_share_v[lx] = p_share_v[lx1];
-        barrier(CLK_LOCAL_MEM_FENCE);
-
-        bitonic_merge_uint(num_stages, lx, local_data, local_indices);
-
-        min_data = local_data[511];
-        p_share_k[lx] = init_k;
-        p_share_v[lx] = init_v;
-        p_share_k[lx1] = init_k;
-        p_share_v[lx1] = init_v;
-    }
-
-    if (width > limit)
-    {
-        if (coord.x < width)
-        {
-            uint2 data;
-            data.x = read_imageui(input, coord.xy).x;
-            coord.z = coord.x + threads_per_block;
-            data.y = read_imageui(input, coord.zy).x;
-
-            p_share_k[lx] = data.x;
-            p_share_v[lx] = coord.x;
-
-            p_share_k[lx1] = coord.z < width ? data.y : init_k;
-            p_share_v[lx1] = coord.z < width ? coord.z : init_v;
-        }
-        barrier(CLK_LOCAL_MEM_FENCE);
-
-        bitonic_step_ascend_uint(num_stages, lx, p_share_k, p_share_v);
-
-        if (p_share_k[index_minus_1] >= min_data)
-        {
-            p_share_k[lx] = p_share_k[lx1];
-            p_share_v[lx] = p_share_v[lx1];
-            barrier(CLK_LOCAL_MEM_FENCE);
-            bitonic_merge_uint(num_stages, lx, local_data, local_indices);
-        }
-    }
-
-    uint4 dst;
-    dst.x = local_data[lx];
-
-    coord.x = lx;
-    write_imageui(output, coord.xy, dst.xxxx);
-
-    int4 index;
-    index.x = local_indices[lx];
-
-    write_imagei(indices, coord.xy, index.xxxx);
-}
--- a/src/tim/vx/internal/src/libnnext/vsi_nn_libnnext_resource.c
+++ b/src/tim/vx/internal/src/libnnext/vsi_nn_libnnext_resource.c
@ -80509,7 +80509,7 @@ __kernel __attribute__((reqd_work_group_size(LOCAL_SIZE0, 1, 1))) void topk_stag
            float left_elem = local_data[left_id]; \\\n\
            float right_elem = local_data[right_id]; \\\n\
 \\\n\
-            if ((left_elem < right_elem || (left_elem == right_elem && left_idx < right_idx)) ^ signo) \\\n\
+            if ((left_elem < right_elem) ^ signo) \\\n\
            { \\\n\
                local_data[left_id] = right_elem; \\\n\
                local_data[right_id] = left_elem; \\\n\
@ -80597,7 +80597,7 @@ __kernel __attribute__((reqd_work_group_size(LOCAL_SIZE0, 1, 1))) void topk_stag
            uint left_elem = local_data[left_id]; \\\n\
            uint right_elem = local_data[right_id]; \\\n\
 \\\n\
-            if ((left_elem < right_elem || (left_elem == right_elem && left_idx < right_idx)) ^ signo) \\\n\
+            if ((left_elem < right_elem) ^ signo) \\\n\
            { \\\n\
                local_data[left_id] = right_elem; \\\n\
                local_data[right_id] = left_elem; \\\n\
@ -80685,7 +80685,7 @@ __kernel __attribute__((reqd_work_group_size(LOCAL_SIZE0, 1, 1))) void topk_stag
            int left_elem = local_data[left_id]; \\\n\
            int right_elem = local_data[right_id]; \\\n\
 \\\n\
-            if ((left_elem < right_elem || (left_elem == right_elem && left_idx < right_idx)) ^ signo) \\\n\
+            if ((left_elem < right_elem) ^ signo) \\\n\
            { \\\n\
                local_data[left_id] = right_elem; \\\n\
                local_data[right_id] = left_elem; \\\n\
@ -80773,7 +80773,7 @@ __kernel __attribute__((reqd_work_group_size(LOCAL_SIZE0, 1, 1))) void topk_stag
            float left_elem = local_data[left_id]; \\\n\
            float right_elem = local_data[right_id]; \\\n\
 \\\n\
-            if ((left_elem < right_elem || (left_elem == right_elem && left_idx < right_idx)) ^ signo) \\\n\
+            if ((left_elem < right_elem) ^ signo) \\\n\
            { \\\n\
                local_data[left_id] = right_elem; \\\n\
                local_data[right_id] = left_elem; \\\n\
@ -80861,7 +80861,7 @@ __kernel __attribute__((reqd_work_group_size(LOCAL_SIZE0, 1, 1))) void topk_stag
            float left_elem = local_data[left_id]; \\\n\
            float right_elem = local_data[right_id]; \\\n\
 \\\n\
-            if ((left_elem < right_elem || (left_elem == right_elem && left_idx < right_idx)) ^ signo) \\\n\
+            if ((left_elem < right_elem) ^ signo) \\\n\
            { \\\n\
                local_data[left_id] = right_elem; \\\n\
                local_data[right_id] = left_elem; \\\n\
@ -80896,376 +80896,6 @@ TOPK_F32toI32(1 << 4, 4)\n\
 TOPK_F32toI32(1 << 5, 5)\n\
 TOPK_F32toI32(1 << 6, 6)"; /* end of topk_cl*/

-static const char topk2_cl[] = "\n\
-#define BITONIC_STEP(dtype) \\\n\
-void bitonic_step_##dtype(uint num_stages, int lx, \\\n\
-        __local dtype *local_data, __local int *local_indices) \\\n\
-{ \\\n\
-    for (uint stage = 0; stage < num_stages + 1; ++stage) \\\n\
-    { \\\n\
-        uint signo = (lx >> stage) & 1; \\\n\
- \\\n\
-        for (uint passOfStage = 0; passOfStage < stage + 1; ++passOfStage) \\\n\
-        { \\\n\
-            uint postShift = (stage - passOfStage); \\\n\
-            uint pairDistance = 1 << postShift; \\\n\
- \\\n\
-            uint left_id = ( (lx >> postShift) << (postShift + 1)) + (lx & (pairDistance - 1)); \\\n\
-            uint right_id = left_id + pairDistance; \\\n\
- \\\n\
-            int left_idx = local_indices[left_id]; \\\n\
-            int right_idx = local_indices[right_id]; \\\n\
- \\\n\
-            dtype left_elem = local_data[left_id]; \\\n\
-            dtype right_elem = local_data[right_id]; \\\n\
- \\\n\
-            if ((left_elem < right_elem || (left_elem == right_elem && left_idx < right_idx)) ^ signo) \\\n\
-            { \\\n\
-                local_data[left_id] = right_elem; \\\n\
-                local_data[right_id] = left_elem; \\\n\
- \\\n\
-                local_indices[left_id] = right_idx; \\\n\
-                local_indices[right_id] = left_idx; \\\n\
-            } \\\n\
- \\\n\
-            barrier(CLK_LOCAL_MEM_FENCE); \\\n\
-        } \\\n\
-    } \\\n\
-}\n\
-BITONIC_STEP(int)\n\
-BITONIC_STEP(uint)\n\
-\n\
-#define BITONIC_STEP_ASCEND(dtype) \\\n\
-void bitonic_step_ascend_##dtype(uint num_stages, int lx, \\\n\
-        __local dtype *p_share_k, __local int *p_share_v) \\\n\
-{ \\\n\
-    for (uint stage = 0; stage < num_stages + 1; ++stage) \\\n\
-    { \\\n\
-        uint signo = (lx >> stage) & 1; \\\n\
- \\\n\
-        for (uint passOfStage = 0; passOfStage < stage + 1; ++passOfStage) \\\n\
-        { \\\n\
-            uint postShift = (stage - passOfStage); \\\n\
-            uint pairDistance = 1 << postShift; \\\n\
- \\\n\
-            uint left_id = ( (lx >> postShift) << (postShift + 1)) + (lx & (pairDistance - 1)); \\\n\
-            uint right_id = left_id + pairDistance; \\\n\
- \\\n\
-            int left_idx = p_share_v[left_id]; \\\n\
-            int right_idx = p_share_v[right_id]; \\\n\
- \\\n\
-            dtype left_elem = p_share_k[left_id]; \\\n\
-            dtype right_elem = p_share_k[right_id]; \\\n\
- \\\n\
-            if ((left_elem > right_elem || (left_elem == right_elem && left_idx > right_idx)) ^ signo) \\\n\
-            { \\\n\
-                p_share_k[left_id] = right_elem; \\\n\
-                p_share_k[right_id] = left_elem; \\\n\
- \\\n\
-                p_share_v[left_id] = right_idx; \\\n\
-                p_share_v[right_id] = left_idx; \\\n\
-            } \\\n\
- \\\n\
-            barrier(CLK_LOCAL_MEM_FENCE); \\\n\
-        } \\\n\
-    } \\\n\
-}\n\
-BITONIC_STEP_ASCEND(int)\n\
-BITONIC_STEP_ASCEND(uint)\n\
-\n\
-#define BITONIC_MERGE(dtype) \\\n\
-void bitonic_merge_##dtype(uint num_stages, int lx, \\\n\
-        __local dtype *local_data, __local int *local_indices) \\\n\
-{ \\\n\
-    uint stage = num_stages; \\\n\
-    uint signo = (lx >> stage) & 1; \\\n\
- \\\n\
-    for (uint passOfStage = 0; passOfStage < stage + 1; ++passOfStage) \\\n\
-    { \\\n\
-        uint postShift = (stage - passOfStage); \\\n\
-        uint pairDistance = 1 << postShift; \\\n\
- \\\n\
-        uint left_id = ( (lx >> postShift) << (postShift + 1)) + (lx & (pairDistance - 1)); \\\n\
-        uint right_id = left_id + pairDistance; \\\n\
- \\\n\
-        int left_idx = local_indices[left_id]; \\\n\
-        int right_idx = local_indices[right_id]; \\\n\
- \\\n\
-        dtype left_elem = local_data[left_id]; \\\n\
-        dtype right_elem = local_data[right_id]; \\\n\
- \\\n\
-        if ((left_elem < right_elem || (left_elem == right_elem && left_idx < right_idx)) ^ signo) \\\n\
-        { \\\n\
-            local_data[left_id] = right_elem; \\\n\
-            local_data[right_id] = left_elem; \\\n\
- \\\n\
-            local_indices[left_id] = right_idx; \\\n\
-            local_indices[right_id] = left_idx; \\\n\
-        } \\\n\
- \\\n\
-        barrier(CLK_LOCAL_MEM_FENCE); \\\n\
-    } \\\n\
-}\n\
-BITONIC_MERGE(int)\n\
-BITONIC_MERGE(uint)\n\
-\n\
-#define BLOCK_SIZE              (512)\n\
-\n\
-__kernel __attribute__((reqd_work_group_size(BLOCK_SIZE, 1, 1))) void topk_stage_I32toI32_I32\n\
-(\n\
-  __read_only  image2d_t input,\n\
-  __write_only image2d_t output,\n\
-  __write_only image2d_t indices,\n\
-               float     input_scale,\n\
-               float     input_tail,\n\
-               float     output_scale,\n\
-               float     output_tail,\n\
-               int       _num_stages,\n\
-               int       width\n\
-  )\n\
- {\n\
-    uint lx = get_local_id(0);\n\
-    const int init_k = -2147483647;\n\
-    const int init_v = -2147483647;\n\
-    const int num_stages = 9;\n\
-    const int threads_per_block = BLOCK_SIZE;\n\
-    const int index_minus_1 = threads_per_block * 2 - 1;\n\
-    uint offset = 0;\n\
-    uint lx1 = lx + threads_per_block;\n\
-\n\
-    int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(0), get_global_id(1));\n\
-\n\
-    __local int local_data[1536];\n\
-    __local int local_indices[1536];\n\
-\n\
-    int left = read_imagei(input, coord.xy).x;\n\
-    coord.z += threads_per_block;\n\
-    int right = read_imagei(input, coord.zy).x;\n\
-\n\
-    local_data[lx] = left;\n\
-    local_indices[lx] = coord.x;\n\
-    local_data[lx1] = right;\n\
-    local_indices[lx1] = coord.z;\n\
-\n\
-    barrier(CLK_LOCAL_MEM_FENCE);\n\
-\n\
-    bitonic_step_int(num_stages, lx, local_data, local_indices);\n\
-\n\
-    int min_data = local_data[511];\n\
-\n\
-    int *p_share_k = local_data + threads_per_block;\n\
-    int *p_share_v = local_indices + threads_per_block;\n\
-\n\
-    int limit = (width >> 10) << 10;\n\
-    p_share_k[lx] = init_k;\n\
-    p_share_v[lx] = init_v;\n\
-\n\
-    p_share_k[lx1] = init_k;\n\
-    p_share_v[lx1] = init_v;\n\
-    barrier(CLK_LOCAL_MEM_FENCE);\n\
-\n\
-    for (coord.x = lx + threads_per_block * 2; coord.x < limit; coord.x = coord.x + threads_per_block * 2)\n\
-    {\n\
-        int2 data;\n\
-        coord.z = coord.x + threads_per_block;\n\
-        data.x = read_imagei(input, coord.xy).x;\n\
-        data.y = read_imagei(input, coord.zy).x;\n\
-\n\
-        p_share_k[lx] = data.x;\n\
-        p_share_v[lx] = coord.x;\n\
-\n\
-        p_share_k[lx1] = data.y;\n\
-        p_share_v[lx1] = coord.z;\n\
-        barrier(CLK_LOCAL_MEM_FENCE);\n\
-\n\
-        bitonic_step_ascend_int(num_stages, lx, p_share_k, p_share_v);\n\
-\n\
-        if (p_share_k[index_minus_1] < min_data)\n\
-        {\n\
-            continue;\n\
-        }\n\
-\n\
-        p_share_k[lx] = p_share_k[lx1];\n\
-        p_share_v[lx] = p_share_v[lx1];\n\
-        barrier(CLK_LOCAL_MEM_FENCE);\n\
-\n\
-        bitonic_merge_int(num_stages, lx, local_data, local_indices);\n\
-\n\
-        min_data = local_data[511];\n\
-        p_share_k[lx] = init_k;\n\
-        p_share_v[lx] = init_v;\n\
-        p_share_k[lx1] = init_k;\n\
-        p_share_v[lx1] = init_v;\n\
-    }\n\
-\n\
-    if (width > limit)\n\
-    {\n\
-        if (coord.x < width)\n\
-        {\n\
-            int2 data;\n\
-            data.x = read_imagei(input, coord.xy).x;\n\
-            coord.z = coord.x + threads_per_block;\n\
-            data.y = read_imagei(input, coord.zy).x;\n\
-\n\
-            p_share_k[lx] = data.x;\n\
-            p_share_v[lx] = coord.x;\n\
-\n\
-            p_share_k[lx1] = coord.z < width ? data.y : init_k;\n\
-            p_share_v[lx1] = coord.z < width ? coord.z : init_v;\n\
-        }\n\
-        barrier(CLK_LOCAL_MEM_FENCE);\n\
-\n\
-        bitonic_step_ascend_int(num_stages, lx, p_share_k, p_share_v);\n\
-\n\
-        if (p_share_k[index_minus_1] >= min_data)\n\
-        {\n\
-            p_share_k[lx] = p_share_k[lx1];\n\
-            p_share_v[lx] = p_share_v[lx1];\n\
-            barrier(CLK_LOCAL_MEM_FENCE);\n\
-            bitonic_merge_int(num_stages, lx, local_data, local_indices);\n\
-        }\n\
-    }\n\
-\n\
-    int4 dst;\n\
-    dst.x = local_data[lx];\n\
-\n\
-    coord.x = lx;\n\
-    write_imagei(output, coord.xy, dst.xxxx);\n\
-\n\
-    int4 index;\n\
-    index.x = local_indices[lx];\n\
-\n\
-    write_imagei(indices, coord.xy, index.xxxx);\n\
-}\n\
-\n\
-__kernel __attribute__((reqd_work_group_size(BLOCK_SIZE, 1, 1))) void topk_stage_U32toU32_I32\n\
-(\n\
-  __read_only  image2d_t input,\n\
-  __write_only image2d_t output,\n\
-  __write_only image2d_t indices,\n\
-               float     input_scale,\n\
-               float     input_tail,\n\
-               float     output_scale,\n\
-               float     output_tail,\n\
-               int       _num_stages,\n\
-               int       width\n\
-  )\n\
- {\n\
-    uint lx = get_local_id(0);\n\
-    const uint init_k = 0;\n\
-    const int init_v = -2147483647;\n\
-    const int num_stages = 9;\n\
-    const int threads_per_block = BLOCK_SIZE;\n\
-    const int index_minus_1 = threads_per_block * 2 - 1;\n\
-    uint offset = 0;\n\
-    uint lx1 = lx + threads_per_block;\n\
-\n\
-    int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(0), get_global_id(1));\n\
-\n\
-    __local uint local_data[1536];\n\
-    __local int local_indices[1536];\n\
-\n\
-    uint left = read_imageui(input, coord.xy).x;\n\
-    coord.z += threads_per_block;\n\
-    uint right = read_imageui(input, coord.zy).x;\n\
-\n\
-    local_data[lx] = left;\n\
-    local_indices[lx] = coord.x;\n\
-    local_data[lx1] = right;\n\
-    local_indices[lx1] = coord.z;\n\
-\n\
-    barrier(CLK_LOCAL_MEM_FENCE);\n\
-\n\
-    bitonic_step_uint(num_stages, lx, local_data, local_indices);\n\
-\n\
-    uint min_data = local_data[511];\n\
-\n\
-    uint *p_share_k = local_data + threads_per_block;\n\
-    int *p_share_v = local_indices + threads_per_block;\n\
-\n\
-    int limit = (width >> 10) << 10;\n\
-    p_share_k[lx] = init_k;\n\
-    p_share_v[lx] = init_v;\n\
-\n\
-    p_share_k[lx1] = init_k;\n\
-    p_share_v[lx1] = init_v;\n\
-    barrier(CLK_LOCAL_MEM_FENCE);\n\
-\n\
-    for (coord.x = lx + threads_per_block * 2; coord.x < limit; coord.x = coord.x + threads_per_block * 2)\n\
-    {\n\
-        uint2 data;\n\
-        coord.z = coord.x + threads_per_block;\n\
-        data.x = read_imageui(input, coord.xy).x;\n\
-        data.y = read_imageui(input, coord.zy).x;\n\
-\n\
-        p_share_k[lx] = data.x;\n\
-        p_share_v[lx] = coord.x;\n\
-\n\
-        p_share_k[lx1] = data.y;\n\
-        p_share_v[lx1] = coord.z;\n\
-        barrier(CLK_LOCAL_MEM_FENCE);\n\
-\n\
-        bitonic_step_ascend_uint(num_stages, lx, p_share_k, p_share_v);\n\
-\n\
-        if (p_share_k[index_minus_1] < min_data)\n\
-        {\n\
-            continue;\n\
-        }\n\
-\n\
-        p_share_k[lx] = p_share_k[lx1];\n\
-        p_share_v[lx] = p_share_v[lx1];\n\
-        barrier(CLK_LOCAL_MEM_FENCE);\n\
-\n\
-        bitonic_merge_uint(num_stages, lx, local_data, local_indices);\n\
-\n\
-        min_data = local_data[511];\n\
-        p_share_k[lx] = init_k;\n\
-        p_share_v[lx] = init_v;\n\
-        p_share_k[lx1] = init_k;\n\
-        p_share_v[lx1] = init_v;\n\
-    }\n\
-\n\
-    if (width > limit)\n\
-    {\n\
-        if (coord.x < width)\n\
-        {\n\
-            uint2 data;\n\
-            data.x = read_imageui(input, coord.xy).x;\n\
-            coord.z = coord.x + threads_per_block;\n\
-            data.y = read_imageui(input, coord.zy).x;\n\
-\n\
-            p_share_k[lx] = data.x;\n\
-            p_share_v[lx] = coord.x;\n\
-\n\
-            p_share_k[lx1] = coord.z < width ? data.y : init_k;\n\
-            p_share_v[lx1] = coord.z < width ? coord.z : init_v;\n\
-        }\n\
-        barrier(CLK_LOCAL_MEM_FENCE);\n\
-\n\
-        bitonic_step_ascend_uint(num_stages, lx, p_share_k, p_share_v);\n\
-\n\
-        if (p_share_k[index_minus_1] >= min_data)\n\
-        {\n\
-            p_share_k[lx] = p_share_k[lx1];\n\
-            p_share_v[lx] = p_share_v[lx1];\n\
-            barrier(CLK_LOCAL_MEM_FENCE);\n\
-            bitonic_merge_uint(num_stages, lx, local_data, local_indices);\n\
-        }\n\
-    }\n\
-\n\
-    uint4 dst;\n\
-    dst.x = local_data[lx];\n\
-\n\
-    coord.x = lx;\n\
-    write_imageui(output, coord.xy, dst.xxxx);\n\
-\n\
-    int4 index;\n\
-    index.x = local_indices[lx];\n\
-\n\
-    write_imagei(indices, coord.xy, index.xxxx);\n\
-}\n\
-"; /* end of topk2_cl*/
-
 static const char topk_odd_even_sort_cl[] = "#define LOCAL_SIZE_X    (32)\n\
 __kernel __attribute__((reqd_work_group_size(LOCAL_SIZE_X, 1, 1))) void topk_odd_even_sort_F32toF32_I32\n\
 (\n\
@ -82484,7 +82114,6 @@ static const source_map_t cl_resource[] =
    {"swish_cl", swish_cl},
    {"tile_cl", tile_cl},
    {"topk_cl", topk_cl},
-    {"topk2_cl", topk2_cl},
    {"topk_odd_even_sort_cl", topk_odd_even_sort_cl},
    {"topk_odd_even_sort2_cl", topk_odd_even_sort2_cl},
    {"upsample_cl", upsample_cl},