diff --git a/src/tim/vx/internal/include/vsi_nn_context.h b/src/tim/vx/internal/include/vsi_nn_context.h
index 477cb19..4ac9f61 100644
--- a/src/tim/vx/internal/include/vsi_nn_context.h
+++ b/src/tim/vx/internal/include/vsi_nn_context.h
@@ -60,7 +60,9 @@ typedef struct _vsi_nn_hw_config_t
 {
     char target_name[VSI_NN_MAX_TARGET_NAME];
     vsi_nn_hw_evis_t evis;
+#if VX_HARDWARE_CAPS_PARAMS_EXT_SUPPORT
     uint32_t subGroupSize;
+#endif
     uint32_t use_40bits_va;
     uint32_t support_stream_processor;
     uint32_t sp_exec_count;
diff --git a/src/tim/vx/internal/include/vsi_nn_version.h b/src/tim/vx/internal/include/vsi_nn_version.h
index 0fafc24..92f8349 100644
--- a/src/tim/vx/internal/include/vsi_nn_version.h
+++ b/src/tim/vx/internal/include/vsi_nn_version.h
@@ -33,7 +33,7 @@ extern "C"{
 
 #define VSI_NN_VERSION_MAJOR 1
 #define VSI_NN_VERSION_MINOR 2
-#define VSI_NN_VERSION_PATCH 6
+#define VSI_NN_VERSION_PATCH 5
 #define VSI_NN_VERSION \
     (VSI_NN_VERSION_MAJOR * 10000 + VSI_NN_VERSION_MINOR * 100 + VSI_NN_VERSION_PATCH)
 
diff --git a/src/tim/vx/internal/src/kernel/cl/topk_cl.c b/src/tim/vx/internal/src/kernel/cl/topk_cl.c
index a21d290..b8cdfd0 100644
--- a/src/tim/vx/internal/src/kernel/cl/topk_cl.c
+++ b/src/tim/vx/internal/src/kernel/cl/topk_cl.c
@@ -34,24 +34,20 @@
 #include "vsi_nn_tensor_util.h"
 #include "utils/vsi_nn_util.h"
 #include "kernel/vsi_nn_kernel.h"
+#include "libnnext/vx_lib_nnext.h"
 
 __BEGIN_DECLS
 
 #define _TOPK_KERNEL_SOURCE      "topk"
 #define STR(a) #a
 // Add kernel hashtable here
-#define TOPK_HASH_KEY( IN_DTYPE, OUT_DTYPE, STAGES, SECTION ) \
-        ( ( IN_DTYPE ) | ( OUT_DTYPE << 8 ) | (STAGES << 16) | (SECTION << 26))
+#define TOPK_HASH_KEY( IN_DTYPE, OUT_DTYPE, STAGES ) \
+        ( ( IN_DTYPE ) | ( OUT_DTYPE << 8 ) | (STAGES << 16) )
 #define PACK_KERNEL_MAP( IN_DTYPE, OUT_DTYPE, STAGES ) \
-        { TOPK_HASH_KEY( IN_DTYPE, OUT_DTYPE, STAGES, 0 ), \
+        { TOPK_HASH_KEY( IN_DTYPE, OUT_DTYPE, STAGES ), \
           CVIVANTE_NAMESPACE("cl.topk_stage"STR(STAGES)"_"STR(IN_DTYPE)"to"STR(OUT_DTYPE)"_I32"), \
           _TOPK_KERNEL_SOURCE }
 
-#define PACK_MERGE_KERNEL_MAP( IN_DTYPE, OUT_DTYPE ) \
-        { TOPK_HASH_KEY( IN_DTYPE, OUT_DTYPE, 0, 1 ), \
-          CVIVANTE_NAMESPACE("cl.topk_stage_"STR(IN_DTYPE)"to"STR(OUT_DTYPE)"_I32"), \
-          "topk2" }
-
 #define TOPK_ODD_EVEN_SORT_HASH_KEY( IN_DTYPE, OUT_DTYPE ) \
         ( ( IN_DTYPE ) | ( OUT_DTYPE << 8 ) )
 #define PACK_ODD_EVEN_SORT_KERNEL_MAP( IN_DTYPE, OUT_DTYPE ) \
@@ -115,9 +111,6 @@ static const _kernel_map_type _topk_kernel_map[] =
     PACK_KERNEL_MAP( F32, I32, 4 ),
     PACK_KERNEL_MAP( F32, I32, 5 ),
     PACK_KERNEL_MAP( F32, I32, 6 ),
-
-    PACK_MERGE_KERNEL_MAP(U32, U32),
-    PACK_MERGE_KERNEL_MAP(I32, I32),
 };
 
 static const _kernel_map_type _topk_odd_even_sort_kernel_map[] =
@@ -261,8 +254,7 @@ static vsi_status _query_kernel
     vsi_nn_kernel_t * kernel,
     vsi_nn_tensor_t * const * const inputs,
     vsi_nn_tensor_t * const * const outputs,
-    int32_t num_stages,
-    vsi_bool is_bitnoic_segment
+    int32_t num_stages
     )
 {
     vsi_status status = VSI_FAILURE;
@@ -280,23 +272,21 @@ static vsi_status _query_kernel
     in_dtype  = vsi_nn_kernel_map_dtype( inputs[0]->attr.dtype.vx_type );
     out_dtype = vsi_nn_kernel_map_dtype( outputs[0]->attr.dtype.vx_type );
 
-    num_stages = is_bitnoic_segment ? 0 : num_stages;
-
     switch (_PACK_SELECT_KEY(in_dtype, out_dtype))
     {
     case _PACK_SELECT_KEY(F32, F32):
     case _PACK_SELECT_KEY(F16, F16):
-        key = TOPK_HASH_KEY( F32, F32, num_stages, is_bitnoic_segment );
+        key = TOPK_HASH_KEY( F32, F32, num_stages );
         break;
     case _PACK_SELECT_KEY(U32, U32):
     case _PACK_SELECT_KEY(U16, U16):
     case _PACK_SELECT_KEY(U8,  U8):
-        key = TOPK_HASH_KEY( U32, U32, num_stages, is_bitnoic_segment );
+        key = TOPK_HASH_KEY( U32, U32, num_stages );
         break;
     case _PACK_SELECT_KEY(I32, I32):
     case _PACK_SELECT_KEY(I16, I16):
     case _PACK_SELECT_KEY(I8,  I8):
-        key = TOPK_HASH_KEY( I32, I32, num_stages, is_bitnoic_segment );
+        key = TOPK_HASH_KEY( I32, I32, num_stages );
         break;
     case _PACK_SELECT_KEY(F32, U32):
     case _PACK_SELECT_KEY(F16, U32):
@@ -304,7 +294,7 @@ static vsi_status _query_kernel
     case _PACK_SELECT_KEY(F16, U16):
     case _PACK_SELECT_KEY(F32, U8):
     case _PACK_SELECT_KEY(F16, U8):
-        key = TOPK_HASH_KEY( F32, U32, num_stages, is_bitnoic_segment );
+        key = TOPK_HASH_KEY( F32, U32, num_stages );
         break;
     case _PACK_SELECT_KEY(F32, I32):
     case _PACK_SELECT_KEY(F16, I32):
@@ -312,7 +302,7 @@ static vsi_status _query_kernel
     case _PACK_SELECT_KEY(F16, I16):
     case _PACK_SELECT_KEY(F32, I8):
     case _PACK_SELECT_KEY(F16, I8):
-        key = TOPK_HASH_KEY( F32, I32, num_stages, is_bitnoic_segment );
+        key = TOPK_HASH_KEY( F32, I32, num_stages );
         break;
     default:
         break;
@@ -450,12 +440,7 @@ static vsi_nn_kernel_node_t _setup
     int32_t top_k = vsi_nn_kernel_param_get_int32(params, "top_k");
     int32_t num_stages = (int32_t)vsi_nn_max(ceil(log10(block_size / 2.0f) / log10(2.0f)), 0);
     vsi_bool is_odd_even_sort = FALSE;
-    vsi_bool is_bitnoic_segment = FALSE;
     size_t param_num = _TOPK_PARAM_NUM;
-    int32_t max_stages = 7 + (int32_t)log2(graph->ctx->config.subGroupSize >> 2);
-    vsi_nn_kernel_dtype_e type0 = vsi_nn_kernel_map_dtype( inputs[0]->attr.dtype.vx_type );
-    vsi_nn_kernel_dtype_e type1 = vsi_nn_kernel_map_dtype( outputs[0]->attr.dtype.vx_type );
-
     float inputScale  = vsi_nn_get_tensor_scale(inputs[0]);
     float inputTail   = (float)vsi_nn_get_tensor_zero_point(inputs[0]);
     float outputScale = vsi_nn_get_tensor_scale(outputs[0]);
@@ -486,14 +471,9 @@ static vsi_nn_kernel_node_t _setup
     rs_tensors[0] = vsi_nn_reshape_tensor( graph,
         inputs[0], shape[0], 2 );
 
-    is_bitnoic_segment = (num_stages >= 9) && (top_k <= 512 && max_stages > 9) &&
-        type0 == type1 && (type0 == U8 || type0 == I8 || type0 == I16 || type0 == U16 || type0 == I32 || type0 == U32);
-    num_stages = is_bitnoic_segment ? 9 : num_stages;
-    max_stages = is_bitnoic_segment ? max_stages : 7;
-
-    if (num_stages < max_stages || is_bitnoic_segment)
+    if (num_stages < 7)
     {
-        status = _query_kernel( kernel, inputs, outputs, num_stages, is_bitnoic_segment );
+        status = _query_kernel( kernel, inputs, outputs, num_stages );
 
         rs_tensors[1] = vsi_nn_reshape_tensor( graph,
             outputs[0], shape[1], 2 );
diff --git a/src/tim/vx/internal/src/libnnext/ops/cl/topk.cl b/src/tim/vx/internal/src/libnnext/ops/cl/topk.cl
index dc20389..0e6166c 100644
--- a/src/tim/vx/internal/src/libnnext/ops/cl/topk.cl
+++ b/src/tim/vx/internal/src/libnnext/ops/cl/topk.cl
@@ -51,7 +51,7 @@ __kernel __attribute__((reqd_work_group_size(LOCAL_SIZE0, 1, 1))) void topk_stag
             float left_elem = local_data[left_id]; \
             float right_elem = local_data[right_id]; \
  \
-            if ((left_elem < right_elem || (left_elem == right_elem && left_idx < right_idx)) ^ signo) \
+            if ((left_elem < right_elem) ^ signo) \
             { \
                 local_data[left_id] = right_elem; \
                 local_data[right_id] = left_elem; \
@@ -139,7 +139,7 @@ __kernel __attribute__((reqd_work_group_size(LOCAL_SIZE0, 1, 1))) void topk_stag
             uint left_elem = local_data[left_id]; \
             uint right_elem = local_data[right_id]; \
  \
-            if ((left_elem < right_elem || (left_elem == right_elem && left_idx < right_idx)) ^ signo) \
+            if ((left_elem < right_elem) ^ signo) \
             { \
                 local_data[left_id] = right_elem; \
                 local_data[right_id] = left_elem; \
@@ -227,7 +227,7 @@ __kernel __attribute__((reqd_work_group_size(LOCAL_SIZE0, 1, 1))) void topk_stag
             int left_elem = local_data[left_id]; \
             int right_elem = local_data[right_id]; \
  \
-            if ((left_elem < right_elem || (left_elem == right_elem && left_idx < right_idx)) ^ signo) \
+            if ((left_elem < right_elem) ^ signo) \
             { \
                 local_data[left_id] = right_elem; \
                 local_data[right_id] = left_elem; \
@@ -315,7 +315,7 @@ __kernel __attribute__((reqd_work_group_size(LOCAL_SIZE0, 1, 1))) void topk_stag
             float left_elem = local_data[left_id]; \
             float right_elem = local_data[right_id]; \
  \
-            if ((left_elem < right_elem || (left_elem == right_elem && left_idx < right_idx)) ^ signo) \
+            if ((left_elem < right_elem) ^ signo) \
             { \
                 local_data[left_id] = right_elem; \
                 local_data[right_id] = left_elem; \
@@ -403,7 +403,7 @@ __kernel __attribute__((reqd_work_group_size(LOCAL_SIZE0, 1, 1))) void topk_stag
             float left_elem = local_data[left_id]; \
             float right_elem = local_data[right_id]; \
  \
-            if ((left_elem < right_elem || (left_elem == right_elem && left_idx < right_idx)) ^ signo) \
+            if ((left_elem < right_elem) ^ signo) \
             { \
                 local_data[left_id] = right_elem; \
                 local_data[right_id] = left_elem; \
diff --git a/src/tim/vx/internal/src/libnnext/ops/cl/topk2.cl b/src/tim/vx/internal/src/libnnext/ops/cl/topk2.cl
deleted file mode 100644
index 0eae5ab..0000000
--- a/src/tim/vx/internal/src/libnnext/ops/cl/topk2.cl
+++ /dev/null
@@ -1,368 +0,0 @@
-
-#define BITONIC_STEP(dtype) \
-void bitonic_step_##dtype(uint num_stages, int lx, \
-        __local dtype *local_data, __local int *local_indices) \
-{ \
-    for (uint stage = 0; stage < num_stages + 1; ++stage) \
-    { \
-        uint signo = (lx >> stage) & 1; \
- \
-        for (uint passOfStage = 0; passOfStage < stage + 1; ++passOfStage) \
-        { \
-            uint postShift = (stage - passOfStage); \
-            uint pairDistance = 1 << postShift; \
- \
-            uint left_id = ( (lx >> postShift) << (postShift + 1)) + (lx & (pairDistance - 1)); \
-            uint right_id = left_id + pairDistance; \
- \
-            int left_idx = local_indices[left_id]; \
-            int right_idx = local_indices[right_id]; \
- \
-            dtype left_elem = local_data[left_id]; \
-            dtype right_elem = local_data[right_id]; \
- \
-            if ((left_elem < right_elem || (left_elem == right_elem && left_idx < right_idx)) ^ signo) \
-            { \
-                local_data[left_id] = right_elem; \
-                local_data[right_id] = left_elem; \
- \
-                local_indices[left_id] = right_idx; \
-                local_indices[right_id] = left_idx; \
-            } \
- \
-            barrier(CLK_LOCAL_MEM_FENCE); \
-        } \
-    } \
-}
-BITONIC_STEP(int)
-BITONIC_STEP(uint)
-
-#define BITONIC_STEP_ASCEND(dtype) \
-void bitonic_step_ascend_##dtype(uint num_stages, int lx, \
-        __local dtype *p_share_k, __local int *p_share_v) \
-{ \
-    for (uint stage = 0; stage < num_stages + 1; ++stage) \
-    { \
-        uint signo = (lx >> stage) & 1; \
- \
-        for (uint passOfStage = 0; passOfStage < stage + 1; ++passOfStage) \
-        { \
-            uint postShift = (stage - passOfStage); \
-            uint pairDistance = 1 << postShift; \
- \
-            uint left_id = ( (lx >> postShift) << (postShift + 1)) + (lx & (pairDistance - 1)); \
-            uint right_id = left_id + pairDistance; \
- \
-            int left_idx = p_share_v[left_id]; \
-            int right_idx = p_share_v[right_id]; \
- \
-            dtype left_elem = p_share_k[left_id]; \
-            dtype right_elem = p_share_k[right_id]; \
- \
-            if ((left_elem > right_elem || (left_elem == right_elem && left_idx > right_idx)) ^ signo) \
-            { \
-                p_share_k[left_id] = right_elem; \
-                p_share_k[right_id] = left_elem; \
- \
-                p_share_v[left_id] = right_idx; \
-                p_share_v[right_id] = left_idx; \
-            } \
- \
-            barrier(CLK_LOCAL_MEM_FENCE); \
-        } \
-    } \
-}
-BITONIC_STEP_ASCEND(int)
-BITONIC_STEP_ASCEND(uint)
-
-#define BITONIC_MERGE(dtype) \
-void bitonic_merge_##dtype(uint num_stages, int lx, \
-        __local dtype *local_data, __local int *local_indices) \
-{ \
-    uint stage = num_stages; \
-    uint signo = (lx >> stage) & 1; \
- \
-    for (uint passOfStage = 0; passOfStage < stage + 1; ++passOfStage) \
-    { \
-        uint postShift = (stage - passOfStage); \
-        uint pairDistance = 1 << postShift; \
- \
-        uint left_id = ( (lx >> postShift) << (postShift + 1)) + (lx & (pairDistance - 1)); \
-        uint right_id = left_id + pairDistance; \
- \
-        int left_idx = local_indices[left_id]; \
-        int right_idx = local_indices[right_id]; \
- \
-        dtype left_elem = local_data[left_id]; \
-        dtype right_elem = local_data[right_id]; \
- \
-        if ((left_elem < right_elem || (left_elem == right_elem && left_idx < right_idx)) ^ signo) \
-        { \
-            local_data[left_id] = right_elem; \
-            local_data[right_id] = left_elem; \
- \
-            local_indices[left_id] = right_idx; \
-            local_indices[right_id] = left_idx; \
-        } \
- \
-        barrier(CLK_LOCAL_MEM_FENCE); \
-    } \
-}
-BITONIC_MERGE(int)
-BITONIC_MERGE(uint)
-
-#define BLOCK_SIZE              (512)
-
-__kernel __attribute__((reqd_work_group_size(BLOCK_SIZE, 1, 1))) void topk_stage_I32toI32_I32
-(
-  __read_only  image2d_t input,
-  __write_only image2d_t output,
-  __write_only image2d_t indices,
-               float     input_scale,
-               float     input_tail,
-               float     output_scale,
-               float     output_tail,
-               int       _num_stages,
-               int       width
-  )
- {
-    uint lx = get_local_id(0);
-    const int init_k = -2147483647;
-    const int init_v = -2147483647;
-    const int num_stages = 9;
-    const int threads_per_block = BLOCK_SIZE;
-    const int index_minus_1 = threads_per_block * 2 - 1;
-    uint offset = 0;
-    uint lx1 = lx + threads_per_block;
-
-    int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(0), get_global_id(1));
-
-    __local int local_data[1536];
-    __local int local_indices[1536];
-
-    int left = read_imagei(input, coord.xy).x;
-    coord.z += threads_per_block;
-    int right = read_imagei(input, coord.zy).x;
-
-    local_data[lx] = left;
-    local_indices[lx] = coord.x;
-    local_data[lx1] = right;
-    local_indices[lx1] = coord.z;
-
-    barrier(CLK_LOCAL_MEM_FENCE);
-
-    bitonic_step_int(num_stages, lx, local_data, local_indices);
-
-    int min_data = local_data[511];
-
-    int *p_share_k = local_data + threads_per_block;
-    int *p_share_v = local_indices + threads_per_block;
-
-    int limit = (width >> 10) << 10;
-    p_share_k[lx] = init_k;
-    p_share_v[lx] = init_v;
-
-    p_share_k[lx1] = init_k;
-    p_share_v[lx1] = init_v;
-    barrier(CLK_LOCAL_MEM_FENCE);
-
-    for (coord.x = lx + threads_per_block * 2; coord.x < limit; coord.x = coord.x + threads_per_block * 2)
-    {
-        int2 data;
-        coord.z = coord.x + threads_per_block;
-        data.x = read_imagei(input, coord.xy).x;
-        data.y = read_imagei(input, coord.zy).x;
-
-        p_share_k[lx] = data.x;
-        p_share_v[lx] = coord.x;
-
-        p_share_k[lx1] = data.y;
-        p_share_v[lx1] = coord.z;
-        barrier(CLK_LOCAL_MEM_FENCE);
-
-        bitonic_step_ascend_int(num_stages, lx, p_share_k, p_share_v);
-
-        if (p_share_k[index_minus_1] < min_data)
-        {
-            continue;
-        }
-
-        p_share_k[lx] = p_share_k[lx1];
-        p_share_v[lx] = p_share_v[lx1];
-        barrier(CLK_LOCAL_MEM_FENCE);
-
-        bitonic_merge_int(num_stages, lx, local_data, local_indices);
-
-        min_data = local_data[511];
-        p_share_k[lx] = init_k;
-        p_share_v[lx] = init_v;
-        p_share_k[lx1] = init_k;
-        p_share_v[lx1] = init_v;
-    }
-
-    if (width > limit)
-    {
-        if (coord.x < width)
-        {
-            int2 data;
-            data.x = read_imagei(input, coord.xy).x;
-            coord.z = coord.x + threads_per_block;
-            data.y = read_imagei(input, coord.zy).x;
-
-            p_share_k[lx] = data.x;
-            p_share_v[lx] = coord.x;
-
-            p_share_k[lx1] = coord.z < width ? data.y : init_k;
-            p_share_v[lx1] = coord.z < width ? coord.z : init_v;
-        }
-        barrier(CLK_LOCAL_MEM_FENCE);
-
-        bitonic_step_ascend_int(num_stages, lx, p_share_k, p_share_v);
-
-        if (p_share_k[index_minus_1] >= min_data)
-        {
-            p_share_k[lx] = p_share_k[lx1];
-            p_share_v[lx] = p_share_v[lx1];
-            barrier(CLK_LOCAL_MEM_FENCE);
-            bitonic_merge_int(num_stages, lx, local_data, local_indices);
-        }
-    }
-
-    int4 dst;
-    dst.x = local_data[lx];
-
-    coord.x = lx;
-    write_imagei(output, coord.xy, dst.xxxx);
-
-    int4 index;
-    index.x = local_indices[lx];
-
-    write_imagei(indices, coord.xy, index.xxxx);
-}
-
-__kernel __attribute__((reqd_work_group_size(BLOCK_SIZE, 1, 1))) void topk_stage_U32toU32_I32
-(
-  __read_only  image2d_t input,
-  __write_only image2d_t output,
-  __write_only image2d_t indices,
-               float     input_scale,
-               float     input_tail,
-               float     output_scale,
-               float     output_tail,
-               int       _num_stages,
-               int       width
-  )
- {
-    uint lx = get_local_id(0);
-    const uint init_k = 0;
-    const int init_v = -2147483647;
-    const int num_stages = 9;
-    const int threads_per_block = BLOCK_SIZE;
-    const int index_minus_1 = threads_per_block * 2 - 1;
-    uint offset = 0;
-    uint lx1 = lx + threads_per_block;
-
-    int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(0), get_global_id(1));
-
-    __local uint local_data[1536];
-    __local int local_indices[1536];
-
-    uint left = read_imageui(input, coord.xy).x;
-    coord.z += threads_per_block;
-    uint right = read_imageui(input, coord.zy).x;
-
-    local_data[lx] = left;
-    local_indices[lx] = coord.x;
-    local_data[lx1] = right;
-    local_indices[lx1] = coord.z;
-
-    barrier(CLK_LOCAL_MEM_FENCE);
-
-    bitonic_step_uint(num_stages, lx, local_data, local_indices);
-
-    uint min_data = local_data[511];
-
-    uint *p_share_k = local_data + threads_per_block;
-    int *p_share_v = local_indices + threads_per_block;
-
-    int limit = (width >> 10) << 10;
-    p_share_k[lx] = init_k;
-    p_share_v[lx] = init_v;
-
-    p_share_k[lx1] = init_k;
-    p_share_v[lx1] = init_v;
-    barrier(CLK_LOCAL_MEM_FENCE);
-
-    for (coord.x = lx + threads_per_block * 2; coord.x < limit; coord.x = coord.x + threads_per_block * 2)
-    {
-        uint2 data;
-        coord.z = coord.x + threads_per_block;
-        data.x = read_imageui(input, coord.xy).x;
-        data.y = read_imageui(input, coord.zy).x;
-
-        p_share_k[lx] = data.x;
-        p_share_v[lx] = coord.x;
-
-        p_share_k[lx1] = data.y;
-        p_share_v[lx1] = coord.z;
-        barrier(CLK_LOCAL_MEM_FENCE);
-
-        bitonic_step_ascend_uint(num_stages, lx, p_share_k, p_share_v);
-
-        if (p_share_k[index_minus_1] < min_data)
-        {
-            continue;
-        }
-
-        p_share_k[lx] = p_share_k[lx1];
-        p_share_v[lx] = p_share_v[lx1];
-        barrier(CLK_LOCAL_MEM_FENCE);
-
-        bitonic_merge_uint(num_stages, lx, local_data, local_indices);
-
-        min_data = local_data[511];
-        p_share_k[lx] = init_k;
-        p_share_v[lx] = init_v;
-        p_share_k[lx1] = init_k;
-        p_share_v[lx1] = init_v;
-    }
-
-    if (width > limit)
-    {
-        if (coord.x < width)
-        {
-            uint2 data;
-            data.x = read_imageui(input, coord.xy).x;
-            coord.z = coord.x + threads_per_block;
-            data.y = read_imageui(input, coord.zy).x;
-
-            p_share_k[lx] = data.x;
-            p_share_v[lx] = coord.x;
-
-            p_share_k[lx1] = coord.z < width ? data.y : init_k;
-            p_share_v[lx1] = coord.z < width ? coord.z : init_v;
-        }
-        barrier(CLK_LOCAL_MEM_FENCE);
-
-        bitonic_step_ascend_uint(num_stages, lx, p_share_k, p_share_v);
-
-        if (p_share_k[index_minus_1] >= min_data)
-        {
-            p_share_k[lx] = p_share_k[lx1];
-            p_share_v[lx] = p_share_v[lx1];
-            barrier(CLK_LOCAL_MEM_FENCE);
-            bitonic_merge_uint(num_stages, lx, local_data, local_indices);
-        }
-    }
-
-    uint4 dst;
-    dst.x = local_data[lx];
-
-    coord.x = lx;
-    write_imageui(output, coord.xy, dst.xxxx);
-
-    int4 index;
-    index.x = local_indices[lx];
-
-    write_imagei(indices, coord.xy, index.xxxx);
-}
diff --git a/src/tim/vx/internal/src/libnnext/vsi_nn_libnnext_resource.c b/src/tim/vx/internal/src/libnnext/vsi_nn_libnnext_resource.c
index d19b519..debd687 100644
--- a/src/tim/vx/internal/src/libnnext/vsi_nn_libnnext_resource.c
+++ b/src/tim/vx/internal/src/libnnext/vsi_nn_libnnext_resource.c
@@ -80509,7 +80509,7 @@ __kernel __attribute__((reqd_work_group_size(LOCAL_SIZE0, 1, 1))) void topk_stag
             float left_elem = local_data[left_id]; \\\n\
             float right_elem = local_data[right_id]; \\\n\
  \\\n\
-            if ((left_elem < right_elem || (left_elem == right_elem && left_idx < right_idx)) ^ signo) \\\n\
+            if ((left_elem < right_elem) ^ signo) \\\n\
             { \\\n\
                 local_data[left_id] = right_elem; \\\n\
                 local_data[right_id] = left_elem; \\\n\
@@ -80597,7 +80597,7 @@ __kernel __attribute__((reqd_work_group_size(LOCAL_SIZE0, 1, 1))) void topk_stag
             uint left_elem = local_data[left_id]; \\\n\
             uint right_elem = local_data[right_id]; \\\n\
  \\\n\
-            if ((left_elem < right_elem || (left_elem == right_elem && left_idx < right_idx)) ^ signo) \\\n\
+            if ((left_elem < right_elem) ^ signo) \\\n\
             { \\\n\
                 local_data[left_id] = right_elem; \\\n\
                 local_data[right_id] = left_elem; \\\n\
@@ -80685,7 +80685,7 @@ __kernel __attribute__((reqd_work_group_size(LOCAL_SIZE0, 1, 1))) void topk_stag
             int left_elem = local_data[left_id]; \\\n\
             int right_elem = local_data[right_id]; \\\n\
  \\\n\
-            if ((left_elem < right_elem || (left_elem == right_elem && left_idx < right_idx)) ^ signo) \\\n\
+            if ((left_elem < right_elem) ^ signo) \\\n\
             { \\\n\
                 local_data[left_id] = right_elem; \\\n\
                 local_data[right_id] = left_elem; \\\n\
@@ -80773,7 +80773,7 @@ __kernel __attribute__((reqd_work_group_size(LOCAL_SIZE0, 1, 1))) void topk_stag
             float left_elem = local_data[left_id]; \\\n\
             float right_elem = local_data[right_id]; \\\n\
  \\\n\
-            if ((left_elem < right_elem || (left_elem == right_elem && left_idx < right_idx)) ^ signo) \\\n\
+            if ((left_elem < right_elem) ^ signo) \\\n\
             { \\\n\
                 local_data[left_id] = right_elem; \\\n\
                 local_data[right_id] = left_elem; \\\n\
@@ -80861,7 +80861,7 @@ __kernel __attribute__((reqd_work_group_size(LOCAL_SIZE0, 1, 1))) void topk_stag
             float left_elem = local_data[left_id]; \\\n\
             float right_elem = local_data[right_id]; \\\n\
  \\\n\
-            if ((left_elem < right_elem || (left_elem == right_elem && left_idx < right_idx)) ^ signo) \\\n\
+            if ((left_elem < right_elem) ^ signo) \\\n\
             { \\\n\
                 local_data[left_id] = right_elem; \\\n\
                 local_data[right_id] = left_elem; \\\n\
@@ -80896,376 +80896,6 @@ TOPK_F32toI32(1 << 4, 4)\n\
 TOPK_F32toI32(1 << 5, 5)\n\
 TOPK_F32toI32(1 << 6, 6)"; /* end of topk_cl*/
 
-static const char topk2_cl[] = "\n\
-#define BITONIC_STEP(dtype) \\\n\
-void bitonic_step_##dtype(uint num_stages, int lx, \\\n\
-        __local dtype *local_data, __local int *local_indices) \\\n\
-{ \\\n\
-    for (uint stage = 0; stage < num_stages + 1; ++stage) \\\n\
-    { \\\n\
-        uint signo = (lx >> stage) & 1; \\\n\
- \\\n\
-        for (uint passOfStage = 0; passOfStage < stage + 1; ++passOfStage) \\\n\
-        { \\\n\
-            uint postShift = (stage - passOfStage); \\\n\
-            uint pairDistance = 1 << postShift; \\\n\
- \\\n\
-            uint left_id = ( (lx >> postShift) << (postShift + 1)) + (lx & (pairDistance - 1)); \\\n\
-            uint right_id = left_id + pairDistance; \\\n\
- \\\n\
-            int left_idx = local_indices[left_id]; \\\n\
-            int right_idx = local_indices[right_id]; \\\n\
- \\\n\
-            dtype left_elem = local_data[left_id]; \\\n\
-            dtype right_elem = local_data[right_id]; \\\n\
- \\\n\
-            if ((left_elem < right_elem || (left_elem == right_elem && left_idx < right_idx)) ^ signo) \\\n\
-            { \\\n\
-                local_data[left_id] = right_elem; \\\n\
-                local_data[right_id] = left_elem; \\\n\
- \\\n\
-                local_indices[left_id] = right_idx; \\\n\
-                local_indices[right_id] = left_idx; \\\n\
-            } \\\n\
- \\\n\
-            barrier(CLK_LOCAL_MEM_FENCE); \\\n\
-        } \\\n\
-    } \\\n\
-}\n\
-BITONIC_STEP(int)\n\
-BITONIC_STEP(uint)\n\
-\n\
-#define BITONIC_STEP_ASCEND(dtype) \\\n\
-void bitonic_step_ascend_##dtype(uint num_stages, int lx, \\\n\
-        __local dtype *p_share_k, __local int *p_share_v) \\\n\
-{ \\\n\
-    for (uint stage = 0; stage < num_stages + 1; ++stage) \\\n\
-    { \\\n\
-        uint signo = (lx >> stage) & 1; \\\n\
- \\\n\
-        for (uint passOfStage = 0; passOfStage < stage + 1; ++passOfStage) \\\n\
-        { \\\n\
-            uint postShift = (stage - passOfStage); \\\n\
-            uint pairDistance = 1 << postShift; \\\n\
- \\\n\
-            uint left_id = ( (lx >> postShift) << (postShift + 1)) + (lx & (pairDistance - 1)); \\\n\
-            uint right_id = left_id + pairDistance; \\\n\
- \\\n\
-            int left_idx = p_share_v[left_id]; \\\n\
-            int right_idx = p_share_v[right_id]; \\\n\
- \\\n\
-            dtype left_elem = p_share_k[left_id]; \\\n\
-            dtype right_elem = p_share_k[right_id]; \\\n\
- \\\n\
-            if ((left_elem > right_elem || (left_elem == right_elem && left_idx > right_idx)) ^ signo) \\\n\
-            { \\\n\
-                p_share_k[left_id] = right_elem; \\\n\
-                p_share_k[right_id] = left_elem; \\\n\
- \\\n\
-                p_share_v[left_id] = right_idx; \\\n\
-                p_share_v[right_id] = left_idx; \\\n\
-            } \\\n\
- \\\n\
-            barrier(CLK_LOCAL_MEM_FENCE); \\\n\
-        } \\\n\
-    } \\\n\
-}\n\
-BITONIC_STEP_ASCEND(int)\n\
-BITONIC_STEP_ASCEND(uint)\n\
-\n\
-#define BITONIC_MERGE(dtype) \\\n\
-void bitonic_merge_##dtype(uint num_stages, int lx, \\\n\
-        __local dtype *local_data, __local int *local_indices) \\\n\
-{ \\\n\
-    uint stage = num_stages; \\\n\
-    uint signo = (lx >> stage) & 1; \\\n\
- \\\n\
-    for (uint passOfStage = 0; passOfStage < stage + 1; ++passOfStage) \\\n\
-    { \\\n\
-        uint postShift = (stage - passOfStage); \\\n\
-        uint pairDistance = 1 << postShift; \\\n\
- \\\n\
-        uint left_id = ( (lx >> postShift) << (postShift + 1)) + (lx & (pairDistance - 1)); \\\n\
-        uint right_id = left_id + pairDistance; \\\n\
- \\\n\
-        int left_idx = local_indices[left_id]; \\\n\
-        int right_idx = local_indices[right_id]; \\\n\
- \\\n\
-        dtype left_elem = local_data[left_id]; \\\n\
-        dtype right_elem = local_data[right_id]; \\\n\
- \\\n\
-        if ((left_elem < right_elem || (left_elem == right_elem && left_idx < right_idx)) ^ signo) \\\n\
-        { \\\n\
-            local_data[left_id] = right_elem; \\\n\
-            local_data[right_id] = left_elem; \\\n\
- \\\n\
-            local_indices[left_id] = right_idx; \\\n\
-            local_indices[right_id] = left_idx; \\\n\
-        } \\\n\
- \\\n\
-        barrier(CLK_LOCAL_MEM_FENCE); \\\n\
-    } \\\n\
-}\n\
-BITONIC_MERGE(int)\n\
-BITONIC_MERGE(uint)\n\
-\n\
-#define BLOCK_SIZE              (512)\n\
-\n\
-__kernel __attribute__((reqd_work_group_size(BLOCK_SIZE, 1, 1))) void topk_stage_I32toI32_I32\n\
-(\n\
-  __read_only  image2d_t input,\n\
-  __write_only image2d_t output,\n\
-  __write_only image2d_t indices,\n\
-               float     input_scale,\n\
-               float     input_tail,\n\
-               float     output_scale,\n\
-               float     output_tail,\n\
-               int       _num_stages,\n\
-               int       width\n\
-  )\n\
- {\n\
-    uint lx = get_local_id(0);\n\
-    const int init_k = -2147483647;\n\
-    const int init_v = -2147483647;\n\
-    const int num_stages = 9;\n\
-    const int threads_per_block = BLOCK_SIZE;\n\
-    const int index_minus_1 = threads_per_block * 2 - 1;\n\
-    uint offset = 0;\n\
-    uint lx1 = lx + threads_per_block;\n\
-\n\
-    int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(0), get_global_id(1));\n\
-\n\
-    __local int local_data[1536];\n\
-    __local int local_indices[1536];\n\
-\n\
-    int left = read_imagei(input, coord.xy).x;\n\
-    coord.z += threads_per_block;\n\
-    int right = read_imagei(input, coord.zy).x;\n\
-\n\
-    local_data[lx] = left;\n\
-    local_indices[lx] = coord.x;\n\
-    local_data[lx1] = right;\n\
-    local_indices[lx1] = coord.z;\n\
-\n\
-    barrier(CLK_LOCAL_MEM_FENCE);\n\
-\n\
-    bitonic_step_int(num_stages, lx, local_data, local_indices);\n\
-\n\
-    int min_data = local_data[511];\n\
-\n\
-    int *p_share_k = local_data + threads_per_block;\n\
-    int *p_share_v = local_indices + threads_per_block;\n\
-\n\
-    int limit = (width >> 10) << 10;\n\
-    p_share_k[lx] = init_k;\n\
-    p_share_v[lx] = init_v;\n\
-\n\
-    p_share_k[lx1] = init_k;\n\
-    p_share_v[lx1] = init_v;\n\
-    barrier(CLK_LOCAL_MEM_FENCE);\n\
-\n\
-    for (coord.x = lx + threads_per_block * 2; coord.x < limit; coord.x = coord.x + threads_per_block * 2)\n\
-    {\n\
-        int2 data;\n\
-        coord.z = coord.x + threads_per_block;\n\
-        data.x = read_imagei(input, coord.xy).x;\n\
-        data.y = read_imagei(input, coord.zy).x;\n\
-\n\
-        p_share_k[lx] = data.x;\n\
-        p_share_v[lx] = coord.x;\n\
-\n\
-        p_share_k[lx1] = data.y;\n\
-        p_share_v[lx1] = coord.z;\n\
-        barrier(CLK_LOCAL_MEM_FENCE);\n\
-\n\
-        bitonic_step_ascend_int(num_stages, lx, p_share_k, p_share_v);\n\
-\n\
-        if (p_share_k[index_minus_1] < min_data)\n\
-        {\n\
-            continue;\n\
-        }\n\
-\n\
-        p_share_k[lx] = p_share_k[lx1];\n\
-        p_share_v[lx] = p_share_v[lx1];\n\
-        barrier(CLK_LOCAL_MEM_FENCE);\n\
-\n\
-        bitonic_merge_int(num_stages, lx, local_data, local_indices);\n\
-\n\
-        min_data = local_data[511];\n\
-        p_share_k[lx] = init_k;\n\
-        p_share_v[lx] = init_v;\n\
-        p_share_k[lx1] = init_k;\n\
-        p_share_v[lx1] = init_v;\n\
-    }\n\
-\n\
-    if (width > limit)\n\
-    {\n\
-        if (coord.x < width)\n\
-        {\n\
-            int2 data;\n\
-            data.x = read_imagei(input, coord.xy).x;\n\
-            coord.z = coord.x + threads_per_block;\n\
-            data.y = read_imagei(input, coord.zy).x;\n\
-\n\
-            p_share_k[lx] = data.x;\n\
-            p_share_v[lx] = coord.x;\n\
-\n\
-            p_share_k[lx1] = coord.z < width ? data.y : init_k;\n\
-            p_share_v[lx1] = coord.z < width ? coord.z : init_v;\n\
-        }\n\
-        barrier(CLK_LOCAL_MEM_FENCE);\n\
-\n\
-        bitonic_step_ascend_int(num_stages, lx, p_share_k, p_share_v);\n\
-\n\
-        if (p_share_k[index_minus_1] >= min_data)\n\
-        {\n\
-            p_share_k[lx] = p_share_k[lx1];\n\
-            p_share_v[lx] = p_share_v[lx1];\n\
-            barrier(CLK_LOCAL_MEM_FENCE);\n\
-            bitonic_merge_int(num_stages, lx, local_data, local_indices);\n\
-        }\n\
-    }\n\
-\n\
-    int4 dst;\n\
-    dst.x = local_data[lx];\n\
-\n\
-    coord.x = lx;\n\
-    write_imagei(output, coord.xy, dst.xxxx);\n\
-\n\
-    int4 index;\n\
-    index.x = local_indices[lx];\n\
-\n\
-    write_imagei(indices, coord.xy, index.xxxx);\n\
-}\n\
-\n\
-__kernel __attribute__((reqd_work_group_size(BLOCK_SIZE, 1, 1))) void topk_stage_U32toU32_I32\n\
-(\n\
-  __read_only  image2d_t input,\n\
-  __write_only image2d_t output,\n\
-  __write_only image2d_t indices,\n\
-               float     input_scale,\n\
-               float     input_tail,\n\
-               float     output_scale,\n\
-               float     output_tail,\n\
-               int       _num_stages,\n\
-               int       width\n\
-  )\n\
- {\n\
-    uint lx = get_local_id(0);\n\
-    const uint init_k = 0;\n\
-    const int init_v = -2147483647;\n\
-    const int num_stages = 9;\n\
-    const int threads_per_block = BLOCK_SIZE;\n\
-    const int index_minus_1 = threads_per_block * 2 - 1;\n\
-    uint offset = 0;\n\
-    uint lx1 = lx + threads_per_block;\n\
-\n\
-    int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(0), get_global_id(1));\n\
-\n\
-    __local uint local_data[1536];\n\
-    __local int local_indices[1536];\n\
-\n\
-    uint left = read_imageui(input, coord.xy).x;\n\
-    coord.z += threads_per_block;\n\
-    uint right = read_imageui(input, coord.zy).x;\n\
-\n\
-    local_data[lx] = left;\n\
-    local_indices[lx] = coord.x;\n\
-    local_data[lx1] = right;\n\
-    local_indices[lx1] = coord.z;\n\
-\n\
-    barrier(CLK_LOCAL_MEM_FENCE);\n\
-\n\
-    bitonic_step_uint(num_stages, lx, local_data, local_indices);\n\
-\n\
-    uint min_data = local_data[511];\n\
-\n\
-    uint *p_share_k = local_data + threads_per_block;\n\
-    int *p_share_v = local_indices + threads_per_block;\n\
-\n\
-    int limit = (width >> 10) << 10;\n\
-    p_share_k[lx] = init_k;\n\
-    p_share_v[lx] = init_v;\n\
-\n\
-    p_share_k[lx1] = init_k;\n\
-    p_share_v[lx1] = init_v;\n\
-    barrier(CLK_LOCAL_MEM_FENCE);\n\
-\n\
-    for (coord.x = lx + threads_per_block * 2; coord.x < limit; coord.x = coord.x + threads_per_block * 2)\n\
-    {\n\
-        uint2 data;\n\
-        coord.z = coord.x + threads_per_block;\n\
-        data.x = read_imageui(input, coord.xy).x;\n\
-        data.y = read_imageui(input, coord.zy).x;\n\
-\n\
-        p_share_k[lx] = data.x;\n\
-        p_share_v[lx] = coord.x;\n\
-\n\
-        p_share_k[lx1] = data.y;\n\
-        p_share_v[lx1] = coord.z;\n\
-        barrier(CLK_LOCAL_MEM_FENCE);\n\
-\n\
-        bitonic_step_ascend_uint(num_stages, lx, p_share_k, p_share_v);\n\
-\n\
-        if (p_share_k[index_minus_1] < min_data)\n\
-        {\n\
-            continue;\n\
-        }\n\
-\n\
-        p_share_k[lx] = p_share_k[lx1];\n\
-        p_share_v[lx] = p_share_v[lx1];\n\
-        barrier(CLK_LOCAL_MEM_FENCE);\n\
-\n\
-        bitonic_merge_uint(num_stages, lx, local_data, local_indices);\n\
-\n\
-        min_data = local_data[511];\n\
-        p_share_k[lx] = init_k;\n\
-        p_share_v[lx] = init_v;\n\
-        p_share_k[lx1] = init_k;\n\
-        p_share_v[lx1] = init_v;\n\
-    }\n\
-\n\
-    if (width > limit)\n\
-    {\n\
-        if (coord.x < width)\n\
-        {\n\
-            uint2 data;\n\
-            data.x = read_imageui(input, coord.xy).x;\n\
-            coord.z = coord.x + threads_per_block;\n\
-            data.y = read_imageui(input, coord.zy).x;\n\
-\n\
-            p_share_k[lx] = data.x;\n\
-            p_share_v[lx] = coord.x;\n\
-\n\
-            p_share_k[lx1] = coord.z < width ? data.y : init_k;\n\
-            p_share_v[lx1] = coord.z < width ? coord.z : init_v;\n\
-        }\n\
-        barrier(CLK_LOCAL_MEM_FENCE);\n\
-\n\
-        bitonic_step_ascend_uint(num_stages, lx, p_share_k, p_share_v);\n\
-\n\
-        if (p_share_k[index_minus_1] >= min_data)\n\
-        {\n\
-            p_share_k[lx] = p_share_k[lx1];\n\
-            p_share_v[lx] = p_share_v[lx1];\n\
-            barrier(CLK_LOCAL_MEM_FENCE);\n\
-            bitonic_merge_uint(num_stages, lx, local_data, local_indices);\n\
-        }\n\
-    }\n\
-\n\
-    uint4 dst;\n\
-    dst.x = local_data[lx];\n\
-\n\
-    coord.x = lx;\n\
-    write_imageui(output, coord.xy, dst.xxxx);\n\
-\n\
-    int4 index;\n\
-    index.x = local_indices[lx];\n\
-\n\
-    write_imagei(indices, coord.xy, index.xxxx);\n\
-}\n\
-"; /* end of topk2_cl*/
-
 static const char topk_odd_even_sort_cl[] = "#define LOCAL_SIZE_X    (32)\n\
 __kernel __attribute__((reqd_work_group_size(LOCAL_SIZE_X, 1, 1))) void topk_odd_even_sort_F32toF32_I32\n\
  (\n\
@@ -82484,7 +82114,6 @@ static const source_map_t cl_resource[] =
     {"swish_cl", swish_cl},
     {"tile_cl", tile_cl},
     {"topk_cl", topk_cl},
-    {"topk2_cl", topk2_cl},
     {"topk_odd_even_sort_cl", topk_odd_even_sort_cl},
     {"topk_odd_even_sort2_cl", topk_odd_even_sort2_cl},
     {"upsample_cl", upsample_cl},