diff --git a/src/tim/vx/internal/include/vsi_nn_context.h b/src/tim/vx/internal/include/vsi_nn_context.h index 477cb19..4ac9f61 100644 --- a/src/tim/vx/internal/include/vsi_nn_context.h +++ b/src/tim/vx/internal/include/vsi_nn_context.h @@ -60,7 +60,9 @@ typedef struct _vsi_nn_hw_config_t { char target_name[VSI_NN_MAX_TARGET_NAME]; vsi_nn_hw_evis_t evis; +#if VX_HARDWARE_CAPS_PARAMS_EXT_SUPPORT uint32_t subGroupSize; +#endif uint32_t use_40bits_va; uint32_t support_stream_processor; uint32_t sp_exec_count; diff --git a/src/tim/vx/internal/include/vsi_nn_version.h b/src/tim/vx/internal/include/vsi_nn_version.h index 0fafc24..92f8349 100644 --- a/src/tim/vx/internal/include/vsi_nn_version.h +++ b/src/tim/vx/internal/include/vsi_nn_version.h @@ -33,7 +33,7 @@ extern "C"{ #define VSI_NN_VERSION_MAJOR 1 #define VSI_NN_VERSION_MINOR 2 -#define VSI_NN_VERSION_PATCH 6 +#define VSI_NN_VERSION_PATCH 5 #define VSI_NN_VERSION \ (VSI_NN_VERSION_MAJOR * 10000 + VSI_NN_VERSION_MINOR * 100 + VSI_NN_VERSION_PATCH) diff --git a/src/tim/vx/internal/src/kernel/cl/topk_cl.c b/src/tim/vx/internal/src/kernel/cl/topk_cl.c index a21d290..b8cdfd0 100644 --- a/src/tim/vx/internal/src/kernel/cl/topk_cl.c +++ b/src/tim/vx/internal/src/kernel/cl/topk_cl.c @@ -34,24 +34,20 @@ #include "vsi_nn_tensor_util.h" #include "utils/vsi_nn_util.h" #include "kernel/vsi_nn_kernel.h" +#include "libnnext/vx_lib_nnext.h" __BEGIN_DECLS #define _TOPK_KERNEL_SOURCE "topk" #define STR(a) #a // Add kernel hashtable here -#define TOPK_HASH_KEY( IN_DTYPE, OUT_DTYPE, STAGES, SECTION ) \ - ( ( IN_DTYPE ) | ( OUT_DTYPE << 8 ) | (STAGES << 16) | (SECTION << 26)) +#define TOPK_HASH_KEY( IN_DTYPE, OUT_DTYPE, STAGES ) \ + ( ( IN_DTYPE ) | ( OUT_DTYPE << 8 ) | (STAGES << 16) ) #define PACK_KERNEL_MAP( IN_DTYPE, OUT_DTYPE, STAGES ) \ - { TOPK_HASH_KEY( IN_DTYPE, OUT_DTYPE, STAGES, 0 ), \ + { TOPK_HASH_KEY( IN_DTYPE, OUT_DTYPE, STAGES ), \ CVIVANTE_NAMESPACE("cl.topk_stage"STR(STAGES)"_"STR(IN_DTYPE)"to"STR(OUT_DTYPE)"_I32"), \ _TOPK_KERNEL_SOURCE } -#define PACK_MERGE_KERNEL_MAP( IN_DTYPE, OUT_DTYPE ) \ - { TOPK_HASH_KEY( IN_DTYPE, OUT_DTYPE, 0, 1 ), \ - CVIVANTE_NAMESPACE("cl.topk_stage_"STR(IN_DTYPE)"to"STR(OUT_DTYPE)"_I32"), \ - "topk2" } - #define TOPK_ODD_EVEN_SORT_HASH_KEY( IN_DTYPE, OUT_DTYPE ) \ ( ( IN_DTYPE ) | ( OUT_DTYPE << 8 ) ) #define PACK_ODD_EVEN_SORT_KERNEL_MAP( IN_DTYPE, OUT_DTYPE ) \ @@ -115,9 +111,6 @@ static const _kernel_map_type _topk_kernel_map[] = PACK_KERNEL_MAP( F32, I32, 4 ), PACK_KERNEL_MAP( F32, I32, 5 ), PACK_KERNEL_MAP( F32, I32, 6 ), - - PACK_MERGE_KERNEL_MAP(U32, U32), - PACK_MERGE_KERNEL_MAP(I32, I32), }; static const _kernel_map_type _topk_odd_even_sort_kernel_map[] = @@ -261,8 +254,7 @@ static vsi_status _query_kernel vsi_nn_kernel_t * kernel, vsi_nn_tensor_t * const * const inputs, vsi_nn_tensor_t * const * const outputs, - int32_t num_stages, - vsi_bool is_bitnoic_segment + int32_t num_stages ) { vsi_status status = VSI_FAILURE; @@ -280,23 +272,21 @@ static vsi_status _query_kernel in_dtype = vsi_nn_kernel_map_dtype( inputs[0]->attr.dtype.vx_type ); out_dtype = vsi_nn_kernel_map_dtype( outputs[0]->attr.dtype.vx_type ); - num_stages = is_bitnoic_segment ? 0 : num_stages; - switch (_PACK_SELECT_KEY(in_dtype, out_dtype)) { case _PACK_SELECT_KEY(F32, F32): case _PACK_SELECT_KEY(F16, F16): - key = TOPK_HASH_KEY( F32, F32, num_stages, is_bitnoic_segment ); + key = TOPK_HASH_KEY( F32, F32, num_stages ); break; case _PACK_SELECT_KEY(U32, U32): case _PACK_SELECT_KEY(U16, U16): case _PACK_SELECT_KEY(U8, U8): - key = TOPK_HASH_KEY( U32, U32, num_stages, is_bitnoic_segment ); + key = TOPK_HASH_KEY( U32, U32, num_stages ); break; case _PACK_SELECT_KEY(I32, I32): case _PACK_SELECT_KEY(I16, I16): case _PACK_SELECT_KEY(I8, I8): - key = TOPK_HASH_KEY( I32, I32, num_stages, is_bitnoic_segment ); + key = TOPK_HASH_KEY( I32, I32, num_stages ); break; case _PACK_SELECT_KEY(F32, U32): case _PACK_SELECT_KEY(F16, U32): @@ -304,7 +294,7 @@ static vsi_status _query_kernel case _PACK_SELECT_KEY(F16, U16): case _PACK_SELECT_KEY(F32, U8): case _PACK_SELECT_KEY(F16, U8): - key = TOPK_HASH_KEY( F32, U32, num_stages, is_bitnoic_segment ); + key = TOPK_HASH_KEY( F32, U32, num_stages ); break; case _PACK_SELECT_KEY(F32, I32): case _PACK_SELECT_KEY(F16, I32): @@ -312,7 +302,7 @@ static vsi_status _query_kernel case _PACK_SELECT_KEY(F16, I16): case _PACK_SELECT_KEY(F32, I8): case _PACK_SELECT_KEY(F16, I8): - key = TOPK_HASH_KEY( F32, I32, num_stages, is_bitnoic_segment ); + key = TOPK_HASH_KEY( F32, I32, num_stages ); break; default: break; @@ -450,12 +440,7 @@ static vsi_nn_kernel_node_t _setup int32_t top_k = vsi_nn_kernel_param_get_int32(params, "top_k"); int32_t num_stages = (int32_t)vsi_nn_max(ceil(log10(block_size / 2.0f) / log10(2.0f)), 0); vsi_bool is_odd_even_sort = FALSE; - vsi_bool is_bitnoic_segment = FALSE; size_t param_num = _TOPK_PARAM_NUM; - int32_t max_stages = 7 + (int32_t)log2(graph->ctx->config.subGroupSize >> 2); - vsi_nn_kernel_dtype_e type0 = vsi_nn_kernel_map_dtype( inputs[0]->attr.dtype.vx_type ); - vsi_nn_kernel_dtype_e type1 = vsi_nn_kernel_map_dtype( outputs[0]->attr.dtype.vx_type ); - float inputScale = vsi_nn_get_tensor_scale(inputs[0]); float inputTail = (float)vsi_nn_get_tensor_zero_point(inputs[0]); float outputScale = vsi_nn_get_tensor_scale(outputs[0]); @@ -486,14 +471,9 @@ static vsi_nn_kernel_node_t _setup rs_tensors[0] = vsi_nn_reshape_tensor( graph, inputs[0], shape[0], 2 ); - is_bitnoic_segment = (num_stages >= 9) && (top_k <= 512 && max_stages > 9) && - type0 == type1 && (type0 == U8 || type0 == I8 || type0 == I16 || type0 == U16 || type0 == I32 || type0 == U32); - num_stages = is_bitnoic_segment ? 9 : num_stages; - max_stages = is_bitnoic_segment ? max_stages : 7; - - if (num_stages < max_stages || is_bitnoic_segment) + if (num_stages < 7) { - status = _query_kernel( kernel, inputs, outputs, num_stages, is_bitnoic_segment ); + status = _query_kernel( kernel, inputs, outputs, num_stages ); rs_tensors[1] = vsi_nn_reshape_tensor( graph, outputs[0], shape[1], 2 ); diff --git a/src/tim/vx/internal/src/libnnext/ops/cl/topk.cl b/src/tim/vx/internal/src/libnnext/ops/cl/topk.cl index dc20389..0e6166c 100644 --- a/src/tim/vx/internal/src/libnnext/ops/cl/topk.cl +++ b/src/tim/vx/internal/src/libnnext/ops/cl/topk.cl @@ -51,7 +51,7 @@ __kernel __attribute__((reqd_work_group_size(LOCAL_SIZE0, 1, 1))) void topk_stag float left_elem = local_data[left_id]; \ float right_elem = local_data[right_id]; \ \ - if ((left_elem < right_elem || (left_elem == right_elem && left_idx < right_idx)) ^ signo) \ + if ((left_elem < right_elem) ^ signo) \ { \ local_data[left_id] = right_elem; \ local_data[right_id] = left_elem; \ @@ -139,7 +139,7 @@ __kernel __attribute__((reqd_work_group_size(LOCAL_SIZE0, 1, 1))) void topk_stag uint left_elem = local_data[left_id]; \ uint right_elem = local_data[right_id]; \ \ - if ((left_elem < right_elem || (left_elem == right_elem && left_idx < right_idx)) ^ signo) \ + if ((left_elem < right_elem) ^ signo) \ { \ local_data[left_id] = right_elem; \ local_data[right_id] = left_elem; \ @@ -227,7 +227,7 @@ __kernel __attribute__((reqd_work_group_size(LOCAL_SIZE0, 1, 1))) void topk_stag int left_elem = local_data[left_id]; \ int right_elem = local_data[right_id]; \ \ - if ((left_elem < right_elem || (left_elem == right_elem && left_idx < right_idx)) ^ signo) \ + if ((left_elem < right_elem) ^ signo) \ { \ local_data[left_id] = right_elem; \ local_data[right_id] = left_elem; \ @@ -315,7 +315,7 @@ __kernel __attribute__((reqd_work_group_size(LOCAL_SIZE0, 1, 1))) void topk_stag float left_elem = local_data[left_id]; \ float right_elem = local_data[right_id]; \ \ - if ((left_elem < right_elem || (left_elem == right_elem && left_idx < right_idx)) ^ signo) \ + if ((left_elem < right_elem) ^ signo) \ { \ local_data[left_id] = right_elem; \ local_data[right_id] = left_elem; \ @@ -403,7 +403,7 @@ __kernel __attribute__((reqd_work_group_size(LOCAL_SIZE0, 1, 1))) void topk_stag float left_elem = local_data[left_id]; \ float right_elem = local_data[right_id]; \ \ - if ((left_elem < right_elem || (left_elem == right_elem && left_idx < right_idx)) ^ signo) \ + if ((left_elem < right_elem) ^ signo) \ { \ local_data[left_id] = right_elem; \ local_data[right_id] = left_elem; \ diff --git a/src/tim/vx/internal/src/libnnext/ops/cl/topk2.cl b/src/tim/vx/internal/src/libnnext/ops/cl/topk2.cl deleted file mode 100644 index 0eae5ab..0000000 --- a/src/tim/vx/internal/src/libnnext/ops/cl/topk2.cl +++ /dev/null @@ -1,368 +0,0 @@ - -#define BITONIC_STEP(dtype) \ -void bitonic_step_##dtype(uint num_stages, int lx, \ - __local dtype *local_data, __local int *local_indices) \ -{ \ - for (uint stage = 0; stage < num_stages + 1; ++stage) \ - { \ - uint signo = (lx >> stage) & 1; \ - \ - for (uint passOfStage = 0; passOfStage < stage + 1; ++passOfStage) \ - { \ - uint postShift = (stage - passOfStage); \ - uint pairDistance = 1 << postShift; \ - \ - uint left_id = ( (lx >> postShift) << (postShift + 1)) + (lx & (pairDistance - 1)); \ - uint right_id = left_id + pairDistance; \ - \ - int left_idx = local_indices[left_id]; \ - int right_idx = local_indices[right_id]; \ - \ - dtype left_elem = local_data[left_id]; \ - dtype right_elem = local_data[right_id]; \ - \ - if ((left_elem < right_elem || (left_elem == right_elem && left_idx < right_idx)) ^ signo) \ - { \ - local_data[left_id] = right_elem; \ - local_data[right_id] = left_elem; \ - \ - local_indices[left_id] = right_idx; \ - local_indices[right_id] = left_idx; \ - } \ - \ - barrier(CLK_LOCAL_MEM_FENCE); \ - } \ - } \ -} -BITONIC_STEP(int) -BITONIC_STEP(uint) - -#define BITONIC_STEP_ASCEND(dtype) \ -void bitonic_step_ascend_##dtype(uint num_stages, int lx, \ - __local dtype *p_share_k, __local int *p_share_v) \ -{ \ - for (uint stage = 0; stage < num_stages + 1; ++stage) \ - { \ - uint signo = (lx >> stage) & 1; \ - \ - for (uint passOfStage = 0; passOfStage < stage + 1; ++passOfStage) \ - { \ - uint postShift = (stage - passOfStage); \ - uint pairDistance = 1 << postShift; \ - \ - uint left_id = ( (lx >> postShift) << (postShift + 1)) + (lx & (pairDistance - 1)); \ - uint right_id = left_id + pairDistance; \ - \ - int left_idx = p_share_v[left_id]; \ - int right_idx = p_share_v[right_id]; \ - \ - dtype left_elem = p_share_k[left_id]; \ - dtype right_elem = p_share_k[right_id]; \ - \ - if ((left_elem > right_elem || (left_elem == right_elem && left_idx > right_idx)) ^ signo) \ - { \ - p_share_k[left_id] = right_elem; \ - p_share_k[right_id] = left_elem; \ - \ - p_share_v[left_id] = right_idx; \ - p_share_v[right_id] = left_idx; \ - } \ - \ - barrier(CLK_LOCAL_MEM_FENCE); \ - } \ - } \ -} -BITONIC_STEP_ASCEND(int) -BITONIC_STEP_ASCEND(uint) - -#define BITONIC_MERGE(dtype) \ -void bitonic_merge_##dtype(uint num_stages, int lx, \ - __local dtype *local_data, __local int *local_indices) \ -{ \ - uint stage = num_stages; \ - uint signo = (lx >> stage) & 1; \ - \ - for (uint passOfStage = 0; passOfStage < stage + 1; ++passOfStage) \ - { \ - uint postShift = (stage - passOfStage); \ - uint pairDistance = 1 << postShift; \ - \ - uint left_id = ( (lx >> postShift) << (postShift + 1)) + (lx & (pairDistance - 1)); \ - uint right_id = left_id + pairDistance; \ - \ - int left_idx = local_indices[left_id]; \ - int right_idx = local_indices[right_id]; \ - \ - dtype left_elem = local_data[left_id]; \ - dtype right_elem = local_data[right_id]; \ - \ - if ((left_elem < right_elem || (left_elem == right_elem && left_idx < right_idx)) ^ signo) \ - { \ - local_data[left_id] = right_elem; \ - local_data[right_id] = left_elem; \ - \ - local_indices[left_id] = right_idx; \ - local_indices[right_id] = left_idx; \ - } \ - \ - barrier(CLK_LOCAL_MEM_FENCE); \ - } \ -} -BITONIC_MERGE(int) -BITONIC_MERGE(uint) - -#define BLOCK_SIZE (512) - -__kernel __attribute__((reqd_work_group_size(BLOCK_SIZE, 1, 1))) void topk_stage_I32toI32_I32 -( - __read_only image2d_t input, - __write_only image2d_t output, - __write_only image2d_t indices, - float input_scale, - float input_tail, - float output_scale, - float output_tail, - int _num_stages, - int width - ) - { - uint lx = get_local_id(0); - const int init_k = -2147483647; - const int init_v = -2147483647; - const int num_stages = 9; - const int threads_per_block = BLOCK_SIZE; - const int index_minus_1 = threads_per_block * 2 - 1; - uint offset = 0; - uint lx1 = lx + threads_per_block; - - int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(0), get_global_id(1)); - - __local int local_data[1536]; - __local int local_indices[1536]; - - int left = read_imagei(input, coord.xy).x; - coord.z += threads_per_block; - int right = read_imagei(input, coord.zy).x; - - local_data[lx] = left; - local_indices[lx] = coord.x; - local_data[lx1] = right; - local_indices[lx1] = coord.z; - - barrier(CLK_LOCAL_MEM_FENCE); - - bitonic_step_int(num_stages, lx, local_data, local_indices); - - int min_data = local_data[511]; - - int *p_share_k = local_data + threads_per_block; - int *p_share_v = local_indices + threads_per_block; - - int limit = (width >> 10) << 10; - p_share_k[lx] = init_k; - p_share_v[lx] = init_v; - - p_share_k[lx1] = init_k; - p_share_v[lx1] = init_v; - barrier(CLK_LOCAL_MEM_FENCE); - - for (coord.x = lx + threads_per_block * 2; coord.x < limit; coord.x = coord.x + threads_per_block * 2) - { - int2 data; - coord.z = coord.x + threads_per_block; - data.x = read_imagei(input, coord.xy).x; - data.y = read_imagei(input, coord.zy).x; - - p_share_k[lx] = data.x; - p_share_v[lx] = coord.x; - - p_share_k[lx1] = data.y; - p_share_v[lx1] = coord.z; - barrier(CLK_LOCAL_MEM_FENCE); - - bitonic_step_ascend_int(num_stages, lx, p_share_k, p_share_v); - - if (p_share_k[index_minus_1] < min_data) - { - continue; - } - - p_share_k[lx] = p_share_k[lx1]; - p_share_v[lx] = p_share_v[lx1]; - barrier(CLK_LOCAL_MEM_FENCE); - - bitonic_merge_int(num_stages, lx, local_data, local_indices); - - min_data = local_data[511]; - p_share_k[lx] = init_k; - p_share_v[lx] = init_v; - p_share_k[lx1] = init_k; - p_share_v[lx1] = init_v; - } - - if (width > limit) - { - if (coord.x < width) - { - int2 data; - data.x = read_imagei(input, coord.xy).x; - coord.z = coord.x + threads_per_block; - data.y = read_imagei(input, coord.zy).x; - - p_share_k[lx] = data.x; - p_share_v[lx] = coord.x; - - p_share_k[lx1] = coord.z < width ? data.y : init_k; - p_share_v[lx1] = coord.z < width ? coord.z : init_v; - } - barrier(CLK_LOCAL_MEM_FENCE); - - bitonic_step_ascend_int(num_stages, lx, p_share_k, p_share_v); - - if (p_share_k[index_minus_1] >= min_data) - { - p_share_k[lx] = p_share_k[lx1]; - p_share_v[lx] = p_share_v[lx1]; - barrier(CLK_LOCAL_MEM_FENCE); - bitonic_merge_int(num_stages, lx, local_data, local_indices); - } - } - - int4 dst; - dst.x = local_data[lx]; - - coord.x = lx; - write_imagei(output, coord.xy, dst.xxxx); - - int4 index; - index.x = local_indices[lx]; - - write_imagei(indices, coord.xy, index.xxxx); -} - -__kernel __attribute__((reqd_work_group_size(BLOCK_SIZE, 1, 1))) void topk_stage_U32toU32_I32 -( - __read_only image2d_t input, - __write_only image2d_t output, - __write_only image2d_t indices, - float input_scale, - float input_tail, - float output_scale, - float output_tail, - int _num_stages, - int width - ) - { - uint lx = get_local_id(0); - const uint init_k = 0; - const int init_v = -2147483647; - const int num_stages = 9; - const int threads_per_block = BLOCK_SIZE; - const int index_minus_1 = threads_per_block * 2 - 1; - uint offset = 0; - uint lx1 = lx + threads_per_block; - - int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(0), get_global_id(1)); - - __local uint local_data[1536]; - __local int local_indices[1536]; - - uint left = read_imageui(input, coord.xy).x; - coord.z += threads_per_block; - uint right = read_imageui(input, coord.zy).x; - - local_data[lx] = left; - local_indices[lx] = coord.x; - local_data[lx1] = right; - local_indices[lx1] = coord.z; - - barrier(CLK_LOCAL_MEM_FENCE); - - bitonic_step_uint(num_stages, lx, local_data, local_indices); - - uint min_data = local_data[511]; - - uint *p_share_k = local_data + threads_per_block; - int *p_share_v = local_indices + threads_per_block; - - int limit = (width >> 10) << 10; - p_share_k[lx] = init_k; - p_share_v[lx] = init_v; - - p_share_k[lx1] = init_k; - p_share_v[lx1] = init_v; - barrier(CLK_LOCAL_MEM_FENCE); - - for (coord.x = lx + threads_per_block * 2; coord.x < limit; coord.x = coord.x + threads_per_block * 2) - { - uint2 data; - coord.z = coord.x + threads_per_block; - data.x = read_imageui(input, coord.xy).x; - data.y = read_imageui(input, coord.zy).x; - - p_share_k[lx] = data.x; - p_share_v[lx] = coord.x; - - p_share_k[lx1] = data.y; - p_share_v[lx1] = coord.z; - barrier(CLK_LOCAL_MEM_FENCE); - - bitonic_step_ascend_uint(num_stages, lx, p_share_k, p_share_v); - - if (p_share_k[index_minus_1] < min_data) - { - continue; - } - - p_share_k[lx] = p_share_k[lx1]; - p_share_v[lx] = p_share_v[lx1]; - barrier(CLK_LOCAL_MEM_FENCE); - - bitonic_merge_uint(num_stages, lx, local_data, local_indices); - - min_data = local_data[511]; - p_share_k[lx] = init_k; - p_share_v[lx] = init_v; - p_share_k[lx1] = init_k; - p_share_v[lx1] = init_v; - } - - if (width > limit) - { - if (coord.x < width) - { - uint2 data; - data.x = read_imageui(input, coord.xy).x; - coord.z = coord.x + threads_per_block; - data.y = read_imageui(input, coord.zy).x; - - p_share_k[lx] = data.x; - p_share_v[lx] = coord.x; - - p_share_k[lx1] = coord.z < width ? data.y : init_k; - p_share_v[lx1] = coord.z < width ? coord.z : init_v; - } - barrier(CLK_LOCAL_MEM_FENCE); - - bitonic_step_ascend_uint(num_stages, lx, p_share_k, p_share_v); - - if (p_share_k[index_minus_1] >= min_data) - { - p_share_k[lx] = p_share_k[lx1]; - p_share_v[lx] = p_share_v[lx1]; - barrier(CLK_LOCAL_MEM_FENCE); - bitonic_merge_uint(num_stages, lx, local_data, local_indices); - } - } - - uint4 dst; - dst.x = local_data[lx]; - - coord.x = lx; - write_imageui(output, coord.xy, dst.xxxx); - - int4 index; - index.x = local_indices[lx]; - - write_imagei(indices, coord.xy, index.xxxx); -} diff --git a/src/tim/vx/internal/src/libnnext/vsi_nn_libnnext_resource.c b/src/tim/vx/internal/src/libnnext/vsi_nn_libnnext_resource.c index d19b519..debd687 100644 --- a/src/tim/vx/internal/src/libnnext/vsi_nn_libnnext_resource.c +++ b/src/tim/vx/internal/src/libnnext/vsi_nn_libnnext_resource.c @@ -80509,7 +80509,7 @@ __kernel __attribute__((reqd_work_group_size(LOCAL_SIZE0, 1, 1))) void topk_stag float left_elem = local_data[left_id]; \\\n\ float right_elem = local_data[right_id]; \\\n\ \\\n\ - if ((left_elem < right_elem || (left_elem == right_elem && left_idx < right_idx)) ^ signo) \\\n\ + if ((left_elem < right_elem) ^ signo) \\\n\ { \\\n\ local_data[left_id] = right_elem; \\\n\ local_data[right_id] = left_elem; \\\n\ @@ -80597,7 +80597,7 @@ __kernel __attribute__((reqd_work_group_size(LOCAL_SIZE0, 1, 1))) void topk_stag uint left_elem = local_data[left_id]; \\\n\ uint right_elem = local_data[right_id]; \\\n\ \\\n\ - if ((left_elem < right_elem || (left_elem == right_elem && left_idx < right_idx)) ^ signo) \\\n\ + if ((left_elem < right_elem) ^ signo) \\\n\ { \\\n\ local_data[left_id] = right_elem; \\\n\ local_data[right_id] = left_elem; \\\n\ @@ -80685,7 +80685,7 @@ __kernel __attribute__((reqd_work_group_size(LOCAL_SIZE0, 1, 1))) void topk_stag int left_elem = local_data[left_id]; \\\n\ int right_elem = local_data[right_id]; \\\n\ \\\n\ - if ((left_elem < right_elem || (left_elem == right_elem && left_idx < right_idx)) ^ signo) \\\n\ + if ((left_elem < right_elem) ^ signo) \\\n\ { \\\n\ local_data[left_id] = right_elem; \\\n\ local_data[right_id] = left_elem; \\\n\ @@ -80773,7 +80773,7 @@ __kernel __attribute__((reqd_work_group_size(LOCAL_SIZE0, 1, 1))) void topk_stag float left_elem = local_data[left_id]; \\\n\ float right_elem = local_data[right_id]; \\\n\ \\\n\ - if ((left_elem < right_elem || (left_elem == right_elem && left_idx < right_idx)) ^ signo) \\\n\ + if ((left_elem < right_elem) ^ signo) \\\n\ { \\\n\ local_data[left_id] = right_elem; \\\n\ local_data[right_id] = left_elem; \\\n\ @@ -80861,7 +80861,7 @@ __kernel __attribute__((reqd_work_group_size(LOCAL_SIZE0, 1, 1))) void topk_stag float left_elem = local_data[left_id]; \\\n\ float right_elem = local_data[right_id]; \\\n\ \\\n\ - if ((left_elem < right_elem || (left_elem == right_elem && left_idx < right_idx)) ^ signo) \\\n\ + if ((left_elem < right_elem) ^ signo) \\\n\ { \\\n\ local_data[left_id] = right_elem; \\\n\ local_data[right_id] = left_elem; \\\n\ @@ -80896,376 +80896,6 @@ TOPK_F32toI32(1 << 4, 4)\n\ TOPK_F32toI32(1 << 5, 5)\n\ TOPK_F32toI32(1 << 6, 6)"; /* end of topk_cl*/ -static const char topk2_cl[] = "\n\ -#define BITONIC_STEP(dtype) \\\n\ -void bitonic_step_##dtype(uint num_stages, int lx, \\\n\ - __local dtype *local_data, __local int *local_indices) \\\n\ -{ \\\n\ - for (uint stage = 0; stage < num_stages + 1; ++stage) \\\n\ - { \\\n\ - uint signo = (lx >> stage) & 1; \\\n\ - \\\n\ - for (uint passOfStage = 0; passOfStage < stage + 1; ++passOfStage) \\\n\ - { \\\n\ - uint postShift = (stage - passOfStage); \\\n\ - uint pairDistance = 1 << postShift; \\\n\ - \\\n\ - uint left_id = ( (lx >> postShift) << (postShift + 1)) + (lx & (pairDistance - 1)); \\\n\ - uint right_id = left_id + pairDistance; \\\n\ - \\\n\ - int left_idx = local_indices[left_id]; \\\n\ - int right_idx = local_indices[right_id]; \\\n\ - \\\n\ - dtype left_elem = local_data[left_id]; \\\n\ - dtype right_elem = local_data[right_id]; \\\n\ - \\\n\ - if ((left_elem < right_elem || (left_elem == right_elem && left_idx < right_idx)) ^ signo) \\\n\ - { \\\n\ - local_data[left_id] = right_elem; \\\n\ - local_data[right_id] = left_elem; \\\n\ - \\\n\ - local_indices[left_id] = right_idx; \\\n\ - local_indices[right_id] = left_idx; \\\n\ - } \\\n\ - \\\n\ - barrier(CLK_LOCAL_MEM_FENCE); \\\n\ - } \\\n\ - } \\\n\ -}\n\ -BITONIC_STEP(int)\n\ -BITONIC_STEP(uint)\n\ -\n\ -#define BITONIC_STEP_ASCEND(dtype) \\\n\ -void bitonic_step_ascend_##dtype(uint num_stages, int lx, \\\n\ - __local dtype *p_share_k, __local int *p_share_v) \\\n\ -{ \\\n\ - for (uint stage = 0; stage < num_stages + 1; ++stage) \\\n\ - { \\\n\ - uint signo = (lx >> stage) & 1; \\\n\ - \\\n\ - for (uint passOfStage = 0; passOfStage < stage + 1; ++passOfStage) \\\n\ - { \\\n\ - uint postShift = (stage - passOfStage); \\\n\ - uint pairDistance = 1 << postShift; \\\n\ - \\\n\ - uint left_id = ( (lx >> postShift) << (postShift + 1)) + (lx & (pairDistance - 1)); \\\n\ - uint right_id = left_id + pairDistance; \\\n\ - \\\n\ - int left_idx = p_share_v[left_id]; \\\n\ - int right_idx = p_share_v[right_id]; \\\n\ - \\\n\ - dtype left_elem = p_share_k[left_id]; \\\n\ - dtype right_elem = p_share_k[right_id]; \\\n\ - \\\n\ - if ((left_elem > right_elem || (left_elem == right_elem && left_idx > right_idx)) ^ signo) \\\n\ - { \\\n\ - p_share_k[left_id] = right_elem; \\\n\ - p_share_k[right_id] = left_elem; \\\n\ - \\\n\ - p_share_v[left_id] = right_idx; \\\n\ - p_share_v[right_id] = left_idx; \\\n\ - } \\\n\ - \\\n\ - barrier(CLK_LOCAL_MEM_FENCE); \\\n\ - } \\\n\ - } \\\n\ -}\n\ -BITONIC_STEP_ASCEND(int)\n\ -BITONIC_STEP_ASCEND(uint)\n\ -\n\ -#define BITONIC_MERGE(dtype) \\\n\ -void bitonic_merge_##dtype(uint num_stages, int lx, \\\n\ - __local dtype *local_data, __local int *local_indices) \\\n\ -{ \\\n\ - uint stage = num_stages; \\\n\ - uint signo = (lx >> stage) & 1; \\\n\ - \\\n\ - for (uint passOfStage = 0; passOfStage < stage + 1; ++passOfStage) \\\n\ - { \\\n\ - uint postShift = (stage - passOfStage); \\\n\ - uint pairDistance = 1 << postShift; \\\n\ - \\\n\ - uint left_id = ( (lx >> postShift) << (postShift + 1)) + (lx & (pairDistance - 1)); \\\n\ - uint right_id = left_id + pairDistance; \\\n\ - \\\n\ - int left_idx = local_indices[left_id]; \\\n\ - int right_idx = local_indices[right_id]; \\\n\ - \\\n\ - dtype left_elem = local_data[left_id]; \\\n\ - dtype right_elem = local_data[right_id]; \\\n\ - \\\n\ - if ((left_elem < right_elem || (left_elem == right_elem && left_idx < right_idx)) ^ signo) \\\n\ - { \\\n\ - local_data[left_id] = right_elem; \\\n\ - local_data[right_id] = left_elem; \\\n\ - \\\n\ - local_indices[left_id] = right_idx; \\\n\ - local_indices[right_id] = left_idx; \\\n\ - } \\\n\ - \\\n\ - barrier(CLK_LOCAL_MEM_FENCE); \\\n\ - } \\\n\ -}\n\ -BITONIC_MERGE(int)\n\ -BITONIC_MERGE(uint)\n\ -\n\ -#define BLOCK_SIZE (512)\n\ -\n\ -__kernel __attribute__((reqd_work_group_size(BLOCK_SIZE, 1, 1))) void topk_stage_I32toI32_I32\n\ -(\n\ - __read_only image2d_t input,\n\ - __write_only image2d_t output,\n\ - __write_only image2d_t indices,\n\ - float input_scale,\n\ - float input_tail,\n\ - float output_scale,\n\ - float output_tail,\n\ - int _num_stages,\n\ - int width\n\ - )\n\ - {\n\ - uint lx = get_local_id(0);\n\ - const int init_k = -2147483647;\n\ - const int init_v = -2147483647;\n\ - const int num_stages = 9;\n\ - const int threads_per_block = BLOCK_SIZE;\n\ - const int index_minus_1 = threads_per_block * 2 - 1;\n\ - uint offset = 0;\n\ - uint lx1 = lx + threads_per_block;\n\ -\n\ - int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(0), get_global_id(1));\n\ -\n\ - __local int local_data[1536];\n\ - __local int local_indices[1536];\n\ -\n\ - int left = read_imagei(input, coord.xy).x;\n\ - coord.z += threads_per_block;\n\ - int right = read_imagei(input, coord.zy).x;\n\ -\n\ - local_data[lx] = left;\n\ - local_indices[lx] = coord.x;\n\ - local_data[lx1] = right;\n\ - local_indices[lx1] = coord.z;\n\ -\n\ - barrier(CLK_LOCAL_MEM_FENCE);\n\ -\n\ - bitonic_step_int(num_stages, lx, local_data, local_indices);\n\ -\n\ - int min_data = local_data[511];\n\ -\n\ - int *p_share_k = local_data + threads_per_block;\n\ - int *p_share_v = local_indices + threads_per_block;\n\ -\n\ - int limit = (width >> 10) << 10;\n\ - p_share_k[lx] = init_k;\n\ - p_share_v[lx] = init_v;\n\ -\n\ - p_share_k[lx1] = init_k;\n\ - p_share_v[lx1] = init_v;\n\ - barrier(CLK_LOCAL_MEM_FENCE);\n\ -\n\ - for (coord.x = lx + threads_per_block * 2; coord.x < limit; coord.x = coord.x + threads_per_block * 2)\n\ - {\n\ - int2 data;\n\ - coord.z = coord.x + threads_per_block;\n\ - data.x = read_imagei(input, coord.xy).x;\n\ - data.y = read_imagei(input, coord.zy).x;\n\ -\n\ - p_share_k[lx] = data.x;\n\ - p_share_v[lx] = coord.x;\n\ -\n\ - p_share_k[lx1] = data.y;\n\ - p_share_v[lx1] = coord.z;\n\ - barrier(CLK_LOCAL_MEM_FENCE);\n\ -\n\ - bitonic_step_ascend_int(num_stages, lx, p_share_k, p_share_v);\n\ -\n\ - if (p_share_k[index_minus_1] < min_data)\n\ - {\n\ - continue;\n\ - }\n\ -\n\ - p_share_k[lx] = p_share_k[lx1];\n\ - p_share_v[lx] = p_share_v[lx1];\n\ - barrier(CLK_LOCAL_MEM_FENCE);\n\ -\n\ - bitonic_merge_int(num_stages, lx, local_data, local_indices);\n\ -\n\ - min_data = local_data[511];\n\ - p_share_k[lx] = init_k;\n\ - p_share_v[lx] = init_v;\n\ - p_share_k[lx1] = init_k;\n\ - p_share_v[lx1] = init_v;\n\ - }\n\ -\n\ - if (width > limit)\n\ - {\n\ - if (coord.x < width)\n\ - {\n\ - int2 data;\n\ - data.x = read_imagei(input, coord.xy).x;\n\ - coord.z = coord.x + threads_per_block;\n\ - data.y = read_imagei(input, coord.zy).x;\n\ -\n\ - p_share_k[lx] = data.x;\n\ - p_share_v[lx] = coord.x;\n\ -\n\ - p_share_k[lx1] = coord.z < width ? data.y : init_k;\n\ - p_share_v[lx1] = coord.z < width ? coord.z : init_v;\n\ - }\n\ - barrier(CLK_LOCAL_MEM_FENCE);\n\ -\n\ - bitonic_step_ascend_int(num_stages, lx, p_share_k, p_share_v);\n\ -\n\ - if (p_share_k[index_minus_1] >= min_data)\n\ - {\n\ - p_share_k[lx] = p_share_k[lx1];\n\ - p_share_v[lx] = p_share_v[lx1];\n\ - barrier(CLK_LOCAL_MEM_FENCE);\n\ - bitonic_merge_int(num_stages, lx, local_data, local_indices);\n\ - }\n\ - }\n\ -\n\ - int4 dst;\n\ - dst.x = local_data[lx];\n\ -\n\ - coord.x = lx;\n\ - write_imagei(output, coord.xy, dst.xxxx);\n\ -\n\ - int4 index;\n\ - index.x = local_indices[lx];\n\ -\n\ - write_imagei(indices, coord.xy, index.xxxx);\n\ -}\n\ -\n\ -__kernel __attribute__((reqd_work_group_size(BLOCK_SIZE, 1, 1))) void topk_stage_U32toU32_I32\n\ -(\n\ - __read_only image2d_t input,\n\ - __write_only image2d_t output,\n\ - __write_only image2d_t indices,\n\ - float input_scale,\n\ - float input_tail,\n\ - float output_scale,\n\ - float output_tail,\n\ - int _num_stages,\n\ - int width\n\ - )\n\ - {\n\ - uint lx = get_local_id(0);\n\ - const uint init_k = 0;\n\ - const int init_v = -2147483647;\n\ - const int num_stages = 9;\n\ - const int threads_per_block = BLOCK_SIZE;\n\ - const int index_minus_1 = threads_per_block * 2 - 1;\n\ - uint offset = 0;\n\ - uint lx1 = lx + threads_per_block;\n\ -\n\ - int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(0), get_global_id(1));\n\ -\n\ - __local uint local_data[1536];\n\ - __local int local_indices[1536];\n\ -\n\ - uint left = read_imageui(input, coord.xy).x;\n\ - coord.z += threads_per_block;\n\ - uint right = read_imageui(input, coord.zy).x;\n\ -\n\ - local_data[lx] = left;\n\ - local_indices[lx] = coord.x;\n\ - local_data[lx1] = right;\n\ - local_indices[lx1] = coord.z;\n\ -\n\ - barrier(CLK_LOCAL_MEM_FENCE);\n\ -\n\ - bitonic_step_uint(num_stages, lx, local_data, local_indices);\n\ -\n\ - uint min_data = local_data[511];\n\ -\n\ - uint *p_share_k = local_data + threads_per_block;\n\ - int *p_share_v = local_indices + threads_per_block;\n\ -\n\ - int limit = (width >> 10) << 10;\n\ - p_share_k[lx] = init_k;\n\ - p_share_v[lx] = init_v;\n\ -\n\ - p_share_k[lx1] = init_k;\n\ - p_share_v[lx1] = init_v;\n\ - barrier(CLK_LOCAL_MEM_FENCE);\n\ -\n\ - for (coord.x = lx + threads_per_block * 2; coord.x < limit; coord.x = coord.x + threads_per_block * 2)\n\ - {\n\ - uint2 data;\n\ - coord.z = coord.x + threads_per_block;\n\ - data.x = read_imageui(input, coord.xy).x;\n\ - data.y = read_imageui(input, coord.zy).x;\n\ -\n\ - p_share_k[lx] = data.x;\n\ - p_share_v[lx] = coord.x;\n\ -\n\ - p_share_k[lx1] = data.y;\n\ - p_share_v[lx1] = coord.z;\n\ - barrier(CLK_LOCAL_MEM_FENCE);\n\ -\n\ - bitonic_step_ascend_uint(num_stages, lx, p_share_k, p_share_v);\n\ -\n\ - if (p_share_k[index_minus_1] < min_data)\n\ - {\n\ - continue;\n\ - }\n\ -\n\ - p_share_k[lx] = p_share_k[lx1];\n\ - p_share_v[lx] = p_share_v[lx1];\n\ - barrier(CLK_LOCAL_MEM_FENCE);\n\ -\n\ - bitonic_merge_uint(num_stages, lx, local_data, local_indices);\n\ -\n\ - min_data = local_data[511];\n\ - p_share_k[lx] = init_k;\n\ - p_share_v[lx] = init_v;\n\ - p_share_k[lx1] = init_k;\n\ - p_share_v[lx1] = init_v;\n\ - }\n\ -\n\ - if (width > limit)\n\ - {\n\ - if (coord.x < width)\n\ - {\n\ - uint2 data;\n\ - data.x = read_imageui(input, coord.xy).x;\n\ - coord.z = coord.x + threads_per_block;\n\ - data.y = read_imageui(input, coord.zy).x;\n\ -\n\ - p_share_k[lx] = data.x;\n\ - p_share_v[lx] = coord.x;\n\ -\n\ - p_share_k[lx1] = coord.z < width ? data.y : init_k;\n\ - p_share_v[lx1] = coord.z < width ? coord.z : init_v;\n\ - }\n\ - barrier(CLK_LOCAL_MEM_FENCE);\n\ -\n\ - bitonic_step_ascend_uint(num_stages, lx, p_share_k, p_share_v);\n\ -\n\ - if (p_share_k[index_minus_1] >= min_data)\n\ - {\n\ - p_share_k[lx] = p_share_k[lx1];\n\ - p_share_v[lx] = p_share_v[lx1];\n\ - barrier(CLK_LOCAL_MEM_FENCE);\n\ - bitonic_merge_uint(num_stages, lx, local_data, local_indices);\n\ - }\n\ - }\n\ -\n\ - uint4 dst;\n\ - dst.x = local_data[lx];\n\ -\n\ - coord.x = lx;\n\ - write_imageui(output, coord.xy, dst.xxxx);\n\ -\n\ - int4 index;\n\ - index.x = local_indices[lx];\n\ -\n\ - write_imagei(indices, coord.xy, index.xxxx);\n\ -}\n\ -"; /* end of topk2_cl*/ - static const char topk_odd_even_sort_cl[] = "#define LOCAL_SIZE_X (32)\n\ __kernel __attribute__((reqd_work_group_size(LOCAL_SIZE_X, 1, 1))) void topk_odd_even_sort_F32toF32_I32\n\ (\n\ @@ -82484,7 +82114,6 @@ static const source_map_t cl_resource[] = {"swish_cl", swish_cl}, {"tile_cl", tile_cl}, {"topk_cl", topk_cl}, - {"topk2_cl", topk2_cl}, {"topk_odd_even_sort_cl", topk_odd_even_sort_cl}, {"topk_odd_even_sort2_cl", topk_odd_even_sort2_cl}, {"upsample_cl", upsample_cl},