Revert ovxlib topk kernel modification (#694)

Internal ovxlib commit:b12b1f138e66c78e0fb4032e5399a68a7280a801 is
revert for sw compatibility

Type:  Bug Fix

Signed-off-by: Feiyue Chen <Feiyue.Chen@verisilicon.com>
This commit is contained in:
Chen Feiyue 2024-04-27 07:54:30 +08:00 committed by GitHub
parent 3b80968fb1
commit e1c2f0a18d
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
6 changed files with 25 additions and 782 deletions

View File

@ -60,7 +60,9 @@ typedef struct _vsi_nn_hw_config_t
{
char target_name[VSI_NN_MAX_TARGET_NAME];
vsi_nn_hw_evis_t evis;
#if VX_HARDWARE_CAPS_PARAMS_EXT_SUPPORT
uint32_t subGroupSize;
#endif
uint32_t use_40bits_va;
uint32_t support_stream_processor;
uint32_t sp_exec_count;

View File

@ -33,7 +33,7 @@ extern "C"{
#define VSI_NN_VERSION_MAJOR 1
#define VSI_NN_VERSION_MINOR 2
#define VSI_NN_VERSION_PATCH 6
#define VSI_NN_VERSION_PATCH 5
#define VSI_NN_VERSION \
(VSI_NN_VERSION_MAJOR * 10000 + VSI_NN_VERSION_MINOR * 100 + VSI_NN_VERSION_PATCH)

View File

@ -34,24 +34,20 @@
#include "vsi_nn_tensor_util.h"
#include "utils/vsi_nn_util.h"
#include "kernel/vsi_nn_kernel.h"
#include "libnnext/vx_lib_nnext.h"
__BEGIN_DECLS
#define _TOPK_KERNEL_SOURCE "topk"
#define STR(a) #a
// Add kernel hashtable here
#define TOPK_HASH_KEY( IN_DTYPE, OUT_DTYPE, STAGES, SECTION ) \
( ( IN_DTYPE ) | ( OUT_DTYPE << 8 ) | (STAGES << 16) | (SECTION << 26))
#define TOPK_HASH_KEY( IN_DTYPE, OUT_DTYPE, STAGES ) \
( ( IN_DTYPE ) | ( OUT_DTYPE << 8 ) | (STAGES << 16) )
#define PACK_KERNEL_MAP( IN_DTYPE, OUT_DTYPE, STAGES ) \
{ TOPK_HASH_KEY( IN_DTYPE, OUT_DTYPE, STAGES, 0 ), \
{ TOPK_HASH_KEY( IN_DTYPE, OUT_DTYPE, STAGES ), \
CVIVANTE_NAMESPACE("cl.topk_stage"STR(STAGES)"_"STR(IN_DTYPE)"to"STR(OUT_DTYPE)"_I32"), \
_TOPK_KERNEL_SOURCE }
#define PACK_MERGE_KERNEL_MAP( IN_DTYPE, OUT_DTYPE ) \
{ TOPK_HASH_KEY( IN_DTYPE, OUT_DTYPE, 0, 1 ), \
CVIVANTE_NAMESPACE("cl.topk_stage_"STR(IN_DTYPE)"to"STR(OUT_DTYPE)"_I32"), \
"topk2" }
#define TOPK_ODD_EVEN_SORT_HASH_KEY( IN_DTYPE, OUT_DTYPE ) \
( ( IN_DTYPE ) | ( OUT_DTYPE << 8 ) )
#define PACK_ODD_EVEN_SORT_KERNEL_MAP( IN_DTYPE, OUT_DTYPE ) \
@ -115,9 +111,6 @@ static const _kernel_map_type _topk_kernel_map[] =
PACK_KERNEL_MAP( F32, I32, 4 ),
PACK_KERNEL_MAP( F32, I32, 5 ),
PACK_KERNEL_MAP( F32, I32, 6 ),
PACK_MERGE_KERNEL_MAP(U32, U32),
PACK_MERGE_KERNEL_MAP(I32, I32),
};
static const _kernel_map_type _topk_odd_even_sort_kernel_map[] =
@ -261,8 +254,7 @@ static vsi_status _query_kernel
vsi_nn_kernel_t * kernel,
vsi_nn_tensor_t * const * const inputs,
vsi_nn_tensor_t * const * const outputs,
int32_t num_stages,
vsi_bool is_bitnoic_segment
int32_t num_stages
)
{
vsi_status status = VSI_FAILURE;
@ -280,23 +272,21 @@ static vsi_status _query_kernel
in_dtype = vsi_nn_kernel_map_dtype( inputs[0]->attr.dtype.vx_type );
out_dtype = vsi_nn_kernel_map_dtype( outputs[0]->attr.dtype.vx_type );
num_stages = is_bitnoic_segment ? 0 : num_stages;
switch (_PACK_SELECT_KEY(in_dtype, out_dtype))
{
case _PACK_SELECT_KEY(F32, F32):
case _PACK_SELECT_KEY(F16, F16):
key = TOPK_HASH_KEY( F32, F32, num_stages, is_bitnoic_segment );
key = TOPK_HASH_KEY( F32, F32, num_stages );
break;
case _PACK_SELECT_KEY(U32, U32):
case _PACK_SELECT_KEY(U16, U16):
case _PACK_SELECT_KEY(U8, U8):
key = TOPK_HASH_KEY( U32, U32, num_stages, is_bitnoic_segment );
key = TOPK_HASH_KEY( U32, U32, num_stages );
break;
case _PACK_SELECT_KEY(I32, I32):
case _PACK_SELECT_KEY(I16, I16):
case _PACK_SELECT_KEY(I8, I8):
key = TOPK_HASH_KEY( I32, I32, num_stages, is_bitnoic_segment );
key = TOPK_HASH_KEY( I32, I32, num_stages );
break;
case _PACK_SELECT_KEY(F32, U32):
case _PACK_SELECT_KEY(F16, U32):
@ -304,7 +294,7 @@ static vsi_status _query_kernel
case _PACK_SELECT_KEY(F16, U16):
case _PACK_SELECT_KEY(F32, U8):
case _PACK_SELECT_KEY(F16, U8):
key = TOPK_HASH_KEY( F32, U32, num_stages, is_bitnoic_segment );
key = TOPK_HASH_KEY( F32, U32, num_stages );
break;
case _PACK_SELECT_KEY(F32, I32):
case _PACK_SELECT_KEY(F16, I32):
@ -312,7 +302,7 @@ static vsi_status _query_kernel
case _PACK_SELECT_KEY(F16, I16):
case _PACK_SELECT_KEY(F32, I8):
case _PACK_SELECT_KEY(F16, I8):
key = TOPK_HASH_KEY( F32, I32, num_stages, is_bitnoic_segment );
key = TOPK_HASH_KEY( F32, I32, num_stages );
break;
default:
break;
@ -450,12 +440,7 @@ static vsi_nn_kernel_node_t _setup
int32_t top_k = vsi_nn_kernel_param_get_int32(params, "top_k");
int32_t num_stages = (int32_t)vsi_nn_max(ceil(log10(block_size / 2.0f) / log10(2.0f)), 0);
vsi_bool is_odd_even_sort = FALSE;
vsi_bool is_bitnoic_segment = FALSE;
size_t param_num = _TOPK_PARAM_NUM;
int32_t max_stages = 7 + (int32_t)log2(graph->ctx->config.subGroupSize >> 2);
vsi_nn_kernel_dtype_e type0 = vsi_nn_kernel_map_dtype( inputs[0]->attr.dtype.vx_type );
vsi_nn_kernel_dtype_e type1 = vsi_nn_kernel_map_dtype( outputs[0]->attr.dtype.vx_type );
float inputScale = vsi_nn_get_tensor_scale(inputs[0]);
float inputTail = (float)vsi_nn_get_tensor_zero_point(inputs[0]);
float outputScale = vsi_nn_get_tensor_scale(outputs[0]);
@ -486,14 +471,9 @@ static vsi_nn_kernel_node_t _setup
rs_tensors[0] = vsi_nn_reshape_tensor( graph,
inputs[0], shape[0], 2 );
is_bitnoic_segment = (num_stages >= 9) && (top_k <= 512 && max_stages > 9) &&
type0 == type1 && (type0 == U8 || type0 == I8 || type0 == I16 || type0 == U16 || type0 == I32 || type0 == U32);
num_stages = is_bitnoic_segment ? 9 : num_stages;
max_stages = is_bitnoic_segment ? max_stages : 7;
if (num_stages < max_stages || is_bitnoic_segment)
if (num_stages < 7)
{
status = _query_kernel( kernel, inputs, outputs, num_stages, is_bitnoic_segment );
status = _query_kernel( kernel, inputs, outputs, num_stages );
rs_tensors[1] = vsi_nn_reshape_tensor( graph,
outputs[0], shape[1], 2 );

View File

@ -51,7 +51,7 @@ __kernel __attribute__((reqd_work_group_size(LOCAL_SIZE0, 1, 1))) void topk_stag
float left_elem = local_data[left_id]; \
float right_elem = local_data[right_id]; \
\
if ((left_elem < right_elem || (left_elem == right_elem && left_idx < right_idx)) ^ signo) \
if ((left_elem < right_elem) ^ signo) \
{ \
local_data[left_id] = right_elem; \
local_data[right_id] = left_elem; \
@ -139,7 +139,7 @@ __kernel __attribute__((reqd_work_group_size(LOCAL_SIZE0, 1, 1))) void topk_stag
uint left_elem = local_data[left_id]; \
uint right_elem = local_data[right_id]; \
\
if ((left_elem < right_elem || (left_elem == right_elem && left_idx < right_idx)) ^ signo) \
if ((left_elem < right_elem) ^ signo) \
{ \
local_data[left_id] = right_elem; \
local_data[right_id] = left_elem; \
@ -227,7 +227,7 @@ __kernel __attribute__((reqd_work_group_size(LOCAL_SIZE0, 1, 1))) void topk_stag
int left_elem = local_data[left_id]; \
int right_elem = local_data[right_id]; \
\
if ((left_elem < right_elem || (left_elem == right_elem && left_idx < right_idx)) ^ signo) \
if ((left_elem < right_elem) ^ signo) \
{ \
local_data[left_id] = right_elem; \
local_data[right_id] = left_elem; \
@ -315,7 +315,7 @@ __kernel __attribute__((reqd_work_group_size(LOCAL_SIZE0, 1, 1))) void topk_stag
float left_elem = local_data[left_id]; \
float right_elem = local_data[right_id]; \
\
if ((left_elem < right_elem || (left_elem == right_elem && left_idx < right_idx)) ^ signo) \
if ((left_elem < right_elem) ^ signo) \
{ \
local_data[left_id] = right_elem; \
local_data[right_id] = left_elem; \
@ -403,7 +403,7 @@ __kernel __attribute__((reqd_work_group_size(LOCAL_SIZE0, 1, 1))) void topk_stag
float left_elem = local_data[left_id]; \
float right_elem = local_data[right_id]; \
\
if ((left_elem < right_elem || (left_elem == right_elem && left_idx < right_idx)) ^ signo) \
if ((left_elem < right_elem) ^ signo) \
{ \
local_data[left_id] = right_elem; \
local_data[right_id] = left_elem; \

View File

@ -1,368 +0,0 @@
#define BITONIC_STEP(dtype) \
void bitonic_step_##dtype(uint num_stages, int lx, \
__local dtype *local_data, __local int *local_indices) \
{ \
for (uint stage = 0; stage < num_stages + 1; ++stage) \
{ \
uint signo = (lx >> stage) & 1; \
\
for (uint passOfStage = 0; passOfStage < stage + 1; ++passOfStage) \
{ \
uint postShift = (stage - passOfStage); \
uint pairDistance = 1 << postShift; \
\
uint left_id = ( (lx >> postShift) << (postShift + 1)) + (lx & (pairDistance - 1)); \
uint right_id = left_id + pairDistance; \
\
int left_idx = local_indices[left_id]; \
int right_idx = local_indices[right_id]; \
\
dtype left_elem = local_data[left_id]; \
dtype right_elem = local_data[right_id]; \
\
if ((left_elem < right_elem || (left_elem == right_elem && left_idx < right_idx)) ^ signo) \
{ \
local_data[left_id] = right_elem; \
local_data[right_id] = left_elem; \
\
local_indices[left_id] = right_idx; \
local_indices[right_id] = left_idx; \
} \
\
barrier(CLK_LOCAL_MEM_FENCE); \
} \
} \
}
BITONIC_STEP(int)
BITONIC_STEP(uint)
#define BITONIC_STEP_ASCEND(dtype) \
void bitonic_step_ascend_##dtype(uint num_stages, int lx, \
__local dtype *p_share_k, __local int *p_share_v) \
{ \
for (uint stage = 0; stage < num_stages + 1; ++stage) \
{ \
uint signo = (lx >> stage) & 1; \
\
for (uint passOfStage = 0; passOfStage < stage + 1; ++passOfStage) \
{ \
uint postShift = (stage - passOfStage); \
uint pairDistance = 1 << postShift; \
\
uint left_id = ( (lx >> postShift) << (postShift + 1)) + (lx & (pairDistance - 1)); \
uint right_id = left_id + pairDistance; \
\
int left_idx = p_share_v[left_id]; \
int right_idx = p_share_v[right_id]; \
\
dtype left_elem = p_share_k[left_id]; \
dtype right_elem = p_share_k[right_id]; \
\
if ((left_elem > right_elem || (left_elem == right_elem && left_idx > right_idx)) ^ signo) \
{ \
p_share_k[left_id] = right_elem; \
p_share_k[right_id] = left_elem; \
\
p_share_v[left_id] = right_idx; \
p_share_v[right_id] = left_idx; \
} \
\
barrier(CLK_LOCAL_MEM_FENCE); \
} \
} \
}
BITONIC_STEP_ASCEND(int)
BITONIC_STEP_ASCEND(uint)
#define BITONIC_MERGE(dtype) \
void bitonic_merge_##dtype(uint num_stages, int lx, \
__local dtype *local_data, __local int *local_indices) \
{ \
uint stage = num_stages; \
uint signo = (lx >> stage) & 1; \
\
for (uint passOfStage = 0; passOfStage < stage + 1; ++passOfStage) \
{ \
uint postShift = (stage - passOfStage); \
uint pairDistance = 1 << postShift; \
\
uint left_id = ( (lx >> postShift) << (postShift + 1)) + (lx & (pairDistance - 1)); \
uint right_id = left_id + pairDistance; \
\
int left_idx = local_indices[left_id]; \
int right_idx = local_indices[right_id]; \
\
dtype left_elem = local_data[left_id]; \
dtype right_elem = local_data[right_id]; \
\
if ((left_elem < right_elem || (left_elem == right_elem && left_idx < right_idx)) ^ signo) \
{ \
local_data[left_id] = right_elem; \
local_data[right_id] = left_elem; \
\
local_indices[left_id] = right_idx; \
local_indices[right_id] = left_idx; \
} \
\
barrier(CLK_LOCAL_MEM_FENCE); \
} \
}
BITONIC_MERGE(int)
BITONIC_MERGE(uint)
#define BLOCK_SIZE (512)
__kernel __attribute__((reqd_work_group_size(BLOCK_SIZE, 1, 1))) void topk_stage_I32toI32_I32
(
__read_only image2d_t input,
__write_only image2d_t output,
__write_only image2d_t indices,
float input_scale,
float input_tail,
float output_scale,
float output_tail,
int _num_stages,
int width
)
{
uint lx = get_local_id(0);
const int init_k = -2147483647;
const int init_v = -2147483647;
const int num_stages = 9;
const int threads_per_block = BLOCK_SIZE;
const int index_minus_1 = threads_per_block * 2 - 1;
uint offset = 0;
uint lx1 = lx + threads_per_block;
int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(0), get_global_id(1));
__local int local_data[1536];
__local int local_indices[1536];
int left = read_imagei(input, coord.xy).x;
coord.z += threads_per_block;
int right = read_imagei(input, coord.zy).x;
local_data[lx] = left;
local_indices[lx] = coord.x;
local_data[lx1] = right;
local_indices[lx1] = coord.z;
barrier(CLK_LOCAL_MEM_FENCE);
bitonic_step_int(num_stages, lx, local_data, local_indices);
int min_data = local_data[511];
int *p_share_k = local_data + threads_per_block;
int *p_share_v = local_indices + threads_per_block;
int limit = (width >> 10) << 10;
p_share_k[lx] = init_k;
p_share_v[lx] = init_v;
p_share_k[lx1] = init_k;
p_share_v[lx1] = init_v;
barrier(CLK_LOCAL_MEM_FENCE);
for (coord.x = lx + threads_per_block * 2; coord.x < limit; coord.x = coord.x + threads_per_block * 2)
{
int2 data;
coord.z = coord.x + threads_per_block;
data.x = read_imagei(input, coord.xy).x;
data.y = read_imagei(input, coord.zy).x;
p_share_k[lx] = data.x;
p_share_v[lx] = coord.x;
p_share_k[lx1] = data.y;
p_share_v[lx1] = coord.z;
barrier(CLK_LOCAL_MEM_FENCE);
bitonic_step_ascend_int(num_stages, lx, p_share_k, p_share_v);
if (p_share_k[index_minus_1] < min_data)
{
continue;
}
p_share_k[lx] = p_share_k[lx1];
p_share_v[lx] = p_share_v[lx1];
barrier(CLK_LOCAL_MEM_FENCE);
bitonic_merge_int(num_stages, lx, local_data, local_indices);
min_data = local_data[511];
p_share_k[lx] = init_k;
p_share_v[lx] = init_v;
p_share_k[lx1] = init_k;
p_share_v[lx1] = init_v;
}
if (width > limit)
{
if (coord.x < width)
{
int2 data;
data.x = read_imagei(input, coord.xy).x;
coord.z = coord.x + threads_per_block;
data.y = read_imagei(input, coord.zy).x;
p_share_k[lx] = data.x;
p_share_v[lx] = coord.x;
p_share_k[lx1] = coord.z < width ? data.y : init_k;
p_share_v[lx1] = coord.z < width ? coord.z : init_v;
}
barrier(CLK_LOCAL_MEM_FENCE);
bitonic_step_ascend_int(num_stages, lx, p_share_k, p_share_v);
if (p_share_k[index_minus_1] >= min_data)
{
p_share_k[lx] = p_share_k[lx1];
p_share_v[lx] = p_share_v[lx1];
barrier(CLK_LOCAL_MEM_FENCE);
bitonic_merge_int(num_stages, lx, local_data, local_indices);
}
}
int4 dst;
dst.x = local_data[lx];
coord.x = lx;
write_imagei(output, coord.xy, dst.xxxx);
int4 index;
index.x = local_indices[lx];
write_imagei(indices, coord.xy, index.xxxx);
}
__kernel __attribute__((reqd_work_group_size(BLOCK_SIZE, 1, 1))) void topk_stage_U32toU32_I32
(
__read_only image2d_t input,
__write_only image2d_t output,
__write_only image2d_t indices,
float input_scale,
float input_tail,
float output_scale,
float output_tail,
int _num_stages,
int width
)
{
uint lx = get_local_id(0);
const uint init_k = 0;
const int init_v = -2147483647;
const int num_stages = 9;
const int threads_per_block = BLOCK_SIZE;
const int index_minus_1 = threads_per_block * 2 - 1;
uint offset = 0;
uint lx1 = lx + threads_per_block;
int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(0), get_global_id(1));
__local uint local_data[1536];
__local int local_indices[1536];
uint left = read_imageui(input, coord.xy).x;
coord.z += threads_per_block;
uint right = read_imageui(input, coord.zy).x;
local_data[lx] = left;
local_indices[lx] = coord.x;
local_data[lx1] = right;
local_indices[lx1] = coord.z;
barrier(CLK_LOCAL_MEM_FENCE);
bitonic_step_uint(num_stages, lx, local_data, local_indices);
uint min_data = local_data[511];
uint *p_share_k = local_data + threads_per_block;
int *p_share_v = local_indices + threads_per_block;
int limit = (width >> 10) << 10;
p_share_k[lx] = init_k;
p_share_v[lx] = init_v;
p_share_k[lx1] = init_k;
p_share_v[lx1] = init_v;
barrier(CLK_LOCAL_MEM_FENCE);
for (coord.x = lx + threads_per_block * 2; coord.x < limit; coord.x = coord.x + threads_per_block * 2)
{
uint2 data;
coord.z = coord.x + threads_per_block;
data.x = read_imageui(input, coord.xy).x;
data.y = read_imageui(input, coord.zy).x;
p_share_k[lx] = data.x;
p_share_v[lx] = coord.x;
p_share_k[lx1] = data.y;
p_share_v[lx1] = coord.z;
barrier(CLK_LOCAL_MEM_FENCE);
bitonic_step_ascend_uint(num_stages, lx, p_share_k, p_share_v);
if (p_share_k[index_minus_1] < min_data)
{
continue;
}
p_share_k[lx] = p_share_k[lx1];
p_share_v[lx] = p_share_v[lx1];
barrier(CLK_LOCAL_MEM_FENCE);
bitonic_merge_uint(num_stages, lx, local_data, local_indices);
min_data = local_data[511];
p_share_k[lx] = init_k;
p_share_v[lx] = init_v;
p_share_k[lx1] = init_k;
p_share_v[lx1] = init_v;
}
if (width > limit)
{
if (coord.x < width)
{
uint2 data;
data.x = read_imageui(input, coord.xy).x;
coord.z = coord.x + threads_per_block;
data.y = read_imageui(input, coord.zy).x;
p_share_k[lx] = data.x;
p_share_v[lx] = coord.x;
p_share_k[lx1] = coord.z < width ? data.y : init_k;
p_share_v[lx1] = coord.z < width ? coord.z : init_v;
}
barrier(CLK_LOCAL_MEM_FENCE);
bitonic_step_ascend_uint(num_stages, lx, p_share_k, p_share_v);
if (p_share_k[index_minus_1] >= min_data)
{
p_share_k[lx] = p_share_k[lx1];
p_share_v[lx] = p_share_v[lx1];
barrier(CLK_LOCAL_MEM_FENCE);
bitonic_merge_uint(num_stages, lx, local_data, local_indices);
}
}
uint4 dst;
dst.x = local_data[lx];
coord.x = lx;
write_imageui(output, coord.xy, dst.xxxx);
int4 index;
index.x = local_indices[lx];
write_imagei(indices, coord.xy, index.xxxx);
}

View File

@ -80509,7 +80509,7 @@ __kernel __attribute__((reqd_work_group_size(LOCAL_SIZE0, 1, 1))) void topk_stag
float left_elem = local_data[left_id]; \\\n\
float right_elem = local_data[right_id]; \\\n\
\\\n\
if ((left_elem < right_elem || (left_elem == right_elem && left_idx < right_idx)) ^ signo) \\\n\
if ((left_elem < right_elem) ^ signo) \\\n\
{ \\\n\
local_data[left_id] = right_elem; \\\n\
local_data[right_id] = left_elem; \\\n\
@ -80597,7 +80597,7 @@ __kernel __attribute__((reqd_work_group_size(LOCAL_SIZE0, 1, 1))) void topk_stag
uint left_elem = local_data[left_id]; \\\n\
uint right_elem = local_data[right_id]; \\\n\
\\\n\
if ((left_elem < right_elem || (left_elem == right_elem && left_idx < right_idx)) ^ signo) \\\n\
if ((left_elem < right_elem) ^ signo) \\\n\
{ \\\n\
local_data[left_id] = right_elem; \\\n\
local_data[right_id] = left_elem; \\\n\
@ -80685,7 +80685,7 @@ __kernel __attribute__((reqd_work_group_size(LOCAL_SIZE0, 1, 1))) void topk_stag
int left_elem = local_data[left_id]; \\\n\
int right_elem = local_data[right_id]; \\\n\
\\\n\
if ((left_elem < right_elem || (left_elem == right_elem && left_idx < right_idx)) ^ signo) \\\n\
if ((left_elem < right_elem) ^ signo) \\\n\
{ \\\n\
local_data[left_id] = right_elem; \\\n\
local_data[right_id] = left_elem; \\\n\
@ -80773,7 +80773,7 @@ __kernel __attribute__((reqd_work_group_size(LOCAL_SIZE0, 1, 1))) void topk_stag
float left_elem = local_data[left_id]; \\\n\
float right_elem = local_data[right_id]; \\\n\
\\\n\
if ((left_elem < right_elem || (left_elem == right_elem && left_idx < right_idx)) ^ signo) \\\n\
if ((left_elem < right_elem) ^ signo) \\\n\
{ \\\n\
local_data[left_id] = right_elem; \\\n\
local_data[right_id] = left_elem; \\\n\
@ -80861,7 +80861,7 @@ __kernel __attribute__((reqd_work_group_size(LOCAL_SIZE0, 1, 1))) void topk_stag
float left_elem = local_data[left_id]; \\\n\
float right_elem = local_data[right_id]; \\\n\
\\\n\
if ((left_elem < right_elem || (left_elem == right_elem && left_idx < right_idx)) ^ signo) \\\n\
if ((left_elem < right_elem) ^ signo) \\\n\
{ \\\n\
local_data[left_id] = right_elem; \\\n\
local_data[right_id] = left_elem; \\\n\
@ -80896,376 +80896,6 @@ TOPK_F32toI32(1 << 4, 4)\n\
TOPK_F32toI32(1 << 5, 5)\n\
TOPK_F32toI32(1 << 6, 6)"; /* end of topk_cl*/
static const char topk2_cl[] = "\n\
#define BITONIC_STEP(dtype) \\\n\
void bitonic_step_##dtype(uint num_stages, int lx, \\\n\
__local dtype *local_data, __local int *local_indices) \\\n\
{ \\\n\
for (uint stage = 0; stage < num_stages + 1; ++stage) \\\n\
{ \\\n\
uint signo = (lx >> stage) & 1; \\\n\
\\\n\
for (uint passOfStage = 0; passOfStage < stage + 1; ++passOfStage) \\\n\
{ \\\n\
uint postShift = (stage - passOfStage); \\\n\
uint pairDistance = 1 << postShift; \\\n\
\\\n\
uint left_id = ( (lx >> postShift) << (postShift + 1)) + (lx & (pairDistance - 1)); \\\n\
uint right_id = left_id + pairDistance; \\\n\
\\\n\
int left_idx = local_indices[left_id]; \\\n\
int right_idx = local_indices[right_id]; \\\n\
\\\n\
dtype left_elem = local_data[left_id]; \\\n\
dtype right_elem = local_data[right_id]; \\\n\
\\\n\
if ((left_elem < right_elem || (left_elem == right_elem && left_idx < right_idx)) ^ signo) \\\n\
{ \\\n\
local_data[left_id] = right_elem; \\\n\
local_data[right_id] = left_elem; \\\n\
\\\n\
local_indices[left_id] = right_idx; \\\n\
local_indices[right_id] = left_idx; \\\n\
} \\\n\
\\\n\
barrier(CLK_LOCAL_MEM_FENCE); \\\n\
} \\\n\
} \\\n\
}\n\
BITONIC_STEP(int)\n\
BITONIC_STEP(uint)\n\
\n\
#define BITONIC_STEP_ASCEND(dtype) \\\n\
void bitonic_step_ascend_##dtype(uint num_stages, int lx, \\\n\
__local dtype *p_share_k, __local int *p_share_v) \\\n\
{ \\\n\
for (uint stage = 0; stage < num_stages + 1; ++stage) \\\n\
{ \\\n\
uint signo = (lx >> stage) & 1; \\\n\
\\\n\
for (uint passOfStage = 0; passOfStage < stage + 1; ++passOfStage) \\\n\
{ \\\n\
uint postShift = (stage - passOfStage); \\\n\
uint pairDistance = 1 << postShift; \\\n\
\\\n\
uint left_id = ( (lx >> postShift) << (postShift + 1)) + (lx & (pairDistance - 1)); \\\n\
uint right_id = left_id + pairDistance; \\\n\
\\\n\
int left_idx = p_share_v[left_id]; \\\n\
int right_idx = p_share_v[right_id]; \\\n\
\\\n\
dtype left_elem = p_share_k[left_id]; \\\n\
dtype right_elem = p_share_k[right_id]; \\\n\
\\\n\
if ((left_elem > right_elem || (left_elem == right_elem && left_idx > right_idx)) ^ signo) \\\n\
{ \\\n\
p_share_k[left_id] = right_elem; \\\n\
p_share_k[right_id] = left_elem; \\\n\
\\\n\
p_share_v[left_id] = right_idx; \\\n\
p_share_v[right_id] = left_idx; \\\n\
} \\\n\
\\\n\
barrier(CLK_LOCAL_MEM_FENCE); \\\n\
} \\\n\
} \\\n\
}\n\
BITONIC_STEP_ASCEND(int)\n\
BITONIC_STEP_ASCEND(uint)\n\
\n\
#define BITONIC_MERGE(dtype) \\\n\
void bitonic_merge_##dtype(uint num_stages, int lx, \\\n\
__local dtype *local_data, __local int *local_indices) \\\n\
{ \\\n\
uint stage = num_stages; \\\n\
uint signo = (lx >> stage) & 1; \\\n\
\\\n\
for (uint passOfStage = 0; passOfStage < stage + 1; ++passOfStage) \\\n\
{ \\\n\
uint postShift = (stage - passOfStage); \\\n\
uint pairDistance = 1 << postShift; \\\n\
\\\n\
uint left_id = ( (lx >> postShift) << (postShift + 1)) + (lx & (pairDistance - 1)); \\\n\
uint right_id = left_id + pairDistance; \\\n\
\\\n\
int left_idx = local_indices[left_id]; \\\n\
int right_idx = local_indices[right_id]; \\\n\
\\\n\
dtype left_elem = local_data[left_id]; \\\n\
dtype right_elem = local_data[right_id]; \\\n\
\\\n\
if ((left_elem < right_elem || (left_elem == right_elem && left_idx < right_idx)) ^ signo) \\\n\
{ \\\n\
local_data[left_id] = right_elem; \\\n\
local_data[right_id] = left_elem; \\\n\
\\\n\
local_indices[left_id] = right_idx; \\\n\
local_indices[right_id] = left_idx; \\\n\
} \\\n\
\\\n\
barrier(CLK_LOCAL_MEM_FENCE); \\\n\
} \\\n\
}\n\
BITONIC_MERGE(int)\n\
BITONIC_MERGE(uint)\n\
\n\
#define BLOCK_SIZE (512)\n\
\n\
__kernel __attribute__((reqd_work_group_size(BLOCK_SIZE, 1, 1))) void topk_stage_I32toI32_I32\n\
(\n\
__read_only image2d_t input,\n\
__write_only image2d_t output,\n\
__write_only image2d_t indices,\n\
float input_scale,\n\
float input_tail,\n\
float output_scale,\n\
float output_tail,\n\
int _num_stages,\n\
int width\n\
)\n\
{\n\
uint lx = get_local_id(0);\n\
const int init_k = -2147483647;\n\
const int init_v = -2147483647;\n\
const int num_stages = 9;\n\
const int threads_per_block = BLOCK_SIZE;\n\
const int index_minus_1 = threads_per_block * 2 - 1;\n\
uint offset = 0;\n\
uint lx1 = lx + threads_per_block;\n\
\n\
int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(0), get_global_id(1));\n\
\n\
__local int local_data[1536];\n\
__local int local_indices[1536];\n\
\n\
int left = read_imagei(input, coord.xy).x;\n\
coord.z += threads_per_block;\n\
int right = read_imagei(input, coord.zy).x;\n\
\n\
local_data[lx] = left;\n\
local_indices[lx] = coord.x;\n\
local_data[lx1] = right;\n\
local_indices[lx1] = coord.z;\n\
\n\
barrier(CLK_LOCAL_MEM_FENCE);\n\
\n\
bitonic_step_int(num_stages, lx, local_data, local_indices);\n\
\n\
int min_data = local_data[511];\n\
\n\
int *p_share_k = local_data + threads_per_block;\n\
int *p_share_v = local_indices + threads_per_block;\n\
\n\
int limit = (width >> 10) << 10;\n\
p_share_k[lx] = init_k;\n\
p_share_v[lx] = init_v;\n\
\n\
p_share_k[lx1] = init_k;\n\
p_share_v[lx1] = init_v;\n\
barrier(CLK_LOCAL_MEM_FENCE);\n\
\n\
for (coord.x = lx + threads_per_block * 2; coord.x < limit; coord.x = coord.x + threads_per_block * 2)\n\
{\n\
int2 data;\n\
coord.z = coord.x + threads_per_block;\n\
data.x = read_imagei(input, coord.xy).x;\n\
data.y = read_imagei(input, coord.zy).x;\n\
\n\
p_share_k[lx] = data.x;\n\
p_share_v[lx] = coord.x;\n\
\n\
p_share_k[lx1] = data.y;\n\
p_share_v[lx1] = coord.z;\n\
barrier(CLK_LOCAL_MEM_FENCE);\n\
\n\
bitonic_step_ascend_int(num_stages, lx, p_share_k, p_share_v);\n\
\n\
if (p_share_k[index_minus_1] < min_data)\n\
{\n\
continue;\n\
}\n\
\n\
p_share_k[lx] = p_share_k[lx1];\n\
p_share_v[lx] = p_share_v[lx1];\n\
barrier(CLK_LOCAL_MEM_FENCE);\n\
\n\
bitonic_merge_int(num_stages, lx, local_data, local_indices);\n\
\n\
min_data = local_data[511];\n\
p_share_k[lx] = init_k;\n\
p_share_v[lx] = init_v;\n\
p_share_k[lx1] = init_k;\n\
p_share_v[lx1] = init_v;\n\
}\n\
\n\
if (width > limit)\n\
{\n\
if (coord.x < width)\n\
{\n\
int2 data;\n\
data.x = read_imagei(input, coord.xy).x;\n\
coord.z = coord.x + threads_per_block;\n\
data.y = read_imagei(input, coord.zy).x;\n\
\n\
p_share_k[lx] = data.x;\n\
p_share_v[lx] = coord.x;\n\
\n\
p_share_k[lx1] = coord.z < width ? data.y : init_k;\n\
p_share_v[lx1] = coord.z < width ? coord.z : init_v;\n\
}\n\
barrier(CLK_LOCAL_MEM_FENCE);\n\
\n\
bitonic_step_ascend_int(num_stages, lx, p_share_k, p_share_v);\n\
\n\
if (p_share_k[index_minus_1] >= min_data)\n\
{\n\
p_share_k[lx] = p_share_k[lx1];\n\
p_share_v[lx] = p_share_v[lx1];\n\
barrier(CLK_LOCAL_MEM_FENCE);\n\
bitonic_merge_int(num_stages, lx, local_data, local_indices);\n\
}\n\
}\n\
\n\
int4 dst;\n\
dst.x = local_data[lx];\n\
\n\
coord.x = lx;\n\
write_imagei(output, coord.xy, dst.xxxx);\n\
\n\
int4 index;\n\
index.x = local_indices[lx];\n\
\n\
write_imagei(indices, coord.xy, index.xxxx);\n\
}\n\
\n\
__kernel __attribute__((reqd_work_group_size(BLOCK_SIZE, 1, 1))) void topk_stage_U32toU32_I32\n\
(\n\
__read_only image2d_t input,\n\
__write_only image2d_t output,\n\
__write_only image2d_t indices,\n\
float input_scale,\n\
float input_tail,\n\
float output_scale,\n\
float output_tail,\n\
int _num_stages,\n\
int width\n\
)\n\
{\n\
uint lx = get_local_id(0);\n\
const uint init_k = 0;\n\
const int init_v = -2147483647;\n\
const int num_stages = 9;\n\
const int threads_per_block = BLOCK_SIZE;\n\
const int index_minus_1 = threads_per_block * 2 - 1;\n\
uint offset = 0;\n\
uint lx1 = lx + threads_per_block;\n\
\n\
int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(0), get_global_id(1));\n\
\n\
__local uint local_data[1536];\n\
__local int local_indices[1536];\n\
\n\
uint left = read_imageui(input, coord.xy).x;\n\
coord.z += threads_per_block;\n\
uint right = read_imageui(input, coord.zy).x;\n\
\n\
local_data[lx] = left;\n\
local_indices[lx] = coord.x;\n\
local_data[lx1] = right;\n\
local_indices[lx1] = coord.z;\n\
\n\
barrier(CLK_LOCAL_MEM_FENCE);\n\
\n\
bitonic_step_uint(num_stages, lx, local_data, local_indices);\n\
\n\
uint min_data = local_data[511];\n\
\n\
uint *p_share_k = local_data + threads_per_block;\n\
int *p_share_v = local_indices + threads_per_block;\n\
\n\
int limit = (width >> 10) << 10;\n\
p_share_k[lx] = init_k;\n\
p_share_v[lx] = init_v;\n\
\n\
p_share_k[lx1] = init_k;\n\
p_share_v[lx1] = init_v;\n\
barrier(CLK_LOCAL_MEM_FENCE);\n\
\n\
for (coord.x = lx + threads_per_block * 2; coord.x < limit; coord.x = coord.x + threads_per_block * 2)\n\
{\n\
uint2 data;\n\
coord.z = coord.x + threads_per_block;\n\
data.x = read_imageui(input, coord.xy).x;\n\
data.y = read_imageui(input, coord.zy).x;\n\
\n\
p_share_k[lx] = data.x;\n\
p_share_v[lx] = coord.x;\n\
\n\
p_share_k[lx1] = data.y;\n\
p_share_v[lx1] = coord.z;\n\
barrier(CLK_LOCAL_MEM_FENCE);\n\
\n\
bitonic_step_ascend_uint(num_stages, lx, p_share_k, p_share_v);\n\
\n\
if (p_share_k[index_minus_1] < min_data)\n\
{\n\
continue;\n\
}\n\
\n\
p_share_k[lx] = p_share_k[lx1];\n\
p_share_v[lx] = p_share_v[lx1];\n\
barrier(CLK_LOCAL_MEM_FENCE);\n\
\n\
bitonic_merge_uint(num_stages, lx, local_data, local_indices);\n\
\n\
min_data = local_data[511];\n\
p_share_k[lx] = init_k;\n\
p_share_v[lx] = init_v;\n\
p_share_k[lx1] = init_k;\n\
p_share_v[lx1] = init_v;\n\
}\n\
\n\
if (width > limit)\n\
{\n\
if (coord.x < width)\n\
{\n\
uint2 data;\n\
data.x = read_imageui(input, coord.xy).x;\n\
coord.z = coord.x + threads_per_block;\n\
data.y = read_imageui(input, coord.zy).x;\n\
\n\
p_share_k[lx] = data.x;\n\
p_share_v[lx] = coord.x;\n\
\n\
p_share_k[lx1] = coord.z < width ? data.y : init_k;\n\
p_share_v[lx1] = coord.z < width ? coord.z : init_v;\n\
}\n\
barrier(CLK_LOCAL_MEM_FENCE);\n\
\n\
bitonic_step_ascend_uint(num_stages, lx, p_share_k, p_share_v);\n\
\n\
if (p_share_k[index_minus_1] >= min_data)\n\
{\n\
p_share_k[lx] = p_share_k[lx1];\n\
p_share_v[lx] = p_share_v[lx1];\n\
barrier(CLK_LOCAL_MEM_FENCE);\n\
bitonic_merge_uint(num_stages, lx, local_data, local_indices);\n\
}\n\
}\n\
\n\
uint4 dst;\n\
dst.x = local_data[lx];\n\
\n\
coord.x = lx;\n\
write_imageui(output, coord.xy, dst.xxxx);\n\
\n\
int4 index;\n\
index.x = local_indices[lx];\n\
\n\
write_imagei(indices, coord.xy, index.xxxx);\n\
}\n\
"; /* end of topk2_cl*/
static const char topk_odd_even_sort_cl[] = "#define LOCAL_SIZE_X (32)\n\
__kernel __attribute__((reqd_work_group_size(LOCAL_SIZE_X, 1, 1))) void topk_odd_even_sort_F32toF32_I32\n\
(\n\
@ -82484,7 +82114,6 @@ static const source_map_t cl_resource[] =
{"swish_cl", swish_cl},
{"tile_cl", tile_cl},
{"topk_cl", topk_cl},
{"topk2_cl", topk2_cl},
{"topk_odd_even_sort_cl", topk_odd_even_sort_cl},
{"topk_odd_even_sort2_cl", topk_odd_even_sort2_cl},
{"upsample_cl", upsample_cl},