Update internal to REL/v1.1.30.2

SHA: 2e64046f

Signed-off-by: Kainan Cha <kainan.zha@verisilicon.com>
This commit is contained in:
Kainan Cha 2021-03-29 16:21:46 +08:00
parent b5f2666e92
commit c141416238
120 changed files with 14252 additions and 11997 deletions

View File

@ -194,22 +194,13 @@ cc_library(
"src/kernel/vsi_nn_kernel_param.c",
"src/kernel/vsi_nn_gpu.c",
"src/kernel/vsi_nn_kernel_gpu_shape_optimize.c",
"src/libnnext/ops/kernel/vsi_nn_kernel_crop.c",
"src/libnnext/ops/kernel/vsi_nn_kernel_fullconnect2.c",
"src/libnnext/ops/kernel/vsi_nn_kernel_shufflechannel.c",
"src/libnnext/ops/kernel/vsi_nn_kernel_resize.c",
"src/libnnext/ops/kernel/vsi_nn_kernel_scale.c",
"src/libnnext/ops/kernel/vsi_nn_kernel_space2depth.c",
"src/libnnext/ops/kernel/vsi_nn_kernel_imageprocess.c",
"src/libnnext/ops/kernel/vsi_nn_kernel_layernormalize.c",
"src/libnnext/ops/kernel/vsi_nn_kernel_reduce.c",
"src/libnnext/ops/kernel/vsi_nn_kernel_signalframe.c",
"src/libnnext/ops/kernel/vsi_nn_kernel_tensorstackconcat.c",
"src/libnnext/ops/kernel/vsi_nn_kernel_sync_host.c",
"src/libnnext/ops/kernel/vsi_nn_kernel_spatial_transformer.c",
"src/libnnext/ops/kernel/vsi_nn_kernel_extra_ending.c",
"src/libnnext/ops/kernel/vsi_nn_kernel_topk.c",
"src/libnnext/ops/kernel/vsi_nn_kernel_roi_align.c",
"src/libnnext/ops/kernel/vsi_nn_kernel_heatmap_max_keypoint.c",
"src/libnnext/ops/kernel/vsi_nn_kernel_axis_aligned_bbox_transform.c",
"src/libnnext/ops/kernel/vsi_nn_kernel_box_with_nms_limit.c",

View File

@ -146,3 +146,4 @@ DEF_OP(SCATTER_ND)
DEF_OP(DECONVOLUTION1D)
DEF_OP(INTERP)
DEF_OP(RESIZE_1D)
DEF_OP(UPSAMPLESCALE)

View File

@ -16,3 +16,4 @@ DEF_OP(GRUCELL_ACTIVATION_INTERNAL)
DEF_OP(GRUCELL_ACTIVATION_INTERNAL_SMA)
DEF_OP(RESIZE_1D_BILINEAR_INTERNAL)
DEF_OP(RESIZE_1D_NEAREST_INTERNAL)
DEF_OP(SPACE2DEPTH_INTERNAL)

View File

@ -38,6 +38,14 @@ vsi_bool vsi_nn_kernel_optimize_reduce_shape
int32_t* out_axis, uint32_t* out_axis_size
);
vsi_bool vsi_nn_kernel_optimize_tensor_shape
(
const int32_t* shape_x, const size_t rank_x,
const int32_t *axis, const size_t axis_size,
int32_t* out_shape_x, uint32_t* out_rank_x,
int32_t* out_axis, uint32_t* out_axis_size
);
vsi_bool vsi_nn_kernel_optimize_element_shape
(
const int32_t* shape_x, const size_t rank_x,
@ -59,4 +67,16 @@ vsi_bool vsi_nn_kernel_optimize_tile_shape
int32_t* out_shape_output, uint32_t* out_rank_output
);
vsi_bool vsi_nn_kernel_optimize_1d_tensor_shape
(
const int32_t* shape, const uint32_t rank,
int32_t* out_shape, uint32_t* out_rank
);
vsi_bool vsi_nn_kernel_optimize_nchw2xhw_shape
(
const int32_t* shape, const uint32_t rank,
int32_t* out_shape, uint32_t* out_rank
);
#endif

View File

@ -372,10 +372,6 @@ enum vx_kernel_libnnext_offset_e
#define VX_KERNEL_NAME_GRAYSCALETOTENSOR_INT16_COPY VIVANTE_NAMESPACE ".GrayScaletoTensor_Int16_copy"
#define VX_KERNEL_NAME_GRAYSCALETOTENSOR_UINT8 VIVANTE_NAMESPACE ".GrayScaletoTensor_UInt8"
#define VX_KERNEL_NAME_GRAYSCALETOTENSOR_UINT8_COPY VIVANTE_NAMESPACE ".GrayScaletoTensor_UInt8_copy"
#define VX_KERNEL_NAME_LAYERNORM VIVANTE_NAMESPACE ".vxcLayerNorm"
#define VX_KERNEL_NAME_LAYERNORM_UINT8 VIVANTE_NAMESPACE ".vxcLayerNorm_u8"
#define VX_KERNEL_NAME_LAYERNORM_FP16TOU8 VIVANTE_NAMESPACE ".vxcLayerNormFP16toU8"
#define VX_KERNEL_NAME_LAYERNORM_U8TOFP16 VIVANTE_NAMESPACE ".vxcLayerNormU8toFp16"
#define VX_KERNEL_NAME_TENSORSTACKCONCAT VIVANTE_NAMESPACE ".vxcTensorStackConcat"
#define VX_KERNEL_NAME_TENSORSTACKCONCAT8BITS VIVANTE_NAMESPACE ".vxcTensorStackConcat8Bits"
#define VX_KERNEL_NAME_SIGNALFRAME_WIDTH VIVANTE_NAMESPACE ".vxcSignalFrame_width"

View File

@ -70,6 +70,10 @@ typedef struct _vsi_nn_instancenorm_lcl_data2
uint32_t reshapeFlg;
uint32_t hash_idx;
vsi_bool execute_on_sw;
/* handle 3D instance norm */
vsi_nn_tensor_t *reshaped_input;
vsi_nn_tensor_t *reshaped_output;
} vsi_nn_instancenorm_lcl_data2;
typedef struct _vsi_nn_instancenorm_lcl_data

View File

@ -0,0 +1,44 @@
/****************************************************************************
*
* Copyright (c) 2020 Vivante Corporation
*
* Permission is hereby granted, free of charge, to any person obtaining a
* copy of this software and associated documentation files (the "Software"),
* to deal in the Software without restriction, including without limitation
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
* and/or sell copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in
* all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
* DEALINGS IN THE SOFTWARE.
*
*****************************************************************************/
#ifndef _VSI_NN_OP_SPACE2DEPTH_INTERNAL_H
#define _VSI_NN_OP_SPACE2DEPTH_INTERNAL_H
#include "vsi_nn_types.h"
#ifdef __cplusplus
extern "C" {
#endif
typedef struct _vsi_nn_space2depth_internal_param
{
int32_t block_size_x;
int32_t block_size_y;
} vsi_nn_space2depth_internal_param;
#ifdef __cplusplus
}
#endif
#endif

View File

@ -0,0 +1,39 @@
/****************************************************************************
*
* Copyright (c) 2020 Vivante Corporation
*
* Permission is hereby granted, free of charge, to any person obtaining a
* copy of this software and associated documentation files (the "Software"),
* to deal in the Software without restriction, including without limitation
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
* and/or sell copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in
* all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
* DEALINGS IN THE SOFTWARE.
*
*****************************************************************************/
#ifndef _VSI_NN_OP_UPSAMPLESCALE_H
#define _VSI_NN_OP_UPSAMPLESCALE_H
#include "vsi_nn_types.h"
typedef struct _vsi_nn_upsamplescale_param
{
struct _upsamplescale_local_data_t* local;
// Add parameters here
int32_t stride;
float scale;
} vsi_nn_upsamplescale_param;
#endif

View File

@ -677,6 +677,11 @@ OVXLIB_API vsi_status vsi_nn_TrySetupCompleteSignalNode
vsi_nn_graph_t* graph
);
vsi_status vsi_nn_setup_binary_graph_inputs_outputs
(
vsi_nn_graph_t* graph
);
void vsi_nn_get_tensor_consumers
(
vsi_nn_graph_t* graph,

View File

@ -56,6 +56,7 @@
#include "ops/vsi_nn_op_elu.h"
#include "ops/vsi_nn_op_reverse.h"
#include "ops/vsi_nn_op_space2depth.h"
#include "ops/vsi_nn_op_space2depth_internal.h"
#include "ops/vsi_nn_op_depth2space.h"
#include "ops/vsi_nn_op_depth2space_internal.h"
#include "ops/vsi_nn_op_maximum.h"
@ -162,6 +163,7 @@
#include "ops/vsi_nn_op_resize_1d.h"
#include "ops/vsi_nn_op_resize_1d_bilinear_internal.h"
#include "ops/vsi_nn_op_resize_1d_nearest_internal.h"
#include "ops/vsi_nn_op_upsamplescale.h"
/* custom node head define define */
#include "custom/vsi_nn_custom_node_type.h"
@ -204,6 +206,7 @@ typedef union _vsi_nn_nn_param
vsi_nn_elu_param elu;
vsi_nn_reverse_param reverse;
vsi_nn_space2depth_param space2depth;
vsi_nn_space2depth_internal_param space2depth_internal;
vsi_nn_depth2space_param depth2space;
vsi_nn_depth2space_internal_param depth2space_internal;
vsi_nn_maximum_param maximum;
@ -310,6 +313,7 @@ typedef union _vsi_nn_nn_param
vsi_nn_resize_1d_param resize_1d;
vsi_nn_resize_1d_bilinear_internal_param resize_1d_bilinear_internal;
vsi_nn_resize_1d_nearest_internal_param resize_1d_nearest_internal;
vsi_nn_upsamplescale_param upsamplescale;
uint8_t client_param[128];
/* custom node data struct define */

View File

@ -65,6 +65,12 @@ typedef enum
VSI_NN_SOURCE_LAYOUT_NCHW,
} vsi_nn_preprocess_source_layout_e;
typedef enum
{
VSI_NN_DEST_LAYOUT_NHWC = 0,
VSI_NN_DEST_LAYOUT_NCHW,
} vsi_nn_preprocess_dest_layout_e;
/**
* Input source format
*/

View File

@ -214,7 +214,7 @@ DEF_KERNEL_INITIALIZER(_instancenorm_mean_vari_initializer)
width = input_shape->data[0];
height = input_shape->data[1];
chn = attr[1]->shape->data[1];
if(rsFlg)
if (rsFlg)
{
height = height / chn;
}
@ -281,7 +281,7 @@ DEF_KERNEL_INITIALIZER(_instancenorm_initializer)
width = input_shape->data[0];
height = input_shape->data[1];
chn = attr[1]->shape->data[1];
if(rsFlg)
if (rsFlg)
{
height = height / chn;
}
@ -355,12 +355,12 @@ static vsi_status _query_kernel
for( i = 0; i < kernel_map_size; i ++ )
{
if( kernel_map[i].key == hashkey )
if ( kernel_map[i].key == hashkey )
{
break;
}
}
if( i < kernel_map_size )
if ( i < kernel_map_size )
{
snprintf( kernel->info.name, VX_MAX_KERNEL_NAME, "%s", kernel_map[i].function_name );
kernel->info.parameters = param_def;
@ -413,19 +413,23 @@ static vsi_nn_kernel_node_t _setup
int32_t width = inputs[0]->attr.size[0];
int32_t height = inputs[0]->attr.size[1];
int32_t group_num = (width + 15) / 16;
int32_t input_zp = inputs[0]->attr.dtype.zero_point;
float input_scale = inputs[0]->attr.dtype.scale;
int32_t input_fl = inputs[0]->attr.dtype.fl;
int32_t output_zp = outputs[0]->attr.dtype.zero_point;
float output_scale = outputs[0]->attr.dtype.scale;
int32_t output_fl = outputs[0]->attr.dtype.fl;
int32_t input_zp = 0;
float input_scale = 1.0f;
int32_t input_fl = 0;
int32_t output_zp = 0;
float output_scale = 1.0f;
int32_t output_fl = 0;
float in_fl_scale = 1.0f, out_fl_scale = 1.0;
float dim_ratio = (float)1.0 / (float)(width * height);
if(inputs[0]->attr.dtype.vx_type == VSI_NN_TYPE_INT8
|| inputs[0]->attr.dtype.vx_type == VSI_NN_TYPE_INT16
|| inputs[0]->attr.dtype.vx_type == VSI_NN_TYPE_INT32)
if (inputs[0]->attr.dtype.qnt_type == VSI_NN_QNT_TYPE_AFFINE_ASYMMETRIC)
{
input_zp = inputs[0]->attr.dtype.zero_point;
input_scale = inputs[0]->attr.dtype.scale;
}
else if (inputs[0]->attr.dtype.qnt_type == VSI_NN_QNT_TYPE_DFP)
{
input_fl = inputs[0]->attr.dtype.fl;
if (input_fl > 0)
{
in_fl_scale = (1.0f / ((float) ((int64_t)1 << input_fl)));
@ -434,12 +438,17 @@ static vsi_nn_kernel_node_t _setup
{
in_fl_scale = ((float) ((int64_t)1 << -input_fl));
}
input_zp = 0;
}
if(outputs[0]->attr.dtype.vx_type == VSI_NN_TYPE_INT8
|| outputs[0]->attr.dtype.vx_type == VSI_NN_TYPE_INT16
|| outputs[0]->attr.dtype.vx_type == VSI_NN_TYPE_INT32)
if (outputs[0]->attr.dtype.qnt_type == VSI_NN_QNT_TYPE_AFFINE_ASYMMETRIC)
{
output_zp = outputs[0]->attr.dtype.zero_point;
output_scale = 1.0f / outputs[0]->attr.dtype.scale;
}
else if (outputs[0]->attr.dtype.qnt_type == VSI_NN_QNT_TYPE_DFP)
{
output_fl = outputs[0]->attr.dtype.fl;
if (output_fl > 0)
{
out_fl_scale = (float)((int64_t)1 << output_fl);
@ -448,9 +457,10 @@ static vsi_nn_kernel_node_t _setup
{
out_fl_scale = (1.0f / (float)((int64_t)1 << -output_fl));
}
output_zp = 0;
}
if( !vsi_nn_kernel_gpu_check_shape( (int32_t*)outputs[0]->attr.size,
if ( !vsi_nn_kernel_gpu_check_shape( (int32_t*)outputs[0]->attr.size,
outputs[0]->attr.dim_num ) )
{
return NULL;
@ -482,17 +492,17 @@ static vsi_nn_kernel_node_t _setup
hashkey = HASH_INSTANCENORM_KEY( in0_dtype, out_dtype, reshape_flg );
status = _query_kernel( ikernels[MEAN_VARI_INDEX], hashkeys[MEAN_VARI_INDEX], INTERNAL_KERNEL_MEAN_VARI );
if( VSI_SUCCESS != status )
if ( VSI_SUCCESS != status )
{
goto final;
}
status = _query_kernel( kernel, hashkey, INTERNAL_KERNEL_NORM );
if( VSI_SUCCESS != status )
if ( VSI_SUCCESS != status )
{
goto final;
}
if(reshape_flg)
if (reshape_flg)
{
int32_t shape[VSI_NN_MAX_DIM_NUM] = {0};
shape[0] = inputs[0]->attr.size[0];
@ -507,7 +517,7 @@ static vsi_nn_kernel_node_t _setup
shape[3] = outputs[0]->attr.dim_num > 3 ? outputs[0]->attr.size[3] : 1;
rs_output = vsi_nn_kernel_tensor_reshape( outputs[0]->t, shape, 4 );
}
if(inputs[1]->attr.dim_num < 2)
if (inputs[1]->attr.dim_num < 2)
{
int32_t shape[VSI_NN_MAX_DIM_NUM] = {0};
shape[0] = inputs[1]->attr.size[0];
@ -516,7 +526,7 @@ static vsi_nn_kernel_node_t _setup
shape[3] = 1;
rs_beta = vsi_nn_kernel_tensor_reshape( inputs[1]->t, shape, 4 );
}
if(inputs[2]->attr.dim_num < 2)
if (inputs[2]->attr.dim_num < 2)
{
int32_t shape[VSI_NN_MAX_DIM_NUM] = {0};
shape[0] = inputs[2]->attr.size[0];
@ -528,10 +538,10 @@ static vsi_nn_kernel_node_t _setup
// Mean Vari
{
node = vsi_nn_kernel_create_node( graph, ikernels[MEAN_VARI_INDEX] );
if(node)
if (node)
{
uint32_t index = 0;
if(reshape_flg)
if (reshape_flg)
{
mean_vari_node_params[index++] = rs_input;
}
@ -565,10 +575,10 @@ static vsi_nn_kernel_node_t _setup
// Nomalization
{
node = vsi_nn_kernel_create_node( graph, kernel );
if(node)
if (node)
{
uint32_t index = 0;
if(reshape_flg)
if (reshape_flg)
{
node_params[index++] = rs_input;
}
@ -576,7 +586,7 @@ static vsi_nn_kernel_node_t _setup
{
node_params[index++] = (vsi_nn_kernel_node_param_t)inputs[0]->t;
}
if(inputs[1]->attr.dim_num < 2)
if (inputs[1]->attr.dim_num < 2)
{
node_params[index++] = rs_beta;
}
@ -584,7 +594,7 @@ static vsi_nn_kernel_node_t _setup
{
node_params[index++] = (vsi_nn_kernel_node_param_t)inputs[1]->t;
}
if(inputs[2]->attr.dim_num < 2)
if (inputs[2]->attr.dim_num < 2)
{
node_params[index++] = rs_gamma;
}
@ -593,7 +603,7 @@ static vsi_nn_kernel_node_t _setup
node_params[index++] = (vsi_nn_kernel_node_param_t)inputs[2]->t;
}
node_params[index++] = (vsi_nn_kernel_node_param_t)tensors[MEAN_VARI_INDEX]->t;
if(reshape_flg)
if (reshape_flg)
{
node_params[index++] = rs_output;
}
@ -634,26 +644,26 @@ static vsi_nn_kernel_node_t _setup
/* Pass parameters to node. */
final:
if(rs_beta)
if (rs_beta)
{
vsi_nn_kernel_tensor_release( &rs_beta );
}
if(rs_gamma)
if (rs_gamma)
{
vsi_nn_kernel_tensor_release( &rs_gamma );
}
if(reshape_flg)
if (reshape_flg)
{
vsi_nn_kernel_tensor_release( &rs_input );
vsi_nn_kernel_tensor_release( &rs_output );
}
for( i = 0; i < INTERNAL_KERNEL_SIZE; i ++ )
{
if( ikernels[i] )
if ( ikernels[i] )
{
vsi_nn_kernel_release( &ikernels[i] );
}
if( tensors[i] )
if ( tensors[i] )
{
vsi_nn_ReleaseTensor( &tensors[i] );
}

View File

@ -0,0 +1,395 @@
/****************************************************************************
*
* Copyright (c) 2019 Vivante Corporation
*
* Permission is hereby granted, free of charge, to any person obtaining a
* copy of this software and associated documentation files (the "Software"),
* to deal in the Software without restriction, including without limitation
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
* and/or sell copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in
* all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
* DEALINGS IN THE SOFTWARE.
*
*****************************************************************************/
#include <stdint.h>
#include <stdlib.h>
#include <string.h>
#include "vsi_nn_types.h"
#include "vsi_nn_tensor.h"
#include "vsi_nn_graph.h"
#include "vsi_nn_log.h"
#include "vsi_nn_prv.h"
#include "vsi_nn_error.h"
#include "vsi_nn_tensor_util.h"
#include "utils/vsi_nn_util.h"
#include "kernel/vsi_nn_kernel.h"
#include "kernel/vsi_nn_kernel_eltwise.h"
__BEGIN_DECLS
/*
* Define kernel meta.
*/
#define KERNEL_SOURCE_1 "layer_normalization"
#define HASH_LAYERNORM_KERNEL_NAME(SRC0_TYPE, DST_TYPE) \
CVIVANTE_NAMESPACE("cl.layer_norm_"#SRC0_TYPE"to"#DST_TYPE)
// Add kernel hashtable here
#define HASH_LAYERNORM_KEY(_input0_type, _output_type, _reshape_flag) \
((_input0_type << 24) | (_output_type << 16) | (_reshape_flag << 8))
#define TENSOR_LAYERNORM_KERNELS(IN0_TYPE, OUT_TYPE, SOURCE) \
{ HASH_LAYERNORM_KEY(IN0_TYPE, OUT_TYPE, 0), \
HASH_LAYERNORM_KERNEL_NAME(IN0_TYPE, OUT_TYPE), \
SOURCE },
typedef struct
{
uint32_t key;
char * function_name;
const char * source_name;
} _kernel_map_type;
static const _kernel_map_type _layernorm_kernel_map[] =
{
// Register kernel here
TENSOR_LAYERNORM_KERNELS( F32, F32, KERNEL_SOURCE_1 )
TENSOR_LAYERNORM_KERNELS( U8, U8, KERNEL_SOURCE_1 )
};
/*
* Kernel params
*/
static vx_param_description_t _layernorm_kernel_param_def[] =
{
{VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
{VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
{VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
{VX_OUTPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
{VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
{VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
{VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
{VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
{VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
{VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
{VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
{VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
{VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
{VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
{VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
{VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
{VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
// Add kererl parameters here
};
#define _LAYERNORM_PARAM_NUM _cnt_of_array( _layernorm_kernel_param_def )
/*
* Kernel initializer
*/
DEF_KERNEL_INITIALIZER(_layernorm_initializer)
(
vsi_nn_kernel_node_t node,
const vsi_nn_kernel_node_param_t * param,
size_t param_size
)
{
vsi_status status = VSI_FAILURE;
gpu_param_t gpu_param = {
3,
{0, 0, 0},
{0, 0, 0},
{0, 0, 0},
{0, 0, 0}
};
vsi_nn_kernel_tensor_attr_t * attr[2] = { NULL };
vsi_int_array_t * input_shape = NULL;
//int32_t width = 0;
int32_t height = 0;
int32_t chn = 0;
attr[0] = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[0] );
CHECK_PTR_FAIL_GOTO( attr[0], "Create tensor attr buffer fail.", final );
attr[1] = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[3] );
CHECK_PTR_FAIL_GOTO( attr[1], "Create tensor attr buffer fail.", final );
input_shape = attr[0]->shape;
//width = input_shape->data[0];
height = input_shape->data[1];
chn = (input_shape->size <= 2) ? 1 : input_shape->data[2];
gpu_param.global_scale[0] = 1;
gpu_param.global_scale[1] = 1;
gpu_param.global_scale[2] = 1;
gpu_param.local_size[0] = 16;
gpu_param.local_size[1] = 1;
gpu_param.local_size[2] = 1;
gpu_param.global_size[0] = 16;
gpu_param.global_size[1] = height;
gpu_param.global_size[2] = chn;
status = vsi_nn_kernel_gpu_config( node, &gpu_param );
CHECK_STATUS_FAIL_GOTO(status, final);
final:
if (attr[0])
{
vsi_nn_kernel_tensor_attr_release( &attr[0] );
attr[0] = NULL;
}
if (attr[1])
{
vsi_nn_kernel_tensor_attr_release( &attr[1] );
attr[1] = NULL;
}
return status;
} /* _layernorm_initializer() */
/*
* Query kernel
*/
static vsi_status _query_kernel
(
vsi_nn_kernel_t* kernel,
vsi_nn_tensor_t* const* const inputs,
vsi_nn_tensor_t* const* const outputs,
int32_t reshape2D
)
{
vsi_status status = VSI_FAILURE;
vsi_nn_kernel_dtype_e input0_dtype = U8;
vsi_nn_kernel_dtype_e output_dtype = U8;
uint32_t key = 0;
int i = 0;
input0_dtype = vsi_nn_kernel_map_dtype( inputs[0]->attr.dtype.vx_type );
output_dtype = vsi_nn_kernel_map_dtype( outputs[0]->attr.dtype.vx_type );
if (input0_dtype == F16 && output_dtype == F16)
{
input0_dtype = F32;
output_dtype = F32;
}
key = HASH_LAYERNORM_KEY( input0_dtype, output_dtype, 0 );
for( i = 0; i < _cnt_of_array(_layernorm_kernel_map); i ++ )
{
if ( _layernorm_kernel_map[i].key == key )
{
break;
}
}
if ( i < _cnt_of_array(_layernorm_kernel_map) )
{
snprintf( kernel->info.name, VX_MAX_KERNEL_NAME, "%s", _layernorm_kernel_map[i].function_name );
kernel->info.parameters = _layernorm_kernel_param_def;
kernel->info.numParams = _LAYERNORM_PARAM_NUM;
kernel->info.initialize = _layernorm_initializer;
vsi_nn_kernel_add_source( kernel, VSI_NN_GPU_SOURCE_FMT_CODE, 2,
"eltwise_ops_helper",
_layernorm_kernel_map[i].source_name );
vsi_nn_kernel_add_source( kernel, VSI_NN_GPU_SOURCE_FMT_EXECUTABLE, 1,
_layernorm_kernel_map[i].source_name );
status = VSI_SUCCESS;
}
return status;
} /* _query_kernel() */
static vsi_nn_kernel_node_t _setup
(
vsi_nn_graph_t * graph,
vsi_nn_tensor_t ** inputs,
size_t input_num,
vsi_nn_tensor_t ** outputs,
size_t output_num,
const vsi_nn_kernel_param_t * params,
vsi_nn_kernel_t * kernel
)
{
vsi_status status = VSI_FAILURE;
vsi_nn_kernel_node_param_t node_params[_LAYERNORM_PARAM_NUM] = { NULL };
vsi_nn_kernel_node_t node = NULL;
vsi_nn_kernel_tensor_t rs_gamma = NULL, rs_beta = NULL;
float eps = vsi_nn_kernel_param_get_float32( params, "eps" );
int32_t width = inputs[0]->attr.size[0];
int32_t height = inputs[0]->attr.size[1];
int32_t input_fl = 0;
float input_zp = 0.0f;
float input_scale = 1.0f;
int32_t output_fl = 0;
float output_zp = 0.0f;
float output_scale = 1.0f;
float e2InScale = 1.0f, scale_inOut = 1.0f;
float dim_ratio = (float)1.0 / (float)(width);
float sumZpScale = 0.0f;
float zp2ScaleE2 = 0.0f;
float sumZpScaleE2 = 0.0f;
if (inputs[0]->attr.dtype.qnt_type == VSI_NN_QNT_TYPE_AFFINE_ASYMMETRIC)
{
input_zp = (float)inputs[0]->attr.dtype.zero_point;
input_scale = inputs[0]->attr.dtype.scale;
}
else if (inputs[0]->attr.dtype.qnt_type == VSI_NN_QNT_TYPE_DFP)
{
input_fl = inputs[0]->attr.dtype.fl;
if (input_fl > 0)
{
input_scale = (1.0f / ((float) ((int64_t)1 << input_fl)));
}
else
{
input_scale = ((float) ((int64_t)1 << -input_fl));
}
input_zp = 0.0f;
}
if (outputs[0]->attr.dtype.qnt_type == VSI_NN_QNT_TYPE_AFFINE_ASYMMETRIC)
{
output_zp = (float)outputs[0]->attr.dtype.zero_point;
output_scale = 1.0f / outputs[0]->attr.dtype.scale;
}
else if (outputs[0]->attr.dtype.qnt_type == VSI_NN_QNT_TYPE_DFP)
{
output_fl = outputs[0]->attr.dtype.fl;
if (output_fl > 0)
{
output_scale = (float)((int64_t)1 << output_fl);
}
else
{
output_scale = (1.0f / (float)((int64_t)1 << -output_fl));
}
output_zp = 0.0f;
}
scale_inOut = input_scale * output_scale;
e2InScale = input_scale * input_scale;
sumZpScale = width * input_zp * input_scale;
zp2ScaleE2 = input_zp * 2 * e2InScale;
sumZpScaleE2 = width * input_zp * input_zp * e2InScale;
if ( !vsi_nn_kernel_gpu_check_shape( (int32_t*)outputs[0]->attr.size,
outputs[0]->attr.dim_num ) )
{
return NULL;
}
status = _query_kernel( kernel, inputs, outputs, 0 );
if ( VSI_SUCCESS != status )
{
goto final;
}
if (inputs[1]->attr.dim_num < 2)
{
int32_t shape[VSI_NN_MAX_DIM_NUM] = {0};
shape[0] = inputs[1]->attr.size[0];
shape[1] = 1;
shape[2] = 1;
shape[3] = 1;
rs_beta = vsi_nn_kernel_tensor_reshape( inputs[1]->t, shape, 4 );
}
if (inputs[2]->attr.dim_num < 2)
{
int32_t shape[VSI_NN_MAX_DIM_NUM] = {0};
shape[0] = inputs[2]->attr.size[0];
shape[1] = 1;
shape[2] = 1;
shape[3] = 1;
rs_gamma = vsi_nn_kernel_tensor_reshape( inputs[2]->t, shape, 4 );
}
// Nomalization
{
node = vsi_nn_kernel_create_node( graph, kernel );
if (node)
{
uint32_t index = 0;
node_params[index++] = (vsi_nn_kernel_node_param_t)inputs[0]->t;
if (inputs[1]->attr.dim_num < 2)
{
node_params[index++] = rs_beta;
}
else
{
node_params[index++] = (vsi_nn_kernel_node_param_t)inputs[1]->t;
}
if (inputs[2]->attr.dim_num < 2)
{
node_params[index++] = rs_gamma;
}
else
{
node_params[index++] = (vsi_nn_kernel_node_param_t)inputs[2]->t;
}
node_params[index++] = (vsi_nn_kernel_node_param_t)outputs[0]->t;
node_params[index++] = vsi_nn_kernel_scalar_create( graph, F32, &eps );
node_params[index++] = vsi_nn_kernel_scalar_create( graph, F32, &input_zp );
node_params[index++] = vsi_nn_kernel_scalar_create( graph, F32, &input_scale );
node_params[index++] = vsi_nn_kernel_scalar_create( graph, F32, &output_zp );
node_params[index++] = vsi_nn_kernel_scalar_create( graph, F32, &output_scale );
node_params[index++] = vsi_nn_kernel_scalar_create( graph, F32, &e2InScale );
node_params[index++] = vsi_nn_kernel_scalar_create( graph, F32, &scale_inOut );
node_params[index++] = vsi_nn_kernel_scalar_create( graph, F32, &sumZpScale );
node_params[index++] = vsi_nn_kernel_scalar_create( graph, F32, &zp2ScaleE2 );
node_params[index++] = vsi_nn_kernel_scalar_create( graph, F32, &sumZpScaleE2 );
node_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &width );
node_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &height );
node_params[index++] = vsi_nn_kernel_scalar_create( graph, F32, &dim_ratio );
status = vsi_nn_kernel_node_pass_param( node, node_params,
_LAYERNORM_PARAM_NUM );
CHECK_STATUS(status);
vsi_nn_kernel_scalar_release( &node_params[4] );
vsi_nn_kernel_scalar_release( &node_params[5] );
vsi_nn_kernel_scalar_release( &node_params[6] );
vsi_nn_kernel_scalar_release( &node_params[7] );
vsi_nn_kernel_scalar_release( &node_params[8] );
vsi_nn_kernel_scalar_release( &node_params[9] );
vsi_nn_kernel_scalar_release( &node_params[10] );
vsi_nn_kernel_scalar_release( &node_params[11] );
vsi_nn_kernel_scalar_release( &node_params[12] );
vsi_nn_kernel_scalar_release( &node_params[13] );
vsi_nn_kernel_scalar_release( &node_params[14] );
vsi_nn_kernel_scalar_release( &node_params[15] );
vsi_nn_kernel_scalar_release( &node_params[16] );
}
}
/* Pass parameters to node. */
final:
if (rs_beta)
{
vsi_nn_kernel_tensor_release( &rs_beta );
}
if (rs_gamma)
{
vsi_nn_kernel_tensor_release( &rs_gamma );
}
return node;
} /* _setup() */
__END_DECLS
REGISTER_BACKEND_CL( layer_norm, _setup )

View File

@ -59,6 +59,9 @@ __BEGIN_DECLS
#define HASH_MATRIXMUL_TRANSA_SH_KERNEL_NAME(SRC0_TYPE, SRC1_TYPE, DST_TYPE, IMAGE_DIM) \
CVIVANTE_NAMESPACE("cl.gemm_transa_"#SRC0_TYPE#SRC1_TYPE"to"#DST_TYPE#IMAGE_DIM)
#define HASH_MATRIXMUL_TRANSB_SH_KERNEL_NAME(SRC0_TYPE, SRC1_TYPE, DST_TYPE, IMAGE_DIM) \
CVIVANTE_NAMESPACE("cl.gemm_transb_"#SRC0_TYPE#SRC1_TYPE"to"#DST_TYPE#IMAGE_DIM)
#define TENSOR_MATRIXMUL_KERNELS(IN0_TYPE, IN1_TYPE, OUT_TYPE, IMAGE_DIM, SOURCE) \
{ HASH_MATRIXMUL_KEY(IN0_TYPE, IN1_TYPE, OUT_TYPE, IMAGE_DIM, 0), \
HASH_MATRIXMUL_SH_KERNEL_NAME(F32, F32, F32, IMAGE_DIM), \
@ -69,6 +72,11 @@ __BEGIN_DECLS
HASH_MATRIXMUL_TRANSA_SH_KERNEL_NAME(F32, F32, F32, IMAGE_DIM), \
SOURCE },
#define TENSOR_MATRIXMUL_TRANSB_KERNELS(IN0_TYPE, IN1_TYPE, OUT_TYPE, IMAGE_DIM, SOURCE) \
{ HASH_MATRIXMUL_KEY(IN0_TYPE, IN1_TYPE, OUT_TYPE, IMAGE_DIM, 2), \
HASH_MATRIXMUL_TRANSB_SH_KERNEL_NAME(IN0_TYPE, IN1_TYPE, OUT_TYPE, IMAGE_DIM), \
SOURCE },
static const struct {
uint32_t key;
char* function_name;
@ -83,6 +91,10 @@ static const struct {
TENSOR_MATRIXMUL_KERNELS(F32, F32, F32, _3D, KERNEL_SOURCE_1)
TENSOR_MATRIXMUL_TRANSA_KERNELS(F32, F32, F32, _2D, KERNEL_SOURCE_2)
TENSOR_MATRIXMUL_TRANSA_KERNELS(F32, F32, F32, _3D, KERNEL_SOURCE_2)
TENSOR_MATRIXMUL_TRANSB_KERNELS(F32, F32, F32, _2D, KERNEL_SOURCE_1)
TENSOR_MATRIXMUL_TRANSB_KERNELS(F32, F32, F32, _3D, KERNEL_SOURCE_1)
TENSOR_MATRIXMUL_TRANSB_KERNELS(F32, I8, F32, _2D, KERNEL_SOURCE_1)
TENSOR_MATRIXMUL_TRANSB_KERNELS(F32, I8, F32, _3D, KERNEL_SOURCE_1)
};
/*
@ -98,6 +110,12 @@ static vx_param_description_t _matrixmul_kernel_param_def[] =
{VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
{VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
{VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
{VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
{VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
{VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
{VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
{VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
{VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
};
#define _MATRIXMUL_PARAM_NUM _cnt_of_array(_matrixmul_kernel_param_def)
@ -130,7 +148,7 @@ DEF_KERNEL_INITIALIZER(_matrixmul_initializer)
CHECK_PTR_FAIL_GOTO( attr[0], "Create tensor attr buffer fail.", final );
width = attr[0]->shape->data[0];
height = attr[0]->shape->data[0];
height = attr[0]->shape->data[1];
chn = attr[0]->shape->size > 2 ? attr[0]->shape->data[2] : 1;
gpu_param.global_scale[0] = 1;
@ -175,22 +193,27 @@ static vsi_status _query_kernel
input0_dtype = vsi_nn_kernel_map_dtype( inputs[0]->attr.dtype.vx_type );
input1_dtype = vsi_nn_kernel_map_dtype( inputs[1]->attr.dtype.vx_type );
output_dtype = vsi_nn_kernel_map_dtype( outputs[0]->attr.dtype.vx_type );
if(depth > 1)
if (depth > 1)
{
dim_type = _3D;
}
if (input1_dtype == I16 || input1_dtype == I32)
{
input1_dtype = I8;
}
key = HASH_MATRIXMUL_KEY( input0_dtype, input1_dtype, output_dtype, dim_type, transa );
for( i = 0; i < _cnt_of_array(matrixmul_map); i ++ )
{
if( matrixmul_map[i].key == key )
if ( matrixmul_map[i].key == key )
{
break;
}
}
if( i < _cnt_of_array(matrixmul_map) )
if ( i < _cnt_of_array(matrixmul_map) )
{
snprintf( kernel->info.name, VX_MAX_KERNEL_NAME, "%s", matrixmul_map[i].function_name );
kernel->info.parameters = _matrixmul_kernel_param_def;
@ -223,48 +246,111 @@ static vsi_nn_kernel_node_t _setup
vsi_nn_kernel_node_t node = NULL;
int32_t transposeA = vsi_nn_kernel_param_get_int32( params, "transposeA" );
int32_t transposeB = vsi_nn_kernel_param_get_int32( params, "transposeB" );
int32_t transFlg = 0;
uint32_t M = inputs[0]->attr.size[1];
uint32_t K = inputs[0]->attr.size[0];
uint32_t N = inputs[1]->attr.size[0];
uint32_t depth = outputs[0]->attr.dim_num > 2 ? outputs[0]->attr.size[2] : 1;
uint32_t ac2zero = 0;
uint32_t bc2zero = 0;
float scale_a = 1.0f;
float zp_a = 0;
float scale_b = 1.0f;
float zp_b = 0;
float scale_out = 1.0f;
float zp_out = 0;
if( !vsi_nn_kernel_gpu_check_shape( (int32_t*)outputs[0]->attr.size,
if ( !vsi_nn_kernel_gpu_check_shape( (int32_t*)outputs[0]->attr.size,
outputs[0]->attr.dim_num ) )
{
return NULL;
}
if(transposeB)
if (transposeB)
{
return NULL;
N = inputs[1]->attr.size[1];
transFlg = 2;
}
if(transposeA)
if (inputs[0]->attr.dtype.qnt_type == VSI_NN_QNT_TYPE_DFP)
{
if (inputs[0]->attr.dtype.fl > 0)
{
scale_a = (1.0f / ((float) ((int64_t)1 << inputs[0]->attr.dtype.fl)));
}
else
{
scale_a = ((float) ((int64_t)1 << -inputs[0]->attr.dtype.fl));
}
zp_a = 0;
}
else if (inputs[0]->attr.dtype.qnt_type == VSI_NN_QNT_TYPE_AFFINE_ASYMMETRIC)
{
zp_a = (float)inputs[0]->attr.dtype.zero_point;
scale_a = inputs[0]->attr.dtype.scale;
}
if (inputs[1]->attr.dtype.qnt_type == VSI_NN_QNT_TYPE_DFP)
{
if (inputs[1]->attr.dtype.fl > 0)
{
scale_b = (1.0f / ((float) ((int64_t)1 << inputs[1]->attr.dtype.fl)));
}
else
{
scale_b = ((float) ((int64_t)1 << -inputs[1]->attr.dtype.fl));
}
zp_b = 0;
}
else if (inputs[1]->attr.dtype.qnt_type == VSI_NN_QNT_TYPE_AFFINE_ASYMMETRIC)
{
zp_b = (float)inputs[1]->attr.dtype.zero_point;
scale_b = inputs[1]->attr.dtype.scale;
}
if (outputs[0]->attr.dtype.qnt_type == VSI_NN_QNT_TYPE_DFP)
{
if (outputs[0]->attr.dtype.fl > 0)
{
scale_out = (float)((int64_t)1 << outputs[0]->attr.dtype.fl);
}
else
{
scale_out = (1.0f / (float)((int64_t)1 << -outputs[0]->attr.dtype.fl));
}
zp_out = 0;
}
else if (outputs[0]->attr.dtype.qnt_type == VSI_NN_QNT_TYPE_AFFINE_ASYMMETRIC)
{
zp_out = (float)outputs[0]->attr.dtype.zero_point;
scale_out = outputs[0]->attr.dtype.scale;
}
if (transposeA)
{
K = inputs[0]->attr.size[1];
M = inputs[0]->attr.size[0];
transFlg = 1;
}
if((inputs[0]->attr.dim_num > inputs[1]->attr.dim_num) ||
if ((inputs[0]->attr.dim_num > inputs[1]->attr.dim_num) ||
(inputs[0]->attr.size[2] > inputs[1]->attr.size[2]
&& inputs[0]->attr.dim_num > 2 && inputs[1]->attr.dim_num > 2))
{
bc2zero = 1;
}
else if((inputs[1]->attr.dim_num > inputs[0]->attr.dim_num) ||
else if ((inputs[1]->attr.dim_num > inputs[0]->attr.dim_num) ||
(inputs[1]->attr.size[2] > inputs[0]->attr.size[2]
&& inputs[0]->attr.dim_num > 2 && inputs[1]->attr.dim_num > 2))
{
ac2zero = 1;
}
status = _query_kernel( kernel, inputs, outputs, depth, transposeA );
if( VSI_SUCCESS == status)
status = _query_kernel( kernel, inputs, outputs, depth, transFlg );
if ( VSI_SUCCESS == status)
{
node = vsi_nn_kernel_create_node( graph, kernel );
if( node )
if ( node )
{
uint32_t index = 3;
/* Pass parameters to node. */
@ -275,6 +361,12 @@ static vsi_nn_kernel_node_t _setup
node_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &N );
node_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &ac2zero );
node_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &bc2zero );
node_params[index++] = vsi_nn_kernel_scalar_create( graph, F32, &scale_a );
node_params[index++] = vsi_nn_kernel_scalar_create( graph, F32, &zp_a );
node_params[index++] = vsi_nn_kernel_scalar_create( graph, F32, &scale_b );
node_params[index++] = vsi_nn_kernel_scalar_create( graph, F32, &zp_b );
node_params[index++] = vsi_nn_kernel_scalar_create( graph, F32, &scale_out );
node_params[index++] = vsi_nn_kernel_scalar_create( graph, F32, &zp_out );
/* Pass parameters to node. */
status = vsi_nn_kernel_node_pass_param( node, node_params, _MATRIXMUL_PARAM_NUM );
CHECK_STATUS(status);
@ -283,6 +375,12 @@ static vsi_nn_kernel_node_t _setup
vsi_nn_kernel_scalar_release( &node_params[5] );
vsi_nn_kernel_scalar_release( &node_params[6] );
vsi_nn_kernel_scalar_release( &node_params[7] );
vsi_nn_kernel_scalar_release( &node_params[8] );
vsi_nn_kernel_scalar_release( &node_params[9] );
vsi_nn_kernel_scalar_release( &node_params[10] );
vsi_nn_kernel_scalar_release( &node_params[11] );
vsi_nn_kernel_scalar_release( &node_params[12] );
vsi_nn_kernel_scalar_release( &node_params[13] );
}
}
return node;

View File

@ -0,0 +1,329 @@
/****************************************************************************
*
* Copyright (c) 2020 Vivante Corporation
*
* Permission is hereby granted, free of charge, to any person obtaining a
* copy of this software and associated documentation files (the "Software"),
* to deal in the Software without restriction, including without limitation
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
* and/or sell copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in
* all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
* DEALINGS IN THE SOFTWARE.
*
*****************************************************************************/
#include <stdint.h>
#include <stdlib.h>
#include <string.h>
#include "vsi_nn_types.h"
#include "vsi_nn_tensor.h"
#include "vsi_nn_graph.h"
#include "vsi_nn_log.h"
#include "vsi_nn_error.h"
#include "vsi_nn_prv.h"
#include "vsi_nn_tensor_util.h"
#include "utils/vsi_nn_util.h"
#include "kernel/vsi_nn_kernel.h"
#include "libnnext/vx_lib_nnext.h"
#include "kernel/vsi_nn_kernel_gpu_shape_optimize.h"
__BEGIN_DECLS
#define _ROI_ALIGN_KERNEL_SOURCE(_input_type) "roi_align"
#define STR(a) #a
// Add kernel hashtable here
#define ROI_ALIGN_HASH_KEY( IN0_DTYPE, IN1_DTYPE, IN2_DTYPE, OUT_DTYPE, _image_2d ) \
(( IN0_DTYPE ) | ( IN1_DTYPE << 7) | (IN2_DTYPE << 14) | (OUT_DTYPE << 21) | (_image_2d << 28))
#define PACK_KERNEL_MAP( IN0_DTYPE, IN1_DTYPE, IN2_DTYPE, OUT_DTYPE ) \
{ ROI_ALIGN_HASH_KEY( IN0_DTYPE, IN1_DTYPE, IN2_DTYPE, OUT_DTYPE, 0 ), \
CVIVANTE_NAMESPACE("cl.roi_align_"STR(IN0_DTYPE)"to"STR(OUT_DTYPE)), \
_ROI_ALIGN_KERNEL_SOURCE(IN0_DTYPE) }
typedef struct
{
uint32_t key;
char * function_name;
const char * source_name;
} _kernel_map_type;
static const _kernel_map_type _roi_align_kernel_map[] =
{
PACK_KERNEL_MAP(F32, F32, I32, F32),
};
/*
* Kernel params
*/
static vx_param_description_t _roi_align_kernel_param_def[] =
{
{VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
{VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
{VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
{VX_OUTPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
{VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
{VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
{VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
{VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
{VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
{VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
{VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
{VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
{VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
};
#define _ROI_ALIGN_PARAM_NUM _cnt_of_array( _roi_align_kernel_param_def )
#define SCALAR_SPATIAL_X_SCALE (4)
#define SCALAR_SPATIAL_Y_SCALE (5)
#define SCALAR_INPUT_WIDTH (6)
#define SCALAR_INPUT_HEIGHT (7)
#define SCALAR_RCP_OF_OUTPUT_WIDTH (8)
#define SCALAR_RCP_OF_OUTPUT_HEIGHT (9)
#define SCALAR_SAMPLING_X_RATIO (10)
#define SCALAR_SAMPLING_Y_RATIO (11)
#define SCALAR_DEPTH (12)
#define ROI_ALIGN_PARAM_NUM 13
#define ROI_ALIGN_QUANT_PARAM_NUM _cnt_of_array( _roi_align_kernel_param_def )
/*
* Kernel initializer
*/
DEF_KERNEL_INITIALIZER(_roi_align_initializer)
(
vsi_nn_kernel_node_t node,
const vsi_nn_kernel_node_param_t * param,
size_t param_size
)
{
vsi_status status = VSI_FAILURE;
gpu_param_t gpu_param = {
3,
{0, 0, 0},
{0, 0, 0},
{0, 0, 0},
{0, 0, 0}
};
vsi_nn_kernel_tensor_attr_t * rois_attr = NULL;
vsi_nn_kernel_tensor_attr_t * output_attr = NULL;
vsi_int_array_t * rois_shape = NULL;
vsi_int_array_t * out_shape = NULL;
rois_attr = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[1] );
CHECK_PTR_FAIL_GOTO( rois_attr, "Create tensor attr buffer fail.", final );
output_attr = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[3] );
CHECK_PTR_FAIL_GOTO( output_attr, "Create tensor attr buffer fail.", final );
rois_shape = rois_attr->shape;
out_shape = output_attr->shape;
gpu_param.global_scale[0] = 1;
gpu_param.global_scale[1] = 1;
gpu_param.global_scale[2] = 1;
gpu_param.dim = 3;
gpu_param.global_size[0] = gpu_align_p2(
(out_shape->data[0] + gpu_param.global_scale[0] - 1)
/ gpu_param.global_scale[0], 4);
gpu_param.global_size[1] = (
(out_shape->data[1] + gpu_param.global_scale[1] - 1)
/ gpu_param.global_scale[1]);
gpu_param.global_size[2] = rois_shape->data[1];
status = vsi_nn_kernel_gpu_config( node, &gpu_param );
final:
#define SAFE_FREE_TENSOR_ATTR(_PTR) if( _PTR ) { vsi_nn_kernel_tensor_attr_release( &_PTR ); _PTR = NULL; }
SAFE_FREE_TENSOR_ATTR(output_attr);
return status;
} /* _roi_align_initializer() */
/*
* Query kernel
*/
static vsi_status _query_kernel
(
vsi_nn_kernel_t * kernel,
vsi_nn_tensor_t * const * const inputs,
vsi_nn_tensor_t * const * const outputs,
vsi_bool image_2d
)
{
vsi_status status = VSI_FAILURE;
vsi_nn_kernel_dtype_e in0_dtype;
vsi_nn_kernel_dtype_e in1_dtype;
vsi_nn_kernel_dtype_e in2_dtype;
vsi_nn_kernel_dtype_e out_dtype;
const _kernel_map_type * kernel_map = _roi_align_kernel_map;
size_t kernel_map_size = _cnt_of_array( _roi_align_kernel_map );
vx_param_description_t * param_def = _roi_align_kernel_param_def;
size_t param_def_size = ROI_ALIGN_QUANT_PARAM_NUM;
vx_kernel_initialize_f initializer = _roi_align_initializer;
uint32_t key;
uint32_t i;
in0_dtype = vsi_nn_kernel_map_dtype( inputs[0]->attr.dtype.vx_type );
in1_dtype = vsi_nn_kernel_map_dtype( inputs[1]->attr.dtype.vx_type );
in2_dtype = vsi_nn_kernel_map_dtype( inputs[2]->attr.dtype.vx_type );
out_dtype = vsi_nn_kernel_map_dtype( outputs[0]->attr.dtype.vx_type );
in0_dtype = in0_dtype == F16 ? F32 : in0_dtype;
in1_dtype = in1_dtype == F16 ? F32 : in1_dtype;
key = ROI_ALIGN_HASH_KEY( in0_dtype, in1_dtype, in2_dtype, out_dtype, image_2d );
for ( i = 0; i < (uint32_t)kernel_map_size; i ++ )
{
if ( kernel_map[i].key == key )
{
break;
}
}
if ( i < (uint32_t)kernel_map_size )
{
snprintf( kernel->info.name, VX_MAX_KERNEL_NAME, "%s", kernel_map[i].function_name );
kernel->info.parameters = param_def;
kernel->info.numParams = (uint32_t)param_def_size;
kernel->info.initialize = initializer;
// Register code source
vsi_nn_kernel_add_source( kernel, VSI_NN_GPU_SOURCE_FMT_CODE, 1,
kernel_map[i].source_name );
// Register binary source
vsi_nn_kernel_add_source( kernel, VSI_NN_GPU_SOURCE_FMT_EXECUTABLE, 1,
kernel_map[i].source_name );
status = VSI_SUCCESS;
}
return status;
} /* _query_kernel() */
#define _INPUT_NUM (3)
#define _OUTPUT_NUM (1)
#define _IO_NUM (_INPUT_NUM + _OUTPUT_NUM)
static vsi_nn_kernel_node_t _setup
(
vsi_nn_graph_t * graph,
vsi_nn_tensor_t ** inputs,
size_t input_num,
vsi_nn_tensor_t ** outputs,
size_t output_num,
const vsi_nn_kernel_param_t * params,
vsi_nn_kernel_t * kernel
)
{
vsi_status status = VSI_FAILURE;
vsi_nn_kernel_node_param_t node_params[_ROI_ALIGN_PARAM_NUM] = {NULL};
vsi_nn_kernel_node_t node = NULL;
vsi_bool image_2d = FALSE;
uint32_t rank[_IO_NUM] = {0};
int32_t shapes[_IO_NUM][VSI_NN_MAX_DIM_NUM] = {{ 1 }};
vsi_nn_tensor_t* reshape_tensors[_IO_NUM] = { NULL };
int32_t i = 0;
float width_ratio = vsi_nn_kernel_param_get_float32( params, "width_ratio" );
float height_ratio = vsi_nn_kernel_param_get_float32( params, "height_ratio" );
int32_t width_sample_num = vsi_nn_kernel_param_get_int32( params, "width_sample_num" );
int32_t height_sample_num = vsi_nn_kernel_param_get_int32( params, "height_sample_num" );
float width_scale = 1.0f / width_ratio;
float height_scale = 1.0f / height_ratio;
float in_width = (float)(inputs[0]->attr.size[0]);
float in_height = (float)(inputs[0]->attr.size[1]);
float rcp_of_out_width = 1.0f / (float)(outputs[0]->attr.size[0]);
float rcp_of_out_height = 1.0f / (float)(outputs[0]->attr.size[1]);
float sampling_x_ratio = width_sample_num > 0 ? (float)width_sample_num : 0;
float sampling_y_ratio = height_sample_num > 0 ? (float)height_sample_num : 0;
int depth = inputs[0]->attr.size[2];
vsi_nn_kernel_optimize_nchw2xhw_shape( (const int32_t*)inputs[0]->attr.size, inputs[0]->attr.dim_num,
shapes[0], &rank[0]);
vsi_nn_kernel_optimize_1d_tensor_shape( (const int32_t*)inputs[1]->attr.size, inputs[1]->attr.dim_num,
shapes[1], &rank[1]);
vsi_nn_kernel_optimize_1d_tensor_shape( (const int32_t*)inputs[2]->attr.size, inputs[2]->attr.dim_num,
shapes[2], &rank[2]);
vsi_nn_kernel_optimize_nchw2xhw_shape( (const int32_t*)outputs[0]->attr.size, outputs[0]->attr.dim_num,
shapes[3], &rank[3]);
for (i = 0; i < _INPUT_NUM; i++)
{
reshape_tensors[i] = vsi_nn_reshape_tensor( graph,
inputs[i], (uint32_t*)shapes[i], rank[i] );
}
reshape_tensors[_INPUT_NUM] = vsi_nn_reshape_tensor( graph,
outputs[0], (uint32_t*)shapes[_INPUT_NUM], rank[_INPUT_NUM] );
if ( !vsi_nn_kernel_gpu_check_shape( (int32_t*)reshape_tensors[0]->attr.size,
inputs[0]->attr.dim_num ) )
{
return NULL;
}
status = _query_kernel( kernel, reshape_tensors, &reshape_tensors[_INPUT_NUM], image_2d);
if ( VSI_SUCCESS == status )
{
size_t node_params_num = ROI_ALIGN_PARAM_NUM;
node = vsi_nn_kernel_create_node( graph, kernel );
if ( node )
{
/* Set inputs and outputs */
vsi_nn_kernel_node_pack_io( node_params, _ROI_ALIGN_PARAM_NUM,
reshape_tensors, input_num, &reshape_tensors[_INPUT_NUM], output_num );
node_params[SCALAR_SPATIAL_X_SCALE] = vsi_nn_kernel_scalar_create( graph, F32, &width_scale );
node_params[SCALAR_SPATIAL_Y_SCALE] = vsi_nn_kernel_scalar_create( graph, F32, &height_scale );
node_params[SCALAR_INPUT_WIDTH] = vsi_nn_kernel_scalar_create( graph, F32, &in_width );
node_params[SCALAR_INPUT_HEIGHT] = vsi_nn_kernel_scalar_create( graph, F32, &in_height );
node_params[SCALAR_RCP_OF_OUTPUT_WIDTH] = vsi_nn_kernel_scalar_create( graph, F32, &rcp_of_out_width );
node_params[SCALAR_RCP_OF_OUTPUT_HEIGHT] = vsi_nn_kernel_scalar_create( graph, F32, &rcp_of_out_height );
node_params[SCALAR_SAMPLING_X_RATIO] = vsi_nn_kernel_scalar_create( graph, F32, &sampling_x_ratio );
node_params[SCALAR_SAMPLING_Y_RATIO] = vsi_nn_kernel_scalar_create( graph, F32, &sampling_y_ratio );
node_params[SCALAR_DEPTH] = vsi_nn_kernel_scalar_create( graph, I32, &depth );
/* Pass parameters to node. */
status = vsi_nn_kernel_node_pass_param( node, node_params, node_params_num );
vsi_nn_kernel_scalar_release( &node_params[SCALAR_SPATIAL_X_SCALE] );
vsi_nn_kernel_scalar_release( &node_params[SCALAR_SPATIAL_Y_SCALE] );
vsi_nn_kernel_scalar_release( &node_params[SCALAR_INPUT_WIDTH] );
vsi_nn_kernel_scalar_release( &node_params[SCALAR_INPUT_HEIGHT] );
vsi_nn_kernel_scalar_release( &node_params[SCALAR_RCP_OF_OUTPUT_WIDTH] );
vsi_nn_kernel_scalar_release( &node_params[SCALAR_RCP_OF_OUTPUT_HEIGHT] );
vsi_nn_kernel_scalar_release( &node_params[SCALAR_SAMPLING_X_RATIO] );
vsi_nn_kernel_scalar_release( &node_params[SCALAR_SAMPLING_Y_RATIO] );
vsi_nn_kernel_scalar_release( &node_params[SCALAR_DEPTH] );
}
}
for (i = 0; i < _IO_NUM; i++)
{
if (reshape_tensors[i])
{
vsi_nn_ReleaseTensor( &reshape_tensors[i] );
}
}
return node;
} /* _setup() */
__END_DECLS
REGISTER_BACKEND_CL( roi_align, _setup )

View File

@ -0,0 +1,298 @@
/****************************************************************************
*
* Copyright (c) 2020 Vivante Corporation
*
* Permission is hereby granted, free of charge, to any person obtaining a
* copy of this software and associated documentation files (the "Software"),
* to deal in the Software without restriction, including without limitation
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
* and/or sell copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in
* all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
* DEALINGS IN THE SOFTWARE.
*
*****************************************************************************/
#include <stdint.h>
#include <stdlib.h>
#include <string.h>
#include "vsi_nn_types.h"
#include "vsi_nn_tensor.h"
#include "vsi_nn_graph.h"
#include "vsi_nn_log.h"
#include "vsi_nn_prv.h"
#include "vsi_nn_error.h"
#include "vsi_nn_tensor_util.h"
#include "utils/vsi_nn_util.h"
#include "kernel/vsi_nn_kernel.h"
#include "kernel/vsi_nn_kernel_eltwise.h"
__BEGIN_DECLS
/*
* Define kernel meta.
*/
#define KERNEL_SOURCE_1 "space2depth_internal"
#define HASH_SPACE2DEPTH_INTERNAL_KEY(_input0_type, _output_type, _opt_flg) \
((_input0_type << 24) | (_output_type << 16) | (_opt_flg << 8))
#define HASH_SPACE2DEPTH_INTERNAL_CL_KERNEL_NAME(SRC0_TYPE, DST_TYPE) \
CVIVANTE_NAMESPACE("cl.space2depth_internal_"#SRC0_TYPE"to"#DST_TYPE)
#define HASH_SPACE2DEPTH_INTERNAL_X2Y1_CL_KERNEL_NAME(SRC0_TYPE, DST_TYPE) \
CVIVANTE_NAMESPACE("cl.space2depth_internal_"#SRC0_TYPE"to"#DST_TYPE"_X2Y1")
#define TENSOR_SPACE2DEPTH_INTERNAL_KERNELS(IN0_TYPE, OUT_TYPE, SOURCE) \
{ HASH_SPACE2DEPTH_INTERNAL_KEY(IN0_TYPE, OUT_TYPE, 0), \
HASH_SPACE2DEPTH_INTERNAL_CL_KERNEL_NAME(IN0_TYPE, OUT_TYPE), \
SOURCE },
#define TENSOR_SPACE2DEPTH_INTERNAL_OPT_KERNELS(IN0_TYPE, OUT_TYPE, SOURCE) \
{ HASH_SPACE2DEPTH_INTERNAL_KEY(IN0_TYPE, OUT_TYPE, 1), \
HASH_SPACE2DEPTH_INTERNAL_X2Y1_CL_KERNEL_NAME(IN0_TYPE, OUT_TYPE), \
SOURCE },
static const struct {
uint32_t key;
char* function_name;
const char* source_name;
} kernel_map[] =
{
TENSOR_SPACE2DEPTH_INTERNAL_KERNELS(F32, F32, KERNEL_SOURCE_1)
TENSOR_SPACE2DEPTH_INTERNAL_KERNELS(U8, U8, KERNEL_SOURCE_1)
TENSOR_SPACE2DEPTH_INTERNAL_OPT_KERNELS(F32, F32, KERNEL_SOURCE_1)
TENSOR_SPACE2DEPTH_INTERNAL_OPT_KERNELS(U8, U8, KERNEL_SOURCE_1)
};
/*
* Kernel params
*/
static vx_param_description_t kernel_param_def[] =
{
{VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
{VX_OUTPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
{VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
{VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
{VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
{VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
};
#define _CL_PARAM_NUM _cnt_of_array(kernel_param_def)
/*
* Kernel initializer
*/
DEF_KERNEL_INITIALIZER(_space2depth_internal_initializer)
(
vsi_nn_kernel_node_t node,
const vsi_nn_kernel_node_param_t * param,
size_t param_size
)
{
gpu_param_t gpu_param = {
3,
{0, 0, 0},
{0, 0, 0},
{0, 0, 0},
{0, 0, 0}
};
vsi_status status = VSI_FAILURE;
vsi_nn_kernel_tensor_attr_t * attr[1] = { NULL };
vsi_int_array_t * in_shape = NULL;
int32_t width = 0;
int32_t height = 0;
int32_t chn = 0;
attr[0] = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[0] );
CHECK_PTR_FAIL_GOTO( attr[0], "Create tensor attr buffer fail.", final );
in_shape = attr[0]->shape;
width = in_shape->data[0];
height = in_shape->data[1];
chn = in_shape->size > 2 ? in_shape->data[2] : 1;
gpu_param.global_scale[0] = 1;
gpu_param.global_scale[1] = 1;
gpu_param.global_scale[2] = 1;
gpu_param.global_size[0] = gpu_align_p2((width + gpu_param.global_scale[0] - 1)
/ gpu_param.global_scale[0], 4);
gpu_param.global_size[1] = height;
gpu_param.global_size[2] = chn;
status = vsi_nn_kernel_gpu_config( node, &gpu_param );
final:
if (attr[0])
{
vsi_nn_kernel_tensor_attr_release( &attr[0] );
attr[0] = NULL;
}
return status;
} /* _space2depth_internal_initializer() */
static vsi_status _query_kernel
(
vsi_nn_tensor_t* const* const inputs,
vsi_nn_tensor_t* const* const outputs,
vsi_nn_kernel_t* kernel,
int32_t opt_flg
)
{
vsi_nn_kernel_dtype_e input0_dtype = U8;
vsi_nn_kernel_dtype_e output_dtype = U8;
vsi_status status = VSI_FAILURE;
uint32_t key = 0;
int i = 0;
input0_dtype = vsi_nn_kernel_map_dtype( inputs[0]->attr.dtype.vx_type );
output_dtype = vsi_nn_kernel_map_dtype( outputs[0]->attr.dtype.vx_type );
key = HASH_SPACE2DEPTH_INTERNAL_KEY( input0_dtype, output_dtype, opt_flg );
if (input0_dtype == F16 && output_dtype == F16)
{
input0_dtype = F32;
output_dtype = F32;
}
for( i = 0; i < _cnt_of_array(kernel_map); i ++ )
{
if ( kernel_map[i].key == key )
{
break;
}
}
if ( i < _cnt_of_array(kernel_map) )
{
snprintf( kernel->info.name, VX_MAX_KERNEL_NAME, "%s", kernel_map[i].function_name );
kernel->info.parameters = kernel_param_def;
kernel->info.numParams = _cnt_of_array( kernel_param_def );
kernel->info.initialize = _space2depth_internal_initializer;
vsi_nn_kernel_add_source( kernel, VSI_NN_GPU_SOURCE_FMT_CODE, 2,
"eltwise_ops_helper",
kernel_map[i].source_name );
vsi_nn_kernel_add_source( kernel, VSI_NN_GPU_SOURCE_FMT_EXECUTABLE, 1,
kernel_map[i].source_name );
status = VSI_SUCCESS;
}
return status;
} /* _query_kernel() */
static vsi_nn_kernel_node_t _setup
(
vsi_nn_graph_t * graph,
vsi_nn_tensor_t ** inputs,
size_t input_num,
vsi_nn_tensor_t ** outputs,
size_t output_num,
const vsi_nn_kernel_param_t * params,
vsi_nn_kernel_t * kernel
)
{
vsi_status status = VSI_FAILURE;
vsi_nn_kernel_node_param_t node_params[_CL_PARAM_NUM] = {NULL};
vsi_nn_kernel_node_t node = NULL;
int32_t block_size_x = vsi_nn_kernel_param_get_int32( params, "block_size_x" );
int32_t block_size_y = vsi_nn_kernel_param_get_int32( params, "block_size_y" );
int32_t opt_flg = (block_size_x == 2 && block_size_y == 1) ? 1 : 0;
float inputScale = inputs[0]->attr.dtype.scale;
int32_t inputZp = inputs[0]->attr.dtype.zero_point;
float outputScale = outputs[0]->attr.dtype.scale;
int32_t outputZp = outputs[0]->attr.dtype.zero_point;
float scaleInOut = 1.0f;
float zpInOut = 0.0f;
if (inputs[0]->attr.dtype.qnt_type == VSI_NN_QNT_TYPE_DFP)
{
int32_t input_fl = inputs[0]->attr.dtype.fl;
if (input_fl > 0)
{
inputScale = (1.0f / ((float) ((int64_t)1 << input_fl)));
}
else
{
inputScale = ((float) ((int64_t)1 << -input_fl));
}
inputZp = 0;
}
else if (inputs[0]->attr.dtype.qnt_type == VSI_NN_QNT_TYPE_NONE)
{
inputScale = 1.0f;
inputZp = 0;
}
if (outputs[0]->attr.dtype.qnt_type == VSI_NN_QNT_TYPE_DFP)
{
int32_t output_fl = outputs[0]->attr.dtype.fl;
if (output_fl > 0)
{
outputScale = (1.0f / ((float) ((int64_t)1 << output_fl)));
}
else
{
outputScale = ((float) ((int64_t)1 << -output_fl));
}
outputZp = 0;
}
else if (outputs[0]->attr.dtype.qnt_type == VSI_NN_QNT_TYPE_NONE)
{
outputScale = 1.0f;
outputZp = 0;
}
scaleInOut = inputScale / outputScale;
zpInOut = outputZp - inputZp * scaleInOut;
if ( !vsi_nn_kernel_gpu_check_shape( (int32_t*)outputs[0]->attr.size,
outputs[0]->attr.dim_num ) )
{
return NULL;
}
status = _query_kernel( inputs, outputs, kernel, opt_flg);
if ( VSI_SUCCESS == status)
{
node = vsi_nn_kernel_create_node( graph, kernel );
if ( node )
{
int32_t index = 2;
vsi_nn_kernel_node_pack_io( node_params, _CL_PARAM_NUM,
inputs, 1, outputs, 1 );
node_params[index++] = vsi_nn_kernel_scalar_create(
graph, I32, &block_size_x );
node_params[index++] = vsi_nn_kernel_scalar_create(
graph, I32, &block_size_y );
node_params[index++] = vsi_nn_kernel_scalar_create(
graph, F32, &scaleInOut );
node_params[index] = vsi_nn_kernel_scalar_create(
graph, F32, &zpInOut );
/* Pass parameters to node. */
status = vsi_nn_kernel_node_pass_param( node, node_params, _CL_PARAM_NUM );
VSI_ASSERT( status == VSI_SUCCESS );
vsi_nn_kernel_scalar_release( &node_params[2] );
vsi_nn_kernel_scalar_release( &node_params[3] );
vsi_nn_kernel_scalar_release( &node_params[4] );
vsi_nn_kernel_scalar_release( &node_params[5] );
}
}
return node;
} /* _setup() */
__END_DECLS
REGISTER_BACKEND_CL( space2depth_internal, _setup )

View File

@ -173,7 +173,7 @@ final:
if(attr[i]) { vsi_nn_kernel_tensor_attr_release( &attr[i] ); }
}
return status;
} /* _pre_process_yuv420_exec() */
} /* _instance_norm_exec() */
/*
* Kernel params
*/

View File

@ -0,0 +1,255 @@
/****************************************************************************
*
* Copyright (c) 2019 Vivante Corporation
*
* Permission is hereby granted, free of charge, to any person obtaining a
* copy of this software and associated documentation files (the "Software"),
* to deal in the Software without restriction, including without limitation
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
* and/or sell copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in
* all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
* DEALINGS IN THE SOFTWARE.
*
*****************************************************************************/
#include <stdint.h>
#include <stdlib.h>
#include <string.h>
#include "vsi_nn_types.h"
#include "vsi_nn_tensor.h"
#include "vsi_nn_graph.h"
#include "vsi_nn_log.h"
#include "vsi_nn_prv.h"
#include "vsi_nn_tensor_util.h"
#include "vsi_nn_error.h"
#include "utils/vsi_nn_util.h"
#include "kernel/vsi_nn_kernel.h"
#include "client/vsi_nn_vxkernel.h"
__BEGIN_DECLS
/*
* Define kernel meta.
*/
#define _CPU_ARG_NUM (1)
#define _CPU_INPUT_NUM (3)
#define _CPU_OUTPUT_NUM (1)
#define _CPU_IO_NUM (_CPU_INPUT_NUM + _CPU_OUTPUT_NUM)
#define _CPU_PARAM_NUM (_CPU_ARG_NUM + _CPU_IO_NUM)
#define _KERNEL_NAME CVIVANTE_NAMESPACE("cpu.layer_norm")
DEF_KERNEL_EXECUTOR(_layer_norm_exec)
(
vsi_nn_kernel_node_t node,
const vsi_nn_kernel_node_param_t * param,
size_t param_size
)
{
vsi_status status = VSI_FAILURE;
vsi_nn_kernel_tensor_t tensors[_CPU_IO_NUM] = { NULL };
float * buffer[_CPU_IO_NUM] = { NULL };
size_t out_elements = 0;
vsi_nn_kernel_tensor_attr_t * attr[_CPU_IO_NUM] = { NULL };
uint32_t i = 0;
float eps = .0f;
tensors[0] = (vsi_nn_kernel_tensor_t)param[0];
tensors[1] = (vsi_nn_kernel_tensor_t)param[1];
tensors[2] = (vsi_nn_kernel_tensor_t)param[2];
tensors[3] = (vsi_nn_kernel_tensor_t)param[3];
attr[0] = vsi_nn_kernel_tensor_attr_create( tensors[0] );
CHECK_PTR_FAIL_GOTO( attr[0], "Create tensor attr buffer fail.", final );
attr[1] = vsi_nn_kernel_tensor_attr_create( tensors[1] );
CHECK_PTR_FAIL_GOTO( attr[1], "Create tensor attr buffer fail.", final );
attr[2] = vsi_nn_kernel_tensor_attr_create( tensors[2] );
CHECK_PTR_FAIL_GOTO( attr[2], "Create tensor attr buffer fail.", final );
attr[3] = vsi_nn_kernel_tensor_attr_create( tensors[3] );
CHECK_PTR_FAIL_GOTO( attr[3], "Create tensor attr buffer fail.", final );
out_elements = vsi_nn_kernel_tensor_attr_get_size( attr[3] );
status = vsi_nn_kernel_scalar_read_float32((vsi_nn_kernel_scalar_t)param[4], &eps);
CHECK_STATUS_FAIL_GOTO(status, final );
buffer[0] = (float*)vsi_nn_kernel_tensor_create_buffer( tensors[0], attr[0], TRUE );
CHECK_PTR_FAIL_GOTO( buffer[0], "Create input0 buffer fail.", final );
buffer[1] = (float*)vsi_nn_kernel_tensor_create_buffer( tensors[1], attr[1], TRUE );
CHECK_PTR_FAIL_GOTO( buffer[1], "Create input1 buffer fail.", final );
buffer[2] = (float*)vsi_nn_kernel_tensor_create_buffer( tensors[2], attr[2], TRUE );
CHECK_PTR_FAIL_GOTO( buffer[2], "Create input1 buffer fail.", final );
buffer[3] = (float *)malloc( out_elements * sizeof(float) );
CHECK_PTR_FAIL_GOTO( buffer[3], "Create output buffer fail.", final );
memset( buffer[3], 0, out_elements * sizeof(float) );
{
uint32_t axis_first = 0;
uint32_t axis_num = 1;
uint32_t outerSize = 1;
uint32_t axisSize = 1;
uint32_t innerSize = 1;
uint32_t inner = 0;
uint32_t outer = 0;
for (i = 0; i < (uint32_t)axis_first; i++)
{
innerSize *= attr[0]->shape->data[i];
}
for(i = 0; i < (uint32_t)axis_num; i++)
{
axisSize *= attr[0]->shape->data[axis_first + i];
}
for (i = (uint32_t)axis_first + axis_num; i < attr[0]->shape->size; i++)
{
outerSize *= attr[0]->shape->data[i];
}
for ( outer = 0; outer < outerSize; ++outer)
{
for ( inner = 0; inner < innerSize; ++inner)
{
float sum = .0f;
float sumsq = .0f;
float mean = .0f;
float vari = .0f;
for (i = 0; i < (uint32_t)axisSize; ++i)
{
float value = buffer[0][(outer * axisSize + i) * innerSize + inner];
sum += value;
sumsq += (value * value);
}
mean = sum / (axisSize);
vari = sumsq / (axisSize) - mean * mean;
vari = (float)(1.0 / sqrtf(vari + eps));
for (i = 0; i < (uint32_t)axisSize; ++i)
{
int idx = (outer * axisSize + i) * innerSize + inner;
float data = buffer[0][idx] - mean;
float scaleVal = buffer[2][idx];
float biasVal = buffer[1][idx];
float normVal = data * vari * scaleVal + biasVal;
buffer[3][idx] = normVal;
}
}
}
}
status = vsi_nn_kernel_tensor_write_from_float( tensors[3], attr[3],
buffer[3], out_elements );
CHECK_STATUS_FAIL_GOTO( status, final );
final:
for( i = 0; i < _CPU_IO_NUM; i ++ )
{
if ( buffer[i] )
{
free( buffer[i] );
}
}
for( i = 0; i < _CPU_IO_NUM; i ++ )
{
if (attr[i]) { vsi_nn_kernel_tensor_attr_release( &attr[i] ); }
}
return status;
} /* _layer_norm_exec() */
/*
* Kernel params
*/
static vx_param_description_t _layer_normalization_kernel_param_def[] =
{
{VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
{VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
{VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
{VX_OUTPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
{VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
// Add kererl parameters here
};
#define _LAYER_NORMALIZATION_PARAM_NUM _cnt_of_array( _layer_normalization_kernel_param_def )
static const vx_kernel_description_t _kernel_info =
{
KERNEL_ID_PLACEHOLDER,
_KERNEL_NAME,
_layer_norm_exec,
_layer_normalization_kernel_param_def,
_LAYER_NORMALIZATION_PARAM_NUM,
vsi_nn_KernelValidator,
NULL,
NULL,
vsi_nn_KernelInitializer,
vsi_nn_KernelDeinitializer
};
static vsi_status _query_kernel
(
vsi_nn_tensor_t* const* const inputs,
vsi_nn_tensor_t* const* const outputs,
vsi_nn_kernel_t* kernel
)
{
memmove( &kernel->info, &_kernel_info, sizeof(vx_kernel_description_t) );
return VSI_SUCCESS;
} /* _query_kernel() */
static vsi_nn_kernel_node_t _setup
(
vsi_nn_graph_t * graph,
vsi_nn_tensor_t ** inputs,
size_t input_num,
vsi_nn_tensor_t ** outputs,
size_t output_num,
const vsi_nn_kernel_param_t * params,
vsi_nn_kernel_t * kernel
)
{
vsi_status status = VSI_FAILURE;
vsi_nn_kernel_node_param_t backend_params[_CPU_PARAM_NUM] = {NULL};
vsi_nn_kernel_node_t node = NULL;
status = _query_kernel( inputs, outputs, kernel );
if ( VSI_SUCCESS == status)
{
node = vsi_nn_kernel_create_node( graph, kernel );
if ( node )
{
float eps = vsi_nn_kernel_param_get_float32( params, "eps" );
/* Set inputs and outputs */
vsi_nn_kernel_node_pack_io( backend_params, _CPU_PARAM_NUM,
inputs, _CPU_INPUT_NUM, outputs, _CPU_OUTPUT_NUM );
backend_params[4] = vsi_nn_kernel_scalar_create( graph, F32, &eps );
/* Pass parameters to node. */
status = vsi_nn_kernel_node_pass_param( node, backend_params, _CPU_PARAM_NUM );
CHECK_STATUS( status );
vsi_nn_kernel_scalar_release( &backend_params[4] );
}
else
{
status = VSI_FAILURE;
}
}
return node;
} /* _setup() */
__END_DECLS
REGISTER_BACKEND_CPU( layer_norm, _setup )

View File

@ -0,0 +1,378 @@
/****************************************************************************
*
* Copyright (c) 2020 Vivante Corporation
*
* Permission is hereby granted, free of charge, to any person obtaining a
* copy of this software and associated documentation files (the "Software"),
* to deal in the Software without restriction, including without limitation
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
* and/or sell copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in
* all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
* DEALINGS IN THE SOFTWARE.
*
*****************************************************************************/
#include <stdint.h>
#include <stdlib.h>
#include <string.h>
#include "vsi_nn_types.h"
#include "vsi_nn_tensor.h"
#include "vsi_nn_graph.h"
#include "vsi_nn_log.h"
#include "vsi_nn_error.h"
#include "vsi_nn_prv.h"
#include "vsi_nn_tensor_util.h"
#include "utils/vsi_nn_util.h"
#include "kernel/vsi_nn_kernel.h"
#include "libnnext/vx_lib_nnext.h"
__BEGIN_DECLS
/*
* Define kernel meta.
*/
#define _INPUT_NUM (3)
#define _OUTPUT_NUM (1)
#define _KERNEL_NAME CVIVANTE_NAMESPACE("cpu.roi_align")
/*
* Kernel params
*/
static vx_param_description_t _roi_align_kernel_param_def[] =
{
{VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
{VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
{VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
{VX_OUTPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
{VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
{VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
{VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
{VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
};
#define _ROI_ALIGN_PARAM_NUM _cnt_of_array( _roi_align_kernel_param_def )
#define SCALAR_X_RATIO (4)
#define SCALAR_Y_RATIO (5)
#define SCALAR_X_SAMPLE (6)
#define SCALAR_Y_SAMPLE (7)
/*
* Kernel function
*/
static float _compute_region_coordinate(int32_t p, float bin_size, float roi_anchor, float max_value)
{
const float region_start = p * bin_size + roi_anchor;
return vsi_nn_clamp(region_start, 0.0f, max_value - 1);
}
static float _roi_align_1x1(float *input_ptr,
int32_t width,
int32_t height,
float region_start_x,
float bin_size_x,
int32_t grid_size_x,
float region_end_x,
float region_start_y,
float bin_size_y,
int32_t grid_size_y,
float region_end_y)
{
if ((region_end_x <= region_start_x) || (region_end_y <= region_start_y))
{
return 0;
}
else
{
float avg = 0;
int32_t iy = 0;
int32_t ix = 0;
// Iterate through the aligned pooling region
for (iy = 0; iy < grid_size_y; ++iy)
{
for (ix = 0; ix < grid_size_x; ++ix)
{
// Align the window in the middle of every bin
float y = region_start_y + ((float)iy + 0.5f) * bin_size_y / (float)(grid_size_y);
float x = region_start_x + ((float)ix + 0.5f) * bin_size_x / (float)(grid_size_x);
// Interpolation in the [0,0] [0,1] [1,0] [1,1] square
const int32_t y_low = (int32_t)y;
const int32_t x_low = (int32_t)x;
const int32_t y_high = vsi_nn_min(y_low + 1, height - 1);
const int32_t x_high = vsi_nn_min(x_low + 1, width - 1);
const float ly = y - y_low;
const float lx = x - x_low;
const float hy = 1.0f - ly;
const float hx = 1.0f - lx;
const float w1 = hy * hx;
const float w2 = hy * lx;
const float w3 = ly * hx;
const float w4 = ly * lx;
const float data1 = *(input_ptr + y_low * width + x_low);
const float data2 = *(input_ptr + y_low * width + x_high);
const float data3 = *(input_ptr + y_high * width + x_low);
const float data4 = *(input_ptr + y_high * width + x_high);
avg += w1 * data1 + w2 * data2 + w3 * data3 + w4 * data4;
}
}
avg /= grid_size_x * grid_size_y;
return avg;
}
}
DEF_KERNEL_EXECUTOR(_compute)
(
vsi_nn_kernel_node_t node,
const vsi_nn_kernel_node_param_t * param,
size_t param_size
)
{
vsi_status status = VSI_FAILURE;
vsi_nn_kernel_tensor_t input[_INPUT_NUM] = {NULL};
vsi_nn_kernel_tensor_t output[_OUTPUT_NUM] = {NULL};
float *f32_in_buffer[_INPUT_NUM] = {NULL};
float *f32_out_buffer[_OUTPUT_NUM] = {NULL};
vsi_nn_kernel_tensor_attr_t *in_attr[_INPUT_NUM];
vsi_nn_kernel_tensor_attr_t *out_attr[_OUTPUT_NUM];
size_t out_stride_size[_OUTPUT_NUM][VSI_NN_MAX_DIM_NUM] = {{1}};
size_t out_elements[_OUTPUT_NUM] = {0};
size_t out_bytes[_OUTPUT_NUM] = {0};
uint32_t i = 0;
float width_scale = 0.0f;
float height_scale = 0.0f;
float width_ratio = 0.0f;
float height_ratio = 0.0f;
int32_t width_sample_num = 0;
int32_t height_sample_num = 0;
uint32_t n = 0;
uint32_t num_rois = 0;
int32_t inHeight = 0;
int32_t inWidth = 0;
int32_t inDepth = 0;
int32_t outHeight = 0;
int32_t outWidth = 0;
uint32_t kRoiDim = 4;
uint32_t out_index = 0;
/* prepare data */
for (i = 0; i < _INPUT_NUM; i ++)
{
input[i] = (vsi_nn_kernel_tensor_t)param[i];
in_attr[i] = vsi_nn_kernel_tensor_attr_create( input[i] );
f32_in_buffer[i] = (float*)vsi_nn_kernel_tensor_create_buffer( input[i], in_attr[i], TRUE );
CHECK_PTR_FAIL_GOTO( f32_in_buffer[i], "Create input0 buffer fail.", final );
}
for (i = 0; i < _OUTPUT_NUM; i ++)
{
output[i] = (vsi_nn_kernel_tensor_t)param[i + _INPUT_NUM];
out_attr[i] = vsi_nn_kernel_tensor_attr_create( output[i] );
vsi_nn_kernel_tensor_attr_get_stride( out_attr[i], out_stride_size[i] );
out_elements[i] = vsi_nn_kernel_tensor_attr_get_size( out_attr[i] );
out_bytes[i] = out_elements[i] * sizeof(float);
f32_out_buffer[i] = (float *)malloc( out_bytes[i] );
CHECK_PTR_FAIL_GOTO( f32_out_buffer[i], "Create output buffer fail.", final );
memset( f32_out_buffer[i], 0, out_bytes[i] );
}
vsi_nn_kernel_scalar_read_float32((vsi_nn_kernel_scalar_t)param[SCALAR_X_RATIO], &(width_ratio));
vsi_nn_kernel_scalar_read_float32((vsi_nn_kernel_scalar_t)param[SCALAR_Y_RATIO], &(height_ratio));
vsi_nn_kernel_scalar_read_int32((vsi_nn_kernel_scalar_t)param[SCALAR_X_SAMPLE], &(width_sample_num));
vsi_nn_kernel_scalar_read_int32((vsi_nn_kernel_scalar_t)param[SCALAR_Y_SAMPLE], &(height_sample_num));
width_scale = 1.0f / width_ratio;
height_scale = 1.0f / height_ratio;
num_rois = in_attr[1]->shape->data[1];
inWidth = in_attr[0]->shape->data[0];
inHeight = in_attr[0]->shape->data[1];
inDepth = in_attr[0]->shape->data[2];
outWidth = out_attr[0]->shape->data[0];
outHeight = out_attr[0]->shape->data[1];
for (n = 0; n < num_rois; n++)
{
uint32_t batchId = (uint32_t)f32_in_buffer[2][n];
float scale = (in_attr[1]->dtype == U16) ? 0.125f : 1.0f;
float qx1 = f32_in_buffer[1][n * kRoiDim];
float qy1 = f32_in_buffer[1][n * kRoiDim + 1];
float qx2 = f32_in_buffer[1][n * kRoiDim + 2];
float qy2 = f32_in_buffer[1][n * kRoiDim + 3];
float x1 = qx1 * scale;
float x2 = qx2 * scale;
float y1 = qy1 * scale;
float y2 = qy2 * scale;
float roi_anchor_x = x1 * width_scale;
float roi_anchor_y = y1 * height_scale;
float roi_dims_x = vsi_nn_max((x2 - x1) * width_scale, 1.0f);
float roi_dims_y = vsi_nn_max((y2 - y1) * height_scale, 1.0f);
float bin_size_x = roi_dims_x / outWidth;
float bin_size_y = roi_dims_y / outHeight;
int32_t batch_base_index = batchId * inHeight * inWidth * inDepth;
int32_t ch = 0;
int32_t py = 0;
int32_t px = 0;
for (ch = 0; ch < inDepth; ch++)
{
for (py = 0; py < outHeight; py++)
{
for (px = 0; px < outWidth; px++)
{
float region_start_x = _compute_region_coordinate(px, bin_size_x,
roi_anchor_x, (float)inWidth);
float region_start_y = _compute_region_coordinate(py, bin_size_y,
roi_anchor_y, (float)inHeight);
float region_end_x = _compute_region_coordinate(px + 1, bin_size_x,
roi_anchor_x, (float)inWidth);
float region_end_y = _compute_region_coordinate(py + 1, bin_size_y,
roi_anchor_y, (float)inHeight);
int32_t roi_bin_grid_x = (width_sample_num > 0) ? width_sample_num : (int32_t)(ceil(bin_size_x));
int32_t roi_bin_grid_y = (height_sample_num > 0) ? height_sample_num : (int32_t)(ceil(bin_size_y));
float *input_ptr = &f32_in_buffer[0][batch_base_index + ch * inWidth * inHeight];
float out_val = 0;
out_val = _roi_align_1x1(
input_ptr, inWidth, inHeight, region_start_x, bin_size_x,
roi_bin_grid_x, region_end_x, region_start_y, bin_size_y,
roi_bin_grid_y, region_end_y);
f32_out_buffer[0][out_index++] = out_val;
}
}
}
}
/* save data */
for (i = 0; i < _OUTPUT_NUM; i++)
{
status = vsi_nn_kernel_tensor_write_from_float( output[i], out_attr[i],
f32_out_buffer[i], out_elements[i] );
CHECK_STATUS_FAIL_GOTO( status, final );
}
final:
for (i = 0; i < _INPUT_NUM; i++)
{
if (f32_in_buffer[i])
{
free(f32_in_buffer[i]);
f32_in_buffer[i] = NULL;
}
if (in_attr[i])
{
vsi_nn_kernel_tensor_attr_release( &in_attr[i] );
}
}
for (i = 0; i < _OUTPUT_NUM; i++)
{
if (f32_out_buffer[i])
{
free(f32_out_buffer[i]);
f32_out_buffer[i] = NULL;
}
if (out_attr[i])
{
vsi_nn_kernel_tensor_attr_release( &out_attr[i] );
}
}
return status;
} /* _compute() */
/*
* Query kernel
*/
static vsi_status _query_kernel
(
vsi_nn_kernel_t * kernel,
vsi_nn_tensor_t * const * const inputs,
vsi_nn_tensor_t * const * const outputs
)
{
vsi_status status = VSI_FAILURE;
snprintf( kernel->info.name, VX_MAX_KERNEL_NAME, "%s", _KERNEL_NAME );
kernel->info.function = _compute;
kernel->info.parameters = _roi_align_kernel_param_def;
kernel->info.numParams = _cnt_of_array( _roi_align_kernel_param_def );
status = VSI_SUCCESS;
return status;
} /* _query_kernel() */
static vsi_nn_kernel_node_t _setup
(
vsi_nn_graph_t * graph,
vsi_nn_tensor_t ** inputs,
size_t input_num,
vsi_nn_tensor_t ** outputs,
size_t output_num,
const vsi_nn_kernel_param_t * params,
vsi_nn_kernel_t * kernel
)
{
vsi_status status = VSI_FAILURE;
vsi_nn_kernel_node_param_t node_params[_ROI_ALIGN_PARAM_NUM] = {NULL};
vsi_nn_kernel_node_t node = NULL;
float width_ratio = vsi_nn_kernel_param_get_float32( params, "width_ratio" );
float height_ratio = vsi_nn_kernel_param_get_float32( params, "height_ratio" );
int32_t width_sample_num = vsi_nn_kernel_param_get_int32( params, "width_sample_num" );
int32_t height_sample_num = vsi_nn_kernel_param_get_int32( params, "height_sample_num" );
status = _query_kernel( kernel, inputs, outputs );
if( VSI_SUCCESS == status)
{
node = vsi_nn_kernel_create_node( graph, kernel );
if( node )
{
/* Set inputs and outputs */
vsi_nn_kernel_node_pack_io( node_params, _ROI_ALIGN_PARAM_NUM,
inputs, input_num, outputs, output_num );
node_params[SCALAR_X_RATIO] = vsi_nn_kernel_scalar_create( graph, F32, &width_ratio );
node_params[SCALAR_Y_RATIO] = vsi_nn_kernel_scalar_create( graph, F32, &height_ratio );
node_params[SCALAR_X_SAMPLE] = vsi_nn_kernel_scalar_create( graph, I32, &width_sample_num );
node_params[SCALAR_Y_SAMPLE] = vsi_nn_kernel_scalar_create( graph, I32, &height_sample_num );
/* Pass parameters to node. */
status = vsi_nn_kernel_node_pass_param( node, node_params, _ROI_ALIGN_PARAM_NUM );
VSI_ASSERT( status == VSI_SUCCESS );
vsi_nn_kernel_scalar_release( &node_params[SCALAR_X_RATIO] );
vsi_nn_kernel_scalar_release( &node_params[SCALAR_Y_RATIO] );
vsi_nn_kernel_scalar_release( &node_params[SCALAR_X_SAMPLE] );
vsi_nn_kernel_scalar_release( &node_params[SCALAR_Y_SAMPLE] );
}
}
return node;
} /* _setup() */
__END_DECLS
REGISTER_BACKEND_CPU( roi_align, _setup )

View File

@ -0,0 +1,230 @@
/****************************************************************************
*
* Copyright (c) 2019 Vivante Corporation
*
* Permission is hereby granted, free of charge, to any person obtaining a
* copy of this software and associated documentation files (the "Software"),
* to deal in the Software without restriction, including without limitation
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
* and/or sell copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in
* all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
* DEALINGS IN THE SOFTWARE.
*
*****************************************************************************/
#include <stdint.h>
#include <stdlib.h>
#include <string.h>
#include "vsi_nn_types.h"
#include "vsi_nn_tensor.h"
#include "vsi_nn_graph.h"
#include "vsi_nn_log.h"
#include "vsi_nn_prv.h"
#include "vsi_nn_tensor_util.h"
#include "vsi_nn_error.h"
#include "utils/vsi_nn_util.h"
#include "kernel/vsi_nn_kernel.h"
#include "client/vsi_nn_vxkernel.h"
__BEGIN_DECLS
/*
* Define kernel meta.
*/
#define _CPU_ARG_NUM (2)
#define _CPU_INPUT_NUM (1)
#define _CPU_OUTPUT_NUM (1)
#define _CPU_IO_NUM (_CPU_INPUT_NUM + _CPU_OUTPUT_NUM)
#define _CPU_PARAM_NUM (_CPU_ARG_NUM + _CPU_IO_NUM)
#define _KERNEL_NAME CVIVANTE_NAMESPACE("cpu.space2depth_internal")
DEF_KERNEL_EXECUTOR(_space2depth_internal_exec)
(
vsi_nn_kernel_node_t node,
const vsi_nn_kernel_node_param_t * param,
size_t param_size
)
{
vsi_status status = VX_FAILURE;
vsi_nn_kernel_tensor_t tensors[_CPU_IO_NUM] = { NULL };
float * buffer[2] = { NULL };
size_t out_elements = 0;
vsi_nn_kernel_tensor_attr_t * attr[_CPU_IO_NUM] = { NULL };
uint32_t i = 0;
int32_t block_size_x = 1;
int32_t block_size_y = 1;
tensors[0] = (vsi_nn_kernel_tensor_t)param[0];
tensors[1] = (vsi_nn_kernel_tensor_t)param[1];
attr[0] = vsi_nn_kernel_tensor_attr_create( tensors[0] );
CHECK_PTR_FAIL_GOTO( attr[0], "Create tensor attr buffer fail.", final );
attr[1] = vsi_nn_kernel_tensor_attr_create( tensors[1] );
CHECK_PTR_FAIL_GOTO( attr[1], "Create tensor attr buffer fail.", final );
out_elements = vsi_nn_kernel_tensor_attr_get_size( attr[1] );
status = vsi_nn_kernel_scalar_read_int32((vsi_nn_kernel_scalar_t)param[2], &block_size_x);
CHECK_STATUS_FAIL_GOTO(status, final );
status = vsi_nn_kernel_scalar_read_int32((vsi_nn_kernel_scalar_t)param[3], &block_size_y);
CHECK_STATUS_FAIL_GOTO(status, final );
buffer[0] = (float*)vsi_nn_kernel_tensor_create_buffer( tensors[0], attr[0], TRUE );
CHECK_PTR_FAIL_GOTO( buffer[0], "Create input0 buffer fail.", final );
buffer[1] = (float *)malloc( out_elements * sizeof(float) );
CHECK_PTR_FAIL_GOTO( buffer[1], "Create output buffer fail.", final );
memset( buffer[1], 0, out_elements * sizeof(float) );
{
uint32_t output_depth = attr[1]->shape->data[2];
uint32_t output_height = attr[1]->shape->data[1];
uint32_t output_width = attr[1]->shape->data[0];
uint32_t input_batch = attr[0]->shape->size > 3 ? attr[0]->shape->data[3] : 1;
uint32_t input_depth = attr[0]->shape->data[2];
uint32_t input_height = attr[0]->shape->data[1];
uint32_t input_width = attr[0]->shape->data[0];
uint32_t batch = 0, in_h = 0, in_w = 0;
for (batch = 0; batch < input_batch; ++ batch)
{
uint32_t output_batch_index = batch * output_height * output_width * output_depth;
uint32_t input_batch_index = batch * input_height * input_width * input_depth;
uint32_t in_d = 0;
for (in_d = 0; in_d < input_depth; in_d ++)
{
for (in_h = 0; in_h < input_height; ++ in_h)
{
for (in_w = 0; in_w < input_width; in_w ++)
{
uint32_t out_w = in_w / block_size_x;
uint32_t out_h = in_h / block_size_y;
uint32_t out_d = (in_w % block_size_x) * input_depth
+ (in_h % block_size_y) * block_size_x * input_depth + in_d;
uint32_t in_index = in_w + in_h * input_width
+ in_d * input_height * input_width + input_batch_index;
uint32_t out_index = out_w + out_h * output_width
+ out_d * output_width * output_height + output_batch_index;
buffer[1][out_index] = buffer[0][in_index];
}
}
}
}
}
status = vsi_nn_kernel_tensor_write_from_float( tensors[1], attr[1],
buffer[1], out_elements );
CHECK_STATUS_FAIL_GOTO( status, final );
final:
for( i = 0; i < _CPU_IO_NUM; i ++ )
{
if (attr[i]) { vsi_nn_kernel_tensor_attr_release( &attr[i] ); }
if ( buffer[i] )
{
free( buffer[i] );
}
}
return status;
} /* _depth2space_crd_exec() */
/*
* Kernel params
*/
static vx_param_description_t _space2depth_internal_kernel_param_def[] =
{
{VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
{VX_OUTPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
{VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
{VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
// Add kererl parameters here
};
#define _DEPTH2SPACE_CRD_PARAM_NUM _cnt_of_array( _space2depth_internal_kernel_param_def )
static const vx_kernel_description_t _kernel_info =
{
KERNEL_ID_PLACEHOLDER,
_KERNEL_NAME,
_space2depth_internal_exec,
_space2depth_internal_kernel_param_def,
_cnt_of_array( _space2depth_internal_kernel_param_def ),
vsi_nn_KernelValidator,
NULL,
NULL,
vsi_nn_KernelInitializer,
vsi_nn_KernelDeinitializer
};
static vsi_status _query_kernel
(
vsi_nn_tensor_t* const* const inputs,
vsi_nn_tensor_t* const* const outputs,
vsi_nn_kernel_t* kernel
)
{
memmove( &kernel->info, &_kernel_info, sizeof(vx_kernel_description_t) );
return VSI_SUCCESS;
} /* _query_kernel() */
static vsi_nn_kernel_node_t _setup
(
vsi_nn_graph_t * graph,
vsi_nn_tensor_t ** inputs,
size_t input_num,
vsi_nn_tensor_t ** outputs,
size_t output_num,
const vsi_nn_kernel_param_t * params,
vsi_nn_kernel_t * kernel
)
{
vsi_status status = VX_FAILURE;
vsi_nn_kernel_node_param_t backend_params[_CPU_PARAM_NUM] = {NULL};
vsi_nn_kernel_node_t node = NULL;
status = _query_kernel( inputs, outputs, kernel );
if ( VSI_SUCCESS == status)
{
node = vsi_nn_kernel_create_node( graph, kernel );
if ( node )
{
uint32_t index = 2;
int32_t block_size_x = vsi_nn_kernel_param_get_int32( params, "block_size_x" );
int32_t block_size_y = vsi_nn_kernel_param_get_int32( params, "block_size_y" );
/* Set inputs and outputs */
vsi_nn_kernel_node_pack_io( backend_params, _CPU_PARAM_NUM,
inputs, _CPU_INPUT_NUM, outputs, _CPU_OUTPUT_NUM );
backend_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &block_size_x );
backend_params[index] = vsi_nn_kernel_scalar_create( graph, I32, &block_size_y );
/* Pass parameters to node. */
status = vsi_nn_kernel_node_pass_param( node, backend_params, _CPU_PARAM_NUM );
CHECK_STATUS( status );
vsi_nn_kernel_scalar_release( &backend_params[2] );
vsi_nn_kernel_scalar_release( &backend_params[3] );
}
else
{
status = VSI_FAILURE;
}
}
return node;
} /* _setup() */
__END_DECLS
REGISTER_BACKEND_CPU( space2depth_internal, _setup )

View File

@ -0,0 +1,264 @@
/****************************************************************************
*
* Copyright (c) 2020 Vivante Corporation
*
* Permission is hereby granted, free of charge, to any person obtaining a
* copy of this software and associated documentation files (the "Software"),
* to deal in the Software without restriction, including without limitation
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
* and/or sell copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in
* all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
* DEALINGS IN THE SOFTWARE.
*
*****************************************************************************/
#include <stdint.h>
#include <stdlib.h>
#include <string.h>
#include "vsi_nn_types.h"
#include "vsi_nn_tensor.h"
#include "vsi_nn_graph.h"
#include "vsi_nn_log.h"
#include "vsi_nn_error.h"
#include "vsi_nn_prv.h"
#include "vsi_nn_tensor_util.h"
#include "utils/vsi_nn_util.h"
#include "kernel/vsi_nn_kernel.h"
#include "libnnext/vx_lib_nnext.h"
__BEGIN_DECLS
/*
* Define kernel meta.
*/
#define _INPUT_NUM (1)
#define _OUTPUT_NUM (1)
#define _KERNEL_NAME CVIVANTE_NAMESPACE("cpu.upsamplescale")
/*
* Kernel params
*/
static vx_param_description_t _upsamplescale_kernel_param_def[] =
{
{VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
{VX_OUTPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
{VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
{VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
// Add kererl parameters here
};
#define _UPSAMPLESCALE_PARAM_NUM _cnt_of_array( _upsamplescale_kernel_param_def )
#define SCALAR_STRIDE_VALUE (2)
#define SCALAR_SCALE_VALUE (3)
/*
* Kernel function
*/
DEF_KERNEL_EXECUTOR(_compute)
(
vsi_nn_kernel_node_t node,
const vsi_nn_kernel_node_param_t * param,
size_t param_size
)
{
vsi_status status = VSI_FAILURE;
vsi_nn_kernel_tensor_t input[_INPUT_NUM] = {NULL};
vsi_nn_kernel_tensor_t output[_OUTPUT_NUM] = {NULL};
float *f32_in_buffer[_INPUT_NUM] = {NULL};
float *f32_out_buffer[_OUTPUT_NUM] = {NULL};
vsi_nn_kernel_tensor_attr_t *in_attr[_INPUT_NUM];
vsi_nn_kernel_tensor_attr_t *out_attr[_OUTPUT_NUM];
size_t out_stride_size[_OUTPUT_NUM][VSI_NN_MAX_DIM_NUM] = {{1}};
size_t out_elements[_OUTPUT_NUM] = {0};
size_t out_bytes[_OUTPUT_NUM] = {0};
int32_t i = 0;
int32_t stride = 0;
float scale = 0.0f;
int32_t width = 0;
int32_t height = 0;
int32_t out_width = 0;
int32_t out_height = 0;
int32_t outerSize = 1;
int32_t x = 0;
int32_t y = 0;
/* prepare data */
for(i = 0; i < _INPUT_NUM; i ++)
{
input[i] = (vsi_nn_kernel_tensor_t)param[i];
in_attr[i] = vsi_nn_kernel_tensor_attr_create( input[i] );
f32_in_buffer[i] = (float*)vsi_nn_kernel_tensor_create_buffer( input[i], in_attr[i], TRUE );
CHECK_PTR_FAIL_GOTO( f32_in_buffer[i], "Create input0 buffer fail.", final );
}
for(i = 0; i < _OUTPUT_NUM; i ++)
{
output[i] = (vsi_nn_kernel_tensor_t)param[i + _INPUT_NUM];
out_attr[i] = vsi_nn_kernel_tensor_attr_create( output[i] );
vsi_nn_kernel_tensor_attr_get_stride( out_attr[i], out_stride_size[i] );
out_elements[i] = vsi_nn_kernel_tensor_attr_get_size( out_attr[i] );
out_bytes[i] = out_elements[i] * sizeof(float);
f32_out_buffer[i] = (float *)malloc( out_bytes[i] );
CHECK_PTR_FAIL_GOTO( f32_out_buffer[i], "Create output buffer fail.", final );
memset( f32_out_buffer[i], 0, out_bytes[i] );
}
vsi_nn_kernel_scalar_read_int32((vsi_nn_kernel_scalar_t)param[SCALAR_STRIDE_VALUE], &stride);
vsi_nn_kernel_scalar_read_float32((vsi_nn_kernel_scalar_t)param[SCALAR_SCALE_VALUE], &scale);
width = in_attr[0]->shape->data[0];
height = in_attr[0]->shape->data[1];
for (i = 2; i < (int32_t)in_attr[0]->shape->size; i++)
{
outerSize *= in_attr[0]->shape->data[i];
}
out_width = out_attr[0]->shape->data[0];
out_height = out_attr[0]->shape->data[1];
for (i = 0; i < outerSize; i++)
{
for (y = 0; y < height; y++)
{
for (x = 0; x < width; x++)
{
int32_t in_idx = i * width * height + y * width + x;
int32_t base_idx = i * out_width * out_height
+ y * stride * out_width + x * stride;
int32_t dx = 0;
int32_t dy = 0;
float data = f32_in_buffer[0][in_idx] * scale;
for (dy = 0; dy < stride; dy++)
{
for (dx = 0; dx < stride; dx++)
{
int32_t idx = base_idx + dy * out_width + dx;
f32_out_buffer[0][idx] = data;
}
}
}
}
}
/* save data */
for(i = 0; i < _OUTPUT_NUM; i++)
{
status = vsi_nn_kernel_tensor_write_from_float( output[i], out_attr[i],
f32_out_buffer[i], out_elements[i] );
CHECK_STATUS_FAIL_GOTO( status, final );
}
final:
for (i = 0; i < _INPUT_NUM; i++)
{
if (f32_in_buffer[i])
{
free(f32_in_buffer[i]);
f32_in_buffer[i] = NULL;
}
if (in_attr[i])
{
vsi_nn_kernel_tensor_attr_release( &in_attr[i] );
}
}
for(i = 0; i < _OUTPUT_NUM; i++)
{
if (f32_out_buffer[i])
{
free(f32_out_buffer[i]);
f32_out_buffer[i] = NULL;
}
if (out_attr[i])
{
vsi_nn_kernel_tensor_attr_release( &out_attr[i] );
}
}
return status;
} /* _compute() */
/*
* Query kernel
*/
static vsi_status _query_kernel
(
vsi_nn_kernel_t * kernel,
vsi_nn_tensor_t * const * const inputs,
vsi_nn_tensor_t * const * const outputs
/* Add extra params */
)
{
vsi_status status = VSI_SUCCESS;
snprintf( kernel->info.name, VX_MAX_KERNEL_NAME, "%s", _KERNEL_NAME );
kernel->info.function = _compute;
kernel->info.parameters = _upsamplescale_kernel_param_def;
kernel->info.numParams = _cnt_of_array( _upsamplescale_kernel_param_def );
return status;
} /* _query_kernel() */
static vsi_nn_kernel_node_t _setup
(
vsi_nn_graph_t * graph,
vsi_nn_tensor_t ** inputs,
size_t input_num,
vsi_nn_tensor_t ** outputs,
size_t output_num,
const vsi_nn_kernel_param_t * params,
vsi_nn_kernel_t * kernel
)
{
vsi_status status = VSI_FAILURE;
vsi_nn_kernel_node_param_t node_params[_UPSAMPLESCALE_PARAM_NUM];
vsi_nn_kernel_node_t node = NULL;
int32_t stride = 0;
float scale = 1.0f;
stride = vsi_nn_kernel_param_get_int32(params, "stride");
scale = vsi_nn_kernel_param_get_float32(params, "scale");
status = _query_kernel( kernel, inputs, outputs /* Add extra params */ );
if( VSI_SUCCESS == status)
{
node = vsi_nn_kernel_create_node( graph, kernel );
if( node )
{
/* Set inputs and outputs */
vsi_nn_kernel_node_pack_io( node_params, _UPSAMPLESCALE_PARAM_NUM,
inputs, input_num, outputs, output_num );
node_params[SCALAR_STRIDE_VALUE] = vsi_nn_kernel_scalar_create(
graph, I32, &stride );
node_params[SCALAR_SCALE_VALUE] = vsi_nn_kernel_scalar_create(
graph, F32, &scale );
/* Pass parameters to node. */
status = vsi_nn_kernel_node_pass_param( node, node_params, _UPSAMPLESCALE_PARAM_NUM );
vsi_nn_kernel_scalar_release( &node_params[SCALAR_STRIDE_VALUE] );
vsi_nn_kernel_scalar_release( &node_params[SCALAR_SCALE_VALUE] );
}
}
return node;
} /* _setup() */
__END_DECLS
REGISTER_BACKEND_CPU( upsamplescale, _setup )

View File

@ -79,8 +79,10 @@ typedef struct
static const _kernel_map_type _a_times_b_plus_c_kernel_map[] =
{
PACK_KERNEL_MAP(F16, F16, F16, F16),
PACK_KERNEL_MAP(F16, F16, F32, F16),
PACK_KERNEL_MAP_2D(F16, F16, F16, F16),
PACK_KERNEL_MAP_2D(F16, F16, F32, F16),
};
/*
@ -106,7 +108,7 @@ DEF_KERNEL_INITIALIZER(_a_times_b_plus_c_initializer)
)
{
#define _PACK_A_TIMES_B_PLUS_C_KEY( IN0_TYPE, IN1_TYPE, IN2_TYPE, OUT_TYPE ) \
(( IN1_TYPE << 24) | ( IN1_TYPE << 16) | ( IN0_TYPE << 8) | ( OUT_TYPE))
(( IN2_TYPE << 24) | ( IN1_TYPE << 16) | ( IN0_TYPE << 8) | ( OUT_TYPE))
vsi_status status = VX_SUCCESS;
// Alignment with a power of two value.
gpu_param_t gpu_param = {
@ -183,6 +185,48 @@ DEF_KERNEL_INITIALIZER(_a_times_b_plus_c_initializer)
CHECK_STATUS_FAIL_GOTO(status, final );
}
break;
case _PACK_A_TIMES_B_PLUS_C_KEY( F16, F16, F32, F16 ):
{
gpu_dp_inst_t uniA_Times_B_lo_4x4 = {{
0x01010101, // TCfg
0x00000000, // ASelt
0x00010000, 0x00030002, // ABin
0x01010101, // BSelt
0x00010000, 0x00030002, // BBin
0x00000400, // AccumType, ConstantType, and PostShift
0x00000000, 0x00000000, 0x00000000, 0x00000000,
0x00000000, 0x00000000, 0x00000000, 0x00000000 // Constant
}, GPU_DP_TYPE_16 };
gpu_dp_inst_t uniA_Times_B_hi_4x4 = {{
0x01010101, // TCfg
0x00000000, // ASelt
0x00050004, 0x00070006, // ABin
0x01010101, // BSelt
0x00050004, 0x00070006, // BBin
0x00000400, // AccumType, ConstantType, and PostShift
0x00000000, 0x00000000, 0x00000000, 0x00000000,
0x00000000, 0x00000000, 0x00000000, 0x00000000 // Constant
}, GPU_DP_TYPE_16 };
gpu_dp_inst_t uniExtractHalf8_2x8 = {{
0x11111111, // TCfg
0x11110000, // ASelt
0x06040200, 0x06040200, // ABin
0x22222222, // BSelt
0x00000000, 0x00000000, // BBin
0x00002100, // AccumType, ConstantType, and PostShift
0x00003c00, 0x00003c00, 0x00003c00, 0x00003c00,
0x00003c00, 0x00003c00, 0x00003c00, 0x00003c00 // Constant
}, GPU_DP_TYPE_16 };
status = vsi_nn_kernel_gpu_add_param( node,
"uniA_Times_B_lo_4x4", &uniA_Times_B_lo_4x4 );
status |= vsi_nn_kernel_gpu_add_param( node,
"uniA_Times_B_hi_4x4", &uniA_Times_B_hi_4x4 );
status |= vsi_nn_kernel_gpu_add_param( node,
"uniExtractHalf8_2x8", &uniExtractHalf8_2x8 );
CHECK_STATUS_FAIL_GOTO(status, final );
}
break;
default:
break;
}
@ -223,13 +267,13 @@ static vsi_status _query_kernel
vx_param_description_t * param_def = _a_times_b_plus_c_kernel_param_def;
size_t param_def_size = _cnt_of_array( _a_times_b_plus_c_kernel_param_def );
vx_kernel_initialize_f initializer = _a_times_b_plus_c_initializer;
uint32_t key;
uint32_t i;
uint32_t key = 0;
uint32_t i = 0;
in0_dtype = vsi_nn_kernel_map_dtype( inputs[0]->attr.dtype.vx_type );
in1_dtype = vsi_nn_kernel_map_dtype( inputs[1]->attr.dtype.vx_type );
in2_dtype = vsi_nn_kernel_map_dtype( inputs[2]->attr.dtype.vx_type );
out_dtype = vsi_nn_kernel_map_dtype( outputs[0]->attr.dtype.vx_type );
in1_dtype = vsi_nn_kernel_map_dtype( inputs[1]->attr.dtype.vx_type );
in2_dtype = vsi_nn_kernel_map_dtype( inputs[2]->attr.dtype.vx_type );
out_dtype = vsi_nn_kernel_map_dtype( outputs[0]->attr.dtype.vx_type );
key = A_TIMES_B_PLUS_C_HASH_KEY(in0_dtype, in1_dtype, in2_dtype, out_dtype, image_2d);

View File

@ -53,18 +53,34 @@ __BEGIN_DECLS
#define VX_KERNEL_NAME_GATHER_U8TOF16 CVIVANTE_NAMESPACE("evis.gather_U8toF16")
#define VX_KERNEL_NAME_GATHER_F16TOU8 CVIVANTE_NAMESPACE("evis.gather_F16toU8")
#define VX_KERNEL_NAME_GATHER_AXIS0_U8TOU8 CVIVANTE_NAMESPACE("evis.gather_U8toU8_axis0")
#define VX_KERNEL_NAME_GATHER_AXIS0_I8TOI8 CVIVANTE_NAMESPACE("evis.gather_I8toI8_axis0")
#define VX_KERNEL_NAME_GATHER_AXIS0_I16TOI16 CVIVANTE_NAMESPACE("evis.gather_I16toI16_axis0")
#define VX_KERNEL_NAME_GATHER_AXIS0_F16TOF16 CVIVANTE_NAMESPACE("evis.gather_F16toF16_axis0")
#define VX_KERNEL_NAME_GATHER_AXIS0_I8TOF16 CVIVANTE_NAMESPACE("evis.gather_I8toF16_axis0")
#define VX_KERNEL_NAME_GATHER_AXIS0_I16TOF16 CVIVANTE_NAMESPACE("evis.gather_I16toF16_axis0")
#define VX_KERNEL_NAME_GATHER_AXIS0_F16TOI8 CVIVANTE_NAMESPACE("evis.gather_F16toI8_axis0")
#define VX_KERNEL_NAME_GATHER_AXIS0_F16TOI16 CVIVANTE_NAMESPACE("evis.gather_F16toI16_axis0")
#define VX_KERNEL_NAME_GATHER_AXIS0_U8TOF16 CVIVANTE_NAMESPACE("evis.gather_U8toF16_axis0")
#define VX_KERNEL_NAME_GATHER_AXIS0_F16TOU8 CVIVANTE_NAMESPACE("evis.gather_F16toU8_axis0")
#define KERNEL_SOURCE_1 "gather"
#define KERNEL_SOURCE_2 "gather_mix"
// Add kernel hashtable here
#define HASH_GATHER_KEY(_input0_type, _input1_type, _output_type, _quant_type) \
((_input0_type << 24) | (_input1_type << 16) | (_output_type << 8) | (_quant_type))
#define HASH_GATHER_KEY(_input0_type, _input1_type, _output_type, _is_axis0) \
((_input0_type << 24) | (_input1_type << 16) | (_output_type << 8) | (_is_axis0))
#define TENSOR_GATHER_KERNELS(IN0_TYPE, IN1TYPE, OUT_TYPE, SOURCE) \
{ HASH_GATHER_KEY(IN0_TYPE, IN1TYPE, OUT_TYPE, 0), \
VX_KERNEL_NAME_GATHER_##IN0_TYPE##TO##OUT_TYPE, \
SOURCE },
#define TENSOR_GATHER_AXIS0_KERNELS(IN0_TYPE, IN1TYPE, OUT_TYPE, SOURCE) \
{ HASH_GATHER_KEY(IN0_TYPE, IN1TYPE, OUT_TYPE, 1), \
VX_KERNEL_NAME_GATHER_AXIS0_##IN0_TYPE##TO##OUT_TYPE, \
SOURCE },
static const struct {
uint32_t key;
char* function_name;
@ -81,6 +97,16 @@ static const struct {
TENSOR_GATHER_KERNELS(F16, I32, I16, KERNEL_SOURCE_2)
TENSOR_GATHER_KERNELS(U8, I32, F16, KERNEL_SOURCE_2)
TENSOR_GATHER_KERNELS(F16, I32, U8, KERNEL_SOURCE_2)
TENSOR_GATHER_AXIS0_KERNELS(U8, I32, U8, KERNEL_SOURCE_1)
TENSOR_GATHER_AXIS0_KERNELS(I8, I32, I8, KERNEL_SOURCE_1)
TENSOR_GATHER_AXIS0_KERNELS(I16, I32, I16, KERNEL_SOURCE_1)
TENSOR_GATHER_AXIS0_KERNELS(F16, I32, F16, KERNEL_SOURCE_1)
TENSOR_GATHER_AXIS0_KERNELS(I8, I32, F16, KERNEL_SOURCE_2)
TENSOR_GATHER_AXIS0_KERNELS(I16, I32, F16, KERNEL_SOURCE_2)
TENSOR_GATHER_AXIS0_KERNELS(F16, I32, I8, KERNEL_SOURCE_2)
TENSOR_GATHER_AXIS0_KERNELS(F16, I32, I16, KERNEL_SOURCE_2)
TENSOR_GATHER_AXIS0_KERNELS(U8, I32, F16, KERNEL_SOURCE_2)
TENSOR_GATHER_AXIS0_KERNELS(F16, I32, U8, KERNEL_SOURCE_2)
};
/*
@ -123,7 +149,7 @@ static vsi_status get_gather_tensor_reshape_size
sizes[i] = 1;
}
if(idxFlg && elementCnt < VSI_NN_MAX_IMAGE_WIDTH)
if (idxFlg && elementCnt < VSI_NN_MAX_IMAGE_WIDTH)
{
sizes[0] = elementCnt;
sizes[1] = 1;
@ -131,7 +157,7 @@ static vsi_status get_gather_tensor_reshape_size
}
else
{
if((elementCnt / block_size) < VSI_NN_MAX_IMAGE_WIDTH)
if ((elementCnt / block_size) < VSI_NN_MAX_IMAGE_WIDTH)
{
sizes[0] = block_size;
sizes[1] = elementCnt / block_size;
@ -191,7 +217,7 @@ DEF_KERNEL_INITIALIZER(_gather_initializer)
src0Scale = attr[0]->asymm.scale;
dstZP = attr[2]->asymm.zero_point;
dstScale = attr[2]->asymm.scale;
if( attr[0]->quant == VSI_NN_KERNEL_QUANT_DFP )
if ( attr[0]->quant == VSI_NN_KERNEL_QUANT_DFP )
{
if (attr[0]->dfp.fl > 0)
{
@ -202,12 +228,12 @@ DEF_KERNEL_INITIALIZER(_gather_initializer)
src0Scale = ((float) ((int64_t)1 << -attr[0]->dfp.fl));
}
}
else if( attr[0]->quant == VSI_NN_KERNEL_QUANT_NONE )
else if ( attr[0]->quant == VSI_NN_KERNEL_QUANT_NONE )
{
src0Scale = 1;
}
if( attr[2]->quant == VSI_NN_KERNEL_QUANT_DFP )
if ( attr[2]->quant == VSI_NN_KERNEL_QUANT_DFP )
{
if (attr[2]->dfp.fl > 0)
{
@ -219,7 +245,7 @@ DEF_KERNEL_INITIALIZER(_gather_initializer)
}
dstScale = 1.0f/dstScale;
}
else if( attr[2]->quant == VSI_NN_KERNEL_QUANT_NONE )
else if ( attr[2]->quant == VSI_NN_KERNEL_QUANT_NONE )
{
dstScale = 1;
}
@ -232,7 +258,7 @@ DEF_KERNEL_INITIALIZER(_gather_initializer)
}
shaderParam.global_scale[0] = 16;
if(attr[0]->dtype == I16 || attr[0]->dtype == F16)
if (attr[0]->dtype == I16 || attr[0]->dtype == F16)
{
shaderParam.global_scale[0] = 8;
}
@ -340,6 +366,214 @@ OnError:
return status;
}
DEF_KERNEL_INITIALIZER(_gather_axis0_initializer)
(
vsi_nn_kernel_node_t node,
const vsi_nn_kernel_node_param_t * param,
size_t param_size
)
{
vsi_status status = VSI_FAILURE;
gpu_param_t shaderParam = {
3, // workdim
{0, 0, 0}, // globalWorkOffset: control the start location be processed in the image
{0, 0, 0}, // globalWorkScale: how many pixels could be processed by a single thread
{0, 0, 0}, // localWorkSize: local group size in thread
{0, 0, 0}}; // globalWorkSize: image size in thread
int32_t block_num = 0;
int32_t indices_num = 1;
uint32_t input_dims1 = 0;
vx_uint32 i = 0;
vsi_nn_kernel_tensor_attr_t* attr[3] = {NULL, NULL};
vsi_int_array_t * input1_shape = NULL;
int32_t src0ZP = 0;
float src0Scale = 0;
int32_t dstZP = 0;
float dstScale = 0;
uint32_t pack_key = 0;
attr[0] = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[0] );
CHECK_PTR_FAIL_GOTO( attr[0], "Create tensor attr buffer fail.", OnError );
attr[1] = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[1] );
CHECK_PTR_FAIL_GOTO( attr[1], "Create tensor attr buffer fail.", OnError );
attr[2] = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[2] );
CHECK_PTR_FAIL_GOTO( attr[2], "Create tensor attr buffer fail.", OnError );
status = vsi_nn_kernel_scalar_read_int32((vsi_nn_kernel_scalar_t)param[4], &block_num);
CHECK_STATUS_FAIL_GOTO(status, OnError );
src0ZP = attr[0]->asymm.zero_point;
src0Scale = attr[0]->asymm.scale;
dstZP = attr[2]->asymm.zero_point;
dstScale = attr[2]->asymm.scale;
if ( attr[0]->quant == VSI_NN_KERNEL_QUANT_DFP )
{
if (attr[0]->dfp.fl > 0)
{
src0Scale = (1.0f / ((float) ((int64_t)1 << attr[0]->dfp.fl)));
}
else
{
src0Scale = ((float) ((int64_t)1 << -attr[0]->dfp.fl));
}
}
else if ( attr[0]->quant == VSI_NN_KERNEL_QUANT_NONE )
{
src0Scale = 1;
}
if ( attr[2]->quant == VSI_NN_KERNEL_QUANT_DFP )
{
if (attr[2]->dfp.fl > 0)
{
dstScale = (float)((int64_t)1 << attr[2]->dfp.fl);
}
else
{
dstScale = (1.0f / (float)((int64_t)1 << -attr[2]->dfp.fl));
}
dstScale = 1.0f/dstScale;
}
else if ( attr[2]->quant == VSI_NN_KERNEL_QUANT_NONE )
{
dstScale = 1;
}
input1_shape = attr[1]->shape;
input_dims1 = (uint32_t)input1_shape->size;
for (i = 0; i < input_dims1; i++)
{
indices_num *= input1_shape->data[i];
}
shaderParam.global_scale[0] = 4;
shaderParam.global_scale[1] = 1;
shaderParam.global_scale[2] = 1;
shaderParam.global_size[0] = gpu_align_p2((indices_num + shaderParam.global_scale[0] - 1)
/ shaderParam.global_scale[0], 4);
shaderParam.global_size[1] = block_num;
shaderParam.global_size[2] = 1;
status = vsi_nn_kernel_gpu_config( node, &shaderParam );
CHECK_STATUS_FAIL_GOTO(status, OnError);
#define _PACK_SELECT_KEY( IN0_TYPE, OUT_TYPE) \
(IN0_TYPE | (OUT_TYPE << 8))
pack_key = _PACK_SELECT_KEY( attr[0]->dtype, attr[2]->dtype);
{
uint16_t M0 = 0;
int32_t postShift = 0;
uint32_t multAndoutZP0[2] = {0};
uint32_t multAndoutZP1[2] = {0};
gpu_dp_inst_t uniU8MulAndPostShift_0_Lo_2x8 = {{
0xdddddddd, // TCfg
0x44444444, // ASelt
0x13121110, 0x17161514, // ABin
0x11111111, // BSelt
0x00000000, 0x00000000, // BBin
0x00002600, // AccumType, ConstantType, and PostShift
0x00000000, 0x00000000, 0x00000000, 0x00000000,
0x00000000, 0x00000000, 0x00000000, 0x00000000 // Constant
}, GPU_DP_TYPE_16 };
gpu_dp_inst_t uniConvertFp16toU8_2x8 = {{
0xdddddddd, // TCfg
0x44444444, // ASelt
0x13121110, 0x17161514, // ABin
0x11111111, // BSelt
0x00000000, 0x00000000, // BBin
0x00002600, // AccumType, ConstantType, and PostShift
0x00000000, 0x00000000, 0x00000000, 0x00000000,
0x00000000, 0x00000000, 0x00000000, 0x00000000 // Constant
}, GPU_DP_TYPE_16 };
gpu_dp_inst_t uniExtraCopyDpKeepinEvis_2x8 = {{
0x11111111, // TCfg
0x00000000, // ASelt
0x03020100, 0x07060504, // ABin
0x22222222, // BSelt
0x00000000, 0x00000000, // BBin
0x00000600, // AccumType, ConstantType, and PostShift
0x00000001, 0x00000001, 0x00000001, 0x00000001, 0x00000001,
0x00000001, 0x00000001, 0x00000001 // Constant
}, GPU_DP_TYPE_16 };
switch( pack_key )
{
case _PACK_SELECT_KEY( U8, F16):
case _PACK_SELECT_KEY( I8, F16):
case _PACK_SELECT_KEY( I16, F16):
{
gpu_quantize_multiplier_16bit( (double)src0Scale / dstScale, &M0, &postShift);
multAndoutZP0[0] = (uint32_t)(M0);
multAndoutZP0[1] = (uint32_t)((dstZP << postShift) - src0ZP * M0);
gpu_dp_inst_update_postshfit( &uniU8MulAndPostShift_0_Lo_2x8, postShift );
status = vsi_nn_kernel_gpu_add_param( node,
"uniU8MulAndPostShift_0_Lo_2x8", &uniU8MulAndPostShift_0_Lo_2x8 );
status |= vsi_nn_kernel_gpu_add_param( node, "multAndoutZP0", &multAndoutZP0 );
CHECK_STATUS_FAIL_GOTO(status, OnError );
}
break;
case _PACK_SELECT_KEY( F16, U8):
case _PACK_SELECT_KEY( F16, I8):
case _PACK_SELECT_KEY( F16, I16):
{
int32_t postShift0 = 0;
gpu_quantize_multiplier_16bit( (double)src0Scale / dstScale, &M0, &postShift0);
multAndoutZP1[0] = (uint32_t)(M0);
multAndoutZP1[1] = (uint32_t)((dstZP << postShift0) - src0ZP * M0);
gpu_dp_inst_update_postshfit( &uniConvertFp16toU8_2x8, postShift0 );
status = vsi_nn_kernel_gpu_add_param( node, "multAndoutZP1", &multAndoutZP1 );
status |= vsi_nn_kernel_gpu_add_param( node,
"uniConvertFp16toU8_2x8", &uniConvertFp16toU8_2x8 );
CHECK_STATUS_FAIL_GOTO(status, OnError );
}
break;
case _PACK_SELECT_KEY( I16, I16):
case _PACK_SELECT_KEY( I8, I8):
case _PACK_SELECT_KEY( U8, U8):
case _PACK_SELECT_KEY( F16, F16):
{
status = vsi_nn_kernel_gpu_add_param( node,
"uniExtraCopyDpKeepinEvis_2x8", &uniExtraCopyDpKeepinEvis_2x8 );
CHECK_STATUS_FAIL_GOTO(status, OnError );
}
break;
default:
break;
}
}
#undef _PACK_SELECT_KEY
status = vsi_nn_kernel_gpu_add_param(node, "indices_num", &indices_num);
CHECK_STATUS_FAIL_GOTO(status, OnError );
OnError:
if (attr[0])
{
vsi_nn_kernel_tensor_attr_release( &attr[0] );
attr[0] = NULL;
}
if (attr[1])
{
vsi_nn_kernel_tensor_attr_release( &attr[1] );
attr[1] = NULL;
}
if (attr[2])
{
vsi_nn_kernel_tensor_attr_release( &attr[2] );
attr[2] = NULL;
}
return status;
}
/*
* Query kernel
*/
@ -348,7 +582,8 @@ static vsi_status _query_kernel
vsi_nn_tensor_t* const* const inputs,
vsi_nn_tensor_t* const* const outputs,
vsi_nn_kernel_t* kernel,
const vsi_nn_kernel_param_t * params
const vsi_nn_kernel_param_t * params,
int32_t axis
)
{
vsi_status status = VSI_FAILURE;
@ -360,21 +595,28 @@ static vsi_status _query_kernel
input0_dtype = vsi_nn_kernel_map_dtype( inputs[0]->attr.dtype.vx_type );
output_dtype = vsi_nn_kernel_map_dtype( outputs[0]->attr.dtype.vx_type );
key = HASH_GATHER_KEY( input0_dtype, I32, output_dtype, 0 );
key = HASH_GATHER_KEY( input0_dtype, I32, output_dtype, axis );
for( i = 0; i < _cnt_of_array(gather_map); i ++ )
{
if( gather_map[i].key == key )
if ( gather_map[i].key == key )
{
break;
}
}
if( i < _cnt_of_array(gather_map) )
if ( i < _cnt_of_array(gather_map) )
{
snprintf( kernel->info.name, VX_MAX_KERNEL_NAME, "%s", gather_map[i].function_name );
kernel->info.parameters = _gather_kernel_param_def;
kernel->info.numParams = _cnt_of_array( _gather_kernel_param_def );
kernel->info.initialize = _gather_initializer;
if (axis)
{
kernel->info.initialize = _gather_axis0_initializer;
}
else
{
kernel->info.initialize = _gather_initializer;
}
vsi_nn_kernel_add_source( kernel, VSI_NN_GPU_SOURCE_FMT_CODE, 2,
"vsi_nn_kernel_header",
@ -405,26 +647,39 @@ static vsi_nn_kernel_node_t _setup
int32_t block_size = vsi_nn_kernel_param_get_int32( params, "block_size" );
int32_t block_num = vsi_nn_kernel_param_get_int32( params, "block_num" );
int32_t axis_num = vsi_nn_kernel_param_get_int32( params, "axis_num" );
int32_t axis = vsi_nn_kernel_param_get_int32( params, "axis" );
int32_t axis0_flg = 0;
status = get_gather_tensor_reshape_size(&inputs[0], shapes[0], block_size, 0);
status |= get_gather_tensor_reshape_size(&inputs[1], shapes[1], 1, 1);
status |= get_gather_tensor_reshape_size(&outputs[0], shapes[2], block_size, 0);
if(status != VSI_SUCCESS)
if (axis == 0)
{
status = get_gather_tensor_reshape_size(&inputs[0], shapes[0], axis_num, 0);
status |= get_gather_tensor_reshape_size(&inputs[1], shapes[1], 1, 1);
status |= get_gather_tensor_reshape_size(&outputs[0], shapes[2], shapes[1][0], 0);
axis0_flg = 1;
}
else
{
status = get_gather_tensor_reshape_size(&inputs[0], shapes[0], block_size, 0);
status |= get_gather_tensor_reshape_size(&inputs[1], shapes[1], 1, 1);
status |= get_gather_tensor_reshape_size(&outputs[0], shapes[2], block_size, 0);
axis0_flg = 0;
}
if (status != VSI_SUCCESS)
{
return NULL;
}
if( !vsi_nn_kernel_gpu_check_shape( (int32_t*)outputs[0]->attr.size,
if ( !vsi_nn_kernel_gpu_check_shape( (int32_t*)outputs[0]->attr.size,
outputs[0]->attr.dim_num ) )
{
return NULL;
}
status = _query_kernel( inputs, outputs, kernel, params );
if( VSI_SUCCESS == status)
status = _query_kernel( inputs, outputs, kernel, params, axis0_flg);
if ( VSI_SUCCESS == status)
{
node = vsi_nn_kernel_create_node( graph, kernel );
if( node )
if ( node )
{
uint32_t index = 0;
#define RESHAPE_DIM 2

View File

@ -183,7 +183,7 @@ DEF_KERNEL_INITIALIZER(_instancenorm_mean_vari_initializer)
vsi_nn_kernel_tensor_attr_t* attr[2] = {NULL, NULL};
vsi_int_array_t * input_shape = NULL;
float scaleIn = 0;
float scaleIn = 1;
int32_t input_zp = 0;
vx_uint32 iter = 0;
int32_t sumInZp = 0;
@ -206,10 +206,13 @@ DEF_KERNEL_INITIALIZER(_instancenorm_mean_vari_initializer)
CHECK_STATUS_FAIL_GOTO(status, OnError );
input_shape = attr[0]->shape;
input_zp = attr[0]->asymm.zero_point;
scaleIn = attr[0]->asymm.scale;
if(attr[0]->dtype == I8 || attr[0]->dtype == I16)
if (attr[0]->quant == VSI_NN_KERNEL_QUANT_ASYMM)
{
input_zp = attr[0]->asymm.zero_point;
scaleIn = attr[0]->asymm.scale;
}
else if (attr[0]->quant == VSI_NN_KERNEL_QUANT_DFP)
{
if (attr[0]->dfp.fl > 0)
{
@ -225,13 +228,13 @@ DEF_KERNEL_INITIALIZER(_instancenorm_mean_vari_initializer)
width = input_shape->data[0];
height = input_shape->data[1];
chn = attr[1]->shape->data[1];
if(rsFlg)
if (rsFlg)
{
height = height / chn;
}
iter = height * 16;
if(attr[0]->dtype == U8)
if (attr[0]->quant == VSI_NN_KERNEL_QUANT_ASYMM)
{
sumInZp = input_zp * iter * (-1);
tmpZp1 = (-2) * input_zp;
@ -247,11 +250,11 @@ DEF_KERNEL_INITIALIZER(_instancenorm_mean_vari_initializer)
shaderParam.local_size[1] = 1;
shaderParam.local_size[2] = 1;
if(attr[0]->dtype == I8 || attr[0]->dtype == U8)
if (attr[0]->dtype == I8 || attr[0]->dtype == U8)
{
shaderParam.global_size[0] = (width + 255) / 256 * 16;
}
else if(attr[0]->dtype == I16 || attr[0]->dtype == F16)
else if (attr[0]->dtype == I16 || attr[0]->dtype == F16)
{
shaderParam.global_size[0] = (width + 127) / 128 * 16;
}
@ -261,7 +264,7 @@ DEF_KERNEL_INITIALIZER(_instancenorm_mean_vari_initializer)
status = vsi_nn_kernel_gpu_config( node, &shaderParam );
CHECK_STATUS_FAIL_GOTO(status, OnError);
if(attr[0]->dtype == U8)
if (attr[0]->dtype == U8)
{
gpu_dp_inst_t uniSumU8_16x1 = {{
0x55555555, // TCfg
@ -290,7 +293,7 @@ DEF_KERNEL_INITIALIZER(_instancenorm_mean_vari_initializer)
status |= vsi_nn_kernel_gpu_add_param(node, "rowSumScale", &rowSumScale);
CHECK_STATUS_FAIL_GOTO(status, OnError );
}
else if(attr[0]->dtype == I8)
else if (attr[0]->dtype == I8)
{
gpu_dp_inst_t uniSumInt8_16x1 = {{
0x55555555, // TCfg
@ -317,7 +320,7 @@ DEF_KERNEL_INITIALIZER(_instancenorm_mean_vari_initializer)
status |= vsi_nn_kernel_gpu_add_param(node, "inFlScale_s2", &inFlScale_s2);
CHECK_STATUS_FAIL_GOTO(status, OnError );
}
else if(attr[0]->dtype == I16)
else if (attr[0]->dtype == I16)
{
gpu_dp_inst_t uniInt16SumSqr_dp8x2 = {{
0x55555555, // TCfg
@ -333,7 +336,7 @@ DEF_KERNEL_INITIALIZER(_instancenorm_mean_vari_initializer)
status |= vsi_nn_kernel_gpu_add_param(node, "inFlScale_s2", &inFlScale_s2);
CHECK_STATUS_FAIL_GOTO(status, OnError );
}
else if(attr[0]->dtype == F16)
else if (attr[0]->dtype == F16)
{
gpu_dp_inst_t uniFp16SumSqr_dp8x2 = {{
0x55555555, // TCfg
@ -384,10 +387,10 @@ DEF_KERNEL_INITIALIZER(_instancenorm_initializer)
vsi_nn_kernel_tensor_attr_t* attr[3] = {NULL, NULL};
vsi_int_array_t * input_shape = NULL;
float scaleIn = 0;
float scaleOut = 0;
float reScaleOut_u8 = 0;
float scale_inOut = 0;
float scaleIn = 1.0f;
float scaleOut = 1.0f;
float reScaleOut_u8 = 1.0f;
float scale_inOut = 1.0f;
int32_t output_zp = 0;
int32_t input_zp = 0;
float in_scale_fl = 1, out_scale_fl = 1, inOut_fl_scale = 1;
@ -407,12 +410,13 @@ DEF_KERNEL_INITIALIZER(_instancenorm_initializer)
CHECK_STATUS_FAIL_GOTO(status, OnError );
input_shape = attr[0]->shape;
input_zp = attr[0]->asymm.zero_point;
scaleIn = attr[0]->asymm.scale;
output_zp = attr[2]->asymm.zero_point;
scaleOut = attr[2]->asymm.scale;
if(attr[0]->dtype == I8 || attr[0]->dtype == I16)
if (attr[0]->quant == VSI_NN_KERNEL_QUANT_ASYMM)
{
input_zp = attr[0]->asymm.zero_point;
scaleIn = attr[0]->asymm.scale;
}
else if (attr[0]->quant == VSI_NN_KERNEL_QUANT_DFP)
{
if (attr[0]->dfp.fl > 0)
{
@ -422,9 +426,16 @@ DEF_KERNEL_INITIALIZER(_instancenorm_initializer)
{
in_scale_fl = ((float) ((int64_t)1 << -attr[0]->dfp.fl));
}
input_zp = 0;
}
if(attr[2]->dtype == I8 || attr[2]->dtype == I16)
if (attr[2]->quant == VSI_NN_KERNEL_QUANT_ASYMM)
{
output_zp = attr[2]->asymm.zero_point;
scaleOut = attr[2]->asymm.scale;
reScaleOut_u8 = 1 / scaleOut;
}
else if (attr[2]->quant == VSI_NN_KERNEL_QUANT_DFP)
{
if (attr[2]->dfp.fl > 0)
{
@ -434,10 +445,11 @@ DEF_KERNEL_INITIALIZER(_instancenorm_initializer)
{
out_scale_fl = (1.0f / (float)((int64_t)1 << -attr[2]->dfp.fl));
}
output_zp = 0;
}
if((attr[2]->dtype == I8 || attr[2]->dtype == I16)
&& (attr[0]->dtype == I8 || attr[0]->dtype == I16))
if ((attr[0]->quant == VSI_NN_KERNEL_QUANT_DFP)
&& (attr[2]->quant == VSI_NN_KERNEL_QUANT_DFP))
{
inOut_fl_scale = in_scale_fl * out_scale_fl;
}
@ -445,21 +457,17 @@ DEF_KERNEL_INITIALIZER(_instancenorm_initializer)
width = input_shape->data[0];
height = input_shape->data[1];
chn = attr[1]->shape->data[1];
if(rsFlg)
if (rsFlg)
{
height = height / chn;
}
if(attr[2]->dtype == U8)
{
reScaleOut_u8 = 1 / scaleOut;
}
dimRatio = (float)(1.0 / (width * height));
group_num = (width + 255) / 256;
shaderParam.global_scale[0] = 16;
if(attr[0]->dtype == I16 || attr[0]->dtype == F16)
if (attr[0]->dtype == I16 || attr[0]->dtype == F16)
{
shaderParam.global_scale[0] = 8;
group_num = (width + 127) / 128;
@ -774,12 +782,12 @@ static vsi_status _query_kernel
for( i = 0; i < kernel_map_size; i ++ )
{
if( kernel_map[i].key == hashkey )
if ( kernel_map[i].key == hashkey )
{
break;
}
}
if( i < kernel_map_size )
if ( i < kernel_map_size )
{
snprintf( kernel->info.name, VX_MAX_KERNEL_NAME, "%s", kernel_map[i].function_name );
kernel->info.parameters = param_def;
@ -830,7 +838,7 @@ static vsi_nn_kernel_node_t _setup
int32_t reshape_flg = vsi_nn_kernel_param_get_int32( params, "reshape_flg" );
// Check if gpu can support the size
if( !vsi_nn_kernel_gpu_check_shape(
if ( !vsi_nn_kernel_gpu_check_shape(
(int32_t*)outputs[0]->attr.size, outputs[0]->attr.dim_num ) )
{
return NULL;
@ -850,7 +858,7 @@ static vsi_nn_kernel_node_t _setup
attr.size[0] = ((inputs[0]->attr.size[0] + 255) / 256) * 4;
if( inputs[0]->attr.dtype.vx_type == VSI_NN_TYPE_INT16
if ( inputs[0]->attr.dtype.vx_type == VSI_NN_TYPE_INT16
|| inputs[0]->attr.dtype.vx_type == VSI_NN_TYPE_FLOAT16)
{
attr.size[0] = ((inputs[0]->attr.size[0] + 127) / 128) * 4;
@ -868,17 +876,17 @@ static vsi_nn_kernel_node_t _setup
hashkey = HASH_INSTANCENORM_KEY( in0_dtype, out_dtype, reshape_flg );
status = _query_kernel( ikernels[MEAN_VARI_INDEX], hashkeys[MEAN_VARI_INDEX], INTERNAL_KERNEL_MEAN_VARI );
if( VSI_SUCCESS != status )
if ( VSI_SUCCESS != status )
{
goto final;
}
status = _query_kernel( kernel, hashkey, INTERNAL_KERNEL_NORM );
if( VSI_SUCCESS != status )
if ( VSI_SUCCESS != status )
{
goto final;
}
if(reshape_flg)
if (reshape_flg)
{
int32_t shape[VSI_NN_MAX_DIM_NUM] = {0};
shape[0] = inputs[0]->attr.size[0];
@ -893,7 +901,7 @@ static vsi_nn_kernel_node_t _setup
shape[3] = outputs[0]->attr.dim_num > 3 ? outputs[0]->attr.size[3] : 1;
rs_output = vsi_nn_kernel_tensor_reshape( outputs[0]->t, shape, 4 );
}
if(inputs[1]->attr.dim_num < 2)
if (inputs[1]->attr.dim_num < 2)
{
int32_t shape[VSI_NN_MAX_DIM_NUM] = {0};
shape[0] = inputs[1]->attr.size[0];
@ -902,7 +910,7 @@ static vsi_nn_kernel_node_t _setup
shape[3] = 1;
rs_beta = vsi_nn_kernel_tensor_reshape( inputs[1]->t, shape, 4 );
}
if(inputs[2]->attr.dim_num < 2)
if (inputs[2]->attr.dim_num < 2)
{
int32_t shape[VSI_NN_MAX_DIM_NUM] = {0};
shape[0] = inputs[2]->attr.size[0];
@ -914,10 +922,10 @@ static vsi_nn_kernel_node_t _setup
// Mean Vari
{
tmp_node = vsi_nn_kernel_create_node( graph, ikernels[MEAN_VARI_INDEX] );
if(tmp_node)
if (tmp_node)
{
uint32_t index = 0;
if(reshape_flg)
if (reshape_flg)
{
mean_vari_node_params[index++] = rs_input;
vsi_nn_kernel_node_pack_io( &mean_vari_node_params[index],
@ -943,7 +951,7 @@ static vsi_nn_kernel_node_t _setup
border.mode = VX_BORDER_CONSTANT;
border.constant_value.U8 = 0;
border.constant_value.U16 = 0;
if(inputs[0]->attr.dtype.vx_type == VSI_NN_TYPE_UINT8)
if (inputs[0]->attr.dtype.vx_type == VSI_NN_TYPE_UINT8)
{
border.constant_value.U8 = (vx_uint8)inputs[0]->attr.dtype.zero_point;
}
@ -956,10 +964,10 @@ static vsi_nn_kernel_node_t _setup
// Nomalization
{
node = vsi_nn_kernel_create_node( graph, kernel );
if(node)
if (node)
{
uint32_t index = 0;
if(reshape_flg)
if (reshape_flg)
{
node_params[index++] = rs_input;
}
@ -967,7 +975,7 @@ static vsi_nn_kernel_node_t _setup
{
node_params[index++] = (vsi_nn_kernel_node_param_t)inputs[0]->t;
}
if(inputs[1]->attr.dim_num < 2)
if (inputs[1]->attr.dim_num < 2)
{
node_params[index++] = rs_beta;
}
@ -975,7 +983,7 @@ static vsi_nn_kernel_node_t _setup
{
node_params[index++] = (vsi_nn_kernel_node_param_t)inputs[1]->t;
}
if(inputs[2]->attr.dim_num < 2)
if (inputs[2]->attr.dim_num < 2)
{
node_params[index++] = rs_gamma;
}
@ -984,7 +992,7 @@ static vsi_nn_kernel_node_t _setup
node_params[index++] = (vsi_nn_kernel_node_param_t)inputs[2]->t;
}
node_params[index++] = (vsi_nn_kernel_node_param_t)tensors[MEAN_VARI_INDEX]->t;
if(reshape_flg)
if (reshape_flg)
{
node_params[index++] = rs_output;
}
@ -1006,9 +1014,9 @@ static vsi_nn_kernel_node_t _setup
border.mode = VX_BORDER_CONSTANT;
border.constant_value.U8 = 0;
border.constant_value.U16 = 0;
if(outputs[0]->attr.dtype.vx_type == VSI_NN_TYPE_UINT8)
if (inputs[0]->attr.dtype.vx_type == VSI_NN_TYPE_UINT8)
{
border.constant_value.U8 = (vx_uint8)outputs[0]->attr.dtype.zero_point;
border.constant_value.U8 = (vx_uint8)inputs[0]->attr.dtype.zero_point;
}
status = vxSetNodeAttribute( (vx_node)node, VX_NODE_BORDER, &border, sizeof(border) );
CHECK_STATUS(status);
@ -1018,31 +1026,31 @@ static vsi_nn_kernel_node_t _setup
/* Pass parameters to node. */
final:
if(rs_beta)
if (rs_beta)
{
vsi_nn_kernel_tensor_release( &rs_beta );
}
if(rs_gamma)
if (rs_gamma)
{
vsi_nn_kernel_tensor_release( &rs_gamma );
}
if(reshape_flg)
if (reshape_flg)
{
vsi_nn_kernel_tensor_release( &rs_input );
vsi_nn_kernel_tensor_release( &rs_output );
}
for( i = 0; i < INTERNAL_KERNEL_SIZE; i ++ )
{
if( ikernels[i] )
if ( ikernels[i] )
{
vsi_nn_kernel_release( &ikernels[i] );
}
if( tensors[i] )
if ( tensors[i] )
{
vsi_nn_ReleaseTensor( &tensors[i] );
}
}
if(tmp_node) {vsi_nn_kernel_node_release( &tmp_node );}
if (tmp_node) {vsi_nn_kernel_node_release( &tmp_node );}
return node;
} /* _setup() */

File diff suppressed because it is too large Load Diff

View File

@ -68,7 +68,6 @@ static const struct {
{
TENSOR_PRE_PROCESS_BGRA_KERNELS(U8, U8, SCALE, KERNEL_SOURCE_1)
TENSOR_PRE_PROCESS_BGRA_KERNELS(U8, U8, COPY, KERNEL_SOURCE_1)
TENSOR_PRE_PROCESS_BGRA_KERNELS(U8, U8, SCALE_NHWC, KERNEL_SOURCE_2)
};
static vx_param_description_t vxPreProcessBgraKernel_param_def[] =
@ -106,7 +105,6 @@ DEF_KERNEL_INITIALIZER(_pre_process_bgra_initializer)
int32_t dstZP = 0;
float outputScale = 1;
int32_t reorder = 0;
int32_t trans = 0;
int32_t xRatio = 0;
int32_t yRatio = 0;
int32_t order1 = 2;
@ -126,8 +124,6 @@ DEF_KERNEL_INITIALIZER(_pre_process_bgra_initializer)
CHECK_STATUS_FAIL_GOTO(status, OnError );
status = vsi_nn_kernel_scalar_read_int32((vsi_nn_kernel_scalar_t)param[10], &reorder);
CHECK_STATUS_FAIL_GOTO(status, OnError );
status = vsi_nn_kernel_scalar_read_int32((vsi_nn_kernel_scalar_t)param[11], &trans);
CHECK_STATUS_FAIL_GOTO(status, OnError );
out_shape = attr[0]->shape;
dstZP = attr[0]->asymm.zero_point;
@ -135,19 +131,14 @@ DEF_KERNEL_INITIALIZER(_pre_process_bgra_initializer)
width = out_shape->data[0];
height = out_shape->data[1];
if(trans)
{
width = width / 3;
}
if(reorder != 0)
if (reorder != 0)
{
reorder = 2;
order1 = 0;
}
enable_copy = (int32_t)(xRatio == (1 << 15) && yRatio == (1 << 15));
if(attr[0]->quant == VSI_NN_KERNEL_QUANT_DFP)
if (attr[0]->quant == VSI_NN_KERNEL_QUANT_DFP)
{
if (attr[0]->dfp.fl > 0)
{
@ -159,11 +150,11 @@ DEF_KERNEL_INITIALIZER(_pre_process_bgra_initializer)
}
dstZP = 0;
}
else if(attr[0]->quant == VSI_NN_KERNEL_QUANT_ASYMM)
else if (attr[0]->quant == VSI_NN_KERNEL_QUANT_ASYMM)
{
outputScale = 1.0f/outputScale;
}
else if( attr[0]->quant == VSI_NN_KERNEL_QUANT_NONE )
else if ( attr[0]->quant == VSI_NN_KERNEL_QUANT_NONE )
{
outputScale = 1;
dstZP = 0;
@ -286,16 +277,6 @@ DEF_KERNEL_INITIALIZER(_pre_process_bgra_initializer)
0x00000001, 0x00000000, 0x00000001, 0x00000000 // Constant
}, GPU_DP_TYPE_16 };
gpu_dp_inst_t uniExtractInt32BgraToU8Bgr_2x8 = {{
0x00333333, // TCfg
0x00111000, // ASelt
0x00020100, 0x00000201, // ABin
0x00000000, // BSelt
0x00000000, 0x00000000, // BBin
0x00002600, // AccumType, ConstantType, and PostShift
0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000 // Constant
}, GPU_DP_TYPE_16 };
// copy
gpu_dp_inst_t uniExtractBfromBgra_4x4 = {{
0x01010101, // TCfg
@ -355,23 +336,7 @@ DEF_KERNEL_INITIALIZER(_pre_process_bgra_initializer)
0x00000001, 0x00000001, 0x00000001, 0x00000001, 0x00000001, 0x00000001, 0x00000001, 0x00000001 // Constant
}, GPU_DP_TYPE_16 };
if(trans)
{
status = vsi_nn_kernel_gpu_add_param(node, "uniExtractInt32BgraToU8Bgr_2x8",
&uniExtractInt32BgraToU8Bgr_2x8);
status |= vsi_nn_kernel_gpu_add_param(node, "uniBilinearTmp1BgraShort_4x4", &uniBilinearTmp1BgraShort_4x4);
status |= vsi_nn_kernel_gpu_add_param(node, "uniBilinearTmp2BgraShort_4x4", &uniBilinearTmp2BgraShort_4x4);
status |= vsi_nn_kernel_gpu_add_param(node, "uniBilinearTmp3BgraShort_4x4", &uniBilinearTmp3BgraShort_4x4);
status |= vsi_nn_kernel_gpu_add_param(node, "uniBilinearTmp4BgraShort_4x4", &uniBilinearTmp4BgraShort_4x4);
status |= vsi_nn_kernel_gpu_add_param(node, "uniBilinearTmp5BgraShort_4x4", &uniBilinearTmp5BgraShort_4x4);
status |= vsi_nn_kernel_gpu_add_param(node, "uniBilinearTmp6BgraShort_4x4", &uniBilinearTmp6BgraShort_4x4);
status |= vsi_nn_kernel_gpu_add_param(node, "uniBilinearTmp7BgraShort_4x4", &uniBilinearTmp7BgraShort_4x4);
status |= vsi_nn_kernel_gpu_add_param(node, "uniBilinearTmp8BgraShort_4x4", &uniBilinearTmp8BgraShort_4x4);
status |= vsi_nn_kernel_gpu_add_param(node, "uniDescaleU8_4x4", &uniDescaleU8_4x4);
status |= vsi_nn_kernel_gpu_add_param(node, "uniConvertIntergetoF32_4x4", &uniConvertIntergetoF32_4x4);
CHECK_STATUS_FAIL_GOTO(status, OnError);
}
else if(enable_copy)
if (enable_copy)
{
status = vsi_nn_kernel_gpu_add_param(node, "uniExtractBfromBgra_4x4", &uniExtractBfromBgra_4x4);
status |= vsi_nn_kernel_gpu_add_param(node, "uniExtractGfromBgra_4x4", &uniExtractGfromBgra_4x4);
@ -429,16 +394,11 @@ static vsi_status _query_kernel
uint32_t key = 0;
int i = 0;
vsi_bool enable_copy = vsi_nn_kernel_param_get_int32( params, "enable_copy" );
vsi_bool enable_perm = vsi_nn_kernel_param_get_int32( params, "enable_perm" );
input0_dtype = vsi_nn_kernel_map_dtype( inputs[0]->attr.dtype.vx_type );
output_dtype = vsi_nn_kernel_map_dtype( outputs[0]->attr.dtype.vx_type );
if(enable_perm)
{
convert_type = SCALE_NHWC;
}
else if(enable_copy)
if (enable_copy)
{
convert_type = COPY;
}
@ -449,14 +409,14 @@ static vsi_status _query_kernel
key = HASH_PRE_PROCESS_BGRA_KEY( input0_dtype, output_dtype, convert_type, 0 );
for( i = 0; i < _cnt_of_array(pre_process_bgra_map); i ++ )
for ( i = 0; i < _cnt_of_array(pre_process_bgra_map); i ++ )
{
if( pre_process_bgra_map[i].key == key )
{
break;
}
}
if( i < _cnt_of_array(pre_process_bgra_map) )
if ( i < _cnt_of_array(pre_process_bgra_map) )
{
snprintf( kernel->info.name, VX_MAX_KERNEL_NAME, "%s", pre_process_bgra_map[i].function_name );
kernel->info.parameters = vxPreProcessBgraKernel_param_def;
@ -488,19 +448,19 @@ static vsi_nn_kernel_node_t _setup
vsi_nn_kernel_node_t node = NULL;
int32_t shapes[VSI_NN_MAX_DIM_NUM] = {1, 1, 1, 1};
vsi_nn_tensor_t* reshape_tensors[1] = {NULL};
int32_t trans = vsi_nn_kernel_param_get_int32( params, "enable_perm" );
int32_t trans = 0;
if( !vsi_nn_kernel_gpu_check_shape( (int32_t*)outputs[0]->attr.size,
if ( !vsi_nn_kernel_gpu_check_shape( (int32_t*)outputs[0]->attr.size,
outputs[0]->attr.dim_num ) )
{
return NULL;
}
status = _query_kernel( inputs, outputs, kernel, params );
if( VSI_SUCCESS == status)
if ( VSI_SUCCESS == status)
{
node = vsi_nn_kernel_create_node( graph, kernel );
if( node )
if ( node )
{
uint32_t index = 2;
int32_t scale_x = vsi_nn_kernel_param_get_int32( params, "scale_x" );

View File

@ -43,7 +43,6 @@ __BEGIN_DECLS
#define VX_KERNEL_NAME_PRE_PROCESS_NV12_SCALE_U8TOU8 CVIVANTE_NAMESPACE("evis.pre_process_nv12_scale_U8toU8")
#define VX_KERNEL_NAME_PRE_PROCESS_NV12_SCALE_U8TOI8 CVIVANTE_NAMESPACE("evis.pre_process_nv12_scale_U8toI8")
#define VX_KERNEL_NAME_PRE_PROCESS_NV12_COPY_U8TOU8 CVIVANTE_NAMESPACE("evis.pre_process_nv12_copy_U8toU8")
#define VX_KERNEL_NAME_PRE_PROCESS_NV12_TRANS_U8TOU8 CVIVANTE_NAMESPACE("evis.pre_process_nv12_trans_U8toU8")
// greater than a quarter
#define VX_KERNEL_NAME_PRE_PROCESS_NV12_SCALE_U8TOU8_GQ CVIVANTE_NAMESPACE("evis.pre_process_nv12_scale_U8toU8_gq")
@ -51,7 +50,6 @@ __BEGIN_DECLS
#define KERNEL_SOURCE_1 "pre_process_nv12_scale_8bits",
#define KERNEL_SOURCE_2 "pre_process_nv12_scale",
#define KERNEL_SOURCE_3 "pre_process_nv12_trans_u8",
#define KERNEL_SOURCE_4 "pre_process_nv12_scale_mix"
typedef enum
@ -85,7 +83,6 @@ static const struct {
TENSOR_PRE_PROCESS_NV12_KERNELS(U8, U8, COPY, KERNEL_SOURCE_1)
TENSOR_PRE_PROCESS_NV12_KERNELS(U8, F16, SCALE, KERNEL_SOURCE_2)
TENSOR_PRE_PROCESS_NV12_KERNELS(U8, I16, SCALE, KERNEL_SOURCE_2)
TENSOR_PRE_PROCESS_NV12_KERNELS(U8, U8, TRANS, KERNEL_SOURCE_3)
TENSOR_PRE_PROCESS_NV12_GQ_KERNELS(U8, U8, SCALE, KERNEL_SOURCE_4)
TENSOR_PRE_PROCESS_NV12_GQ_KERNELS(U8, F16, SCALE, KERNEL_SOURCE_4)
};
@ -156,17 +153,17 @@ DEF_KERNEL_INITIALIZER(_pre_process_nv12_copy_initializer)
width = out_shape->data[0];
height = out_shape->data[1];
if(reorder != 0)
if (reorder != 0)
{
reorder = 2;
order1 = 0;
}
if(attr[0]->quant == VSI_NN_KERNEL_QUANT_ASYMM)
if (attr[0]->quant == VSI_NN_KERNEL_QUANT_ASYMM)
{
dstScale = 1.0f / dstScale;
}
else if(attr[0]->quant == VSI_NN_KERNEL_QUANT_DFP)
else if (attr[0]->quant == VSI_NN_KERNEL_QUANT_DFP)
{
if (attr[0]->dfp.fl > 0)
{
@ -178,7 +175,7 @@ DEF_KERNEL_INITIALIZER(_pre_process_nv12_copy_initializer)
}
dstZP = 0;
}
else if(attr[0]->quant == VSI_NN_KERNEL_QUANT_NONE)
else if (attr[0]->quant == VSI_NN_KERNEL_QUANT_NONE)
{
dstScale = 1;
dstZP = 0;
@ -295,7 +292,6 @@ DEF_KERNEL_INITIALIZER(_pre_process_nv12_initializer)
int32_t dstZP = 0;
float dstScale = 1;
int32_t reorder = 0;
int32_t trans = 0;
int32_t order1 = 2;
uint32_t width = 0;
uint32_t height = 0;
@ -325,8 +321,6 @@ DEF_KERNEL_INITIALIZER(_pre_process_nv12_initializer)
CHECK_STATUS_FAIL_GOTO(status, OnError );
status = vsi_nn_kernel_scalar_read_int32((vsi_nn_kernel_scalar_t)param[11], &reorder);
CHECK_STATUS_FAIL_GOTO(status, OnError );
status = vsi_nn_kernel_scalar_read_int32((vsi_nn_kernel_scalar_t)param[12], &trans);
CHECK_STATUS_FAIL_GOTO(status, OnError );
out_shape = attr[1]->shape;
dstZP = attr[1]->asymm.zero_point;
@ -334,24 +328,21 @@ DEF_KERNEL_INITIALIZER(_pre_process_nv12_initializer)
width = out_shape->data[0];
height = out_shape->data[1];
if(reorder != 0)
if (reorder != 0)
{
reorder = 2;
order1 = 0;
}
if(trans)
{
width = width / 3;
}
resize = (float)width / attr[0]->shape->data[0];
xrIntFloat_16 = (attr[0]->shape->data[0] << 16) / width + 1;
yrIntFloat_16 = (attr[0]->shape->data[1] << 16) / height + 1;
if(attr[1]->quant == VSI_NN_KERNEL_QUANT_ASYMM)
if (attr[1]->quant == VSI_NN_KERNEL_QUANT_ASYMM)
{
dstScale = 1.0f / dstScale;
}
else if(attr[1]->quant == VSI_NN_KERNEL_QUANT_DFP)
else if (attr[1]->quant == VSI_NN_KERNEL_QUANT_DFP)
{
if (attr[1]->dfp.fl > 0)
{
@ -363,7 +354,7 @@ DEF_KERNEL_INITIALIZER(_pre_process_nv12_initializer)
}
dstZP = 0;
}
else if(attr[1]->quant == VSI_NN_KERNEL_QUANT_NONE)
else if (attr[1]->quant == VSI_NN_KERNEL_QUANT_NONE)
{
dstScale = 1;
dstZP = 0;
@ -450,27 +441,6 @@ DEF_KERNEL_INITIALIZER(_pre_process_nv12_initializer)
}, GPU_DP_TYPE_16 };
//trans
gpu_dp_inst_t uniTransPackBgr1st_2x8 = {{
0x11311311, // TCfg
0x00100100, // ASelt
0x01000400, 0x06020105, // ABin
0x22022022, // BSelt
0x00000000, 0x00000000, // BBin
0x00002600, // AccumType, ConstantType, and PostShift
0x00000001, 0x00000001, 0x00000000, 0x00000001,
0x00000001, 0x00000000, 0x00000001, 0x00000001 // Constant
}, GPU_DP_TYPE_16 };
gpu_dp_inst_t uniTransPackBgr2nd_2x8 = {{
0x00003113, // TCfg
0x00001001, // ASelt
0x03070302, 0x00000000, // ABin
0x00000220, // BSelt
0x00000000, 0x00000000, // BBin
0x00002600, // AccumType, ConstantType, and PostShift
0x00000000, 0x00000001, 0x00000001, 0x00000000,
0x00000000, 0x00000000, 0x00000000, 0x00000000 // Constant
}, GPU_DP_TYPE_16 };
gpu_dp_inst_t uniCalculateYShift_2x8 = {{
0x00009999, // TCfg
0x00000000, // ASelt
@ -502,23 +472,15 @@ DEF_KERNEL_INITIALIZER(_pre_process_nv12_initializer)
status |= vsi_nn_kernel_gpu_add_param(node, "bMeanScaleVarZp", &bMeanScaleVarZp);
status |= vsi_nn_kernel_gpu_add_param(node, "gMeanScaleVarZp", &gMeanScaleVarZp);
status |= vsi_nn_kernel_gpu_add_param(node, "rMeanScaleVarZp", &rMeanScaleVarZp);
if(resize >= 0.25 && (attr[1]->dtype == U8 || attr[1]->dtype == F16) && !trans)
if (resize >= 0.25 && (attr[1]->dtype == U8 || attr[1]->dtype == F16))
{
status |= vsi_nn_kernel_gpu_add_param(node, "uniCalculateYShift_2x8", &uniCalculateYShift_2x8);
status |= vsi_nn_kernel_gpu_add_param(node, "uniCalculateUVShift_2x8", &uniCalculateUVShift_2x8);
}
CHECK_STATUS_FAIL_GOTO(status, OnError );
if(trans && attr[1]->dtype == U8)
{
status = vsi_nn_kernel_gpu_add_param(node, "uniTransPackBgr1st_2x8", &uniTransPackBgr1st_2x8);
status |= vsi_nn_kernel_gpu_add_param(node, "uniTransPackBgr2nd_2x8", &uniTransPackBgr2nd_2x8);
CHECK_STATUS_FAIL_GOTO(status, OnError );
}
else
{
status |= vsi_nn_kernel_gpu_add_param(node, "rOrder", &reorder);
}
status |= vsi_nn_kernel_gpu_add_param(node, "rOrder", &reorder);
status |= vsi_nn_kernel_gpu_add_param(node, "bOrder", &order1);
CHECK_STATUS_FAIL_GOTO(status, OnError );
@ -572,20 +534,15 @@ static vsi_status _query_kernel
uint32_t key = 0;
int i = 0;
vsi_bool enable_copy = vsi_nn_kernel_param_get_int32( params, "enable_copy" );
vsi_bool enable_perm = vsi_nn_kernel_param_get_int32( params, "enable_perm" );
uint32_t srcWidth = inputs[0]->attr.size[0];
uint32_t dstWidth = enable_perm ? outputs[0]->attr.size[1] : outputs[0]->attr.size[0];
uint32_t dstWidth = outputs[0]->attr.size[0];
float scaleVal = (float)dstWidth / srcWidth;
uint32_t optFlg = 0;
input0_dtype = vsi_nn_kernel_map_dtype( inputs[0]->attr.dtype.vx_type );
output_dtype = vsi_nn_kernel_map_dtype( outputs[0]->attr.dtype.vx_type );
if(enable_perm)
{
convert_type = TRANS;
}
else if(enable_copy && output_dtype == U8)
if (enable_copy && output_dtype == U8)
{
convert_type = COPY;
}
@ -594,7 +551,7 @@ static vsi_status _query_kernel
convert_type = SCALE;
}
if(scaleVal >= 0.25 && (output_dtype == U8 || output_dtype == F16) && convert_type == SCALE)
if (scaleVal >= 0.25 && (output_dtype == U8 || output_dtype == F16) && convert_type == SCALE)
{
optFlg = 1;
}
@ -608,7 +565,7 @@ static vsi_status _query_kernel
break;
}
}
if( i < _cnt_of_array(pre_process_nv12_map) )
if ( i < _cnt_of_array(pre_process_nv12_map) )
{
snprintf( kernel->info.name, VX_MAX_KERNEL_NAME, "%s", pre_process_nv12_map[i].function_name );
kernel->info.parameters = vxPreProcessNv12Kernel_param_def;
@ -646,21 +603,20 @@ static vsi_nn_kernel_node_t _setup
vsi_status status = VSI_FAILURE;
vsi_nn_kernel_node_param_t tmp_params[_EVIS_PRE_PROCESS_NV12_PARAM_NUM] = { NULL };
vsi_nn_kernel_node_t node = NULL;
int32_t shapes[VSI_NN_MAX_DIM_NUM] = {1, 1, 1, 1};
vsi_nn_tensor_t* reshape_tensors[1] = {NULL};
int32_t trans = vsi_nn_kernel_param_get_int32( params, "enable_perm" );
int32_t trans = 0;
if( !vsi_nn_kernel_gpu_check_shape( (int32_t*)outputs[0]->attr.size,
if ( !vsi_nn_kernel_gpu_check_shape( (int32_t*)outputs[0]->attr.size,
outputs[0]->attr.dim_num ) )
{
return NULL;
}
status = _query_kernel( inputs, outputs, kernel, params );
if( VSI_SUCCESS == status)
if ( VSI_SUCCESS == status)
{
node = vsi_nn_kernel_create_node( graph, kernel );
if( node )
if ( node )
{
uint32_t index = 3;
int32_t scale_x = vsi_nn_kernel_param_get_int32( params, "scale_x" );
@ -674,22 +630,9 @@ static vsi_nn_kernel_node_t _setup
int32_t reverse = vsi_nn_kernel_param_get_int32( params, "reverse" );
/* Pass parameters to node. */
if(trans)
{
shapes[0] = outputs[0]->attr.size[0] * outputs[0]->attr.size[1];
shapes[1] = outputs[0]->attr.size[2];
reshape_tensors[0] = vsi_nn_reshape_tensor( graph,
outputs[0], (uint32_t*)shapes, outputs[0]->attr.dim_num);
vsi_nn_kernel_node_pack_io( tmp_params, _EVIS_PRE_PROCESS_NV12_PARAM_NUM,
inputs, 2, &reshape_tensors[0], 1 );
}
else
{
vsi_nn_kernel_node_pack_io( tmp_params, _EVIS_PRE_PROCESS_NV12_PARAM_NUM,
vsi_nn_kernel_node_pack_io( tmp_params, _EVIS_PRE_PROCESS_NV12_PARAM_NUM,
inputs, 2, outputs, 1 );
}
tmp_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &scale_x );
tmp_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &scale_y );
tmp_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &left );

View File

@ -90,14 +90,6 @@ static const struct {
TENSOR_PRE_PROCESS_RGB_KERNELS(U8, I16, COPY, KERNEL_SOURCE_2)
TENSOR_PRE_PROCESS_RGB_KERNELS(U8, U8, COPY, KERNEL_SOURCE_2)
TENSOR_PRE_PROCESS_RGB_KERNELS(U8, I8, COPY, KERNEL_SOURCE_2)
TENSOR_PRE_PROCESS_RGB_KERNELS(U8, F16, SCALE_NHWC, KERNEL_SOURCE_3)
TENSOR_PRE_PROCESS_RGB_KERNELS(U8, I16, SCALE_NHWC, KERNEL_SOURCE_3)
TENSOR_PRE_PROCESS_RGB_KERNELS(U8, U8, SCALE_NHWC, KERNEL_SOURCE_3)
TENSOR_PRE_PROCESS_RGB_KERNELS(U8, I8, SCALE_NHWC, KERNEL_SOURCE_3)
TENSOR_PRE_PROCESS_RGB_KERNELS(U8, F16, COPY_NHWC, KERNEL_SOURCE_4)
TENSOR_PRE_PROCESS_RGB_KERNELS(U8, I16, COPY_NHWC, KERNEL_SOURCE_4)
TENSOR_PRE_PROCESS_RGB_KERNELS(U8, U8, COPY_NHWC, KERNEL_SOURCE_4)
TENSOR_PRE_PROCESS_RGB_KERNELS(U8, I8, COPY_NHWC, KERNEL_SOURCE_4)
};
static vx_param_description_t vxPreProcessRgbKernel_param_def[] =
@ -156,8 +148,6 @@ DEF_KERNEL_INITIALIZER(_pre_process_rgb_initializer)
CHECK_STATUS_FAIL_GOTO(status, OnError );
status = vsi_nn_kernel_scalar_read_int32((vsi_nn_kernel_scalar_t)param[10], &reorder);
CHECK_STATUS_FAIL_GOTO(status, OnError );
status = vsi_nn_kernel_scalar_read_int32((vsi_nn_kernel_scalar_t)param[11], &trans);
CHECK_STATUS_FAIL_GOTO(status, OnError );
out_shape = attr[0]->shape;
outputZP = (float)attr[0]->asymm.zero_point;
@ -165,14 +155,14 @@ DEF_KERNEL_INITIALIZER(_pre_process_rgb_initializer)
width = out_shape->data[0];
height = out_shape->data[1];
if(reorder != 0)
if (reorder != 0)
{
reorder = 2;
order1 = 0;
}
enable_copy = (int32_t)(xRatio == (1 << 15) && yRatio == (1 << 15));
if(attr[0]->quant == VSI_NN_KERNEL_QUANT_DFP)
if (attr[0]->quant == VSI_NN_KERNEL_QUANT_DFP)
{
if (attr[0]->dfp.fl > 0)
{
@ -184,11 +174,11 @@ DEF_KERNEL_INITIALIZER(_pre_process_rgb_initializer)
}
outputZP = 0;
}
else if(attr[0]->quant == VSI_NN_KERNEL_QUANT_ASYMM)
else if (attr[0]->quant == VSI_NN_KERNEL_QUANT_ASYMM)
{
outputScale = 1.0f / outputScale;
}
else if( attr[0]->quant == VSI_NN_KERNEL_QUANT_NONE )
else if ( attr[0]->quant == VSI_NN_KERNEL_QUANT_NONE )
{
outputScale = 1;
outputZP = 0;
@ -199,48 +189,6 @@ DEF_KERNEL_INITIALIZER(_pre_process_rgb_initializer)
pack_key = _PACK_SELECT_KEY( enable_copy, reorder, trans);
{
// trans and copy
gpu_dp_inst_t uniNormilizationLo_2x8 = {{
0x99999999, // TCfg
0x44444444, // ASelt
0x45002142, 0x27480324, // ABin
0x99999999, // BSelt
0x06060606, 0x06060606, // BBin
0x00000100, // AccumType, ConstantType, and PostShift
0x3c000000, 0x3c000000, 0x3c000000, 0x3c000000,
0x3c000000, 0x3c000000, 0x3c000000, 0x3c000000 // Constant
}, GPU_DP_TYPE_16 };
gpu_dp_inst_t uniNormilizationHi_2x8 = {{
0x09999999, // TCfg
0x04444444, // ASelt
0x092a4b06, 0x000c2d4e, // ABin
0x09999999, // BSelt
0x06060606, 0x00060606, // BBin
0x00000100, // AccumType, ConstantType, and PostShift
0x3c000000, 0x3c000000, 0x3c000000, 0x3c000000,
0x3c000000, 0x3c000000, 0x3c000000, 0x00000000 // Constant
}, GPU_DP_TYPE_16 };
gpu_dp_inst_t uniNormilizationLo_NHWC_2x8 = {{
0x99999999, // TCfg
0x44444444, // ASelt
0x03422100, 0x27064524, // ABin
0x99999999, // BSelt
0x06060606, 0x06060606, // BBin
0x00000100, // AccumType, ConstantType, and PostShift
0x3c000000, 0x3c000000, 0x3c000000, 0x3c000000,
0x3c000000, 0x3c000000, 0x3c000000, 0x3c000000 // Constant
}, GPU_DP_TYPE_16 };
gpu_dp_inst_t uniNormilizationHi_NHWC_2x8 = {{
0x09999999, // TCfg
0x04444444, // ASelt
0x4b2a0948, 0x004e2d0c, // ABin
0x09999999, // BSelt
0x06060606, 0x00060606, // BBin
0x00000100, // AccumType, ConstantType, and PostShift
0x3c000000, 0x3c000000, 0x3c000000, 0x3c000000,
0x3c000000, 0x3c000000, 0x3c000000, 0x00000000 // Constant
}, GPU_DP_TYPE_16 };
// copy
gpu_dp_inst_t uniExtractRtoF32_part0_4x4 = {{
0x01010101, // TCfg
@ -404,79 +352,9 @@ DEF_KERNEL_INITIALIZER(_pre_process_rgb_initializer)
0x00000001, 0x00000000, 0x00000001, 0x00000000,
0x00000001, 0x00000000, 0x00000001, 0x00000000 // Constant
}, GPU_DP_TYPE_16 };
gpu_dp_inst_t uniRePackRGBLo_2x8 = {{
0x00111111, // TCfg
0x00001001, // ASelt
0x01000400, 0x00000105, // ABin
0x00222222, // BSelt
0x00000000, 0x00000000, // BBin
0x00000600, // AccumType, ConstantType, and PostShift
0x00000001, 0x00000001, 0x00000001, 0x00000001,
0x00000001, 0x00000001, 0x00000000, 0x00000000 // Constant
}, GPU_DP_TYPE_16 };
gpu_dp_inst_t uniRePackRGBHi_2x8 = {{
0x00111111, // TCfg
0x00001001, // ASelt
0x03020602, 0x00000307, // ABin
0x00222222, // BSelt
0x00000000, 0x00000000, // BBin
0x00000600, // AccumType, ConstantType, and PostShift
0x00000001, 0x00000001, 0x00000001, 0x00000001,
0x00000001, 0x00000001, 0x00000000, 0x00000000 // Constant
}, GPU_DP_TYPE_16 };
gpu_dp_inst_t uniRePackRGBLo_NHWC_2x8 = {{
0x00111111, // TCfg
0x00100100, // ASelt
0x01000400, 0x00000105, // ABin
0x00222222, // BSelt
0x00000000, 0x00000000, // BBin
0x00000600, // AccumType, ConstantType, and PostShift
0x00000001, 0x00000001, 0x00000001, 0x00000001,
0x00000001, 0x00000001, 0x00000000, 0x00000000 // Constant
}, GPU_DP_TYPE_16 };
gpu_dp_inst_t uniRePackRGBHi_NHWC_2x8 = {{
0x00111111, // TCfg
0x00100100, // ASelt
0x03020602, 0x00000307, // ABin
0x00222222, // BSelt
0x00000000, 0x00000000, // BBin
0x00000600, // AccumType, ConstantType, and PostShift
0x00000001, 0x00000001, 0x00000001, 0x00000001,
0x00000001, 0x00000001, 0x00000000, 0x00000000 // Constant
}, GPU_DP_TYPE_16 };
switch( pack_key )
switch ( pack_key )
{
case _PACK_SELECT_KEY( 1, 0, 1): // copy trans
{
shaderParam.global_scale[0] = 15;
shaderParam.global_scale[1] = 1;
shaderParam.global_scale[2] = 1;
shaderParam.global_size[0] = gpu_align_p2((width + shaderParam.global_scale[0] - 1)
/ shaderParam.global_scale[0], 4);
shaderParam.global_size[1] = height;
shaderParam.global_size[2] = 1;
status = vsi_nn_kernel_gpu_add_param(node, "uniNormilizationLo_2x8", &uniNormilizationLo_NHWC_2x8);
status |= vsi_nn_kernel_gpu_add_param(node, "uniNormilizationHi_2x8", &uniNormilizationHi_NHWC_2x8);
CHECK_STATUS_FAIL_GOTO(status, OnError);
}
break;
case _PACK_SELECT_KEY( 1, 2, 1): // copy reorder trans
{
shaderParam.global_scale[0] = 15;
shaderParam.global_scale[1] = 1;
shaderParam.global_scale[2] = 1;
shaderParam.global_size[0] = gpu_align_p2((width + shaderParam.global_scale[0] - 1)
/ shaderParam.global_scale[0], 4);
shaderParam.global_size[1] = height;
shaderParam.global_size[2] = 1;
status = vsi_nn_kernel_gpu_add_param(node, "uniNormilizationLo_2x8", &uniNormilizationLo_2x8);
status |= vsi_nn_kernel_gpu_add_param(node, "uniNormilizationHi_2x8", &uniNormilizationHi_2x8);
CHECK_STATUS_FAIL_GOTO(status, OnError);
}
break;
case _PACK_SELECT_KEY( 1, 0, 0): // copy
case _PACK_SELECT_KEY( 1, 2, 0): // copy reorder
{
@ -539,68 +417,6 @@ DEF_KERNEL_INITIALIZER(_pre_process_rgb_initializer)
CHECK_STATUS_FAIL_GOTO(status, OnError);
}
break;
case _PACK_SELECT_KEY( 0, 0, 1): // trans
{
shaderParam.global_scale[0] = 4;
shaderParam.global_scale[1] = 1;
shaderParam.global_scale[2] = 1;
shaderParam.global_size[0] = gpu_align_p2((width / 3 + shaderParam.global_scale[0] - 1)
/ shaderParam.global_scale[0], 4);
shaderParam.global_size[1] = height;
shaderParam.global_size[2] = 1;
if(attr[0]->dtype == F16)
{
status = vsi_nn_kernel_gpu_add_param(node, "uniExtract8Data_2x8", &uniExtractHalf8_2x8);
}
else
{
status = vsi_nn_kernel_gpu_add_param(node, "uniExtract8Data_2x8", &uniExtractInteger_2x8);
}
status |= vsi_nn_kernel_gpu_add_param(node, "uniConvertIntergetoF32_4x4", &uniConvertIntergetoF32_4x4);
status |= vsi_nn_kernel_gpu_add_param(node, "uniUnpackToR", &uniUnpackToR);
status |= vsi_nn_kernel_gpu_add_param(node, "uniUnpackToG", &uniUnpackToG);
status |= vsi_nn_kernel_gpu_add_param(node, "uniUnpackToB", &uniUnpackToB);
status |= vsi_nn_kernel_gpu_add_param(node, "uniVecShift10", &uniVecShift10);
status |= vsi_nn_kernel_gpu_add_param(node, "uniAddRShift", &uniAddRShift);
status |= vsi_nn_kernel_gpu_add_param(node, "uniGetTempVal", &uniGetTempVal);
status |= vsi_nn_kernel_gpu_add_param(node, "uniExtractBytes", &uniExtractBytes);
status |= vsi_nn_kernel_gpu_add_param(node, "uniRePackRGBLo_2x8", &uniRePackRGBLo_NHWC_2x8);
status |= vsi_nn_kernel_gpu_add_param(node, "uniRePackRGBHi_2x8", &uniRePackRGBHi_NHWC_2x8);
CHECK_STATUS_FAIL_GOTO(status, OnError);
}
break;
case _PACK_SELECT_KEY( 0, 2, 1): // reorder trans
{
shaderParam.global_scale[0] = 4;
shaderParam.global_scale[1] = 1;
shaderParam.global_scale[2] = 1;
shaderParam.global_size[0] = gpu_align_p2((width / 3 + shaderParam.global_scale[0] - 1)
/ shaderParam.global_scale[0], 4);
shaderParam.global_size[1] = height;
shaderParam.global_size[2] = 1;
if(attr[0]->dtype == F16)
{
status = vsi_nn_kernel_gpu_add_param(node, "uniExtract8Data_2x8", &uniExtractHalf8_2x8);
}
else
{
status = vsi_nn_kernel_gpu_add_param(node, "uniExtract8Data_2x8", &uniExtractInteger_2x8);
}
status |= vsi_nn_kernel_gpu_add_param(node, "uniConvertIntergetoF32_4x4", &uniConvertIntergetoF32_4x4);
status |= vsi_nn_kernel_gpu_add_param(node, "uniUnpackToR", &uniUnpackToR);
status |= vsi_nn_kernel_gpu_add_param(node, "uniUnpackToG", &uniUnpackToG);
status |= vsi_nn_kernel_gpu_add_param(node, "uniUnpackToB", &uniUnpackToB);
status |= vsi_nn_kernel_gpu_add_param(node, "uniVecShift10", &uniVecShift10);
status |= vsi_nn_kernel_gpu_add_param(node, "uniAddRShift", &uniAddRShift);
status |= vsi_nn_kernel_gpu_add_param(node, "uniGetTempVal", &uniGetTempVal);
status |= vsi_nn_kernel_gpu_add_param(node, "uniExtractBytes", &uniExtractBytes);
status |= vsi_nn_kernel_gpu_add_param(node, "uniRePackRGBLo_2x8", &uniRePackRGBLo_2x8);
status |= vsi_nn_kernel_gpu_add_param(node, "uniRePackRGBHi_2x8", &uniRePackRGBHi_2x8);
CHECK_STATUS_FAIL_GOTO(status, OnError);
}
break;
default:
break;
}
@ -637,23 +453,14 @@ static vsi_status _query_kernel
uint32_t key = 0;
int i = 0;
vsi_bool enable_copy = vsi_nn_kernel_param_get_int32( params, "enable_copy" );
vsi_bool enable_perm = vsi_nn_kernel_param_get_int32( params, "enable_perm" );
input0_dtype = vsi_nn_kernel_map_dtype( inputs[0]->attr.dtype.vx_type );
output_dtype = vsi_nn_kernel_map_dtype( outputs[0]->attr.dtype.vx_type );
if(enable_copy && enable_perm)
{
convert_type = COPY_NHWC;
}
else if(enable_copy)
if (enable_copy)
{
convert_type = COPY;
}
else if(enable_perm)
{
convert_type = SCALE_NHWC;
}
else
{
convert_type = SCALE;
@ -661,14 +468,14 @@ static vsi_status _query_kernel
key = HASH_PRE_PROCESS_RGB_KEY( input0_dtype, output_dtype, convert_type, 0 );
for( i = 0; i < _cnt_of_array(pre_process_rgb_map); i ++ )
for ( i = 0; i < _cnt_of_array(pre_process_rgb_map); i ++ )
{
if( pre_process_rgb_map[i].key == key )
{
break;
}
}
if( i < _cnt_of_array(pre_process_rgb_map) )
if ( i < _cnt_of_array(pre_process_rgb_map) )
{
snprintf( kernel->info.name, VX_MAX_KERNEL_NAME, "%s", pre_process_rgb_map[i].function_name );
kernel->info.parameters = vxPreProcessRgbKernel_param_def;
@ -698,21 +505,20 @@ static vsi_nn_kernel_node_t _setup
vsi_status status = VSI_FAILURE;
vsi_nn_kernel_node_param_t tmp_params[_EVIS_PRE_PROCESS_RGB_PARAM_NUM] = { NULL };
vsi_nn_kernel_node_t node = NULL;
int32_t shapes[VSI_NN_MAX_DIM_NUM] = {1, 1, 1, 1};
vsi_nn_tensor_t* reshape_tensors[1] = {NULL};
int32_t trans = vsi_nn_kernel_param_get_int32( params, "enable_perm" );
int32_t trans = 0;
if( !vsi_nn_kernel_gpu_check_shape( (int32_t*)outputs[0]->attr.size,
if ( !vsi_nn_kernel_gpu_check_shape( (int32_t*)outputs[0]->attr.size,
outputs[0]->attr.dim_num ) )
{
return NULL;
}
status = _query_kernel( inputs, outputs, kernel, params );
if( VSI_SUCCESS == status)
if ( VSI_SUCCESS == status)
{
node = vsi_nn_kernel_create_node( graph, kernel );
if( node )
if ( node )
{
uint32_t index = 2;
int32_t scale_x = vsi_nn_kernel_param_get_int32( params, "scale_x" );
@ -726,18 +532,7 @@ static vsi_nn_kernel_node_t _setup
int32_t reverse = vsi_nn_kernel_param_get_int32( params, "reverse" );
/* Pass parameters to node. */
if(trans)
{
shapes[0] = outputs[0]->attr.size[0] * outputs[0]->attr.size[1];
shapes[1] = outputs[0]->attr.size[2];
reshape_tensors[0] = vsi_nn_reshape_tensor( graph,
outputs[0], (uint32_t*)shapes, outputs[0]->attr.dim_num);
vsi_nn_kernel_node_pack_io( tmp_params, _EVIS_PRE_PROCESS_RGB_PARAM_NUM,
inputs, 1, &reshape_tensors[0], 1 );
}
else
if (trans == 0)
{
vsi_nn_kernel_node_pack_io( tmp_params, _EVIS_PRE_PROCESS_RGB_PARAM_NUM,
inputs, 1, outputs, 1 );
@ -767,7 +562,7 @@ static vsi_nn_kernel_node_t _setup
}
}
if(reshape_tensors[0])
if (reshape_tensors[0])
{
vsi_nn_ReleaseTensor(&reshape_tensors[0]);
}

View File

@ -43,15 +43,12 @@ __BEGIN_DECLS
#define VX_KERNEL_NAME_PRE_PROCESS_YUV420_SCALE_U8TOU8 CVIVANTE_NAMESPACE("evis.pre_process_yuv420_scale_U8toU8")
#define VX_KERNEL_NAME_PRE_PROCESS_YUV420_SCALE_U8TOI8 CVIVANTE_NAMESPACE("evis.pre_process_yuv420_scale_U8toI8")
#define VX_KERNEL_NAME_PRE_PROCESS_YUV420_COPY_U8TOU8 CVIVANTE_NAMESPACE("evis.pre_process_yuv420_copy_U8toU8")
#define VX_KERNEL_NAME_PRE_PROCESS_YUV420_COPY_TRANS_U8TOU8 CVIVANTE_NAMESPACE("evis.pre_process_yuv420_copy_trans_U8")
#define VX_KERNEL_NAME_PRE_PROCESS_YUV420_TRANS_U8TOU8 CVIVANTE_NAMESPACE("evis.pre_process_yuv420_trans_U8toU8")
#define KERNEL_SOURCE_1 "pre_process_yuv420_scale_u8",
#define KERNEL_SOURCE_2 "pre_process_yuv420_copy_u8",
#define KERNEL_SOURCE_3 "pre_process_yuv420_scale_fp16",
#define KERNEL_SOURCE_4 "pre_process_yuv420_scale_i16",
#define KERNEL_SOURCE_5 "pre_process_yuv420_scale_i8",
#define KERNEL_SOURCE_6 "pre_process_yuv420_trans_u8"
typedef enum
{
@ -80,8 +77,6 @@ static const struct {
TENSOR_PRE_PROCESS_YUV420_KERNELS(U8, U8, SCALE, KERNEL_SOURCE_1)
TENSOR_PRE_PROCESS_YUV420_KERNELS(U8, I8, SCALE, KERNEL_SOURCE_5)
TENSOR_PRE_PROCESS_YUV420_KERNELS(U8, U8, COPY, KERNEL_SOURCE_2)
TENSOR_PRE_PROCESS_YUV420_KERNELS(U8, U8, COPY_TRANS, KERNEL_SOURCE_2)
TENSOR_PRE_PROCESS_YUV420_KERNELS(U8, U8, TRANS, KERNEL_SOURCE_6)
};
static vx_param_description_t vxPreProcessYuv420Kernel_param_def[] =
@ -143,24 +138,24 @@ DEF_KERNEL_INITIALIZER(_pre_process_yuv420_copy_initializer)
width = out_shape->data[0];
height = out_shape->data[1];
if(reorder != 0)
if (reorder != 0)
{
reorder = 2;
order1 = 0;
}
if(trans)
if (trans)
{
width = width / 3;
}
if(attr[0]->dtype == U8)
if (attr[0]->dtype == U8)
{
dstScale = 1.0f / dstScale;
}
shaderParam.global_scale[0] = 16;
if(attr[0]->dtype == I16 || attr[0]->dtype == F16)
if (attr[0]->dtype == I16 || attr[0]->dtype == F16)
{
shaderParam.global_scale[0] = 8;
}
@ -176,131 +171,6 @@ DEF_KERNEL_INITIALIZER(_pre_process_yuv420_copy_initializer)
CHECK_STATUS_FAIL_GOTO(status, OnError);
{
gpu_dp_inst_t uniPackBG0_2x8 = {{
0x11011011, // TCfg
0x10010010, // ASelt
0x01000000, 0x02020001, // ABin
0x22022022, // BSelt
0x00000000, 0x00000000, // BBin
0x00000600, // AccumType, ConstantType, and PostShift
0x00000001, 0x00000001, 0x00000000, 0x00000001,
0x00000001, 0x00000000, 0x00000001, 0x00000001 // Constant
}, GPU_DP_TYPE_16 };
gpu_dp_inst_t uniPackTmpAndR_2x8 = {{
0x11111111, // TCfg
0x00100100, // ASelt
0x03000100, 0x07060104, // ABin
0x22222222, // BSelt
0x00000000, 0x00000000, // BBin
0x00000600, // AccumType, ConstantType, and PostShift
0x00000001, 0x00000001, 0x00000001, 0x00000001,
0x00000001, 0x00000001, 0x00000001, 0x00000001 // Constant
}, GPU_DP_TYPE_16 };
gpu_dp_inst_t uniPackRB0_2x8 = {{
0x11011011, // TCfg
0x10010010, // ASelt
0x03000302, 0x05040004, // ABin
0x22022022, // BSelt
0x00000000, 0x00000000, // BBin
0x00000600, // AccumType, ConstantType, and PostShift
0x00000001, 0x00000001, 0x00000000, 0x00000001,
0x00000001, 0x00000000, 0x00000001, 0x00000001 // Constant
}, GPU_DP_TYPE_16 };
gpu_dp_inst_t uniPackTmp0AndG_2x8 = {{
0x11111111, // TCfg
0x00100100, // ASelt
0x03030100, 0x07060404, // ABin
0x22222222, // BSelt
0x00000000, 0x00000000, // BBin
0x00000600, // AccumType, ConstantType, and PostShift
0x00000001, 0x00000001, 0x00000001, 0x00000001,
0x00000001, 0x00000001, 0x00000001, 0x00000001 // Constant
}, GPU_DP_TYPE_16 };
gpu_dp_inst_t uniPackGR1_2x8 = {{
0x11011011, // TCfg
0x10010010, // ASelt
0x06000505, 0x07070006, // ABin
0x22022022, // BSelt
0x00000000, 0x00000000, // BBin
0x00000600, // AccumType, ConstantType, and PostShift
0x00000001, 0x00000001, 0x00000000, 0x00000001,
0x00000001, 0x00000000, 0x00000001, 0x00000001 // Constant
}, GPU_DP_TYPE_16 };
gpu_dp_inst_t uniPackTmp1AndB_2x8 = {{
0x11111111, // TCfg
0x00100100, // ASelt
0x03060100, 0x07060704, // ABin
0x22222222, // BSelt
0x00000000, 0x00000000, // BBin
0x00000600, // AccumType, ConstantType, and PostShift
0x00000001, 0x00000001, 0x00000001, 0x00000001,
0x00000001, 0x00000001, 0x00000001, 0x00000001 // Constant
}, GPU_DP_TYPE_16 };
gpu_dp_inst_t uniPackBG1_2x8 = {{
0x11011011, // TCfg
0x10010010, // ASelt
0x09000808, 0x0a0a0009, // ABin
0x22022022, // BSelt
0x00000000, 0x00000000, // BBin
0x00000600, // AccumType, ConstantType, and PostShift
0x00000001, 0x00000001, 0x00000000, 0x00000001,
0x00000001, 0x00000000, 0x00000001, 0x00000001 // Constant
}, GPU_DP_TYPE_16 };
gpu_dp_inst_t uniPackTmp1AndR_2x8 = {{
0x11111111, // TCfg
0x00100100, // ASelt
0x03080100, 0x07060904, // ABin
0x22222222, // BSelt
0x00000000, 0x00000000, // BBin
0x00000600, // AccumType, ConstantType, and PostShift
0x00000001, 0x00000001, 0x00000001, 0x00000001,
0x00000001, 0x00000001, 0x00000001, 0x00000001 // Constant
}, GPU_DP_TYPE_16 };
gpu_dp_inst_t uniPackRB2_2x8 = {{
0x11011011, // TCfg
0x10010010, // ASelt
0x0b000b0a, 0x0d0c000c, // ABin
0x22022022, // BSelt
0x00000000, 0x00000000, // BBin
0x00000600, // AccumType, ConstantType, and PostShift
0x00000001, 0x00000001, 0x00000000, 0x00000001,
0x00000001, 0x00000000, 0x00000001, 0x00000001 // Constant
}, GPU_DP_TYPE_16 };
gpu_dp_inst_t uniPackTmp2AndG_2x8 = {{
0x11111111, // TCfg
0x00100100, // ASelt
0x030b0100, 0x07060c04, // ABin
0x22222222, // BSelt
0x00000000, 0x00000000, // BBin
0x00000600, // AccumType, ConstantType, and PostShift
0x00000001, 0x00000001, 0x00000001, 0x00000001,
0x00000001, 0x00000001, 0x00000001, 0x00000001 // Constant
}, GPU_DP_TYPE_16 };
gpu_dp_inst_t uniPackGR2_2x8 = {{
0x11011011, // TCfg
0x10010010, // ASelt
0x0e000d0d, 0x0f0f000e, // ABin
0x22022022, // BSelt
0x00000000, 0x00000000, // BBin
0x00000600, // AccumType, ConstantType, and PostShift
0x00000001, 0x00000001, 0x00000000, 0x00000001,
0x00000001, 0x00000000, 0x00000001, 0x00000001 // Constant
}, GPU_DP_TYPE_16 };
gpu_dp_inst_t uniPackTmp2AndB_2x8 = {{
0x11111111, // TCfg
0x00100100, // ASelt
0x030e0100, 0x07060f04, // ABin
0x22222222, // BSelt
0x00000000, 0x00000000, // BBin
0x00000600, // AccumType, ConstantType, and PostShift
0x00000001, 0x00000001, 0x00000001, 0x00000001,
0x00000001, 0x00000001, 0x00000001, 0x00000001 // Constant
}, GPU_DP_TYPE_16 };
gpu_dp_inst_t uniCalculateTmpR1st_4x4 = {{
0x05050505, // TCfg
0x04040404, // ASelt
@ -574,19 +444,6 @@ DEF_KERNEL_INITIALIZER(_pre_process_yuv420_copy_initializer)
status |= vsi_nn_kernel_gpu_add_param(node, "uniCalculateTmpB4th_4x4", &uniCalculateTmpB4th_4x4);
status |= vsi_nn_kernel_gpu_add_param(node, "uniCalculateB1st_4x4", &uniCalculateR1st_4x4);
status |= vsi_nn_kernel_gpu_add_param(node, "uniPackBG0_2x8", &uniPackBG0_2x8);
status |= vsi_nn_kernel_gpu_add_param(node, "uniPackTmpAndR_2x8", &uniPackTmpAndR_2x8);
status |= vsi_nn_kernel_gpu_add_param(node, "uniPackRB0_2x8", &uniPackRB0_2x8);
status |= vsi_nn_kernel_gpu_add_param(node, "uniPackTmp0AndG_2x8", &uniPackTmp0AndG_2x8);
status |= vsi_nn_kernel_gpu_add_param(node, "uniPackGR1_2x8", &uniPackGR1_2x8);
status |= vsi_nn_kernel_gpu_add_param(node, "uniPackTmp1AndB_2x8", &uniPackTmp1AndB_2x8);
status |= vsi_nn_kernel_gpu_add_param(node, "uniPackBG1_2x8", &uniPackBG1_2x8);
status |= vsi_nn_kernel_gpu_add_param(node, "uniPackTmp1AndR_2x8", &uniPackTmp1AndR_2x8);
status |= vsi_nn_kernel_gpu_add_param(node, "uniPackRB2_2x8", &uniPackRB2_2x8);
status |= vsi_nn_kernel_gpu_add_param(node, "uniPackTmp2AndG_2x8", &uniPackTmp2AndG_2x8);
status |= vsi_nn_kernel_gpu_add_param(node, "uniPackGR2_2x8", &uniPackGR2_2x8);
status |= vsi_nn_kernel_gpu_add_param(node, "uniPackTmp2AndB_2x8", &uniPackTmp2AndB_2x8);
status |= vsi_nn_kernel_gpu_add_param(node, "uniQuantU8toU8LoB_2x8", &uniQuantU8toU8LoB_2x8);
status |= vsi_nn_kernel_gpu_add_param(node, "uniQuantU8toU8HiB_2x8", &uniQuantU8toU8HiB_2x8);
status |= vsi_nn_kernel_gpu_add_param(node, "uniQuantU8toU8LoG_2x8", &uniQuantU8toU8LoG_2x8);
@ -633,7 +490,6 @@ DEF_KERNEL_INITIALIZER(_pre_process_yuv420_initializer)
int32_t dstZP = 0;
float dstScale = 1;
int32_t reorder = 0;
int32_t trans = 0;
int32_t order1 = 2;
uint32_t width = 0;
uint32_t height = 0;
@ -646,8 +502,6 @@ DEF_KERNEL_INITIALIZER(_pre_process_yuv420_initializer)
status = vsi_nn_kernel_scalar_read_int32((vsi_nn_kernel_scalar_t)param[12], &reorder);
CHECK_STATUS_FAIL_GOTO(status, OnError );
status = vsi_nn_kernel_scalar_read_int32((vsi_nn_kernel_scalar_t)param[13], &trans);
CHECK_STATUS_FAIL_GOTO(status, OnError );
out_shape = attr[0]->shape;
dstZP = attr[0]->asymm.zero_point;
@ -655,17 +509,13 @@ DEF_KERNEL_INITIALIZER(_pre_process_yuv420_initializer)
width = out_shape->data[0];
height = out_shape->data[1];
if(reorder != 0)
if (reorder != 0)
{
reorder = 2;
order1 = 0;
}
if(trans)
{
width = width / 3;
}
if(attr[0]->quant == VSI_NN_KERNEL_QUANT_DFP)
if (attr[0]->quant == VSI_NN_KERNEL_QUANT_DFP)
{
if (attr[0]->dfp.fl > 0)
{
@ -677,11 +527,11 @@ DEF_KERNEL_INITIALIZER(_pre_process_yuv420_initializer)
}
dstZP = 0;
}
else if(attr[0]->quant == VSI_NN_KERNEL_QUANT_ASYMM)
else if (attr[0]->quant == VSI_NN_KERNEL_QUANT_ASYMM)
{
dstScale = 1.0f/dstScale;
dstScale = 1.0f / dstScale;
}
else if( attr[0]->quant == VSI_NN_KERNEL_QUANT_NONE )
else if ( attr[0]->quant == VSI_NN_KERNEL_QUANT_NONE )
{
dstScale = 1;
dstZP = 0;
@ -925,26 +775,6 @@ DEF_KERNEL_INITIALIZER(_pre_process_yuv420_initializer)
0x00000001, 0x00000001, 0x00000001, 0x00000001, 0x00000001, 0x00000001, 0x00000001, 0x00000001 // Constant
}, GPU_DP_TYPE_16 };
//trans
gpu_dp_inst_t uniTransPackBgr1st_2x8 = {{
0x11311311, // TCfg
0x00100100, // ASelt
0x01000400, 0x06020105, // ABin
0x22022022, // BSelt
0x00000000, 0x00000000, // BBin
0x00002600, // AccumType, ConstantType, and PostShift
0x00000001, 0x00000001, 0x00000000, 0x00000001, 0x00000001, 0x00000000, 0x00000001, 0x00000001 // Constant
}, GPU_DP_TYPE_16 };
gpu_dp_inst_t uniTransPackBgr2nd_2x8 = {{
0x00003113, // TCfg
0x00001001, // ASelt
0x03070302, 0x00000000, // ABin
0x00000220, // BSelt
0x00000000, 0x00000000, // BBin
0x00002600, // AccumType, ConstantType, and PostShift
0x00000000, 0x00000001, 0x00000001, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000 // Constant
}, GPU_DP_TYPE_16 };
status = vsi_nn_kernel_gpu_add_param(node, "uniCalculateR1st_4x4", &uniCalculateR1st_4x4);
status |= vsi_nn_kernel_gpu_add_param(node, "uniCalculateTmpGbyU_2x8", &uniCalculateTmpGbyU_2x8);
status |= vsi_nn_kernel_gpu_add_param(node, "uniCalculateTmpGbyU2nd_2x8", &uniCalculateTmpGbyU2nd_2x8);
@ -975,16 +805,7 @@ DEF_KERNEL_INITIALIZER(_pre_process_yuv420_initializer)
status |= vsi_nn_kernel_gpu_add_param(node, "uniCalculateGWise_4x4", &uniCalculateGWise_4x4);
status |= vsi_nn_kernel_gpu_add_param(node, "uniCalculateGWise2nd_4x4", &uniCalculateGWise2nd_4x4);
if(trans)
{
status = vsi_nn_kernel_gpu_add_param(node, "uniTransPackBgr1st_2x8", &uniTransPackBgr1st_2x8);
status |= vsi_nn_kernel_gpu_add_param(node, "uniTransPackBgr2nd_2x8", &uniTransPackBgr2nd_2x8);
CHECK_STATUS_FAIL_GOTO(status, OnError );
}
else
{
status |= vsi_nn_kernel_gpu_add_param(node, "rOrder", &reorder);
}
status |= vsi_nn_kernel_gpu_add_param(node, "rOrder", &reorder);
status |= vsi_nn_kernel_gpu_add_param(node, "bOrder", &order1);
CHECK_STATUS_FAIL_GOTO(status, OnError );
@ -1041,20 +862,11 @@ static vsi_status _query_kernel
uint32_t key = 0;
int i = 0;
vsi_bool enable_copy = vsi_nn_kernel_param_get_int32( params, "enable_copy" );
vsi_bool enable_perm = vsi_nn_kernel_param_get_int32( params, "enable_perm" );
input0_dtype = vsi_nn_kernel_map_dtype( inputs[0]->attr.dtype.vx_type );
output_dtype = vsi_nn_kernel_map_dtype( outputs[0]->attr.dtype.vx_type );
if(enable_perm && enable_copy)
{
convert_type = COPY_TRANS;
}
else if(enable_perm)
{
convert_type = TRANS;
}
else if(enable_copy && output_dtype == U8)
if (enable_copy && output_dtype == U8)
{
convert_type = COPY;
}
@ -1065,20 +877,20 @@ static vsi_status _query_kernel
key = HASH_PRE_PROCESS_YUV420_KEY( input0_dtype, output_dtype, convert_type, 0 );
for( i = 0; i < _cnt_of_array(pre_process_yuv420_map); i ++ )
for ( i = 0; i < _cnt_of_array(pre_process_yuv420_map); i ++ )
{
if( pre_process_yuv420_map[i].key == key )
if ( pre_process_yuv420_map[i].key == key )
{
break;
}
}
if( i < _cnt_of_array(pre_process_yuv420_map) )
if ( i < _cnt_of_array(pre_process_yuv420_map) )
{
snprintf( kernel->info.name, VX_MAX_KERNEL_NAME, "%s", pre_process_yuv420_map[i].function_name );
kernel->info.parameters = vxPreProcessYuv420Kernel_param_def;
kernel->info.numParams = _cnt_of_array( vxPreProcessYuv420Kernel_param_def );
if(enable_copy && output_dtype == U8)
if (enable_copy && output_dtype == U8)
{
kernel->info.initialize = _pre_process_yuv420_copy_initializer;
}
@ -1110,21 +922,20 @@ static vsi_nn_kernel_node_t _setup
vsi_status status = VSI_FAILURE;
vsi_nn_kernel_node_param_t tmp_params[_EVIS_PRE_PROCESS_YUV420_PARAM_NUM] = { NULL };
vsi_nn_kernel_node_t node = NULL;
int32_t shapes[VSI_NN_MAX_DIM_NUM] = {1, 1, 1, 1};
vsi_nn_tensor_t* reshape_tensors[1] = {NULL};
int32_t trans = vsi_nn_kernel_param_get_int32( params, "enable_perm" );
int32_t trans = 0;
if( !vsi_nn_kernel_gpu_check_shape( (int32_t*)outputs[0]->attr.size,
if ( !vsi_nn_kernel_gpu_check_shape( (int32_t*)outputs[0]->attr.size,
outputs[0]->attr.dim_num ) )
{
return NULL;
}
status = _query_kernel( inputs, outputs, kernel, params );
if( VSI_SUCCESS == status)
if ( VSI_SUCCESS == status)
{
node = vsi_nn_kernel_create_node( graph, kernel );
if( node )
if ( node )
{
uint32_t index = 4;
int32_t scale_x = vsi_nn_kernel_param_get_int32( params, "scale_x" );
@ -1138,22 +949,10 @@ static vsi_nn_kernel_node_t _setup
int32_t reverse = vsi_nn_kernel_param_get_int32( params, "reverse" );
/* Pass parameters to node. */
if(trans)
{
shapes[0] = outputs[0]->attr.size[0] * outputs[0]->attr.size[1];
shapes[1] = outputs[0]->attr.size[2];
reshape_tensors[0] = vsi_nn_reshape_tensor( graph,
outputs[0], (uint32_t*)shapes, outputs[0]->attr.dim_num);
vsi_nn_kernel_node_pack_io( tmp_params, _EVIS_PRE_PROCESS_YUV420_PARAM_NUM,
inputs, 3, outputs, 1 );
vsi_nn_kernel_node_pack_io( tmp_params, _EVIS_PRE_PROCESS_YUV420_PARAM_NUM,
inputs, 3, &reshape_tensors[0], 1 );
}
else
{
vsi_nn_kernel_node_pack_io( tmp_params, _EVIS_PRE_PROCESS_YUV420_PARAM_NUM,
inputs, 3, outputs, 1 );
}
tmp_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &scale_x );
tmp_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &scale_y );
tmp_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &left );
@ -1178,7 +977,7 @@ static vsi_nn_kernel_node_t _setup
vsi_nn_kernel_scalar_release( &tmp_params[13] );
}
}
if(reshape_tensors[0])
if (reshape_tensors[0])
{
vsi_nn_ReleaseTensor(&reshape_tensors[0]);
}

View File

@ -43,11 +43,8 @@ __BEGIN_DECLS
#define VX_KERNEL_NAME_PRE_PROCESS_YUV444_SCALE_U8TOU8 CVIVANTE_NAMESPACE("evis.pre_process_yuv444_scale_U8toU8")
#define VX_KERNEL_NAME_PRE_PROCESS_YUV444_SCALE_U8TOI8 CVIVANTE_NAMESPACE("evis.pre_process_yuv444_scale_U8toI8")
#define VX_KERNEL_NAME_PRE_PROCESS_YUV444_COPY_U8TOU8 CVIVANTE_NAMESPACE("evis.pre_process_yuv444_copy_U8toU8")
#define VX_KERNEL_NAME_PRE_PROCESS_YUV444_COPY_TRANS_U8TOU8 CVIVANTE_NAMESPACE("evis.pre_process_yuv444_copy_trans_U8")
#define VX_KERNEL_NAME_PRE_PROCESS_YUV444_TRANS_U8TOU8 CVIVANTE_NAMESPACE("evis.pre_process_yuv444_trans_U8toU8")
#define KERNEL_SOURCE_1 "pre_process_yuv444_scale",
#define KERNEL_SOURCE_2 "pre_process_yuv444_trans_u8",
#define KERNEL_SOURCE_3 "pre_process_yuv444_scale_fp16",
#define KERNEL_SOURCE_4 "pre_process_yuv444_copy_u8",
@ -78,8 +75,6 @@ static const struct {
TENSOR_PRE_PROCESS_YUV444_KERNELS(U8, U8, SCALE, KERNEL_SOURCE_1)
TENSOR_PRE_PROCESS_YUV444_KERNELS(U8, I8, SCALE, KERNEL_SOURCE_1)
TENSOR_PRE_PROCESS_YUV444_KERNELS(U8, U8, COPY, KERNEL_SOURCE_4)
TENSOR_PRE_PROCESS_YUV444_KERNELS(U8, U8, COPY_TRANS, KERNEL_SOURCE_4)
TENSOR_PRE_PROCESS_YUV444_KERNELS(U8, U8, TRANS, KERNEL_SOURCE_2)
};
static vx_param_description_t vxPreProcessYuv444Kernel_param_def[] =
@ -119,7 +114,6 @@ DEF_KERNEL_INITIALIZER(_pre_process_yuv444_copy_initializer)
int32_t dstZP = 0;
float dstScale = 1;
int32_t reorder = 0;
int32_t trans = 0;
int32_t order1 = 2;
uint32_t width = 0;
uint32_t height = 0;
@ -132,8 +126,6 @@ DEF_KERNEL_INITIALIZER(_pre_process_yuv444_copy_initializer)
status = vsi_nn_kernel_scalar_read_int32((vsi_nn_kernel_scalar_t)param[12], &reorder);
CHECK_STATUS_FAIL_GOTO(status, OnError );
status = vsi_nn_kernel_scalar_read_int32((vsi_nn_kernel_scalar_t)param[13], &trans);
CHECK_STATUS_FAIL_GOTO(status, OnError );
out_shape = attr[0]->shape;
dstZP = attr[0]->asymm.zero_point;
@ -141,24 +133,19 @@ DEF_KERNEL_INITIALIZER(_pre_process_yuv444_copy_initializer)
width = out_shape->data[0];
height = out_shape->data[1];
if(reorder != 0)
if (reorder != 0)
{
reorder = 2;
order1 = 0;
}
if(trans)
{
width = width / 3;
}
if(attr[0]->dtype == U8)
if (attr[0]->dtype == U8)
{
dstScale = 1.0f / dstScale;
}
shaderParam.global_scale[0] = 16;
if(attr[0]->dtype == I16 || attr[0]->dtype == F16)
if (attr[0]->dtype == I16 || attr[0]->dtype == F16)
{
shaderParam.global_scale[0] = 8;
}
@ -174,131 +161,6 @@ DEF_KERNEL_INITIALIZER(_pre_process_yuv444_copy_initializer)
CHECK_STATUS_FAIL_GOTO(status, OnError);
{
gpu_dp_inst_t uniPackBG0_2x8 = {{
0x11011011, // TCfg
0x10010010, // ASelt
0x01000000, 0x02020001, // ABin
0x22022022, // BSelt
0x00000000, 0x00000000, // BBin
0x00000600, // AccumType, ConstantType, and PostShift
0x00000001, 0x00000001, 0x00000000, 0x00000001,
0x00000001, 0x00000000, 0x00000001, 0x00000001 // Constant
}, GPU_DP_TYPE_16 };
gpu_dp_inst_t uniPackTmpAndR_2x8 = {{
0x11111111, // TCfg
0x00100100, // ASelt
0x03000100, 0x07060104, // ABin
0x22222222, // BSelt
0x00000000, 0x00000000, // BBin
0x00000600, // AccumType, ConstantType, and PostShift
0x00000001, 0x00000001, 0x00000001, 0x00000001,
0x00000001, 0x00000001, 0x00000001, 0x00000001 // Constant
}, GPU_DP_TYPE_16 };
gpu_dp_inst_t uniPackRB0_2x8 = {{
0x11011011, // TCfg
0x10010010, // ASelt
0x03000302, 0x05040004, // ABin
0x22022022, // BSelt
0x00000000, 0x00000000, // BBin
0x00000600, // AccumType, ConstantType, and PostShift
0x00000001, 0x00000001, 0x00000000, 0x00000001,
0x00000001, 0x00000000, 0x00000001, 0x00000001 // Constant
}, GPU_DP_TYPE_16 };
gpu_dp_inst_t uniPackTmp0AndG_2x8 = {{
0x11111111, // TCfg
0x00100100, // ASelt
0x03030100, 0x07060404, // ABin
0x22222222, // BSelt
0x00000000, 0x00000000, // BBin
0x00000600, // AccumType, ConstantType, and PostShift
0x00000001, 0x00000001, 0x00000001, 0x00000001,
0x00000001, 0x00000001, 0x00000001, 0x00000001 // Constant
}, GPU_DP_TYPE_16 };
gpu_dp_inst_t uniPackGR1_2x8 = {{
0x11011011, // TCfg
0x10010010, // ASelt
0x06000505, 0x07070006, // ABin
0x22022022, // BSelt
0x00000000, 0x00000000, // BBin
0x00000600, // AccumType, ConstantType, and PostShift
0x00000001, 0x00000001, 0x00000000, 0x00000001,
0x00000001, 0x00000000, 0x00000001, 0x00000001 // Constant
}, GPU_DP_TYPE_16 };
gpu_dp_inst_t uniPackTmp1AndB_2x8 = {{
0x11111111, // TCfg
0x00100100, // ASelt
0x03060100, 0x07060704, // ABin
0x22222222, // BSelt
0x00000000, 0x00000000, // BBin
0x00000600, // AccumType, ConstantType, and PostShift
0x00000001, 0x00000001, 0x00000001, 0x00000001,
0x00000001, 0x00000001, 0x00000001, 0x00000001 // Constant
}, GPU_DP_TYPE_16 };
gpu_dp_inst_t uniPackBG1_2x8 = {{
0x11011011, // TCfg
0x10010010, // ASelt
0x09000808, 0x0a0a0009, // ABin
0x22022022, // BSelt
0x00000000, 0x00000000, // BBin
0x00000600, // AccumType, ConstantType, and PostShift
0x00000001, 0x00000001, 0x00000000, 0x00000001,
0x00000001, 0x00000000, 0x00000001, 0x00000001 // Constant
}, GPU_DP_TYPE_16 };
gpu_dp_inst_t uniPackTmp1AndR_2x8 = {{
0x11111111, // TCfg
0x00100100, // ASelt
0x03080100, 0x07060904, // ABin
0x22222222, // BSelt
0x00000000, 0x00000000, // BBin
0x00000600, // AccumType, ConstantType, and PostShift
0x00000001, 0x00000001, 0x00000001, 0x00000001,
0x00000001, 0x00000001, 0x00000001, 0x00000001 // Constant
}, GPU_DP_TYPE_16 };
gpu_dp_inst_t uniPackRB2_2x8 = {{
0x11011011, // TCfg
0x10010010, // ASelt
0x0b000b0a, 0x0d0c000c, // ABin
0x22022022, // BSelt
0x00000000, 0x00000000, // BBin
0x00000600, // AccumType, ConstantType, and PostShift
0x00000001, 0x00000001, 0x00000000, 0x00000001,
0x00000001, 0x00000000, 0x00000001, 0x00000001 // Constant
}, GPU_DP_TYPE_16 };
gpu_dp_inst_t uniPackTmp2AndG_2x8 = {{
0x11111111, // TCfg
0x00100100, // ASelt
0x030b0100, 0x07060c04, // ABin
0x22222222, // BSelt
0x00000000, 0x00000000, // BBin
0x00000600, // AccumType, ConstantType, and PostShift
0x00000001, 0x00000001, 0x00000001, 0x00000001,
0x00000001, 0x00000001, 0x00000001, 0x00000001 // Constant
}, GPU_DP_TYPE_16 };
gpu_dp_inst_t uniPackGR2_2x8 = {{
0x11011011, // TCfg
0x10010010, // ASelt
0x0e000d0d, 0x0f0f000e, // ABin
0x22022022, // BSelt
0x00000000, 0x00000000, // BBin
0x00000600, // AccumType, ConstantType, and PostShift
0x00000001, 0x00000001, 0x00000000, 0x00000001,
0x00000001, 0x00000000, 0x00000001, 0x00000001 // Constant
}, GPU_DP_TYPE_16 };
gpu_dp_inst_t uniPackTmp2AndB_2x8 = {{
0x11111111, // TCfg
0x00100100, // ASelt
0x030e0100, 0x07060f04, // ABin
0x22222222, // BSelt
0x00000000, 0x00000000, // BBin
0x00000600, // AccumType, ConstantType, and PostShift
0x00000001, 0x00000001, 0x00000001, 0x00000001,
0x00000001, 0x00000001, 0x00000001, 0x00000001 // Constant
}, GPU_DP_TYPE_16 };
gpu_dp_inst_t uniCalculateTmpR1st_4x4 = {{
0x05050505, // TCfg
0x04040404, // ASelt
@ -563,19 +425,6 @@ DEF_KERNEL_INITIALIZER(_pre_process_yuv444_copy_initializer)
status |= vsi_nn_kernel_gpu_add_param(node, "uniCalculateTmpB4th_4x4", &uniCalculateTmpB4th_4x4);
status |= vsi_nn_kernel_gpu_add_param(node, "uniCalculateB1st_4x4", &uniCalculateR1st_4x4);
status |= vsi_nn_kernel_gpu_add_param(node, "uniPackBG0_2x8", &uniPackBG0_2x8);
status |= vsi_nn_kernel_gpu_add_param(node, "uniPackTmpAndR_2x8", &uniPackTmpAndR_2x8);
status |= vsi_nn_kernel_gpu_add_param(node, "uniPackRB0_2x8", &uniPackRB0_2x8);
status |= vsi_nn_kernel_gpu_add_param(node, "uniPackTmp0AndG_2x8", &uniPackTmp0AndG_2x8);
status |= vsi_nn_kernel_gpu_add_param(node, "uniPackGR1_2x8", &uniPackGR1_2x8);
status |= vsi_nn_kernel_gpu_add_param(node, "uniPackTmp1AndB_2x8", &uniPackTmp1AndB_2x8);
status |= vsi_nn_kernel_gpu_add_param(node, "uniPackBG1_2x8", &uniPackBG1_2x8);
status |= vsi_nn_kernel_gpu_add_param(node, "uniPackTmp1AndR_2x8", &uniPackTmp1AndR_2x8);
status |= vsi_nn_kernel_gpu_add_param(node, "uniPackRB2_2x8", &uniPackRB2_2x8);
status |= vsi_nn_kernel_gpu_add_param(node, "uniPackTmp2AndG_2x8", &uniPackTmp2AndG_2x8);
status |= vsi_nn_kernel_gpu_add_param(node, "uniPackGR2_2x8", &uniPackGR2_2x8);
status |= vsi_nn_kernel_gpu_add_param(node, "uniPackTmp2AndB_2x8", &uniPackTmp2AndB_2x8);
status |= vsi_nn_kernel_gpu_add_param(node, "uniQuantU8toU8LoB_2x8", &uniQuantU8toU8LoB_2x8);
status |= vsi_nn_kernel_gpu_add_param(node, "uniQuantU8toU8HiB_2x8", &uniQuantU8toU8HiB_2x8);
status |= vsi_nn_kernel_gpu_add_param(node, "uniQuantU8toU8LoG_2x8", &uniQuantU8toU8LoG_2x8);
@ -622,7 +471,6 @@ DEF_KERNEL_INITIALIZER(_pre_process_yuv444_initializer)
int32_t dstZP = 0;
float dstScale = 1;
int32_t reorder = 0;
int32_t trans = 0;
int32_t order1 = 2;
uint32_t width = 0;
uint32_t height = 0;
@ -635,8 +483,6 @@ DEF_KERNEL_INITIALIZER(_pre_process_yuv444_initializer)
status = vsi_nn_kernel_scalar_read_int32((vsi_nn_kernel_scalar_t)param[12], &reorder);
CHECK_STATUS_FAIL_GOTO(status, OnError );
status = vsi_nn_kernel_scalar_read_int32((vsi_nn_kernel_scalar_t)param[13], &trans);
CHECK_STATUS_FAIL_GOTO(status, OnError );
out_shape = attr[0]->shape;
dstZP = attr[0]->asymm.zero_point;
@ -644,17 +490,13 @@ DEF_KERNEL_INITIALIZER(_pre_process_yuv444_initializer)
width = out_shape->data[0];
height = out_shape->data[1];
if(reorder != 0)
if (reorder != 0)
{
reorder = 2;
order1 = 0;
}
if(trans)
{
width = width / 3;
}
if(attr[0]->quant == VSI_NN_KERNEL_QUANT_DFP)
if (attr[0]->quant == VSI_NN_KERNEL_QUANT_DFP)
{
if (attr[0]->dfp.fl > 0)
{
@ -666,11 +508,11 @@ DEF_KERNEL_INITIALIZER(_pre_process_yuv444_initializer)
}
dstZP = 0;
}
else if(attr[0]->quant == VSI_NN_KERNEL_QUANT_ASYMM)
else if (attr[0]->quant == VSI_NN_KERNEL_QUANT_ASYMM)
{
dstScale = 1.0f/dstScale;
dstScale = 1.0f / dstScale;
}
else if( attr[0]->quant == VSI_NN_KERNEL_QUANT_NONE )
else if ( attr[0]->quant == VSI_NN_KERNEL_QUANT_NONE )
{
dstScale = 1;
dstZP = 0;
@ -914,26 +756,6 @@ DEF_KERNEL_INITIALIZER(_pre_process_yuv444_initializer)
0x00000001, 0x00000001, 0x00000001, 0x00000001, 0x00000001, 0x00000001, 0x00000001, 0x00000001 // Constant
}, GPU_DP_TYPE_16 };
//trans
gpu_dp_inst_t uniTransPackBgr1st_2x8 = {{
0x11311311, // TCfg
0x00100100, // ASelt
0x01000400, 0x06020105, // ABin
0x22022022, // BSelt
0x00000000, 0x00000000, // BBin
0x00002600, // AccumType, ConstantType, and PostShift
0x00000001, 0x00000001, 0x00000000, 0x00000001, 0x00000001, 0x00000000, 0x00000001, 0x00000001 // Constant
}, GPU_DP_TYPE_16 };
gpu_dp_inst_t uniTransPackBgr2nd_2x8 = {{
0x00003113, // TCfg
0x00001001, // ASelt
0x03070302, 0x00000000, // ABin
0x00000220, // BSelt
0x00000000, 0x00000000, // BBin
0x00002600, // AccumType, ConstantType, and PostShift
0x00000000, 0x00000001, 0x00000001, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000 // Constant
}, GPU_DP_TYPE_16 };
status = vsi_nn_kernel_gpu_add_param(node, "uniCalculateR1st_4x4", &uniCalculateR1st_4x4);
status |= vsi_nn_kernel_gpu_add_param(node, "uniCalculateTmpGbyU_2x8", &uniCalculateTmpGbyU_2x8);
status |= vsi_nn_kernel_gpu_add_param(node, "uniCalculateTmpGbyU2nd_2x8", &uniCalculateTmpGbyU2nd_2x8);
@ -963,17 +785,8 @@ DEF_KERNEL_INITIALIZER(_pre_process_yuv444_initializer)
status |= vsi_nn_kernel_gpu_add_param(node, "uniCalculateGWise_4x4", &uniCalculateGWise_4x4);
status |= vsi_nn_kernel_gpu_add_param(node, "uniCalculateGWise2nd_4x4", &uniCalculateGWise2nd_4x4);
status |= vsi_nn_kernel_gpu_add_param(node, "rOrder", &reorder);
if(trans)
{
status = vsi_nn_kernel_gpu_add_param(node, "uniTransPackBgr1st_2x8", &uniTransPackBgr1st_2x8);
status |= vsi_nn_kernel_gpu_add_param(node, "uniTransPackBgr2nd_2x8", &uniTransPackBgr2nd_2x8);
CHECK_STATUS_FAIL_GOTO(status, OnError );
}
else
{
status |= vsi_nn_kernel_gpu_add_param(node, "rOrder", &reorder);
}
status |= vsi_nn_kernel_gpu_add_param(node, "bOrder", &order1);
CHECK_STATUS_FAIL_GOTO(status, OnError );
@ -1024,20 +837,11 @@ static vsi_status _query_kernel
uint32_t key = 0;
int i = 0;
vsi_bool enable_copy = vsi_nn_kernel_param_get_int32( params, "enable_copy" );
vsi_bool enable_perm = vsi_nn_kernel_param_get_int32( params, "enable_perm" );
input0_dtype = vsi_nn_kernel_map_dtype( inputs[0]->attr.dtype.vx_type );
output_dtype = vsi_nn_kernel_map_dtype( outputs[0]->attr.dtype.vx_type );
if(enable_perm && enable_copy)
{
convert_type = COPY_TRANS;
}
else if(enable_perm)
{
convert_type = TRANS;
}
else if(enable_copy && output_dtype == U8)
if (enable_copy && output_dtype == U8)
{
convert_type = COPY;
}
@ -1048,20 +852,20 @@ static vsi_status _query_kernel
key = HASH_PRE_PROCESS_YUV444_KEY( input0_dtype, output_dtype, convert_type, 0 );
for( i = 0; i < _cnt_of_array(pre_process_yuv444_map); i ++ )
for ( i = 0; i < _cnt_of_array(pre_process_yuv444_map); i ++ )
{
if( pre_process_yuv444_map[i].key == key )
if ( pre_process_yuv444_map[i].key == key )
{
break;
}
}
if( i < _cnt_of_array(pre_process_yuv444_map) )
if ( i < _cnt_of_array(pre_process_yuv444_map) )
{
snprintf( kernel->info.name, VX_MAX_KERNEL_NAME, "%s", pre_process_yuv444_map[i].function_name );
kernel->info.parameters = vxPreProcessYuv444Kernel_param_def;
kernel->info.numParams = _cnt_of_array( vxPreProcessYuv444Kernel_param_def );
if(enable_copy && output_dtype == U8)
if (enable_copy && output_dtype == U8)
{
kernel->info.initialize = _pre_process_yuv444_copy_initializer;
}
@ -1093,21 +897,20 @@ static vsi_nn_kernel_node_t _setup
vsi_status status = VSI_FAILURE;
vsi_nn_kernel_node_param_t tmp_params[_EVIS_PRE_PROCESS_YUV444_PARAM_NUM] = { NULL };
vsi_nn_kernel_node_t node = NULL;
int32_t shapes[VSI_NN_MAX_DIM_NUM] = {1, 1, 1, 1};
vsi_nn_tensor_t* reshape_tensors[1] = {NULL};
int32_t trans = vsi_nn_kernel_param_get_int32( params, "enable_perm" );
int32_t trans = 0;
if( !vsi_nn_kernel_gpu_check_shape( (int32_t*)outputs[0]->attr.size,
if ( !vsi_nn_kernel_gpu_check_shape( (int32_t*)outputs[0]->attr.size,
outputs[0]->attr.dim_num ) )
{
return NULL;
}
status = _query_kernel( inputs, outputs, kernel, params );
if( VSI_SUCCESS == status)
if ( VSI_SUCCESS == status)
{
node = vsi_nn_kernel_create_node( graph, kernel );
if( node )
if ( node )
{
uint32_t index = 4;
int32_t scale_x = vsi_nn_kernel_param_get_int32( params, "scale_x" );
@ -1121,22 +924,9 @@ static vsi_nn_kernel_node_t _setup
int32_t reverse = vsi_nn_kernel_param_get_int32( params, "reverse" );
/* Pass parameters to node. */
if(trans)
{
shapes[0] = outputs[0]->attr.size[0] * outputs[0]->attr.size[1];
shapes[1] = outputs[0]->attr.size[2];
vsi_nn_kernel_node_pack_io( tmp_params, _EVIS_PRE_PROCESS_YUV444_PARAM_NUM,
inputs, 3, outputs, 1 );
reshape_tensors[0] = vsi_nn_reshape_tensor( graph,
outputs[0], (uint32_t*)shapes, outputs[0]->attr.dim_num);
vsi_nn_kernel_node_pack_io( tmp_params, _EVIS_PRE_PROCESS_YUV444_PARAM_NUM,
inputs, 3, &reshape_tensors[0], 1 );
}
else
{
vsi_nn_kernel_node_pack_io( tmp_params, _EVIS_PRE_PROCESS_YUV444_PARAM_NUM,
inputs, 3, outputs, 1 );
}
tmp_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &scale_x );
tmp_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &scale_y );
tmp_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &left );

View File

@ -369,6 +369,26 @@ DEF_KERNEL_INITIALIZER(_resize_bilinear_initializer)
0x00000000, 0x00000000, 0x00000000, 0x00000000,
0x00000000, 0x00000000, 0x00000000, 0x00000000 // Constant
}, GPU_DP_TYPE_16};
gpu_dp_inst_t uniRightSubLeft_4x4 = {{
0x09090909, // TCfg
0x00000000, // ASelt
0x00230001, 0x00670045, // ABin
0x0a0a0a0a, // BSelt
0x00000000, 0x00000000, // BBin
0x00000400, // AccumType, ConstantType, and PostShift
0x00010001, 0x00000000, 0x00010001, 0x00000000,
0x00010001, 0x00000000, 0x00010001, 0x00000000 // Constant
}, GPU_DP_TYPE_16};
gpu_dp_inst_t uniDFPtoFp32_left_4x4 = {{
0x01010101, // TCfg
0x00000000, // ASelt
0x00020000, 0x00060004, // ABin
0x02020202, // BSelt
0x00000000, 0x00000000, // BBin
0x00000400, // AccumType, ConstantType, and PostShift
0x00000001, 0x00000000, 0x00000001, 0x00000000,
0x00000001, 0x00000000, 0x00000001, 0x00000000 // Constant
}, GPU_DP_TYPE_16};
if (I8 == input_dtype && I8 == output_dtype && out_width > in_width)
{
@ -405,7 +425,8 @@ DEF_KERNEL_INITIALIZER(_resize_bilinear_initializer)
status = vsi_nn_kernel_gpu_add_param( node, "uniConvertI32toI16_2x8", &uniConvertI32toI16_2x8);
status |= vsi_nn_kernel_gpu_add_param( node, "uniGetMaskShift_2x8", &uniGetMaskShift_2x8);
status |= vsi_nn_kernel_gpu_add_param( node, "uniConvertDFP2FP32_part1_4x4",
status |= vsi_nn_kernel_gpu_add_param( node, "uniDFPtoFp32_left_4x4", &uniConvertDFP2FP32_4x4);
status |= vsi_nn_kernel_gpu_add_param( node, "uniRightSubLeft_4x4",
&uniConvertDFP2FP32_part1_4x4);
status |= vsi_nn_kernel_gpu_add_param( node, "depth", &depth);
CHECK_STATUS_FAIL_GOTO(status, final );
@ -447,16 +468,22 @@ DEF_KERNEL_INITIALIZER(_resize_bilinear_initializer)
status = vsi_nn_kernel_gpu_add_param( node, "uniConvertI32toI16_2x8", &uniConvertI32toI16_2x8);
status |= vsi_nn_kernel_gpu_add_param( node, "uniGetMaskShift_2x8", &uniGetMaskShift_2x8);
status |= vsi_nn_kernel_gpu_add_param( node, "uniConvertDFP2FP32_part1_4x4",
status |= vsi_nn_kernel_gpu_add_param( node, "uniDFPtoFp32_left_4x4", &uniConvertDFP2FP32_4x4);
status |= vsi_nn_kernel_gpu_add_param( node, "uniRightSubLeft_4x4",
&uniConvertDFP2FP32_part1_4x4);
status |= vsi_nn_kernel_gpu_add_param( node, "depth", &depth);
CHECK_STATUS_FAIL_GOTO(status, final );
gpu_param.global_scale[2] = depth;
}
else
{
status = vsi_nn_kernel_gpu_add_param( node, "uniRightSubLeft_4x4", &uniRightSubLeft_4x4);
status |= vsi_nn_kernel_gpu_add_param( node, "uniDFPtoFp32_left_4x4", &uniDFPtoFp32_left_4x4);
CHECK_STATUS_FAIL_GOTO(status, final );
}
status = vsi_nn_kernel_gpu_add_param( node, "uniConvertDFP2FP32_4x4", &uniConvertDFP2FP32_4x4);
status |= vsi_nn_kernel_gpu_add_param( node, "uniExtact8Bit_2x8", &uniExtact8Bit_2x8);
status = vsi_nn_kernel_gpu_add_param( node, "uniExtact8Bit_2x8", &uniExtact8Bit_2x8);
status |= vsi_nn_kernel_gpu_add_param( node, "scale_xy", scale_factor);
status |= vsi_nn_kernel_gpu_add_param( node, "dfpScale", &dfpScale);
CHECK_STATUS_FAIL_GOTO(status, final );
@ -485,10 +512,33 @@ DEF_KERNEL_INITIALIZER(_resize_bilinear_initializer)
0x00010001, 0x00000000, 0x00010001, 0x00000000,
0x00010001, 0x00000000, 0x00010001, 0x00000000 // Constant
}, GPU_DP_TYPE_16};
gpu_dp_inst_t uniU8SubZPtoFp32_left_4x4 = {{
0x09090909, // TCfg
0x04040404, // ASelt
0x00020000, 0x00060004, // ABin
0x0a0a0a0a, // BSelt
0x00000000, 0x00000000, // BBin
0x00000400, // AccumType, ConstantType, and PostShift
0x00010001, 0x00000000, 0x00010001, 0x00000000,
0x00010001, 0x00000000, 0x00010001, 0x00000000 // Constant
}, GPU_DP_TYPE_16};
gpu_dp_inst_t uniU8RightSubLeft_4x4 = {{
0x09090909, // TCfg
0x00000000, // ASelt
0x00230001, 0x00670045, // ABin
0x0a0a0a0a, // BSelt
0x00000000, 0x00000000, // BBin
0x00000400, // AccumType, ConstantType, and PostShift
0x00010001, 0x00000000, 0x00010001, 0x00000000,
0x00010001, 0x00000000, 0x00010001, 0x00000000 // Constant
}, GPU_DP_TYPE_16};
if (F16 == output_dtype)
{
status = vsi_nn_kernel_gpu_add_param( node, "uint8Scale", &input_scale);
status = vsi_nn_kernel_gpu_add_param( node, "uint8Scale", &input_scale);
status |= vsi_nn_kernel_gpu_add_param( node, "uniU8SubZPtoFp32_left_4x4", &uniU8SubZPtoFp32_left_4x4);
status |= vsi_nn_kernel_gpu_add_param( node, "uniU8RightSubLeft_4x4", &uniU8RightSubLeft_4x4);
CHECK_STATUS_FAIL_GOTO(status, final );
}
else
@ -544,13 +594,21 @@ DEF_KERNEL_INITIALIZER(_resize_bilinear_initializer)
}
else
{
status = vsi_nn_kernel_gpu_add_param( node, "uniU8SubZPtoFp32_left_4x4", &uniU8SubZPtoFp32_4x4);
status |= vsi_nn_kernel_gpu_add_param( node,
"uniU8SubZPtoFp32_part1_4x4", &uniU8SubZPtoFp32_part1_4x4);
"uniU8RightSubLeft_4x4", &uniU8SubZPtoFp32_part1_4x4);
}
CHECK_STATUS_FAIL_GOTO(status, final );
gpu_param.global_scale[2] = depth;
}
else if (!is_use_scale_kernel)
{
status = vsi_nn_kernel_gpu_add_param( node, "uniU8SubZPtoFp32_left_4x4", &uniU8SubZPtoFp32_left_4x4);
status |= vsi_nn_kernel_gpu_add_param( node, "uniU8RightSubLeft_4x4", &uniU8RightSubLeft_4x4);
CHECK_STATUS_FAIL_GOTO(status, final );
}
if (!is_use_scale_kernel)
{
status = vsi_nn_kernel_gpu_add_param( node, "uniExtact8Bit_2x8", &uniExtact8Bit_2x8);
@ -562,8 +620,7 @@ DEF_KERNEL_INITIALIZER(_resize_bilinear_initializer)
status = vsi_nn_kernel_gpu_add_param( node, "scale_xy", scale_factor);
if (!is_use_scale_kernel)
{
status = vsi_nn_kernel_gpu_add_param( node, "uniU8SubZPtoFp32_4x4", &uniU8SubZPtoFp32_4x4);
status |= vsi_nn_kernel_gpu_add_param( node, "input_ZP", &inputZP);
status = vsi_nn_kernel_gpu_add_param( node, "input_ZP", &inputZP);
}
CHECK_STATUS_FAIL_GOTO(status, final );
}
@ -581,25 +638,25 @@ DEF_KERNEL_INITIALIZER(_resize_bilinear_initializer)
0x00000000, 0x00000000, 0x00000000, 0x00000000,
0x00000000, 0x00000000, 0x00000000, 0x00000000 // Constant
}, GPU_DP_TYPE_16};
gpu_dp_inst_t uniFp16toFp32_4x4 = {{
0x01010101, // TCfg
0x00000000, // ASelt
0x00010000, 0x00030002, // ABin
0x02020202, // BSelt
0x00000000, 0x00000000, // BBin
0x00000100, // AccumType, ConstantType, and PostShift
0x00003c00, 0x00000000, 0x00003c00, 0x00000000,
0x00003c00, 0x00000000, 0x00003c00, 0x00000000 // Constant
}, GPU_DP_TYPE_16};
gpu_dp_inst_t uniRightSubLeft_4x4 = {{
0x09090909, // TCfg
0x04040404, // ASelt
0x00110000, 0x00330022, // ABin
0x00000000, // ASelt
0x00230001, 0x00670045, // ABin
0x0a0a0a0a, // BSelt
0x00000000, 0x00000000, // BBin
0x00000100, // AccumType, ConstantType, and PostShift
0x3c003c00, 0x00000000, 0x3c003c00, 0x00000000,
0x3c003c00, 0x00000000, 0x3c003c00, 0x00000000 // Constant
0x00000400, // AccumType, ConstantType, and PostShift
0x00010001, 0x00000000, 0x00010001, 0x00000000,
0x00010001, 0x00000000, 0x00010001, 0x00000000 // Constant
}, GPU_DP_TYPE_16};
gpu_dp_inst_t uniFp16toFp32_left_4x4 = {{
0x01010101, // TCfg
0x00000000, // ASelt
0x00020000, 0x00060004, // ABin
0x02020202, // BSelt
0x00000000, 0x00000000, // BBin
0x00000400, // AccumType, ConstantType, and PostShift
0x00000001, 0x00000000, 0x00000001, 0x00000000,
0x00000001, 0x00000000, 0x00000001, 0x00000000 // Constant
}, GPU_DP_TYPE_16};
gpu_dp_inst_t uniExtactHalf8_2x8 = {{
0x11111111, // TCfg
@ -634,7 +691,17 @@ DEF_KERNEL_INITIALIZER(_resize_bilinear_initializer)
0x00000000, 0x00000000, 0x00000000, 0x00000000,
0x00000000, 0x00000000, 0x00000000, 0x00000000 // Constant
}, GPU_DP_TYPE_16};
gpu_dp_inst_t uniFp16toFp32_part1_4x4 = {{
gpu_dp_inst_t uniFp16toFp32_Lo_4x4 = {{
0x01010101, // TCfg
0x00000000, // ASelt
0x00010000, 0x00030002, // ABin
0x02020202, // BSelt
0x00000000, 0x00000000, // BBin
0x00000400, // AccumType, ConstantType, and PostShift
0x00000001, 0x00000000, 0x00000001, 0x00000000,
0x00000001, 0x00000000, 0x00000001, 0x00000000 // Constant
}, GPU_DP_TYPE_16};
gpu_dp_inst_t uniFp16toFp32_Hi_4x4 = {{
0x09090909, // TCfg
0x00000000, // ASelt
0x00150004, 0x00370026, // ABin
@ -647,7 +714,8 @@ DEF_KERNEL_INITIALIZER(_resize_bilinear_initializer)
status = vsi_nn_kernel_gpu_add_param( node, "uniConvertI32toI16_2x8", &uniConvertI32toI16_2x8);
status |= vsi_nn_kernel_gpu_add_param( node, "uniGetMaskShift_2x8", &uniGetMaskShift_2x8);
status |= vsi_nn_kernel_gpu_add_param( node, "uniFp16toFp32_part1_4x4", &uniFp16toFp32_part1_4x4);
status |= vsi_nn_kernel_gpu_add_param( node, "uniFp16toFp32_left_4x4", &uniFp16toFp32_Lo_4x4);
status |= vsi_nn_kernel_gpu_add_param( node, "uniRightSubLeft_4x4", &uniFp16toFp32_Hi_4x4);
status |= vsi_nn_kernel_gpu_add_param( node, "uniExtactHalf8_2x8", &uniExtactHalf8_2x8);
status |= vsi_nn_kernel_gpu_add_param( node, "depth", &depth);
CHECK_STATUS_FAIL_GOTO(status, final );
@ -657,19 +725,21 @@ DEF_KERNEL_INITIALIZER(_resize_bilinear_initializer)
else if (F16 == output_dtype)
{
status = vsi_nn_kernel_gpu_add_param( node, "uniExtactHalf8_2x8", &uniExtactHalf8_2x8);
status |= vsi_nn_kernel_gpu_add_param( node, "uniRightSubLeft_4x4", &uniRightSubLeft_4x4);
status |= vsi_nn_kernel_gpu_add_param( node, "uniFp16toFp32_left_4x4", &uniFp16toFp32_left_4x4);
CHECK_STATUS_FAIL_GOTO(status, final );
}
else
{
status = vsi_nn_kernel_gpu_add_param( node, "uniExtact8Bit_2x8", &uniExtact8Bit_2x8);
status |= vsi_nn_kernel_gpu_add_param( node, "uniRightSubLeft_4x4", &uniRightSubLeft_4x4);
status |= vsi_nn_kernel_gpu_add_param( node, "uniFp16toFp32_left_4x4", &uniFp16toFp32_left_4x4);
status |= vsi_nn_kernel_gpu_add_param( node, "uint8Scale", &uint8Scale);
status |= vsi_nn_kernel_gpu_add_param( node, "output_ZP", &uint8ZP_out);
CHECK_STATUS_FAIL_GOTO(status, final );
}
status = vsi_nn_kernel_gpu_add_param( node, "uniFp16toFp32_4x4", &uniFp16toFp32_4x4);
status |= vsi_nn_kernel_gpu_add_param( node, "uniRightSubLeft_4x4", &uniRightSubLeft_4x4);
status |= vsi_nn_kernel_gpu_add_param( node, "scale_xy", scale_factor);
status = vsi_nn_kernel_gpu_add_param( node, "scale_xy", scale_factor);
CHECK_STATUS_FAIL_GOTO(status, final );
}
else if (BF16 == input_dtype && BF16 == output_dtype)

View File

@ -0,0 +1,366 @@
/****************************************************************************
*
* Copyright (c) 2019 Vivante Corporation
*
* Permission is hereby granted, free of charge, to any person obtaining a
* copy of this software and associated documentation files (the "Software"),
* to deal in the Software without restriction, including without limitation
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
* and/or sell copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in
* all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
* DEALINGS IN THE SOFTWARE.
*
*****************************************************************************/
#include <stdint.h>
#include <stdlib.h>
#include <string.h>
#include "vsi_nn_types.h"
#include "vsi_nn_tensor.h"
#include "vsi_nn_graph.h"
#include "vsi_nn_log.h"
#include "vsi_nn_prv.h"
#include "vsi_nn_tensor_util.h"
#include "vsi_nn_error.h"
#include "utils/vsi_nn_util.h"
#include "kernel/vsi_nn_kernel.h"
#include "libnnext/vx_lib_nnext.h"
__BEGIN_DECLS
/*
* Define kernel meta.
*/
#define VX_KERNEL_NAME_SPACE2DEPTH_INTERNAL_U8TOU8 CVIVANTE_NAMESPACE("evis.space2depth_internal_U8toU8")
#define VX_KERNEL_NAME_SPACE2DEPTH_INTERNAL_U8TOU8_X2Y1 CVIVANTE_NAMESPACE("evis.space2depth_internal_U8toU8_X2Y1")
#define VX_KERNEL_NAME_SPACE2DEPTH_INTERNAL_I8TOI8 CVIVANTE_NAMESPACE("evis.space2depth_internal_I8toI8")
#define VX_KERNEL_NAME_SPACE2DEPTH_INTERNAL_I8TOI8_X2Y1 CVIVANTE_NAMESPACE("evis.space2depth_internal_I8toI8_X2Y1")
#define VX_KERNEL_NAME_SPACE2DEPTH_INTERNAL_I16TOI16 CVIVANTE_NAMESPACE("evis.space2depth_internal_I16toI16")
#define VX_KERNEL_NAME_SPACE2DEPTH_INTERNAL_I16TOI16_X2Y1 CVIVANTE_NAMESPACE("evis.space2depth_internal_I16toI16_X2Y1")
#define VX_KERNEL_NAME_SPACE2DEPTH_INTERNAL_F16TOF16 CVIVANTE_NAMESPACE("evis.space2depth_internal_F16toF16")
#define VX_KERNEL_NAME_SPACE2DEPTH_INTERNAL_F16TOF16_X2Y1 CVIVANTE_NAMESPACE("evis.space2depth_internal_F16toF16_X2Y1")
#define KERNEL_SOURCE_1 "space2depth_internal"
// Add kernel hashtable here
#define HASH_SPACE2DEPTH_INTERNAL_KEY(_input0_type, _output_type, _opt_stride) \
((_input0_type << 24) | (_output_type << 16) | (_opt_stride << 8))
#define TENSOR_SPACE2DEPTH_INTERNAL_KERNELS(IN0_TYPE, OUT_TYPE, SOURCE) \
{ HASH_SPACE2DEPTH_INTERNAL_KEY(IN0_TYPE, OUT_TYPE, 0), \
VX_KERNEL_NAME_SPACE2DEPTH_INTERNAL_##IN0_TYPE##TO##OUT_TYPE, \
SOURCE },
#define TENSOR_SPACE2DEPTH_INTERNAL_OPT_KERNELS(IN0_TYPE, OUT_TYPE, SOURCE) \
{ HASH_SPACE2DEPTH_INTERNAL_KEY(IN0_TYPE, OUT_TYPE, 1), \
VX_KERNEL_NAME_SPACE2DEPTH_INTERNAL_##IN0_TYPE##TO##OUT_TYPE##_X2Y1, \
SOURCE },
static const struct {
uint32_t key;
char* function_name;
const char* source_name;
} space2depth_internal_map[] =
{
TENSOR_SPACE2DEPTH_INTERNAL_KERNELS(U8, U8, KERNEL_SOURCE_1)
TENSOR_SPACE2DEPTH_INTERNAL_KERNELS(I8, I8, KERNEL_SOURCE_1)
TENSOR_SPACE2DEPTH_INTERNAL_KERNELS(I16, I16, KERNEL_SOURCE_1)
TENSOR_SPACE2DEPTH_INTERNAL_KERNELS(F16, F16, KERNEL_SOURCE_1)
TENSOR_SPACE2DEPTH_INTERNAL_OPT_KERNELS(U8, U8, KERNEL_SOURCE_1)
TENSOR_SPACE2DEPTH_INTERNAL_OPT_KERNELS(I8, I8, KERNEL_SOURCE_1)
TENSOR_SPACE2DEPTH_INTERNAL_OPT_KERNELS(I16, I16, KERNEL_SOURCE_1)
TENSOR_SPACE2DEPTH_INTERNAL_OPT_KERNELS(F16, F16, KERNEL_SOURCE_1)
};
/*
* Kernel params
*/
static vx_param_description_t _space2depth_internal_kernel_param_def[] =
{
{VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
{VX_OUTPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
{VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
{VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
// Add kererl parameters here
};
#define _SPACE2DEPTH_INTERNAL_PARAM_NUM _cnt_of_array( _space2depth_internal_kernel_param_def )
/*
* Kernel initializer
*/
DEF_KERNEL_INITIALIZER(_space2depth_internal_initializer)
(
vsi_nn_kernel_node_t node,
const vsi_nn_kernel_node_param_t * param,
size_t param_size
)
{
vsi_status status = VSI_FAILURE;
gpu_param_t shaderParam = {
3, // workdim
{0, 0, 0}, // globalWorkOffset: control the start location be processed in the image
{0, 0, 0}, // globalWorkScale: how many pixels could be processed by a single thread
{0, 0, 0}, // localWorkSize: local group size in thread
{0, 0, 0}}; // globalWorkSize: image size in thread
uint32_t input_dims = 0;
vsi_nn_kernel_tensor_attr_t* attr[2] = {NULL, NULL};
int32_t input_width = 0;
int32_t input_height = 0;
int32_t input_depth = 0;
int32_t stride_x = 0;
int32_t stride_y = 0;
int32_t opt_flg = 0;
uint32_t pack_key = 0;
attr[0] = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[0] );
CHECK_PTR_FAIL_GOTO( attr[0], "Create tensor attr buffer fail.", OnError );
attr[1] = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[1] );
CHECK_PTR_FAIL_GOTO( attr[1], "Create tensor attr buffer fail.", OnError );
status = vsi_nn_kernel_scalar_read_int32((vsi_nn_kernel_scalar_t)param[2], &stride_x);
CHECK_STATUS_FAIL_GOTO(status, OnError );
status = vsi_nn_kernel_scalar_read_int32((vsi_nn_kernel_scalar_t)param[3], &stride_y);
CHECK_STATUS_FAIL_GOTO(status, OnError );
input_dims = (uint32_t)attr[0]->shape->size;
input_width = attr[0]->shape->data[0];
input_height = attr[0]->shape->data[1];
input_depth = input_dims > 2 ? attr[0]->shape->data[2] : 1;
shaderParam.global_scale[0] = 1;
shaderParam.global_scale[1] = 1;
shaderParam.global_scale[2] = 1;
if (stride_x == 2 && stride_y == 1)
{
shaderParam.global_scale[0] = 16;
if (attr[0]->dtype == F16 || attr[0]->dtype == I16)
{
shaderParam.global_scale[0] = 8;
}
opt_flg = 1;
}
shaderParam.global_size[0] = gpu_align_p2((input_width + shaderParam.global_scale[0] - 1)
/ shaderParam.global_scale[0], 4);
shaderParam.global_size[1] = input_height;
shaderParam.global_size[2] = input_depth;
status = vsi_nn_kernel_gpu_config( node, &shaderParam );
CHECK_STATUS_FAIL_GOTO(status, OnError);
#define _PACK_SELECT_KEY( IN0_TYPE, OUT_TYPE, OPT_FLG ) \
(IN0_TYPE | (OUT_TYPE << 8) | (OPT_FLG << 16))
pack_key = _PACK_SELECT_KEY( attr[0]->dtype, attr[1]->dtype, opt_flg);
{
gpu_dp_inst_t uniExtractEvenUint8Stride2_2x8 = {{
0x11111111, // TCfg
0x00000000, // ASelt
0x06040200, 0x0e0c0a08, // ABin
0x22222222, // BSelt
0x00000000, 0x00000000, // BBin
0x00000700, // AccumType, ConstantType, and PostShift
0x00000001, 0x00000001, 0x00000001, 0x00000001, 0x00000001, 0x00000001, 0x00000001, 0x00000001 // Constant
}, GPU_DP_TYPE_16 };
gpu_dp_inst_t uniExtractOddUint8Stride2_2x8 = {{
0x11111111, // TCfg
0x00000000, // ASelt
0x07050301, 0x0f0d0b09, // ABin
0x22222222, // BSelt
0x00000000, 0x00000000, // BBin
0x00000700, // AccumType, ConstantType, and PostShift
0x00000001, 0x00000001, 0x00000001, 0x00000001, 0x00000001, 0x00000001, 0x00000001, 0x00000001 // Constant
}, GPU_DP_TYPE_16 };
gpu_dp_inst_t uniExtractEvenFp16Stride2_4x4 = {{
0x01010101, // TCfg
0x00000000, // ASelt
0x00020000, 0x00060004, // ABin
0x02020202, // BSelt
0x00000000, 0x00000000, // BBin
0x00000400, // AccumType, ConstantType, and PostShift
0x00000001, 0x00000000, 0x00000001, 0x00000000, 0x00000001, 0x00000000, 0x00000001, 0x00000000 // Constant
}, GPU_DP_TYPE_16 };
gpu_dp_inst_t uniExtractOddFp16Stride2_4x4 = {{
0x01010101, // TCfg
0x00000000, // ASelt
0x00030001, 0x00070005, // ABin
0x02020202, // BSelt
0x00000000, 0x00000000, // BBin
0x00000400, // AccumType, ConstantType, and PostShift
0x00000001, 0x00000000, 0x00000001, 0x00000000, 0x00000001, 0x00000000, 0x00000001, 0x00000000 // Constant
}, GPU_DP_TYPE_16 };
status = vsi_nn_kernel_gpu_add_param(node, "input_depth", &input_depth);
CHECK_STATUS_FAIL_GOTO(status, OnError );
switch( pack_key )
{
case _PACK_SELECT_KEY( U8, U8, 0 ):
case _PACK_SELECT_KEY( I8, I8, 0 ):
case _PACK_SELECT_KEY( I16, I16, 0 ):
case _PACK_SELECT_KEY( F16, F16, 0 ):
break;
case _PACK_SELECT_KEY( U8, U8, 1 ):
case _PACK_SELECT_KEY( I8, I8, 1 ):
{
status = vsi_nn_kernel_gpu_add_param( node,
"uniExtractEvenUint8Stride2_2x8", &uniExtractEvenUint8Stride2_2x8 );
status |= vsi_nn_kernel_gpu_add_param( node,
"uniExtractOddUint8Stride2_2x8", &uniExtractOddUint8Stride2_2x8 );
CHECK_STATUS_FAIL_GOTO(status, OnError );
}
break;
case _PACK_SELECT_KEY( I16, I16, 1 ):
case _PACK_SELECT_KEY( F16, F16, 1 ):
{
status = vsi_nn_kernel_gpu_add_param( node,
"uniExtractEvenFp16Stride2_4x4", &uniExtractEvenFp16Stride2_4x4 );
status |= vsi_nn_kernel_gpu_add_param( node,
"uniExtractOddFp16Stride2_4x4", &uniExtractOddFp16Stride2_4x4 );
CHECK_STATUS_FAIL_GOTO(status, OnError );
}
break;
default:
break;
}
}
#undef _PACK_SELECT_KEY
CHECK_STATUS_FAIL_GOTO(status, OnError );
OnError:
if (attr[0])
{
vsi_nn_kernel_tensor_attr_release( &attr[0] );
attr[0] = NULL;
}
if (attr[1])
{
vsi_nn_kernel_tensor_attr_release( &attr[1] );
attr[1] = NULL;
}
return status;
}
/*
* Query kernel
*/
static vsi_status _query_kernel
(
vsi_nn_tensor_t* const* const inputs,
vsi_nn_tensor_t* const* const outputs,
vsi_nn_kernel_t* kernel,
const vsi_nn_kernel_param_t * params,
int32_t opt_flg
)
{
vsi_status status = VSI_FAILURE;
vsi_nn_kernel_dtype_e input0_dtype = U8;
vsi_nn_kernel_dtype_e output_dtype = U8;
uint32_t key = 0;
int i = 0;
input0_dtype = vsi_nn_kernel_map_dtype( inputs[0]->attr.dtype.vx_type );
output_dtype = vsi_nn_kernel_map_dtype( outputs[0]->attr.dtype.vx_type );
key = HASH_SPACE2DEPTH_INTERNAL_KEY( input0_dtype, output_dtype, opt_flg );
for( i = 0; i < _cnt_of_array(space2depth_internal_map); i ++ )
{
if ( space2depth_internal_map[i].key == key )
{
break;
}
}
if ( i < _cnt_of_array(space2depth_internal_map) )
{
snprintf( kernel->info.name, VX_MAX_KERNEL_NAME, "%s", space2depth_internal_map[i].function_name );
kernel->info.parameters = _space2depth_internal_kernel_param_def;
kernel->info.numParams = _SPACE2DEPTH_INTERNAL_PARAM_NUM;
kernel->info.initialize = _space2depth_internal_initializer;
vsi_nn_kernel_add_source( kernel, VSI_NN_GPU_SOURCE_FMT_CODE, 2,
"vsi_nn_kernel_header",
space2depth_internal_map[i].source_name );
vsi_nn_kernel_add_source( kernel, VSI_NN_GPU_SOURCE_FMT_EXECUTABLE, 1,
space2depth_internal_map[i].source_name );
status = VSI_SUCCESS;
}
return status;
} /* _query_kernel() */
static vsi_nn_kernel_node_t _setup
(
vsi_nn_graph_t * graph,
vsi_nn_tensor_t ** inputs,
size_t input_num,
vsi_nn_tensor_t ** outputs,
size_t output_num,
const vsi_nn_kernel_param_t * params,
vsi_nn_kernel_t * kernel
)
{
vsi_status status = VSI_FAILURE;
vsi_nn_kernel_node_param_t tmp_params[_SPACE2DEPTH_INTERNAL_PARAM_NUM] = { NULL };
vsi_nn_kernel_node_t node = NULL;
int32_t block_size_x = vsi_nn_kernel_param_get_int32( params, "block_size_x" );
int32_t block_size_y = vsi_nn_kernel_param_get_int32( params, "block_size_y" );
int32_t opt_flg = (block_size_x == 2 && block_size_y == 1) ? 1 : 0;
if ( !vsi_nn_kernel_gpu_check_shape( (int32_t*)outputs[0]->attr.size,
outputs[0]->attr.dim_num ) )
{
return NULL;
}
status = _query_kernel( inputs, outputs, kernel, params, opt_flg );
if ( VSI_SUCCESS == status)
{
node = vsi_nn_kernel_create_node( graph, kernel );
if ( node )
{
vsi_nn_kernel_node_pack_io( tmp_params, _SPACE2DEPTH_INTERNAL_PARAM_NUM, inputs, 1, outputs, 1 );
tmp_params[2] = vsi_nn_kernel_scalar_create( graph, I32, &block_size_x );
tmp_params[3] = vsi_nn_kernel_scalar_create( graph, I32, &block_size_y );
status = vsi_nn_kernel_node_pass_param( node, tmp_params, _SPACE2DEPTH_INTERNAL_PARAM_NUM );
CHECK_STATUS(status);
vsi_nn_kernel_scalar_release( &tmp_params[2] );
vsi_nn_kernel_scalar_release( &tmp_params[3] );
{
// Set default border mode.
vx_border_t border;
border.mode = VX_BORDER_CONSTANT;
border.constant_value.U8 = 0;
border.constant_value.U16 = 0;
if (inputs[0]->attr.dtype.vx_type == VSI_NN_TYPE_UINT8)
{
border.constant_value.U8 = (vx_uint8)inputs[0]->attr.dtype.zero_point;
}
status = vxSetNodeAttribute( (vx_node)node, VX_NODE_BORDER, &border, sizeof(border) );
CHECK_STATUS(status);
}
}
}
return node;
} /* _setup() */
__END_DECLS
REGISTER_BACKEND_EVIS( space2depth_internal, _setup )

View File

@ -0,0 +1,422 @@
/****************************************************************************
*
* Copyright (c) 2020 Vivante Corporation
*
* Permission is hereby granted, free of charge, to any person obtaining a
* copy of this software and associated documentation files (the "Software"),
* to deal in the Software without restriction, including without limitation
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
* and/or sell copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in
* all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
* DEALINGS IN THE SOFTWARE.
*
*****************************************************************************/
#include <stdint.h>
#include <stdlib.h>
#include <string.h>
#include "vsi_nn_types.h"
#include "vsi_nn_tensor.h"
#include "vsi_nn_graph.h"
#include "vsi_nn_log.h"
#include "vsi_nn_error.h"
#include "vsi_nn_prv.h"
#include "vsi_nn_tensor_util.h"
#include "utils/vsi_nn_util.h"
#include "kernel/vsi_nn_kernel.h"
#include "libnnext/vx_lib_nnext.h"
__BEGIN_DECLS
/*
* Define kernel meta.
*/
typedef enum
{
UP_ORG = 0,
UP_K2,
} _internal_upscale_e;
#define _UPSAMPLESCALE_KERNEL_SOURCE "upsamplescale"
#define _UPSAMPLESCALE_KERNEL_K2_SOURCE "upsamplescale_k2"
#define _UPSAMPLESCALE_KERNEL_NAME CVIVANTE_NAMESPACE("evis.upsamplescale")
#define STR(a) #a
// Add kernel hashtable here
#define UPSAMPLESCALE_HASH_KEY( IN_DTYPE, OUT_DTYPE, FLAG ) \
(( IN_DTYPE ) | ( OUT_DTYPE << 8) | ( FLAG << 16))
#define PACK_KERNEL_MAP( IN_DTYPE, OUT_DTYPE ) \
{ UPSAMPLESCALE_HASH_KEY( IN_DTYPE, OUT_DTYPE, UP_ORG ), \
CVIVANTE_NAMESPACE("evis.upsamplescale_"STR(IN_DTYPE)"to"STR(OUT_DTYPE)), \
_UPSAMPLESCALE_KERNEL_SOURCE }
#define PACK_KERNEL_MAP_K2( IN_DTYPE, OUT_DTYPE ) \
{ UPSAMPLESCALE_HASH_KEY( IN_DTYPE, OUT_DTYPE, UP_K2 ), \
CVIVANTE_NAMESPACE("evis.upsamplescale_"STR(IN_DTYPE)"to"STR(OUT_DTYPE)"_K2"), \
_UPSAMPLESCALE_KERNEL_K2_SOURCE }
typedef struct
{
uint32_t key;
char * function_name;
const char * source_name;
} _kernel_map_type;
static const _kernel_map_type _upsamplescale_kernel_map[] =
{
// Register kernel here
PACK_KERNEL_MAP( F16, F16 ),
PACK_KERNEL_MAP( F16, I16 ),
PACK_KERNEL_MAP( F16, I8 ),
PACK_KERNEL_MAP( F16, U8 ),
PACK_KERNEL_MAP( I16, I16 ),
PACK_KERNEL_MAP( I16, F16 ),
PACK_KERNEL_MAP( I8, I8 ),
PACK_KERNEL_MAP( I8, F16 ),
PACK_KERNEL_MAP( U8, U8 ),
PACK_KERNEL_MAP( U8, F16 ),
PACK_KERNEL_MAP_K2( F16, F16 ),
PACK_KERNEL_MAP_K2( F16, I16 ),
PACK_KERNEL_MAP_K2( F16, I8 ),
PACK_KERNEL_MAP_K2( F16, U8 ),
PACK_KERNEL_MAP_K2( I16, I16 ),
PACK_KERNEL_MAP_K2( I16, F16 ),
PACK_KERNEL_MAP_K2( I8, I8 ),
PACK_KERNEL_MAP_K2( I8, F16 ),
PACK_KERNEL_MAP_K2( U8, U8 ),
PACK_KERNEL_MAP_K2( U8, F16 ),
};
/*
* Kernel params
*/
static vx_param_description_t _upsamplescale_kernel_param_def[] =
{
{VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
{VX_OUTPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
{VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
{VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
// Add kererl parameters here
};
#define _UPSAMPLESCALE_PARAM_NUM _cnt_of_array( _upsamplescale_kernel_param_def )
#define SCALAR_STRIDE_VALUE (2)
#define SCALAR_SCALE_VALUE (3)
/*
* Kernel initializer
*/
DEF_KERNEL_INITIALIZER(_upsamplescale_initializer)
(
vsi_nn_kernel_node_t node,
const vsi_nn_kernel_node_param_t * param,
size_t param_size
)
{
#define _PACK_UPSCALE_KEY( IN_TYPE, OUT_TYPE, FLAG ) \
( IN_TYPE | ( OUT_TYPE << 16) | (FLAG << 24) )
vsi_status status = VSI_FAILURE;
gpu_param_t gpu_param = {
3,
{0, 0, 0},
{0, 0, 0},
{0, 0, 0},
{0, 0, 0}
};
vsi_nn_kernel_tensor_attr_t * output_attr = NULL;
vsi_nn_kernel_tensor_attr_t * input_attr = NULL;
vsi_int_array_t * in_shape = NULL;
vsi_nn_kernel_dtype_e input_dtype = F16;
vsi_nn_kernel_dtype_e output_dtype = F16;
int32_t stride = 0;
float scale = 0;
float scaleIn = 1.0f;
float scaleOut = 1.0f;
int32_t output_ZP = 0;
int32_t input_ZP = 0;
int32_t srcFixPointPos = 0;
int32_t dstFixPointPos = 0;
uint32_t pack_key = 0;
_internal_upscale_e flag = UP_ORG;
input_attr = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[0] );
CHECK_PTR_FAIL_GOTO( input_attr, "Create tensor attr buffer fail.", final );
output_attr = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[1] );
CHECK_PTR_FAIL_GOTO( output_attr, "Create tensor attr buffer fail.", final );
in_shape = input_attr->shape;
vsi_nn_kernel_scalar_read_int32((vsi_nn_kernel_scalar_t)param[SCALAR_STRIDE_VALUE], &(stride));
vsi_nn_kernel_scalar_read_float32((vsi_nn_kernel_scalar_t)param[SCALAR_SCALE_VALUE], &(scale));
input_dtype = input_attr->dtype;
output_dtype = output_attr->dtype;
if (VSI_NN_KERNEL_QUANT_DFP == input_attr->quant)
{
srcFixPointPos = input_attr->dfp.fl;
if (srcFixPointPos >=0 )
scaleIn = 1.0f / (float) ((int64_t)1 << srcFixPointPos);
else
scaleIn = (float) ((int64_t)1 << -srcFixPointPos);
}
else if (VSI_NN_KERNEL_QUANT_ASYMM == input_attr->quant)
{
input_ZP = input_attr->asymm.zero_point;
scaleIn = input_attr->asymm.scale;
}
if (VSI_NN_KERNEL_QUANT_DFP == output_attr->quant)
{
dstFixPointPos = output_attr->dfp.fl;
if (dstFixPointPos >=0 )
scaleOut = 1.0f / (float) ((int64_t)1 << dstFixPointPos);
else
scaleOut = (float) ((int64_t)1 << -dstFixPointPos);
}
else if (VSI_NN_KERNEL_QUANT_ASYMM == output_attr->quant)
{
output_ZP = output_attr->asymm.zero_point;
scaleOut = output_attr->asymm.scale;
}
if (stride == 2 && scale >= 0)
{
flag = UP_K2;
}
if ( flag == UP_K2 )
{
gpu_param.global_scale[0] = 8;
gpu_param.global_scale[1] = 1;
gpu_param.global_scale[2] = 1;
}
else
{
gpu_param.global_scale[0] = 1;
gpu_param.global_scale[1] = 1;
gpu_param.global_scale[2] = 1;
}
gpu_param.global_size[0] = gpu_align_p2(
(in_shape->data[0] + gpu_param.global_scale[0] - 1)
/ gpu_param.global_scale[0], 4);
gpu_param.global_size[1] = (
(in_shape->data[1] + gpu_param.global_scale[1] - 1)
/ gpu_param.global_scale[1]);
gpu_param.global_size[2] = in_shape->size > 2 ? in_shape->data[2] : 1;
pack_key = _PACK_UPSCALE_KEY( input_dtype, output_dtype, flag );
switch( pack_key )
{
case _PACK_UPSCALE_KEY( F16, F16, UP_K2 ):
case _PACK_UPSCALE_KEY( F16, I16, UP_K2 ):
case _PACK_UPSCALE_KEY( F16, I8, UP_K2 ):
case _PACK_UPSCALE_KEY( F16, U8, UP_K2 ):
case _PACK_UPSCALE_KEY( I16, F16, UP_K2 ):
case _PACK_UPSCALE_KEY( I16, I16, UP_K2 ):
case _PACK_UPSCALE_KEY( I8, F16, UP_K2 ):
case _PACK_UPSCALE_KEY( I8, I8, UP_K2 ):
case _PACK_UPSCALE_KEY( U8, F16, UP_K2 ):
case _PACK_UPSCALE_KEY( U8, U8, UP_K2 ):
{
uint16_t multiplier = 0;
int32_t postShift = 0;
uint32_t multAndoutZP[2] = {0};
gpu_dp_inst_t uniUpSampleScale2X_lo_2x8 = {{
0xdddddddd, // TCfg
0x44444444, // ASelt
0x11111010, 0x13131212, // ABin
0x11111111, // BSelt
0x00000000, 0x00000000, // BBin
0x00002400, // AccumType, ConstantType, and PostShift
0x00000000, 0x00000000, 0x00000000, 0x00000000,
0x00000000, 0x00000000, 0x00000000, 0x00000000 // Constant
}, GPU_DP_TYPE_16};
gpu_dp_inst_t uniUpSampleScale2X_hi_2x8 = {{
0xdddddddd, // TCfg
0x44444444, // ASelt
0x15151414, 0x17171616, // ABin
0x11111111, // BSelt
0x00000000, 0x00000000, // BBin
0x00002400, // AccumType, ConstantType, and PostShift
0x00000000, 0x00000000, 0x00000000, 0x00000000,
0x00000000, 0x00000000, 0x00000000, 0x00000000 // Constant
}, GPU_DP_TYPE_16};
gpu_quantize_multiplier_16bit(scaleIn * scale / scaleOut, &multiplier, &postShift);
multAndoutZP[0] = (uint32_t)(multiplier);
multAndoutZP[1] = (uint32_t)((output_ZP << postShift) - input_ZP * multiplier);
uniUpSampleScale2X_lo_2x8.data[7] |= (postShift & 0x1F);
uniUpSampleScale2X_hi_2x8.data[7] |= (postShift & 0x1F);
status = vsi_nn_kernel_gpu_add_param( node, "uniUpScale2X_lo_2x8", &uniUpSampleScale2X_lo_2x8);
status |= vsi_nn_kernel_gpu_add_param( node, "uniUpScale2X_hi_2x8", &uniUpSampleScale2X_hi_2x8);
status |= vsi_nn_kernel_gpu_add_param( node, "multAndoutZP", multAndoutZP);
}
break;
case _PACK_UPSCALE_KEY( F16, F16, UP_ORG ):
case _PACK_UPSCALE_KEY( F16, I16, UP_ORG ):
case _PACK_UPSCALE_KEY( F16, I8, UP_ORG ):
case _PACK_UPSCALE_KEY( F16, U8, UP_ORG ):
case _PACK_UPSCALE_KEY( I16, F16, UP_ORG ):
case _PACK_UPSCALE_KEY( I16, I16, UP_ORG ):
case _PACK_UPSCALE_KEY( I8, F16, UP_ORG ):
case _PACK_UPSCALE_KEY( I8, I8, UP_ORG ):
case _PACK_UPSCALE_KEY( U8, F16, UP_ORG ):
case _PACK_UPSCALE_KEY( U8, U8, UP_ORG ):
{
float output_scale = scaleIn * scale / scaleOut;
float tail = output_ZP - input_ZP * output_scale;
gpu_dp_inst_t uniConvertDatatoF32_4x4 = {{
0x01010101, // TCfg
0x00000000, // ASelt
0x00010000, 0x00030002, // ABin
0x02020202, // BSelt
0x00000000, 0x00000000, // BBin
0x00000100, // AccumType, ConstantType, and PostShift
0x00003c00, 0x00000000, 0x00003c00, 0x00000000,
0x00003c00, 0x00000000, 0x00003c00, 0x00000000 // Constant
}, GPU_DP_TYPE_16};
status = vsi_nn_kernel_gpu_add_param( node, "uniConvertDatatoF32_4x4", &uniConvertDatatoF32_4x4);
status |= vsi_nn_kernel_gpu_add_param( node, "output_scale", &output_scale);
status |= vsi_nn_kernel_gpu_add_param( node, "tail", &tail);
}
break;
default:
break;
}
#undef _PACK_UPSCALE_KEY
status = vsi_nn_kernel_gpu_config( node, &gpu_param );
final:
if (input_attr)
{
vsi_nn_kernel_tensor_attr_release( &input_attr );
input_attr = NULL;
}
if (output_attr)
{
vsi_nn_kernel_tensor_attr_release( &output_attr );
output_attr = NULL;
}
return status;
} /* _upsamplescale_initializer() */
/*
* Query kernel
*/
static vsi_status _query_kernel
(
vsi_nn_kernel_t * kernel,
vsi_nn_tensor_t * const * const inputs,
vsi_nn_tensor_t * const * const outputs,
int32_t stride,
float scale
/* Add extra params */
)
{
vsi_status status = VSI_FAILURE;
vsi_nn_kernel_dtype_e in_dtype;
vsi_nn_kernel_dtype_e out_dtype;
const _kernel_map_type * kernel_map = _upsamplescale_kernel_map;
vx_param_description_t * param_def = _upsamplescale_kernel_param_def;
size_t param_def_size = _cnt_of_array( _upsamplescale_kernel_param_def );
vx_kernel_initialize_f initializer = _upsamplescale_initializer;
_internal_upscale_e flag = (stride == 2 && scale >= 0 ) ? UP_K2 : UP_ORG;
uint32_t key = 0;
int i;
in_dtype = vsi_nn_kernel_map_dtype( inputs[0]->attr.dtype.vx_type );
out_dtype = vsi_nn_kernel_map_dtype( outputs[0]->attr.dtype.vx_type );
key = UPSAMPLESCALE_HASH_KEY( in_dtype, out_dtype, flag );
for( i = 0; i < _cnt_of_array( _upsamplescale_kernel_map ); i ++ )
{
if( kernel_map[i].key == key )
{
break;
}
}
if( i < _cnt_of_array( _upsamplescale_kernel_map ) )
{
snprintf( kernel->info.name, VX_MAX_KERNEL_NAME, "%s", kernel_map[i].function_name );
kernel->info.parameters = param_def;
kernel->info.numParams = (uint32_t)param_def_size;
kernel->info.initialize = initializer;
// Register code source
vsi_nn_kernel_add_source( kernel, VSI_NN_GPU_SOURCE_FMT_CODE, 2,
"vsi_nn_kernel_header",
kernel_map[i].source_name );
// Register binary source
vsi_nn_kernel_add_source( kernel, VSI_NN_GPU_SOURCE_FMT_EXECUTABLE, 1,
kernel_map[i].source_name );
status = VSI_SUCCESS;
}
return status;
} /* _query_kernel() */
static vsi_nn_kernel_node_t _setup
(
vsi_nn_graph_t * graph,
vsi_nn_tensor_t ** inputs,
size_t input_num,
vsi_nn_tensor_t ** outputs,
size_t output_num,
const vsi_nn_kernel_param_t * params,
vsi_nn_kernel_t * kernel
)
{
vsi_status status = VSI_FAILURE;
vsi_nn_kernel_node_param_t node_params[_UPSAMPLESCALE_PARAM_NUM];
vsi_nn_kernel_node_t node = NULL;
int32_t stride = vsi_nn_kernel_param_get_int32( params, "stride" );
float scale = vsi_nn_kernel_param_get_float32( params, "scale" );
status = _query_kernel( kernel, inputs, outputs, stride, scale );
if ( VSI_SUCCESS == status)
{
node = vsi_nn_kernel_create_node( graph, kernel );
if ( node )
{
/* Set inputs and outputs */
vsi_nn_kernel_node_pack_io( node_params, _UPSAMPLESCALE_PARAM_NUM,
inputs, input_num, outputs, output_num );
node_params[SCALAR_STRIDE_VALUE] = vsi_nn_kernel_scalar_create( graph, I32, &stride );
node_params[SCALAR_SCALE_VALUE] = vsi_nn_kernel_scalar_create( graph, F32, &scale );
/* Pass parameters to node. */
status = vsi_nn_kernel_node_pass_param( node, node_params, _UPSAMPLESCALE_PARAM_NUM );
vsi_nn_kernel_scalar_release( &node_params[SCALAR_STRIDE_VALUE] );
vsi_nn_kernel_scalar_release( &node_params[SCALAR_SCALE_VALUE] );
VSI_ASSERT( status == VSI_SUCCESS );
}
}
return node;
} /* _setup() */
__END_DECLS
REGISTER_BACKEND_EVIS( upsamplescale, _setup )

View File

@ -24,6 +24,7 @@
#include <stdint.h>
#include "vsi_nn_tensor.h"
#include "vsi_nn_tensor_util.h"
#include "vsi_nn_error.h"
#include "utils/vsi_nn_math.h"
#include "kernel/vsi_nn_kernel_gpu_shape_optimize.h"
@ -53,7 +54,7 @@ static vsi_bool compute_gpu_divisor
int32_t i = 0;
for( i = vsi_nn_min( input_value, limit - 1 ); i > 0; i -- )
{
if( ( i % gcd == 0 ) && ( input_value % i == 0 ) )
if ( ( i % gcd == 0 ) && ( input_value % i == 0 ) )
{
*divisor = i;
return TRUE;
@ -75,7 +76,7 @@ static size_t element_fill_dim
if (size_x == 1)
return 0;
if( size_x < GPU_TENSOR_MAX_WIDTH)
if ( size_x < GPU_TENSOR_MAX_WIDTH)
{
shape_x[rank_x] = size_x;
}
@ -85,7 +86,7 @@ static size_t element_fill_dim
int32_t remainder = 0;
compute_gpu_divisor( size_x, GPU_TENSOR_MAX_WIDTH, 1, &divisor );
remainder = size_x / divisor;
if( remainder > GPU_TENSOR_MAX_WIDTH || rank_x >= max_rank)
if ( remainder > GPU_TENSOR_MAX_WIDTH || rank_x >= max_rank)
{
// Cannot optimize.
shape_x[rank_x] = size_x;
@ -97,7 +98,7 @@ static size_t element_fill_dim
* so it should be always 2.
*/
cost_size = 2;
if( size_x > 1 )
if ( size_x > 1 )
{
shape_x[rank_x] = divisor;
shape_x[rank_x + 1] = remainder;
@ -170,25 +171,25 @@ vsi_bool vsi_nn_kernel_optimize_reduce_shape
rank_in += element_fill_dim(out_shape_x, rank_in, GPU_TENSOR_MAX_WIDTH, outerSize);
rank_out += element_fill_dim(out_shape_output, rank_out, GPU_TENSOR_MAX_WIDTH, outerSize);
if( 0 == rank_in )
if ( 0 == rank_in )
{
out_shape_x[0] = 1;
out_shape_x[1] = 1;
rank_in = 2;
}
else if( 1 == rank_in )
else if ( 1 == rank_in )
{
out_shape_x[1] = 1;
rank_in = 2;
}
if( 0 == rank_out )
if ( 0 == rank_out )
{
out_shape_output[0] = 1;
out_shape_output[1] = 1;
rank_out = 2;
}
else if( 1 == rank_out )
else if ( 1 == rank_out )
{
out_shape_output[1] = 1;
rank_out = 2;
@ -200,6 +201,75 @@ vsi_bool vsi_nn_kernel_optimize_reduce_shape
return ret;
} /* vsi_nn_kernel_optimize_reduce_shape() */
vsi_bool vsi_nn_kernel_optimize_tensor_shape
(
const int32_t* shape_x, const size_t rank_x,
const int32_t *axis, const size_t axis_size,
int32_t* out_shape_x, uint32_t* out_rank_x,
int32_t* out_axis, uint32_t* out_axis_size
)
{
vsi_bool ret = TRUE;
size_t i = 0;
size_t rank_in = 0;
size_t dims = 0;
int32_t innerSize = 1;
int32_t outerSize = 1;
int32_t axisSize = 1;
for (i = 0; i < axis_size; i++)
{
axisSize *= shape_x[axis[i]];
}
for (i = 0; i < (size_t)axis[0]; i++)
{
innerSize *= shape_x[i];
}
for (i = axis[axis_size - 1] + 1; i < rank_x; i++)
{
outerSize *= shape_x[i];
}
rank_in += element_fill_dim(out_shape_x, rank_in, GPU_TENSOR_MAX_WIDTH, innerSize);
dims = element_fill_dim(out_shape_x, rank_in, GPU_TENSOR_MAX_WIDTH, axisSize);
if (dims == 0)
{
out_axis[0] = (int32_t)rank_in;
*out_axis_size = 1;
out_shape_x[rank_in ++] = 1;
}
else
{
*out_axis_size = (uint32_t)dims;
for (i = 0; i < dims; i++)
{
out_axis[i] = (int32_t)rank_in + (int32_t)i;
}
}
rank_in += dims;
rank_in += element_fill_dim(out_shape_x, rank_in, GPU_TENSOR_MAX_WIDTH, outerSize);
if ( 0 == rank_in )
{
out_shape_x[0] = 1;
out_shape_x[1] = 1;
rank_in = 2;
}
else if ( 1 == rank_in )
{
out_shape_x[1] = 1;
rank_in = 2;
}
*out_rank_x = (uint32_t)rank_in;
return ret;
} /* vsi_nn_kernel_optimize_reduce_shape() */
vsi_bool vsi_nn_kernel_optimize_element_shape
(
const int32_t* shape_x, const size_t rank_x,
@ -218,13 +288,13 @@ vsi_bool vsi_nn_kernel_optimize_element_shape
rank_in += element_fill_dim(out_shape_x, rank_in, GPU_TENSOR_MAX_WIDTH, element_num);
if( 0 == rank_in )
if ( 0 == rank_in )
{
out_shape_x[0] = 1;
out_shape_x[1] = 1;
rank_in = 2;
}
else if( 1 == rank_in )
else if ( 1 == rank_in )
{
out_shape_x[1] = 1;
rank_in = 2;
@ -275,13 +345,13 @@ vsi_bool vsi_nn_kernel_optimize_softmax_shape
rank_in += element_fill_dim(out_shape_x, rank_in, GPU_TENSOR_MAX_WIDTH, outerSize);
if( 0 == rank_in )
if ( 0 == rank_in )
{
out_shape_x[0] = 1;
out_shape_x[1] = 1;
rank_in = 2;
}
else if( 1 == rank_in )
else if ( 1 == rank_in )
{
out_shape_x[1] = 1;
rank_in = 2;
@ -313,7 +383,7 @@ static size_t tile_fill_dim
size_t cost_size = 1;
VSI_ASSERT( rank <= max_rank );
VSI_ASSERT( size_output >= (int32_t)((int64_t)(0xFFFFFFFF) - 1) );
if( size_output < GPU_TENSOR_MAX_WIDTH )
if ( size_output < GPU_TENSOR_MAX_WIDTH )
{
shape_x[rank] = size_x;
shape_y[rank] = size_y;
@ -325,7 +395,7 @@ static size_t tile_fill_dim
int32_t remainder = 0;
compute_gpu_divisor( size_output, GPU_TENSOR_MAX_WIDTH, 1, &divisor );
remainder = size_output / divisor;
if( remainder > GPU_TENSOR_MAX_WIDTH || rank >= max_rank )
if ( remainder > GPU_TENSOR_MAX_WIDTH || rank >= max_rank )
{
// Cannot optimize.
shape_x[rank] = size_x;
@ -339,7 +409,7 @@ static size_t tile_fill_dim
* so it should be always 2.
*/
cost_size = 2;
if( size_x > 1 )
if ( size_x > 1 )
{
shape_x[rank] = divisor;
shape_x[rank + 1] = remainder;
@ -349,7 +419,7 @@ static size_t tile_fill_dim
shape_x[rank] = 1;
shape_x[rank + 1] = 1;
}
if( size_y > 1 )
if ( size_y > 1 )
{
shape_y[rank] = divisor;
shape_y[rank + 1] = remainder;
@ -401,20 +471,20 @@ vsi_bool vsi_nn_kernel_optimize_tile_shape
sz = shape_output[i];
/*
* Skip dim if the size is equal to 1
* Also skip if( sx == 1 && sy == 1 )
* Also skip if ( sx == 1 && sy == 1 )
*/
if( shape_output[i] == 1 )
if ( shape_output[i] == 1 )
{
continue;
}
// Update state
state = TILE_STATE_EMPTY;
if( sx == sz )
if ( sx == sz )
{
state = TILE_STATE_NO_AXIS;
}
else if( sx != sz )
else if ( sx != sz )
{
state = TILE_STATE_AXIS_X;
}
@ -472,16 +542,16 @@ vsi_bool vsi_nn_kernel_optimize_tile_shape
break;
}
#undef _pack_state
if( append_dim )
if ( append_dim )
{
dims += tile_fill_dim( out_shape_x, out_shape_y, out_shape_output,
dims, VSI_NN_MAX_DIM_NUM, sx, sy, sz );
}
}
if( ret )
if ( ret )
{
/* Append the last dim */
if( i == rank_output )
if ( i == rank_output )
{
sx = effective_size_x;
sy = effective_size_y;
@ -490,7 +560,7 @@ vsi_bool vsi_nn_kernel_optimize_tile_shape
dims, VSI_NN_MAX_DIM_NUM, sx, sy, sz );
}
/* Avoid 1D shape*/
if( 1 == dims )
if ( 1 == dims )
{
out_shape_x[1] = 1;
out_shape_y[1] = 1;
@ -508,3 +578,39 @@ vsi_bool vsi_nn_kernel_optimize_tile_shape
#undef _swap_size
return ret;
} /* vsi_nn_kernel_optimize_eltwise_shape() */
vsi_bool vsi_nn_kernel_optimize_1d_tensor_shape
(
const int32_t* shape, const uint32_t rank,
int32_t* out_shape, uint32_t* out_rank
)
{
memcpy(out_shape, shape, sizeof(int32_t) * rank);
*out_rank = vsi_nn_max(rank, 2);
out_shape[1] = rank == 1 ? 1 : out_shape[1];
return TRUE;
}
vsi_bool vsi_nn_kernel_optimize_nchw2xhw_shape
(
const int32_t* shape, const uint32_t rank,
int32_t* out_shape, uint32_t* out_rank
)
{
uint32_t dim_num = 0;
uint32_t i = 0;
vsi_nn_kernel_optimize_1d_tensor_shape( shape,
rank, out_shape, &dim_num);
for (i = 3; i < dim_num; i++)
{
out_shape[2] *= out_shape[i];
}
*out_rank = vsi_nn_min(dim_num, 3);
return TRUE;
}

View File

@ -131,10 +131,8 @@ static vsi_nn_kernel_node_t _setup
float index[1024] = {0};
float value[1024] = {0};
if ( inputs[0]->attr.dtype.vx_type == VSI_NN_TYPE_FLOAT32 ||
inputs[0]->attr.dtype.vx_type == VSI_NN_TYPE_INT32 ||
outputs[0]->attr.dtype.vx_type == VSI_NN_TYPE_INT32 ||
outputs[0]->attr.dtype.vx_type == VSI_NN_TYPE_INT32)
if ( inputs[0]->attr.dtype.vx_type == VSI_NN_TYPE_INT32 ||
outputs[0]->attr.dtype.vx_type == VSI_NN_TYPE_INT32 )
{
return NULL;
}

View File

@ -255,7 +255,7 @@ REGISTER_CONV_OPENVX_KERNEL( depthwise_conv1d )
vx_node node = NULL;
vx_nn_convolution_params_ext2_t vxparam;
vx_tensor temp_tensors[3] = { NULL };
int i;
int32_t i;
vsi_bool need_explicit_padding = FALSE;
_build_vx_conv2d_param(
@ -277,8 +277,17 @@ REGISTER_CONV_OPENVX_KERNEL( depthwise_conv1d )
if (inputs[1]->attr.dtype.qnt_type != VSI_NN_QNT_TYPE_AFFINE_PERCHANNEL_SYMMETRIC)
{
temp_tensors[1] = _expand_tensor_dim( inputs[1]->t,
(int32_t*)inputs[1]->attr.size, inputs[1]->attr.dim_num, 0 );
int32_t new_w_shape[VSI_NN_MAX_DIM_NUM] = { 0 };
uint32_t new_w_rank = 4;
new_w_shape[0] = 1;
new_w_shape[1] = inputs[1]->attr.size[0];
new_w_shape[2] = 1;
for (i = 1; i < (int32_t)(inputs[1]->attr.dim_num); i++)
{
new_w_shape[2] *= inputs[1]->attr.size[i];
}
new_w_shape[3] = 1;
temp_tensors[1] = vxReshapeTensor( inputs[1]->t, new_w_shape, new_w_rank );
CHECK_PTR_FAIL_GOTO( temp_tensors[1], "Expand kernel dim fail.", final );
}
else

View File

@ -165,10 +165,8 @@ static vsi_nn_kernel_node_t _setup
float index[1024] = {0};
float value[1024] = {0};
if ( inputs[0]->attr.dtype.vx_type == VSI_NN_TYPE_FLOAT32 ||
inputs[0]->attr.dtype.vx_type == VSI_NN_TYPE_INT32 ||
outputs[0]->attr.dtype.vx_type == VSI_NN_TYPE_INT32 ||
outputs[0]->attr.dtype.vx_type == VSI_NN_TYPE_INT32)
if ( inputs[0]->attr.dtype.vx_type == VSI_NN_TYPE_INT32 ||
outputs[0]->attr.dtype.vx_type == VSI_NN_TYPE_INT32 )
{
return NULL;
}

View File

@ -135,10 +135,8 @@ static vsi_nn_kernel_node_t _setup
float index[1024] = {0};
float value[1024] = {0};
if ( inputs[0]->attr.dtype.vx_type == VSI_NN_TYPE_FLOAT32 ||
inputs[0]->attr.dtype.vx_type == VSI_NN_TYPE_INT32 ||
outputs[0]->attr.dtype.vx_type == VSI_NN_TYPE_INT32 ||
outputs[0]->attr.dtype.vx_type == VSI_NN_TYPE_INT32)
if ( inputs[0]->attr.dtype.vx_type == VSI_NN_TYPE_INT32 ||
outputs[0]->attr.dtype.vx_type == VSI_NN_TYPE_INT32 )
{
return NULL;
}

View File

@ -0,0 +1,143 @@
__kernel __attribute__((reqd_work_group_size(16, 1, 1))) void layer_norm_F32toF32(
__read_only image2d_array_t input,
__read_only image2d_t bias,
__read_only image2d_t scale,
__write_only image2d_array_t output,
float eps,
float input_zp,
float input_scale,
float output_zp,
float output_scale,
float e2InScale,
float scale_inOut,
float sumZpScale,
float zp2ScaleE2,
float sumZpScaleE2,
int width,
int height,
float dim_ratio
)
{
int lidx = get_local_id(0);
int4 coord = (int4)(lidx, get_global_id(1), get_global_id(2), 0);
float4 data, dst;
float2 sumSqr = (float2)(0);
float scale_vari, bias_val;
__local float2 local_sum[16];
for(; coord.x < width;)
{
data = read_imagef(input, coord);
coord.x += 16;
sumSqr.x += data.x;
sumSqr.y += data.x * data.x;
}
local_sum[lidx] = sumSqr;
barrier(CLK_LOCAL_MEM_FENCE);
if(lidx == 0)
{
for(int i = 1; i < 16; i++)
{
sumSqr += local_sum[i];
}
local_sum[0] = sumSqr;
}
barrier(CLK_LOCAL_MEM_FENCE);
sumSqr = local_sum[0] * dim_ratio;
sumSqr.s1 = sumSqr.s1 - sumSqr.s0 * sumSqr.s0 + eps;
sumSqr.s1 = rsqrt(sumSqr.s1);
for(coord.x = lidx; coord.x < width;)
{
float4 gamma = read_imagef(scale, coord.xw);
float4 beta = read_imagef(bias, coord.xw);
data = read_imagef(input, coord);
scale_vari = gamma.s0 * sumSqr.s1;
bias_val = (beta.s0 - scale_vari * sumSqr.s0);
dst.x = data.x * scale_vari + bias_val;
write_imagef(output, coord, dst);
coord.x += 16;
}
}
__kernel __attribute__((reqd_work_group_size(16, 1, 1))) void layer_norm_U8toU8(
__read_only image2d_array_t input,
__read_only image2d_t bias,
__read_only image2d_t scale,
__write_only image2d_array_t output,
float eps,
float input_zp,
float input_scale,
float output_zp,
float output_scale,
float e2InScale,
float scale_inOut,
float sumZpScale,
float zp2ScaleE2,
float sumZpScaleE2,
int width,
int height,
float dim_ratio
)
{
int lidx = get_local_id(0);
int4 coord = (int4)(lidx, get_global_id(1), get_global_id(2), 0);
uint4 data, dst;
float2 sumSqr;
uint tmpSum = 0, tmpSqr = 0;
float scale_vari, bias_val;
__local uint local_sum[1];
__local uint local_sqr[1];
if(lidx == 0)
{
local_sum[0] = 0;
local_sqr[0] = 0;
}
barrier(CLK_LOCAL_MEM_FENCE);
for(; coord.x < width;)
{
data = read_imageui(input, coord);
coord.x+=16;
tmpSum += data.x;
tmpSqr += data.x * data.x;
}
atom_add(local_sum, tmpSum);
atom_add(local_sqr, tmpSqr);
barrier(CLK_LOCAL_MEM_FENCE);
tmpSum = local_sum[0];
tmpSqr = local_sqr[0];
//sumSqr.x = ((float)tmpSum - width * input_zp) * input_scale;
//sumSqr.y = ((float)tmpSqr - 2 * input_zp * (float)tmpSum + width * input_zp * input_zp) * e2InScale;
sumSqr.x = (float)tmpSum * input_scale - sumZpScale;
sumSqr.y = (float)tmpSqr * e2InScale - zp2ScaleE2 * (float)tmpSum + sumZpScaleE2;
sumSqr *= dim_ratio;
sumSqr.s1 = sumSqr.s1 - sumSqr.s0 * sumSqr.s0 + eps;
sumSqr.s1 = rsqrt(sumSqr.s1);
for(coord.x = lidx; coord.x < width;)
{
float4 gamma = read_imagef(scale, coord.xw);
float4 beta = read_imagef(bias, coord.xw);
data = read_imageui(input, coord);
scale_vari = gamma.s0 * sumSqr.s1;
float alpha = scale_inOut * scale_vari;
bias_val = (beta.s0 - scale_vari * sumSqr.s0) * output_scale + output_zp;
float tmpVal = data.x - input_zp;
float4 norm;
norm.x = tmpVal * alpha + bias_val;
dst = convert_uint4_rte(norm);
write_imageui(output, coord, dst);
coord.x+=16;
}
}

View File

@ -6,32 +6,30 @@ __kernel void gemm_F32F32toF32_2D(
int K,
int N,
int ac2zero,
int bc2zero
int bc2zero,
float scale_a,
float zp_a,
float scale_b,
float zp_b,
float scale_out,
float zp_out
)
{
int gidx = get_global_id(0);
int gidy = get_global_id(1);
int2 coord_a = (int2)(0, gidy);
int2 coord_b = (int2)(gidx, 0);
int4 coord = (int4)(get_global_id(0), get_global_id(1), 0, 0);
float4 sum = (float4)(0);
for(; coord_a.x < K;)
for(; coord.z < K;)
{
float4 tempA0;
float4 tempB0;
tempA0 = read_imagef(inputA, coord_a);
tempB0 = read_imagef(inputB, coord_b);
coord_a.x++;
coord_b.y++;
tempA0 = read_imagef(inputA, coord.zy);
tempB0 = read_imagef(inputB, coord.xz);
coord.z++;
sum += tempA0 * tempB0;
sum = sum + tempA0 * tempB0;
}
coord_b.y = gidy;
write_imagef(output, coord_b, sum);
write_imagef(output, coord.xy, sum);
}
__kernel void gemm_F32F32toF32_3D(
@ -42,7 +40,13 @@ __kernel void gemm_F32F32toF32_3D(
int K,
int N,
int ac2zero,
int bc2zero
int bc2zero,
float scale_a,
float zp_a,
float scale_b,
float zp_b,
float scale_out,
float zp_out
)
{
int4 coord_a = (int4)(0, get_global_id(1), (ac2zero ? 0 : get_global_id(2)), 0);
@ -60,10 +64,160 @@ __kernel void gemm_F32F32toF32_3D(
coord_a.x++;
coord_b.y++;
sum += tempA0 * tempB0;
sum = sum + tempA0 * tempB0;
}
coord_b.y = get_global_id(1);
coord_b.z = get_global_id(2);
write_imagef(output, coord_b, sum);
}
__kernel void gemm_transb_F32F32toF32_2D(
__read_only image2d_t inputA,
__read_only image2d_t inputB,
__write_only image2d_t output,
int M,
int K,
int N,
int ac2zero,
int bc2zero,
float scale_a,
float zp_a,
float scale_b,
float zp_b,
float scale_out,
float zp_out
)
{
int4 coord = (int4)(get_global_id(0), get_global_id(1), 0, 0);
float4 sum = (float4)(0);
for(; coord.z < K;)
{
float4 tempA0;
float4 tempB0;
tempA0 = read_imagef(inputA, coord.zy);
tempB0 = read_imagef(inputB, coord.zx);
coord.z++;
sum = sum + tempA0 * tempB0;
}
write_imagef(output, coord.xy, sum);
}
__kernel void gemm_transb_F32F32toF32_3D(
__read_only image2d_array_t inputA,
__read_only image2d_array_t inputB,
__write_only image2d_array_t output,
int M,
int K,
int N,
int ac2zero,
int bc2zero,
float scale_a,
float zp_a,
float scale_b,
float zp_b,
float scale_out,
float zp_out
)
{
int4 coord_a = (int4)(0, get_global_id(1), (ac2zero ? 0 : get_global_id(2)), 0);
int4 coord_b = (int4)(0, get_global_id(0), (bc2zero ? 0 : get_global_id(2)), 0);
float4 sum = (float4)(0);
for(; coord_a.x < K;)
{
float4 tempA0;
float4 tempB0;
tempA0 = read_imagef(inputA, coord_a);
tempB0 = read_imagef(inputB, coord_b);
coord_a.x++;
coord_b.x++;
sum = sum + tempA0 * tempB0;
}
coord_a.x = get_global_id(0);
coord_a.z = get_global_id(2);
write_imagef(output, coord_b, sum);
}
__kernel void gemm_transb_F32I8toF32_2D(
__read_only image2d_t inputA,
__read_only image2d_t inputB,
__write_only image2d_t output,
int M,
int K,
int N,
int ac2zero,
int bc2zero,
float scale_a,
float zp_a,
float scale_b,
float zp_b,
float scale_out,
float zp_out
)
{
int4 coord = (int4)(get_global_id(0), get_global_id(1), 0, 0);
float4 sum = (float4)(0);
for(; coord.z < K;)
{
float4 tempA0;
float4 tempB0;
tempA0 = read_imagef(inputA, coord.zy);
tempB0 = convert_float4(read_imagei(inputB, coord.zx));
coord.z++;
tempB0.x = (tempB0.x - zp_b) * scale_b;
sum = sum + tempA0 * tempB0;
}
write_imagef(output, coord.xy, sum);
}
__kernel void gemm_transb_F32I8toF32_3D(
__read_only image2d_array_t inputA,
__read_only image2d_array_t inputB,
__write_only image2d_array_t output,
int M,
int K,
int N,
int ac2zero,
int bc2zero,
float scale_a,
float zp_a,
float scale_b,
float zp_b,
float scale_out,
float zp_out
)
{
int4 coord_a = (int4)(0, get_global_id(1), (ac2zero ? 0 : get_global_id(2)), 0);
int4 coord_b = (int4)(0, get_global_id(0), (bc2zero ? 0 : get_global_id(2)), 0);
float4 sum = (float4)(0);
for(; coord_a.x < K;)
{
float4 tempA0;
float4 tempB0;
tempA0 = read_imagef(inputA, coord_a);
tempB0 = convert_float4(read_imagei(inputB, coord_b));
tempB0.x = (tempB0.x - zp_b) * scale_b;
coord_a.x++;
coord_b.x++;
sum = sum + tempA0 * tempB0;
}
coord_a.x = get_global_id(0);
coord_a.z = get_global_id(2);
write_imagef(output, coord_b, sum);
}

View File

@ -6,32 +6,30 @@ __kernel void gemm_transa_F32F32toF32_2D(
int K,
int N,
int ac2zero,
int bc2zero
int bc2zero,
float scale_a,
float zp_a,
float scale_b,
float zp_b,
float scale_out,
float zp_out
)
{
int gidx = get_global_id(0);
int gidy = get_global_id(1);
int2 coord_a = (int2)(gidy, 0);
int2 coord_b = (int2)(gidx, 0);
int4 coord = (int4)(get_global_id(0), get_global_id(1), 0, 0);
float4 sum = (float4)(0);
for(; coord_a.y < K;)
for(; coord.z < K;)
{
float4 tempA0;
float4 tempB0;
tempA0 = read_imagef(inputA, coord_a);
tempB0 = read_imagef(inputB, coord_b);
coord_a.y++;
coord_b.y++;
tempA0 = read_imagef(inputA, coord.yz);
tempB0 = read_imagef(inputB, coord.xz);
coord.z++;
sum += tempA0 * tempB0;
sum = sum + tempA0 * tempB0;
}
coord_b.y = gidy;
write_imagef(output, coord_b, sum);
write_imagef(output, coord.xy, sum);
}
__kernel void gemm_transa_F32F32toF32_3D(
@ -42,7 +40,13 @@ __kernel void gemm_transa_F32F32toF32_3D(
int K,
int N,
int ac2zero,
int bc2zero
int bc2zero,
float scale_a,
float zp_a,
float scale_b,
float zp_b,
float scale_out,
float zp_out
)
{
int gidx = get_global_id(0);
@ -63,7 +67,7 @@ __kernel void gemm_transa_F32F32toF32_3D(
coord_a.y++;
coord_b.y++;
sum += tempA0 * tempB0;
sum = sum + tempA0 * tempB0;
}
coord_b.y = gidy;

View File

@ -0,0 +1,108 @@
inline float roi_align_1x1
(
__read_only image2d_array_t input,
float2 region_start,
float2 region_end,
float2 bin_size,
int2 grid_size,
float2 rcp_of_grid_size,
int pz
)
{
float sum = 0;
for(int iy = 0; iy < grid_size.y; ++iy)
{
for(int ix = 0; ix < grid_size.x; ++ix)
{
float2 ixy = (float2)(ix + 0.5f, iy + 0.5f);
float2 pos = region_start + ixy * bin_size * rcp_of_grid_size;
int2 xy_low = convert_int2(pos);
int2 xy_high = xy_low + 1;
float ly = pos.y - xy_low.y;
float lx = pos.x - xy_low.x;
float hy = 1.0f - ly;
float hx = 1.0f - lx;
float w1 = hy * hx;
float w2 = hy * lx;
float w3 = ly * hx;
float w4 = ly * lx;
float data1 = read_imagef(input, (int4)(xy_low.x, xy_low.y, pz, 0)).x;
float data2 = read_imagef(input, (int4)(xy_high.x, xy_low.y, pz, 0)).x;
float data3 = read_imagef(input, (int4)(xy_low.x, xy_high.y, pz, 0)).x;
float data4 = read_imagef(input, (int4)(xy_high.x, xy_high.y, pz, 0)).x;
sum = sum + w1 * data1 + w2 * data2 + w3 * data3 + w4 * data4;
}
}
return (float)(sum * rcp_of_grid_size.x * rcp_of_grid_size.y);
}
#define EPS_GRID 0.00001f
__kernel void roi_align_F32toF32
(
__read_only image2d_array_t input,
__read_only image2d_t rois,
__read_only image2d_t n_rois,
__write_only image2d_array_t output,
float spatial_x_scale,
float spatial_y_scale,
float in_width,
float in_height,
float rcp_of_out_width,
float rcp_of_out_height,
float sampling_x_ratio,
float sampling_y_ratio,
int depth
)
{
int px = get_global_id(0);
int py = get_global_id(1);
int pw = get_global_id(2);
int roi_batch = read_imagei(n_rois, (int2)(pw, 0)).x;
float4 roi_x = read_imagef(rois, (int2)(0, pw));
float4 roi_y = read_imagef(rois, (int2)(1, pw));
float4 roi_z = read_imagef(rois, (int2)(2, pw));
float4 roi_w = read_imagef(rois, (int2)(3, pw));
float4 roi = (float4)(roi_x.x, roi_y.x, roi_z.x, roi_w.x);
float4 roi_anchor = roi * (float4)(spatial_x_scale, spatial_y_scale, spatial_x_scale, spatial_y_scale);
float2 roi_dims = fmax(roi_anchor.zw - roi_anchor.xy, 1.0f);
float2 spatial_indx = (float2)(px, py);
float2 pooled_dims = (float2)(rcp_of_out_width, rcp_of_out_height);
float2 max_spatial_dims = (float2)(in_width, in_height);
float2 bin_size = roi_dims * pooled_dims;
float2 region_start = spatial_indx * bin_size + roi_anchor.xy;
float2 region_end = region_start + bin_size;
float2 roi_bin_grid = (float2)(sampling_x_ratio, sampling_y_ratio);
roi_bin_grid = roi_bin_grid == 0 ? ceil(bin_size - EPS_GRID) : roi_bin_grid;
int kz = roi_batch * depth;
float2 rcp_of_grid_size = 1.0f / roi_bin_grid;
int2 grid_size_xy = convert_int2(roi_bin_grid);
float4 interp;
int kz1 = pw * depth;
for (int pz = 0; pz < depth; pz ++, kz ++, kz1 ++)
{
interp.x = roi_align_1x1( input,
region_start,
region_end,
bin_size,
grid_size_xy,
rcp_of_grid_size,
kz);
write_imagef(output, (int4)(px, py, kz1, 0), interp);
}
}

View File

@ -0,0 +1,90 @@
__kernel void space2depth_internal_F32toF32 (
image2d_array_t input,
image2d_array_t output,
int block_size_x, int block_size_y,
float scaleInOut, float zpInOut)
{
int x = get_global_id(0);
int y = get_global_id(1);
int z = get_global_id(2);
int inDepth = get_image_array_size(input);
int4 coord = (int4)(x, y, z, 0);
float4 data = {0.0};
data = read_imagef(input, coord);
ushort blockSize_x = convert_ushort(block_size_x);
ushort blockSize_y = convert_ushort(block_size_y);
int4 coord_out = (int4)(convert_ushort(x)/blockSize_x, convert_ushort(y)/blockSize_y, 0, 0);
coord_out.z = ((x - coord_out.x * block_size_x) + (y - coord_out.y * block_size_y) * block_size_x) * inDepth
+ z;
write_imagef(output, coord_out, data);
}
__kernel void space2depth_internal_F32toF32_X2Y1 (
image2d_array_t input,
image2d_array_t output,
int block_size_x, int block_size_y,
float scaleInOut, float zpInOut)
{
int x = get_global_id(0);
int y = get_global_id(1);
int z = get_global_id(2);
int inDepth = get_image_array_size(input);
int4 coord = (int4)(x, y, z, 0);
float4 data = {0.0};
data = read_imagef(input, coord);
int4 coord_out = (int4)(x >> 1, y, 0, 0);
coord_out.z = (x & 1) * inDepth + z;
write_imagef(output, coord_out, data);
}
__kernel void space2depth_internal_U8toU8 (
image2d_array_t input,
image2d_array_t output,
int block_size_x, int block_size_y,
float scaleInOut, float zpInOut)
{
int x = get_global_id(0);
int y = get_global_id(1);
int z = get_global_id(2);
int inDepth = get_image_array_size(input);
int4 coord = (int4)(x, y, z, 0);
uint4 data = {0};
data = read_imageui(input, coord);
ushort blockSize_x = convert_ushort(block_size_x);
ushort blockSize_y = convert_ushort(block_size_y);
int4 coord_out = (int4)(convert_ushort(x)/blockSize_x, convert_ushort(y)/blockSize_y, 0, 0);
coord_out.z = ((x - coord_out.x * block_size_x) + (y - coord_out.y * block_size_y) * block_size_x) * inDepth
+ z;
data.x = convert_uint(data.x * scaleInOut + zpInOut);
write_imageui(output, coord_out, data);
}
__kernel void space2depth_internal_U8toU8_X2Y1 (
image2d_array_t input,
image2d_array_t output,
int block_size_x, int block_size_y,
float scaleInOut, float zpInOut)
{
int x = get_global_id(0);
int y = get_global_id(1);
int z = get_global_id(2);
int inDepth = get_image_array_size(input);
int4 coord = (int4)(x, y, z, 0);
uint4 data = {0};
data = read_imageui(input, coord);
int4 coord_out = (int4)(x >> 1, y, 0, 0);
coord_out.z = (x & 1) * inDepth + z;
data.x = convert_uint(data.x * scaleInOut + zpInOut);
write_imageui(output, coord_out, data);
}

View File

@ -1,253 +0,0 @@
/****************************************************************************
*
* Copyright (c) 2020 Vivante Corporation
*
* Permission is hereby granted, free of charge, to any person obtaining a
* copy of this software and associated documentation files (the "Software"),
* to deal in the Software without restriction, including without limitation
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
* and/or sell copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in
* all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
* DEALINGS IN THE SOFTWARE.
*
*****************************************************************************/
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <stdint.h>
#include "vsi_nn_platform.h"
#include "vsi_nn_prv.h"
#include "vsi_nn_log.h"
#include "vsi_nn_tensor_util.h"
#include "utils/vsi_nn_util.h"
#include "utils/vsi_nn_math.h"
#include "client/vsi_nn_vxkernel.h"
#include "libnnext/vx_lib_nnext.h"
void myTensorCropFunc
(
int8_t *src,
int8_t *dst
)
{
return;
}
vsi_status VX_CALLBACK TensorCropInternalKernel
(vx_node node,
const vx_reference* paramObj,
uint32_t paramNum
)
{
vsi_status status = VX_ERROR_INVALID_PARAMETERS;
if(paramNum == 2)
{
}
return status;
}
vsi_status VX_CALLBACK TensorCropInitializer
(vx_node nodObj,
const vx_reference *paramObj,
uint32_t paraNum
)
{
vsi_status status = VX_SUCCESS;
#define gcmALIGN(n, align) ((n) + ((align) - 1)) & ~((align) - 1)
vx_kernel_execution_parameters_t shaderParam = {
3, // workdim
{0, 0, 0}, // globalWorkOffset: control the start location be processed in the image
{0, 0, 0}, // globalWorkScale: how many pixels could be processed by a single thread
{0, 0, 0}, // localWorkSize: local group size in threads
{0, 0, 0}}; // globalWorkSize: image size in threads
vx_tensor input = (vx_tensor)paramObj[0];
vx_tensor output = (vx_tensor)paramObj[1];
uint32_t output_size[4] = {1, 1, 1, 1};
vsi_enum dataFormat, dstFormat;
int8_t input_fixPointPos = 0;
vx_uint32 i = 0;
int32_t offset[3];
size_t size[DIM_SIZE];
vsi_nn_tensor_attr_t attr[2];
memset(&attr[0], 0, sizeof(vsi_nn_tensor_attr_t));
memset(&attr[1], 0, sizeof(vsi_nn_tensor_attr_t));
status = vsi_nn_vxGetTensorAttr(input, &attr[0]);
status |= vsi_nn_vxGetTensorAttr(output, &attr[1]);
if (status != VX_SUCCESS)
{
VSILOGE("vsi_nn_vxGetTensorAttr failure! at line %d\n", __LINE__);
return status;
}
dataFormat = attr[0].dtype.vx_type;
input_fixPointPos = attr[0].dtype.fl;
dstFormat = attr[1].dtype.vx_type;
for (i = 0; i < attr[1].dim_num; i++)
{
output_size[i] = attr[1].size[i];
}
vxCopyScalar((vx_scalar)paramObj[2], &offset[0], VX_READ_ONLY, VX_MEMORY_TYPE_HOST);
vxCopyScalar((vx_scalar)paramObj[3], &offset[1], VX_READ_ONLY, VX_MEMORY_TYPE_HOST);
vxCopyScalar((vx_scalar)paramObj[4], &offset[2], VX_READ_ONLY, VX_MEMORY_TYPE_HOST);
memset(size, 0, sizeof(size_t) * DIM_SIZE);
switch(dstFormat)
{
case VSI_NN_TYPE_INT8:
case VSI_NN_TYPE_UINT8:
size[0] = 16;
size[1] = 4;
break;
case VSI_NN_TYPE_INT16:
case VSI_NN_TYPE_UINT16:
case VSI_NN_TYPE_FLOAT16:
size[0] = 8;
size[1] = 4;
break;
}
shaderParam.globalWorkOffset[0] = offset[0];
shaderParam.globalWorkOffset[1] = offset[1];
shaderParam.globalWorkOffset[2] = offset[2];
shaderParam.globalWorkScale[0] = size[0];
shaderParam.globalWorkScale[1] = size[1];
shaderParam.globalWorkScale[2] = 1;
shaderParam.globalWorkSize[0] = gcmALIGN((output_size[0] + shaderParam.globalWorkScale[0] - 1)
/ shaderParam.globalWorkScale[0], 4);
shaderParam.globalWorkSize[1] = (output_size[1] + shaderParam.globalWorkScale[1] - 1)
/ shaderParam.globalWorkScale[1];
shaderParam.globalWorkSize[2] = output_size[2];
if(dataFormat == VSI_NN_TYPE_INT16 && dstFormat == VSI_NN_TYPE_FLOAT16)
{
vx_uint32 uniConvertInt16toFp16_2x8[16] = {
0x11111111, // TCfg
0x00000000, // ASelt
0x03020100, 0x07060504, // ABin
0x22222222, // BSelt
0x00000000, 0x00000000, // BBin
0x00000600, // AccumType, ConstantType, and PostShift
0x00000001, 0x00000001, 0x00000001, 0x00000001, 0x00000001, 0x00000001, 0x00000001, 0x00000001 // Constant
};
#define cropMIN(x, y) (((x) <= (y)) ? (x) : (y))
#define CROP_MAX_POST_SHIFT_BITS (31)
#define CROP_MAX_MULTIPLIER_NUM (65535)
if (input_fixPointPos > 0)
{
vx_uint8 postshift = cropMIN(input_fixPointPos, CROP_MAX_POST_SHIFT_BITS);
uniConvertInt16toFp16_2x8[7] |= (postshift & 0x1F);
}
else
{
vx_uint32 multiplier = cropMIN((int64_t)1 << (-input_fixPointPos), CROP_MAX_MULTIPLIER_NUM);
for (i = 0; i < 8; i++)
{
uniConvertInt16toFp16_2x8[i + 8] = multiplier;
}
}
#undef cropMIN
#undef CROP_MAX_POST_SHIFT_BITS
#undef CROP_MAX_MULTIPLIER_NUM
status |= vxSetNodeUniform(nodObj, "uniConvertInt16toFp16_2x8", 1, uniConvertInt16toFp16_2x8);
}
vxSetNodeAttribute(nodObj, VX_NODE_ATTRIBUTE_KERNEL_EXECUTION_PARAMETERS,
&shaderParam, sizeof(vx_kernel_execution_parameters_t));
if(status < 0)
{
VSILOGE("[%s : %d]Initializer failure! \n",__FILE__, __LINE__);
}
return status;
}
vx_param_description_t basekernel_tensorCrop_params[] = {
{VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
{VX_OUTPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
{VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
{VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
{VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED}
};
#ifdef __cplusplus
extern "C" {
#endif
vx_kernel_description_t vxTensorCropKernelInt16Info =
{
VX_KERNEL_ENUM_TENSORCROP_INT16,
VX_KERNEL_NAME_TENSORCROP_INT16,
NULL,
basekernel_tensorCrop_params,
(sizeof(basekernel_tensorCrop_params) / sizeof(basekernel_tensorCrop_params[0])),
vsi_nn_KernelValidator,
NULL,
NULL,
TensorCropInitializer,
vsi_nn_KernelDeinitializer
};
vx_kernel_description_t vxTensorCropKernelInt8Info =
{
VX_KERNEL_ENUM_TENSORCROP_INT8,
VX_KERNEL_NAME_TENSORCROP_INT8,
NULL,
basekernel_tensorCrop_params,
(sizeof(basekernel_tensorCrop_params) / sizeof(basekernel_tensorCrop_params[0])),
vsi_nn_KernelValidator,
NULL,
NULL,
TensorCropInitializer,
vsi_nn_KernelDeinitializer
};
vx_kernel_description_t vxTensorCropKernelInt16Fp16Info =
{
VX_KERNEL_ENUM_TENSORCROP_INT16_FP16,
VX_KERNEL_NAME_TENSORCROP_INT16_FP16,
NULL,
basekernel_tensorCrop_params,
(sizeof(basekernel_tensorCrop_params) / sizeof(basekernel_tensorCrop_params[0])),
vsi_nn_KernelValidator,
NULL,
NULL,
TensorCropInitializer,
vsi_nn_KernelDeinitializer
};
vx_kernel_description_t * vx_kernel_CROP_list[] =
{
NULL,
&vxTensorCropKernelInt16Info,
&vxTensorCropKernelInt8Info,
&vxTensorCropKernelInt16Fp16Info,
NULL
};
#ifdef __cplusplus
}
#endif

View File

@ -1,323 +0,0 @@
/****************************************************************************
*
* Copyright (c) 2020 Vivante Corporation
*
* Permission is hereby granted, free of charge, to any person obtaining a
* copy of this software and associated documentation files (the "Software"),
* to deal in the Software without restriction, including without limitation
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
* and/or sell copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in
* all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
* DEALINGS IN THE SOFTWARE.
*
*****************************************************************************/
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <stdint.h>
#include "vsi_nn_platform.h"
#include "vsi_nn_prv.h"
#include "vsi_nn_log.h"
#include "vsi_nn_tensor_util.h"
#include "utils/vsi_nn_util.h"
#include "utils/vsi_nn_dtype_util.h"
#include "utils/vsi_nn_math.h"
#include "client/vsi_nn_vxkernel.h"
#include "libnnext/vx_lib_nnext.h"
#define _VX_KERNEL_VAR (vx_kernel_FCL2)
#define _VX_KERNEL_ID (VX_KERNEL_ENUM_FULLYCONNECTED_AXIS2)
#define _VX_KERNEL_NAME ("vsi_nn_kernel_fullconnect2")
#define _VX_KERNEL_FUNC_KERNEL (vxFullconnect2Kernel)
//static uint32_t layerNum = 0;
static vsi_status VX_CALLBACK vxFullconnect2Kernel
(
vx_node node,
const vx_reference* paramObj,
uint32_t paramNum
)
{
/* TODO: */
#define ARG_NUM (2)
#define TENSOR_NUM_INPUT (3)
#define TENSOR_NUM_OUTPUT (1)
#define TENSOR_NUM (TENSOR_NUM_INPUT+TENSOR_NUM_OUTPUT)
vsi_status status = VX_SUCCESS;
uint32_t i, j, k;
vx_context context = NULL;
vsi_nn_tensor_attr_t attr[TENSOR_NUM];
uint32_t stride_size[TENSOR_NUM][VSI_NN_MAX_DIM_NUM];
vx_tensor_addressing user_addr[TENSOR_NUM] = {NULL};
uint8_t *buffer_ptr[TENSOR_NUM] = {NULL};
vx_tensor tensor[TENSOR_NUM];
//char fileName[256] = {'\0'};
//uint32_t total_size;
int32_t axis, weights;
uint32_t num_fc = 1, num_no_fc = 1;
//prepare data
context = vxGetContext((vx_reference)node);
for( i = 0; i < TENSOR_NUM_INPUT; i ++ )
{
tensor[i] = (vx_tensor)paramObj[i];
buffer_ptr[i] = vsi_nn_ConvertRawTensorToData2(context, tensor[i],
&(attr[i]), stride_size[i], &(user_addr[i]), VX_READ_ONLY);
}
for( i = TENSOR_NUM_INPUT; i < TENSOR_NUM; i ++ )
{
tensor[i] = (vx_tensor)paramObj[i];
buffer_ptr[i] = vsi_nn_ConvertRawTensorToData2(context, tensor[i],
&(attr[i]), stride_size[i], &(user_addr[i]), VX_WRITE_ONLY);
}
vxCopyScalar((vx_scalar)paramObj[TENSOR_NUM], &(axis),
VX_READ_ONLY, VX_MEMORY_TYPE_HOST);
vxCopyScalar((vx_scalar)paramObj[TENSOR_NUM + 1], &(weights),
VX_READ_ONLY, VX_MEMORY_TYPE_HOST);
//op calc
for(i = 0; i <= (uint32_t)axis; ++i)
{
num_fc *= attr[0].size[i];
}
for(i = axis + 1; i < attr[0].dim_num; ++i)
{
num_no_fc *= attr[0].size[i];
}
for(k = 0; k < num_no_fc; ++k)
{
for(j = 0; j < (uint32_t)weights; ++j)
{
float sum;
vsi_nn_DtypeToFloat32(&buffer_ptr[2][stride_size[2][0] * j], &sum, &attr[2].dtype);
for(i = 0; i < num_fc; ++i)
{
float x, w;
vsi_nn_DtypeToFloat32(&buffer_ptr[0][stride_size[0][0] * (i + num_fc * k)],
&x, &attr[0].dtype);
vsi_nn_DtypeToFloat32(&buffer_ptr[1][stride_size[1][0] * (i + num_fc * j)],
&w, &attr[1].dtype);
sum += w * x;
}
vsi_nn_Float32ToDtype(sum, &buffer_ptr[3][stride_size[3][0] * (j + weights * k)],
&attr[3].dtype);
}
}
#if 0
print_index = 3;
total_size = vsi_nn_ShapeProduct(size[print_index], dim_num[print_index]);
if (dim_num[print_index] == 3)
{
snprintf(fileName, VSI_NN_MAX_PATH, "%s_%d_%d_%d_%d.txt", _VX_KERNEL_NAME, layerNum,
size[print_index][0], size[print_index][1], size[print_index][2]);
}
else
{
snprintf(fileName, VSI_NN_MAX_PATH, "%s_%d_%d_%d_%d_%d.txt", _VX_KERNEL_NAME, layerNum,
size[print_index][0], size[print_index][1], size[print_index][2], size[print_index][3]);
}
vsi_nn_SaveDataToText(fileName, buffer_ptr[print_index], total_size,
data_format[print_index], NULL);
layerNum++;
#endif
//save data
for( i = TENSOR_NUM_INPUT; i < TENSOR_NUM; i ++ )
{
status = vsi_nn_copy_tensor_patch(tensor[i], &attr[i], buffer_ptr[i], VX_WRITE_ONLY);
if (user_addr[i]) vxReleaseTensorAddressing(&(user_addr[i]));
}
for( i = 0; i < TENSOR_NUM; i ++ )
{
if (buffer_ptr[i]) free(buffer_ptr[i]);
}
return status;
} /* _VX_KERNEL_FUNC_KERNEL() */
static vx_param_description_t s_params[] =
{
{ VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED },
{ VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED },
{ VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED },
{ VX_OUTPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED },
{ VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED },
{ VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED },
};
void myFullyConnected_Axis2Func
(
int8_t *src,
int8_t *dst
)
{
return;
}
vsi_status VX_CALLBACK vxFullyConnected_Axis2Kernel
(
vx_node node,
const vx_reference* paramObj,
uint32_t paramNum
)
{
vsi_status status = VX_ERROR_INVALID_PARAMETERS;
if(paramNum == 2)
{
}
return status;
}
vsi_status VX_CALLBACK vxFullyConnected_Axis2Initializer
(
vx_node nodObj,
const vx_reference *paramObj,
uint32_t paraNum
)
{
vsi_status status = VX_SUCCESS;
// Alignment with a power of two value.
#define gcmALIGN(n, align) ((n) + ((align) - 1)) & ~((align) - 1)
vx_kernel_execution_parameters_t shaderParam = {
2, // workdim
{0, 0, 0}, // globalWorkOffset: control the start location be processed in the image
{0, 0, 0}, // globalWorkScale: how many pixels could be processed by a single thread
{0, 0, 0}, // localWorkSize: local group size in threads
{0, 0, 0}}; // globalWorkSize: image size in threads
uint32_t input_size[DIM_SIZE] = {1, 1, 1, 1};
uint32_t output_size[DIM_SIZE] = {1, 1, 1, 1};
uint32_t uniMulAcc_16x1[16] = {
0x00005555, // TCfg
0x00000000, // ASelt
0x76543210, 0x00000000, // ABin
0x00005555, // BSelt
0x76543210, 0x00000000, // BBin
0x00000400, // AccumType, ConstantType, and PostShift
0x00000000, 0x00000000, 0x00000000, 0x00000000,
0x00000000, 0x00000000, 0x00000000, 0x00000000 // Constant
};
uint32_t loopNum = 0;
vsi_nn_tensor_attr_t attr[2];
uint32_t i;
uint32_t input_dims = 0;
uint32_t output_dims = 0;
memset(&attr[0], 0, sizeof(vsi_nn_tensor_attr_t));
memset(&attr[1], 0, sizeof(vsi_nn_tensor_attr_t));
status = vsi_nn_vxGetTensorAttr((vx_tensor)paramObj[1], &attr[0]);
status |= vsi_nn_vxGetTensorAttr((vx_tensor)paramObj[3], &attr[1]);
if (status != VX_SUCCESS)
{
VSILOGE("vsi_nn_vxGetTensorAttr failure! at line %d\n", __LINE__);
return status;
}
input_dims = attr[0].dim_num;
for (i = 0; i < input_dims; i++)
{
input_size[i] = attr[0].size[i];
}
output_dims = attr[1].dim_num;
for (i = 0; i < output_dims; i++)
{
output_size[i] = attr[1].size[i];
}
shaderParam.globalWorkOffset[0] = 0;
shaderParam.globalWorkOffset[1] = 0;
shaderParam.globalWorkScale[0] = 1;
shaderParam.globalWorkScale[1] = 1;
shaderParam.globalWorkSize[0] = gcmALIGN((output_size[0] + shaderParam.globalWorkScale[0] - 1)
/ shaderParam.globalWorkScale[0], 4);
shaderParam.globalWorkSize[1] = (output_size[1] + shaderParam.globalWorkScale[1] - 1)
/ shaderParam.globalWorkScale[1];
vxSetNodeAttribute(nodObj, VX_NODE_ATTRIBUTE_KERNEL_EXECUTION_PARAMETERS,
&shaderParam, sizeof(vx_kernel_execution_parameters_t));
vxSetNodeUniform(nodObj, "uniMulAcc_16x1", 1, uniMulAcc_16x1);
loopNum = gcmALIGN(input_size[0], 32);
vxSetNodeUniform(nodObj, "loopNum", 1, &loopNum);
if(status < 0)
{
VSILOGE("[%s : %d]Initializer failure! \n",__FILE__, __LINE__);
}
return status;
}
static vx_param_description_t vxFullyConnected_Axis2KernelParam[] =
{
{VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
{VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
{VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
{VX_OUTPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}
};
#ifdef __cplusplus
extern "C" {
#endif
vx_kernel_description_t _VX_KERNEL_VAR =
{
_VX_KERNEL_ID,
_VX_KERNEL_NAME,
_VX_KERNEL_FUNC_KERNEL,
s_params,
_cnt_of_array( s_params ),
vsi_nn_KernelValidator,
NULL,
NULL,
vsi_nn_KernelInitializer,
vsi_nn_KernelDeinitializer
};
vx_kernel_description_t vxFullyConnected_Axis2KernelInfo =
{
_VX_KERNEL_ID,
_VX_KERNEL_NAME,
vxFullyConnected_Axis2Kernel,
vxFullyConnected_Axis2KernelParam,
(sizeof(vxFullyConnected_Axis2KernelParam) / sizeof(vxFullyConnected_Axis2KernelParam[0])),
vsi_nn_KernelValidator,
NULL,
NULL,
vxFullyConnected_Axis2Initializer,
vsi_nn_KernelDeinitializer
};
vx_kernel_description_t * vx_kernel_FCL2_list[] =
{
&_VX_KERNEL_VAR,
&vxFullyConnected_Axis2KernelInfo,
NULL
};
#ifdef __cplusplus
}
#endif

View File

@ -1,688 +0,0 @@
/****************************************************************************
*
* Copyright (c) 2020 Vivante Corporation
*
* Permission is hereby granted, free of charge, to any person obtaining a
* copy of this software and associated documentation files (the "Software"),
* to deal in the Software without restriction, including without limitation
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
* and/or sell copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in
* all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
* DEALINGS IN THE SOFTWARE.
*
*****************************************************************************/
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <stdint.h>
#include <math.h>
#include "vsi_nn_platform.h"
#include "vsi_nn_prv.h"
#include "vsi_nn_log.h"
#include "vsi_nn_tensor_util.h"
#include "utils/vsi_nn_util.h"
#include "utils/vsi_nn_dtype_util.h"
#include "utils/vsi_nn_math.h"
#include "client/vsi_nn_vxkernel.h"
#include "libnnext/vx_lib_nnext.h"
void myLayerNormFunc
(
void* src,
int16_t* scale,
float* bias,
float eps,
void* dst,
uint32_t input_dim,
uint32_t width,
uint32_t height,
uint32_t channel,
uint32_t batch
)
{
uint32_t ch = (input_dim <= 2) ? 1 : channel;
uint32_t bn = (input_dim <= 3) ? 1 : batch;
uint32_t b = 0, c = 0, h = 0, w = 0;
int16_t* imgIn, *imgOut;
imgIn = (int16_t*)src;
imgOut = (int16_t*)dst;
VSILOGI("Hello myLayerNormFunc!\n");
for (b = 0; b < bn; b++)
{
for (c = 0; c < ch; c++)
{
for (h = 0; h < height; h++)
{
uint32_t len = (h + (c + b*ch)*height) * width;
float sum = .0f;
float sumsq = .0f;
float mean = .0f;
float vari = .0f;
for (w = 0; w < width; w++)
{
uint32_t index = len + w;
sum += vsi_nn_Fp16toFp32(imgIn[index]);
}
mean = sum / width;
for (w = 0; w < width; w++)
{
uint32_t index = len + w;
float data = vsi_nn_Fp16toFp32(imgIn[index]) - mean;
sumsq += data * data;
}
vari = sumsq / width;
vari = (float)(1.0 / sqrtf(vari + eps));
for (w = 0; w < width; w++)
{
uint32_t index = len + w;
float data = vsi_nn_Fp16toFp32(imgIn[index]) - mean;
float scaleVal = vsi_nn_Fp16toFp32(scale[w]);
float biasVal = bias[w];
float normVal = data * vari * scaleVal + biasVal;
imgOut[index] = vsi_nn_Fp32ToFp16(normVal);
}
}
}
}
return;
}
void myLayerNormFunc_u8
(
void* src,
int16_t* scale,
float* bias,
float eps,
void* dst,
uint32_t input_dim,
uint32_t width,
uint32_t height,
uint32_t channel,
uint32_t batch,
int32_t inZp,
int32_t outZp,
float inScale,
float outScale
)
{
uint32_t ch = (input_dim <= 2) ? 1 : channel;
uint32_t bn = (input_dim <= 3) ? 1 : batch;
uint32_t b = 0, c = 0, h = 0, w = 0;
uint8_t* imgIn, *imgOut;
imgIn = (uint8_t*)src;
imgOut = (uint8_t*)dst;
VSILOGI("Hello myLayerNormFunc!\n");
for (b = 0; b < bn; b++)
{
for (c = 0; c < ch; c++)
{
for (h = 0; h < height; h++)
{
uint32_t len = (h + (c + b*ch)*height) * width;
float sum = .0f;
float sumsq = .0f;
float mean = .0f;
float vari = .0f;
for (w = 0; w < width; w++)
{
uint32_t index = len + w;
//sum += vsi_nn_Fp16toFp32(imgIn[index]);
sum += vsi_nn_AffineToFp32(imgIn[index], inScale, inZp, VSI_NN_TYPE_UINT8);
}
mean = sum / width;
for (w = 0; w < width; w++)
{
uint32_t index = len + w;
//float data = vsi_nn_Fp16toFp32(imgIn[index]) - mean;
float data = vsi_nn_AffineToFp32(imgIn[index], inScale, inZp, VSI_NN_TYPE_UINT8) - mean;
sumsq += data * data;
}
vari = sumsq / width;
vari = (float)(1.0 / sqrtf(vari + eps));
for (w = 0; w < width; w++)
{
uint32_t index = len + w;
//float data = vsi_nn_Fp16toFp32(imgIn[index]) - mean;
float data = vsi_nn_AffineToFp32(imgIn[index], inScale, inZp, VSI_NN_TYPE_UINT8) - mean;
float scaleVal = vsi_nn_Fp16toFp32(scale[w]);
float biasVal = bias[w];
float normVal = data * vari * scaleVal + biasVal;
//imgOut[index] = vsi_nn_Fp32ToFp16(normVal);
imgOut[index] = (vx_uint8)vsi_nn_Fp32ToAffine(normVal, outScale, outZp, VSI_NN_TYPE_UINT8);
}
}
}
}
return;
}
vsi_status VX_CALLBACK vxLayerNormKernel
(
vx_node node,
const vx_reference* paramObj,
uint32_t paramNum
)
{
vsi_status status = VX_ERROR_INVALID_PARAMETERS;
if(paramNum == 5)
{
vx_context context = NULL;
// tensor
vx_tensor imgObj[4] = { NULL };
vsi_nn_tensor_attr_t attr[4];
int16_t *input = NULL, *output = NULL, *scale = NULL;
float *bias = NULL;
uint32_t input_size[4] = {1, 1, 1, 1}, output_size[4] = {1, 1, 1, 1};
uint32_t scale_size[4] = {1, 1, 1, 1}, bias_size[4] = {1, 1, 1, 1};
uint32_t input_stride_size[4] = {0};
uint32_t output_stride_size[4] = {0};
uint32_t scale_stride_size[4] = {0};
uint32_t bias_stride_size[4] = {0};
vx_tensor_addressing input_user_addr = NULL;
vx_tensor_addressing output_user_addr = NULL;
vx_tensor_addressing scale_user_addr = NULL;
vx_tensor_addressing bias_user_addr = NULL;
vsi_nn_type_e inputFormat = VSI_NN_TYPE_FLOAT16, outputFormat = VSI_NN_TYPE_FLOAT16;
vsi_nn_type_e scaleFormat = VSI_NN_TYPE_FLOAT16, biasFormat = VSI_NN_TYPE_FLOAT16;
uint32_t input_dims = 0, output_dims = 0;
uint32_t scale_dims = 0, bias_dims = 0;
uint32_t i;
int32_t in_zp, out_zp;
float in_scale, out_scale;
// scalar
vx_scalar scalar[1] = { NULL };
float eps = .0f;
imgObj[0] = (vx_tensor)paramObj[0];
imgObj[1] = (vx_tensor)paramObj[1];
imgObj[2] = (vx_tensor)paramObj[2];
imgObj[3] = (vx_tensor)paramObj[3];
scalar[0] = (vx_scalar)paramObj[4];
memset(&attr[0], 0, sizeof(vsi_nn_tensor_attr_t));
memset(&attr[1], 0, sizeof(vsi_nn_tensor_attr_t));
memset(&attr[2], 0, sizeof(vsi_nn_tensor_attr_t));
memset(&attr[3], 0, sizeof(vsi_nn_tensor_attr_t));
context = vxGetContext((vx_reference)node);
if (context == NULL)
{
VSILOGE("vxGetContext failure! at line %d\n", __LINE__);
goto OnError;
}
status = vsi_nn_vxGetTensorAttr(imgObj[0], &attr[0]);
status |= vsi_nn_vxGetTensorAttr(imgObj[1], &attr[1]);
status |= vsi_nn_vxGetTensorAttr(imgObj[2], &attr[2]);
status |= vsi_nn_vxGetTensorAttr(imgObj[3], &attr[3]);
if (status != VX_SUCCESS)
{
VSILOGE("vsi_nn_vxGetTensorAttr failure! at line %d\n", __LINE__);
goto OnError;
}
input_dims = attr[0].dim_num;
inputFormat = attr[0].dtype.vx_type;
for (i = 0; i < input_dims; i++)
{
input_size[i] = attr[0].size[i];
}
in_zp = attr[0].dtype.zero_point;
in_scale = attr[0].dtype.scale;
//bias
bias_dims = attr[1].dim_num;
biasFormat = attr[1].dtype.vx_type;
for (i = 0; i < bias_dims; i++)
{
bias_size[i] = attr[1].size[i];
}
//scale
scale_dims = attr[2].dim_num;
scaleFormat = attr[2].dtype.vx_type;
for (i = 0; i < scale_dims; i++)
{
scale_size[i] = attr[2].size[i];
}
//output
output_dims = attr[3].dim_num;
outputFormat = attr[3].dtype.vx_type;
for (i = 0; i < output_dims; i++)
{
output_size[i] = attr[3].size[i];
}
out_zp = attr[3].dtype.zero_point;
out_scale = attr[3].dtype.scale;
input_size[2] = (input_dims <= 2)?1:input_size[2];
input_size[3] = (input_dims <= 3)?1:input_size[3];
input_stride_size[0] = vsi_nn_GetTypeBytes(inputFormat);
output_stride_size[0] = vsi_nn_GetTypeBytes(outputFormat);
for (i=1; i< input_dims; i++)
{
input_stride_size[i] = input_stride_size[i-1] * input_size[i-1];
output_stride_size[i] = output_stride_size[i-1] * output_size[i-1];
}
input = (int16_t*)malloc(input_size[0]*input_size[1]*input_size[2]*sizeof(int16_t));
output = (int16_t*)malloc(output_size[0]*output_size[1]*output_size[2]*sizeof(int16_t));
input_user_addr = vxCreateTensorAddressing(context, input_size, input_stride_size, (vx_uint8)input_dims);
vsi_nn_copy_tensor_patch(imgObj[0], &attr[0], input, VX_READ_ONLY);
//scale and bias
scale_stride_size[0] = vsi_nn_GetTypeBytes(scaleFormat);
bias_stride_size[0] = vsi_nn_GetTypeBytes(biasFormat);
for (i=1; i< scale_dims; i++)
{
scale_stride_size[i] = scale_stride_size[i-1] * scale_size[i-1];
bias_stride_size[i] = bias_stride_size[i-1] * bias_size[i-1];
}
scale = (int16_t*)malloc(scale_size[0]*sizeof(int16_t));
bias = (float*)malloc(bias_size[0]*sizeof(float));
bias_user_addr = vxCreateTensorAddressing(context, bias_size, bias_stride_size, (vx_uint8)bias_dims);
vsi_nn_copy_tensor_patch(imgObj[1], &attr[1], bias, VX_READ_ONLY);
scale_user_addr = vxCreateTensorAddressing(context, scale_size, scale_stride_size, (vx_uint8)scale_dims);
vsi_nn_copy_tensor_patch(imgObj[2], &attr[2], scale, VX_READ_ONLY);
// scalar
status = vxCopyScalar(scalar[0], &eps, VX_READ_ONLY, VX_MEMORY_TYPE_HOST);
if (status != VX_SUCCESS)
{
VSILOGE("vxCopyScalar failure! at line %d\n", __LINE__);
goto OnError;
}
// Call C Prototype
if(inputFormat == VSI_NN_TYPE_FLOAT16)
{
myLayerNormFunc(input, scale, bias, eps, output, input_dims, input_size[0],
input_size[1], input_size[2], input_size[3]);
}
else
{
myLayerNormFunc_u8(input, scale, bias, eps, output, input_dims, input_size[0],
input_size[1], input_size[2], input_size[3], in_zp, out_zp, in_scale, out_scale);
}
//output tensor
output_user_addr = vxCreateTensorAddressing(context, output_size,
output_stride_size, (vx_uint8)output_dims);
vsi_nn_copy_tensor_patch(imgObj[3], &attr[3], output, VX_WRITE_ONLY);
OnError:
if(input) free(input);
if(scale) free(scale);
if(bias) free(bias);
if(output) free(output);
if(input_user_addr) vxReleaseTensorAddressing(&input_user_addr);
if(scale_user_addr) vxReleaseTensorAddressing(&scale_user_addr);
if(bias_user_addr) vxReleaseTensorAddressing(&bias_user_addr);
if(output_user_addr) vxReleaseTensorAddressing(&output_user_addr);
}
return status;
}
vsi_status VX_CALLBACK vxLayerNormInitializer
(
vx_node nodObj,
const vx_reference *paramObj,
uint32_t paraNum
)
{
vsi_status status = VX_SUCCESS;
// Alignment with a power of two value.
#define gcmALIGN(n, align) ((n) + ((align) - 1)) & ~((align) - 1)
vx_kernel_execution_parameters_t shaderParam = {
3, // workdim
{0, 0, 0}, // globalWorkOffset: control the start location be processed in the image
{0, 0, 0}, // globalWorkScale: how many pixels could be processed by a single thread
{0, 0, 0}, // localWorkSize: local group size in thread
{0, 0, 0}}; // globalWorkSize: image size in thread
vx_tensor input = (vx_tensor)paramObj[0];
vx_tensor scale = (vx_tensor)paramObj[2];
vx_tensor output = (vx_tensor)paramObj[3];
uint32_t input_size[4] = {1, 1, 1, 1};
uint32_t input_dims = 0;
vsi_nn_type_e inputDataFormat = VSI_NN_TYPE_FLOAT16;
vsi_nn_type_e scaleDataFormat = VSI_NN_TYPE_FLOAT16;
vsi_nn_type_e outputDataFormat = VSI_NN_TYPE_FLOAT16;
vx_float32 scaleIn = 0;
vx_float32 scaleOut = 0;
vx_float32 reScaleOut_u8 = 0;
vx_float32 reOutZP = 0.f;
int32_t output_ZP = 0;
int32_t input_ZP = 0;
vx_uint32 iter = 0;
int32_t sumInZp = 0;
int32_t tmpZp1 = 0;
int32_t tmpZp2 = 0;
vx_float32 e2InScale = 0;
vsi_nn_tensor_attr_t attr[3];
uint32_t i;
memset(&attr[0], 0, sizeof(vsi_nn_tensor_attr_t));
memset(&attr[1], 0, sizeof(vsi_nn_tensor_attr_t));
memset(&attr[2], 0, sizeof(vsi_nn_tensor_attr_t));
status = vsi_nn_vxGetTensorAttr(input, &attr[0]);
status |= vsi_nn_vxGetTensorAttr(output, &attr[1]);
status |= vsi_nn_vxGetTensorAttr(scale, &attr[2]);
if (status != VX_SUCCESS)
{
VSILOGE("vsi_nn_vxGetTensorAttr failure! at line %d\n", __LINE__);
return status;
}
input_dims = attr[0].dim_num;
inputDataFormat = attr[0].dtype.vx_type;
for (i = 0; i < input_dims; i++)
{
input_size[i] = attr[0].size[i];
}
input_ZP = attr[0].dtype.zero_point;
scaleIn = attr[0].dtype.scale;
outputDataFormat = attr[1].dtype.vx_type;
output_ZP = attr[1].dtype.zero_point;
scaleOut = attr[1].dtype.scale;
scaleDataFormat = attr[2].dtype.vx_type;
if(outputDataFormat == VSI_NN_TYPE_UINT8)
{
reScaleOut_u8 = 1.0f / scaleOut;
reOutZP = (vx_float32)output_ZP;
}
iter = ((input_size[0] + 15) / 16) * 16;
sumInZp = input_ZP * iter * (-1);
tmpZp1 = (-2) * input_ZP;
tmpZp2 = iter * input_ZP * input_ZP;
e2InScale = scaleIn * scaleIn;
input_size[2] = (input_dims <= 2)?1:input_size[2];
shaderParam.globalWorkOffset[0] = 0;
shaderParam.globalWorkOffset[1] = 0;
shaderParam.globalWorkOffset[2] = 0;
shaderParam.globalWorkScale[0] = input_size[0];
shaderParam.globalWorkScale[1] = 1;
shaderParam.globalWorkScale[2] = 1;
shaderParam.globalWorkSize[0] = 1;
shaderParam.globalWorkSize[1] = gcmALIGN((input_size[1] + shaderParam.globalWorkScale[1] - 1)
/ shaderParam.globalWorkScale[1], 4);
shaderParam.globalWorkSize[2] = input_size[2];
status |= vxSetNodeAttribute(nodObj, VX_NODE_ATTRIBUTE_KERNEL_EXECUTION_PARAMETERS,
&shaderParam, sizeof(vx_kernel_execution_parameters_t));
if(status < 0)
{
VSILOGE("[%s : %d]Initializer failure! \n",__FILE__, __LINE__);
}
{
vx_float32 dimRatio = 1.0f / (vx_float32)input_size[0];
vx_uint32 uniFp16SumSqr_dp8x2[16] = {
0x55555555, // TCfg
0x00000000, // ASelt
0x76543210, 0x76543210, // ABin
0x5555aaaa, // BSelt
0x00000000, 0x76543210, // BBin
0x00000100, // AccumType, ConstantType, and PostShift
0x3c003c00, 0x3c003c00, 0x3c003c00, 0x3c003c00, 0x00000000, 0x00000000, 0x00000000, 0x00000000 // Constant
};
vx_uint32 UniFP16toFP32Lo4_dp4x4[16] = {
0x01010101, // TCfg
0x00000000, // ASelt
0x00010000, 0x00030002, // ABin
0x02020202, // BSelt
0x00000000, 0x00000000, // BBin
0x00000100, // AccumType, ConstantType, and PostShift
0x00003c00, 0x00000000, 0x00003c00, 0x00000000, 0x00003c00, 0x00000000, 0x00003c00, 0x00000000 // Constant
};
vx_uint32 uniExtractHalf4_dp4x4[16] = {
0x01010101, // TCfg
0x00000000, // ASelt
0x00020000, 0x00060004, // ABin
0x02020202, // BSelt
0x00000000, 0x00000000, // BBin
0x00000100, // AccumType, ConstantType, and PostShift
0x00003c00, 0x00000000, 0x00003c00, 0x00000000, 0x00003c00, 0x00000000, 0x00003c00, 0x00000000 // Constant
};
vx_uint32 uniConvertSecFp16Fp32_4x4[16] = {
0x01010101, // TCfg
0x00000000, // ASelt
0x00050004, 0x00070006, // ABin
0x02020202, // BSelt
0x00000000, 0x00000000, // BBin
0x00000100, // AccumType, ConstantType, and PostShift
0x00003c00, 0x00000000, 0x00003c00, 0x00000000, 0x00003c00, 0x00000000, 0x00003c00, 0x00000000 // Constant
};
vx_uint32 uniSumU8_16x1[16] = {
0x55555555, // TCfg
0x00000000, // ASelt
0x76543210, 0xfedcba98, // ABin
0xaaaaaaaa, // BSelt
0x00000000, 0x00000000, // BBin
0x00002400, // AccumType, ConstantType, and PostShift
0x00010001, 0x00010001, 0x00010001, 0x00010001, 0x00010001, 0x00010001, 0x00010001, 0x00010001 // Constant
};
vx_uint32 uniSqrSum_16x1[16] = {
0x55555555, // TCfg
0x00000000, // ASelt
0x76543210, 0xfedcba98, // ABin
0x55555555, // BSelt
0x76543210, 0xfedcba98, // BBin
0x00000400, // AccumType, ConstantType, and PostShift
0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000 // Constant
};
vx_uint32 uniConvert1stUint8SubZpToFp32_4x4[16] = {
0x05050505, // TCfg
0x04040404, // ASelt
0x00010000, 0x00030002, // ABin
0x0a0a0a0a, // BSelt
0x00000000, 0x00000000, // BBin
0x00000400, // AccumType, ConstantType, and PostShift
0xffff0001, 0x00000000, 0xffff0001, 0x00000000, 0xffff0001, 0x00000000, 0xffff0001, 0x00000000 // Constant
};
vx_uint32 uniConvert2ndUint8SubZpToFp32_4x4[16] = {
0x05050505, // TCfg
0x04040404, // ASelt
0x00050004, 0x00070006, // ABin
0x0a0a0a0a, // BSelt
0x00000000, 0x00000000, // BBin
0x00000400, // AccumType, ConstantType, and PostShift
0xffff0001, 0x00000000, 0xffff0001, 0x00000000, 0xffff0001, 0x00000000, 0xffff0001, 0x00000000 // Constant
};
vx_uint32 uniConvert3rdUint8SubZpToFp32_4x4[16] = {
0x05050505, // TCfg
0x04040404, // ASelt
0x00090008, 0x000b000a, // ABin
0x0a0a0a0a, // BSelt
0x00000000, 0x00000000, // BBin
0x00000400, // AccumType, ConstantType, and PostShift
0xffff0001, 0x00000000, 0xffff0001, 0x00000000, 0xffff0001, 0x00000000, 0xffff0001, 0x00000000 // Constant
};
vx_uint32 uniConvert4thUint8SubZpToFp32_4x4[16] = {
0x05050505, // TCfg
0x04040404, // ASelt
0x000d000c, 0x000f000e, // ABin
0x0a0a0a0a, // BSelt
0x00000000, 0x00000000, // BBin
0x00002400, // AccumType, ConstantType, and PostShift
0xffff0001, 0x00000000, 0xffff0001, 0x00000000, 0xffff0001, 0x00000000, 0xffff0001, 0x00000000 // Constant
};
vx_uint32 uniConvertInt32toUint8_2x8[16] = {
0x33333333, // TCfg
0x11110000, // ASelt
0x03020100, 0x03020100, // ABin
0x00000000, // BSelt
0x00000000, 0x00000000, // BBin
0x00002400, // AccumType, ConstantType, and PostShift
0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000 // Constant
};
vx_uint32 UniPackFP16even_2x8[16] = {
0x11111111, // TCfg
0x11110000, // ASelt
0x06040200, 0x06040200, // ABin
0x22222222, // BSelt
0x00000000, 0x00000000, // BBin
0x00000100, // AccumType, ConstantType, and PostShift
0x00003c00, 0x00003c00, 0x00003c00, 0x00003c00, 0x00003c00, 0x00003c00, 0x00003c00, 0x00003c00 // Constant
};
if (inputDataFormat == VSI_NN_TYPE_UINT8 && outputDataFormat == VSI_NN_TYPE_FLOAT16
&& scaleDataFormat == VSI_NN_TYPE_FLOAT16)
{
status = vxSetNodeUniform(nodObj, "width", 1, &input_size[0]);
status |= vxSetNodeUniform(nodObj, "dimRatio", 1, &dimRatio);
status |= vxSetNodeUniform(nodObj, "UniFP16toFP32Lo4_dp4x4", 1, UniFP16toFP32Lo4_dp4x4);
status |= vxSetNodeUniform(nodObj, "uniConvertSecFp16Fp32_4x4", 1, uniConvertSecFp16Fp32_4x4);
status |= vxSetNodeUniform(nodObj, "uniSumU8_16x1", 1, uniSumU8_16x1);
status |= vxSetNodeUniform(nodObj, "uniSqrSum_16x1", 1, uniSqrSum_16x1);
status |= vxSetNodeUniform(nodObj, "uniConvert1stUint8SubZpToFp32_4x4", 1, uniConvert1stUint8SubZpToFp32_4x4);
status |= vxSetNodeUniform(nodObj, "uniConvert2ndUint8SubZpToFp32_4x4", 1, uniConvert2ndUint8SubZpToFp32_4x4);
status |= vxSetNodeUniform(nodObj, "uniConvert3rdUint8SubZpToFp32_4x4", 1, uniConvert3rdUint8SubZpToFp32_4x4);
status |= vxSetNodeUniform(nodObj, "uniConvert4thUint8SubZpToFp32_4x4", 1, uniConvert4thUint8SubZpToFp32_4x4);
status |= vxSetNodeUniform(nodObj, "inputZP", 1, &input_ZP);
status |= vxSetNodeUniform(nodObj, "input_scale", 1, &scaleIn);
status |= vxSetNodeUniform(nodObj, "sumInZp", 1, &sumInZp);
status |= vxSetNodeUniform(nodObj, "tmpZp1", 1, &tmpZp1);
status |= vxSetNodeUniform(nodObj, "tmpZp2", 1, &tmpZp2);
status |= vxSetNodeUniform(nodObj, "e2InScale", 1, &e2InScale);
status |= vxSetNodeUniform(nodObj, "UniPackFP16even_2x8", 1, UniPackFP16even_2x8);
}
else
{
status = vxSetNodeUniform(nodObj, "uniFp16SumSqr_dp8x2", 1, uniFp16SumSqr_dp8x2);
status |= vxSetNodeUniform(nodObj, "width", 1, &input_size[0]);
status |= vxSetNodeUniform(nodObj, "dimRatio", 1, &dimRatio);
status |= vxSetNodeUniform(nodObj, "UniFP16toFP32Lo4_dp4x4", 1, UniFP16toFP32Lo4_dp4x4);
status |= vxSetNodeUniform(nodObj, "uniExtractHalf4_dp4x4", 1, uniExtractHalf4_dp4x4);
status |= vxSetNodeUniform(nodObj, "uniConvertInt32toUint8_2x8", 1, uniConvertInt32toUint8_2x8);
status |= vxSetNodeUniform(nodObj, "uniConvertSecFp16Fp32_4x4", 1, uniConvertSecFp16Fp32_4x4);
status |= vxSetNodeUniform(nodObj, "uniSumU8_16x1", 1, uniSumU8_16x1);
status |= vxSetNodeUniform(nodObj, "uniSqrSum_16x1", 1, uniSqrSum_16x1);
status |= vxSetNodeUniform(nodObj, "uniConvert1stUint8SubZpToFp32_4x4", 1, uniConvert1stUint8SubZpToFp32_4x4);
status |= vxSetNodeUniform(nodObj, "uniConvert2ndUint8SubZpToFp32_4x4", 1, uniConvert2ndUint8SubZpToFp32_4x4);
status |= vxSetNodeUniform(nodObj, "uniConvert3rdUint8SubZpToFp32_4x4", 1, uniConvert3rdUint8SubZpToFp32_4x4);
status |= vxSetNodeUniform(nodObj, "uniConvert4thUint8SubZpToFp32_4x4", 1, uniConvert4thUint8SubZpToFp32_4x4);
status |= vxSetNodeUniform(nodObj, "inputZP", 1, &input_ZP);
status |= vxSetNodeUniform(nodObj, "output_ZP", 1, &output_ZP);
status |= vxSetNodeUniform(nodObj, "input_scale", 1, &scaleIn);
status |= vxSetNodeUniform(nodObj, "outputScale", 1, &reScaleOut_u8);
status |= vxSetNodeUniform(nodObj, "outputZP", 1, &reOutZP);
status |= vxSetNodeUniform(nodObj, "sumInZp", 1, &sumInZp);
status |= vxSetNodeUniform(nodObj, "tmpZp1", 1, &tmpZp1);
status |= vxSetNodeUniform(nodObj, "tmpZp2", 1, &tmpZp2);
status |= vxSetNodeUniform(nodObj, "e2InScale", 1, &e2InScale);
}
if(status < 0)
{
VSILOGE("[%s : %d]Initializer failure! \n",__FILE__, __LINE__);
}
}
return status;
}
static vx_param_description_t vxLayerNormKernelParam[] =
{
{VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
{VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
{VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
{VX_OUTPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
{VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED}
};
#ifdef __cplusplus
extern "C" {
#endif
vx_kernel_description_t vxLayerNormKernelInfo =
{
VX_KERNEL_ENUM_LAYERNORM,
VX_KERNEL_NAME_LAYERNORM,
NULL,
vxLayerNormKernelParam,
(sizeof(vxLayerNormKernelParam) / sizeof(vxLayerNormKernelParam[0])),
vsi_nn_KernelValidator,
NULL,
NULL,
vxLayerNormInitializer,
vsi_nn_KernelDeinitializer
};
vx_kernel_description_t vxLayerNormKernelInfo_u8 =
{
VX_KERNEL_ENUM_LAYERNORM,
VX_KERNEL_NAME_LAYERNORM_UINT8,
NULL,
vxLayerNormKernelParam,
(sizeof(vxLayerNormKernelParam) / sizeof(vxLayerNormKernelParam[0])),
vsi_nn_KernelValidator,
NULL,
NULL,
vxLayerNormInitializer,
vsi_nn_KernelDeinitializer
};
vx_kernel_description_t vxLayerNormKernelInfo_FP16toU8 =
{
VX_KERNEL_ENUM_LAYERNORM_FP16TOU8,
VX_KERNEL_NAME_LAYERNORM_FP16TOU8,
NULL,
vxLayerNormKernelParam,
(sizeof(vxLayerNormKernelParam) / sizeof(vxLayerNormKernelParam[0])),
vsi_nn_KernelValidator,
NULL,
NULL,
vxLayerNormInitializer,
vsi_nn_KernelDeinitializer
};
vx_kernel_description_t vxLayerNormKernelInfo_U8toFP16 =
{
VX_KERNEL_ENUM_LAYERNORM,
VX_KERNEL_NAME_LAYERNORM_U8TOFP16,
NULL,
vxLayerNormKernelParam,
(sizeof(vxLayerNormKernelParam) / sizeof(vxLayerNormKernelParam[0])),
vsi_nn_KernelValidator,
NULL,
NULL,
vxLayerNormInitializer,
vsi_nn_KernelDeinitializer
};
vx_kernel_description_t vxLayerNormKernelInfo_CPU =
{
VX_KERNEL_ENUM_LAYERNORM,
VX_KERNEL_NAME_LAYERNORM,
vxLayerNormKernel,
vxLayerNormKernelParam,
(sizeof(vxLayerNormKernelParam) / sizeof(vxLayerNormKernelParam[0])),
vsi_nn_KernelValidator,
NULL,
NULL,
vsi_nn_KernelInitializer,
vsi_nn_KernelDeinitializer
};
vx_kernel_description_t * vx_kernel_LAYERNORM_list[] =
{
&vxLayerNormKernelInfo_CPU,
&vxLayerNormKernelInfo,
&vxLayerNormKernelInfo_u8,
&vxLayerNormKernelInfo_FP16toU8,
&vxLayerNormKernelInfo_U8toFP16,
NULL
};
#ifdef __cplusplus
}
#endif

View File

@ -1,190 +0,0 @@
/****************************************************************************
*
* Copyright (c) 2020 Vivante Corporation
*
* Permission is hereby granted, free of charge, to any person obtaining a
* copy of this software and associated documentation files (the "Software"),
* to deal in the Software without restriction, including without limitation
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
* and/or sell copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in
* all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
* DEALINGS IN THE SOFTWARE.
*
*****************************************************************************/
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include "vsi_nn_platform.h"
#include "vsi_nn_prv.h"
#include "vsi_nn_log.h"
#include "vsi_nn_tensor_util.h"
#include "utils/vsi_nn_util.h"
#include "utils/vsi_nn_dtype_util.h"
#include "utils/vsi_nn_math.h"
#include "client/vsi_nn_vxkernel.h"
#include "libnnext/vx_lib_nnext.h"
#define _VX_KERNEL_VAR (vx_kernel_REDUCE)
#define _VX_KERNEL_ID (VX_KERNEL_ENUM_REDUCE)
#define _VX_KERNEL_NAME ("vsi_nn_kernel_reduce")
#define _VX_KERNEL_FUNC_KERNEL (vxReduceKernel)
static vx_status VX_CALLBACK vxReduceKernel
(
vx_node node,
const vx_reference* paramObj,
vx_uint32 paramNum
)
{
/* TODO: */
#define ARG_NUM (6)
#define TENSOR_NUM_INPUT (1)
#define TENSOR_NUM_OUTPUT (1)
#define TENSOR_NUM (TENSOR_NUM_INPUT+TENSOR_NUM_OUTPUT)
vx_status status = VX_SUCCESS;
vx_context context = NULL;
vsi_nn_tensor_attr_t attr[TENSOR_NUM];
vx_uint32 stride_size[TENSOR_NUM][VSI_NN_MAX_DIM_NUM];
vx_tensor_addressing user_addr[TENSOR_NUM] = {NULL};
vx_uint8 *buffer_ptr[TENSOR_NUM] = {NULL};
vx_tensor tensor[TENSOR_NUM];
vx_float32 factor0;
vx_int32 factor;
vx_uint32 batch, c, h, w;
vx_uint32 i, j, k, b;
//prepare data
context = vxGetContext((vx_reference)node);
for( i = 0; i < TENSOR_NUM_INPUT; i ++ )
{
tensor[i] = (vx_tensor)paramObj[i];
buffer_ptr[i] = vsi_nn_ConvertRawTensorToData2(context, tensor[i],
&(attr[i]), stride_size[i], &(user_addr[i]), VX_READ_ONLY);
}
for( i = TENSOR_NUM_INPUT; i < TENSOR_NUM; i ++ )
{
tensor[i] = (vx_tensor)paramObj[i];
buffer_ptr[i] = vsi_nn_ConvertRawTensorToData2(context, tensor[i],
&(attr[i]), stride_size[i], &(user_addr[i]), VX_WRITE_ONLY);
}
vxCopyScalar((vx_scalar)paramObj[TENSOR_NUM], &(factor0), VX_READ_ONLY, VX_MEMORY_TYPE_HOST);
//op calc
if (factor0 > 1)
{
factor = (vx_int32)(factor0 + 0.5);
w = attr[0].size[0];
h = attr[0].size[1];
c = attr[0].size[2];
batch = 1;
for(b = 0; b < batch; ++b){
for(k = 0; k < c; ++k){
for(j = 0; j < h*factor; ++j){
for(i = 0; i < w*factor; ++i){
vx_int32 in_index = b*w*h*c + k*w*h + (j/factor)*w + i/factor;
vx_int32 out_index = b*w*h*c*factor*factor + k*w*h*factor*factor +
j*w*factor + i;
vx_float32 fval;
//out[out_index] = in[in_index];
vsi_nn_DtypeToFloat32(&buffer_ptr[0][stride_size[0][0] * in_index],
&fval, &attr[0].dtype);
vsi_nn_Float32ToDtype(fval, &buffer_ptr[1][stride_size[1][0] * out_index],
&attr[1].dtype);
}
}
}
}
}
else
{
factor = (vx_int32)(1 / factor0 + 0.5);
w = attr[1].size[0];
h = attr[1].size[1];
c = attr[1].size[2];
batch = 1;
for(b = 0; b < batch; ++b){
for(k = 0; k < c; ++k){
for(j = 0; j < h; ++j){
for(i = 0; i < w; ++i){
vx_int32 in_index = b*w*h*c*factor*factor +
k*w*h*factor*factor + j*w*factor*factor + i*factor;
vx_int32 out_index = b*w*h*c + k*w*h + j * w + i;
vx_float32 fval;
//out[out_index] = in[in_index];
vsi_nn_DtypeToFloat32(&buffer_ptr[0][stride_size[0][0] * in_index], &fval,
&attr[0].dtype);
vsi_nn_Float32ToDtype(fval, &buffer_ptr[1][stride_size[1][0] * out_index],
&attr[1].dtype);
}
}
}
}
}
//save data
for( i = TENSOR_NUM_INPUT; i < TENSOR_NUM; i ++ )
{
status = vsi_nn_copy_tensor_patch(tensor[i], &attr[i], buffer_ptr[i], VX_WRITE_ONLY);
if (user_addr[i]) vxReleaseTensorAddressing(&(user_addr[i]));
}
for( i = 0; i < TENSOR_NUM; i ++ )
{
if (buffer_ptr[i]) free(buffer_ptr[i]);
}
return status;
} /* _VX_KERNEL_FUNC_KERNEL() */
static vx_param_description_t s_params[] =
{
{ VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED },
{ VX_OUTPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED },
{ VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED },
{ VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED },
{ VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_OPTIONAL },
{ VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_OPTIONAL },
{ VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_OPTIONAL },
{ VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_OPTIONAL },
};
#ifdef __cplusplus
extern "C" {
#endif
vx_kernel_description_t _VX_KERNEL_VAR =
{
_VX_KERNEL_ID,
_VX_KERNEL_NAME,
_VX_KERNEL_FUNC_KERNEL,
s_params,
_cnt_of_array( s_params ),
vsi_nn_KernelValidator,
NULL,
NULL,
vsi_nn_KernelInitializer,
vsi_nn_KernelDeinitializer
};
vx_kernel_description_t * vx_kernel_REDUCE_list[] =
{
&_VX_KERNEL_VAR,
NULL
};
#ifdef __cplusplus
}
#endif

View File

@ -1,283 +0,0 @@
/****************************************************************************
*
* Copyright (c) 2020 Vivante Corporation
*
* Permission is hereby granted, free of charge, to any person obtaining a
* copy of this software and associated documentation files (the "Software"),
* to deal in the Software without restriction, including without limitation
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
* and/or sell copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in
* all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
* DEALINGS IN THE SOFTWARE.
*
*****************************************************************************/
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <stdint.h>
#include "vsi_nn_platform.h"
#include "vsi_nn_prv.h"
#include "vsi_nn_log.h"
#include "vsi_nn_tensor_util.h"
#include "utils/vsi_nn_util.h"
#include "utils/vsi_nn_dtype_util.h"
#include "utils/vsi_nn_math.h"
#include "client/vsi_nn_vxkernel.h"
#include "libnnext/vx_lib_nnext.h"
#define _VX_KERNEL_VAR (vx_kernel_RESIZE)
#define _VX_KERNEL_ID (VX_KERNEL_ENUM_RESIZE)
#define _VX_KERNEL_NAME ("vsi_nn_kernel_resize")
#define _VX_KERNEL_FUNC_KERNEL (vxResizeKernel)
static vsi_status VX_CALLBACK vxResizeKernel
(
vx_node node,
const vx_reference* paramObj,
uint32_t paramNum
)
{
/* TODO: */
#define ARG_NUM (1)
#define TENSOR_NUM_INPUT (1)
#define TENSOR_NUM_OUTPUT (1)
#define TENSOR_NUM (TENSOR_NUM_INPUT+TENSOR_NUM_OUTPUT)
vsi_status status = VX_SUCCESS;
vx_context context = NULL;
vsi_nn_tensor_attr_t attr[TENSOR_NUM];
uint32_t stride_size[TENSOR_NUM][VSI_NN_MAX_DIM_NUM];
vx_tensor_addressing user_addr[TENSOR_NUM] = {NULL};
uint8_t *buffer_ptr[TENSOR_NUM] = {NULL};
vx_tensor tensor[TENSOR_NUM];
float factor0;
int32_t factor;
uint32_t batch, c, h, w;
uint32_t i, j, k, b;
//prepare data
context = vxGetContext((vx_reference)node);
for( i = 0; i < TENSOR_NUM_INPUT; i ++ )
{
tensor[i] = (vx_tensor)paramObj[i];
buffer_ptr[i] = vsi_nn_ConvertRawTensorToData2(context, tensor[i],
&(attr[i]), stride_size[i], &(user_addr[i]), VX_READ_ONLY);
}
for( i = TENSOR_NUM_INPUT; i < TENSOR_NUM; i ++ )
{
tensor[i] = (vx_tensor)paramObj[i];
buffer_ptr[i] = vsi_nn_ConvertRawTensorToData2(context, tensor[i],
&(attr[i]), stride_size[i], &(user_addr[i]), VX_WRITE_ONLY);
}
vxCopyScalar((vx_scalar)paramObj[TENSOR_NUM], &(factor0), VX_READ_ONLY, VX_MEMORY_TYPE_HOST);
//op calc
if (factor0 > 1)
{
factor = (int32_t)(factor0 + 0.5);
w = attr[0].size[0];
h = attr[0].size[1];
c = attr[0].size[2];
batch = 1;
for(b = 0; b < batch; ++b){
for(k = 0; k < c; ++k){
for(j = 0; j < h*factor; ++j){
for(i = 0; i < w*factor; ++i){
int32_t in_index = b*w*h*c + k*w*h + (j/factor)*w + i/factor;
int32_t out_index = b*w*h*c*factor*factor + k*w*h*factor*factor +
j*w*factor + i;
float fval;
//out[out_index] = in[in_index];
vsi_nn_DtypeToFloat32(&buffer_ptr[0][stride_size[0][0] * in_index],
&fval, &attr[0].dtype);
vsi_nn_Float32ToDtype(fval, &buffer_ptr[1][stride_size[1][0] * out_index],
&attr[1].dtype);
}
}
}
}
}
else
{
factor = (int32_t)(1 / factor0 + 0.5);
w = attr[1].size[0];
h = attr[1].size[1];
c = attr[1].size[2];
batch = 1;
for(b = 0; b < batch; ++b){
for(k = 0; k < c; ++k){
for(j = 0; j < h; ++j){
for(i = 0; i < w; ++i){
int32_t in_index = b*w*h*c*factor*factor +
k*w*h*factor*factor + j*w*factor*factor + i*factor;
int32_t out_index = b*w*h*c + k*w*h + j * w + i;
float fval;
//out[out_index] = in[in_index];
vsi_nn_DtypeToFloat32(&buffer_ptr[0][stride_size[0][0] * in_index], &fval,
&attr[0].dtype);
vsi_nn_Float32ToDtype(fval, &buffer_ptr[1][stride_size[1][0] * out_index],
&attr[1].dtype);
}
}
}
}
}
//save data
for( i = TENSOR_NUM_INPUT; i < TENSOR_NUM; i ++ )
{
status = vsi_nn_copy_tensor_patch(tensor[i], &attr[i], buffer_ptr[i], VX_WRITE_ONLY);
if (user_addr[i]) vxReleaseTensorAddressing(&(user_addr[i]));
}
for( i = 0; i < TENSOR_NUM; i ++ )
{
if (buffer_ptr[i]) free(buffer_ptr[i]);
}
return status;
} /* _VX_KERNEL_FUNC_KERNEL() */
static vx_param_description_t s_params[] =
{
{ VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED },
{ VX_OUTPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED },
{ VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED },
};
vsi_status VX_CALLBACK vxTensorResizeInitializer
(
vx_node nodObj,
const vx_reference *paramObj,
uint32_t paraNum
)
{
// Alignment with a power of two value.
#define gcmALIGN(n, align) ((n) + ((align) - 1)) & ~((align) - 1)
vx_kernel_execution_parameters_t shaderParam = {
2, // workdim
{0, 0, 0}, // globalWorkOffset: control the start location be processed in the image
{0, 0, 0}, // globalWorkScale: how many pixels could be processed by a single thread
{0, 0, 0}, // localWorkSize: local group size in thread
{0, 0, 0}}; // globalWorkSize: image size in thread
uint32_t uniPackEvenData_2x8[16] = {
0x33333333, // TCfg
0x11110000, // ASelt
0x06040200, 0x06040200, // ABin
0x00000000, // BSelt
0x00000000, 0x00000000, // BBin
0x00003400, // AccumType, ConstantType, and PostShift
0x00000000, 0x00000000, 0x00000000, 0x00000000,
0x00000000, 0x00000000, 0x00000000, 0x00000000 // Constant
};
vsi_status status = VX_SUCCESS;
vx_tensor input = (vx_tensor)paramObj[0];
uint32_t input_size[DIM_SIZE] = {1, 1, 1, 1};
vsi_nn_tensor_attr_t attr;
uint32_t i, input_dim;
memset(&attr, 0, sizeof(vsi_nn_tensor_attr_t));
status = vsi_nn_vxGetTensorAttr(input, &attr);
if (status != VX_SUCCESS)
{
VSILOGE("vsi_nn_vxGetTensorAttr failure! at line %d\n", __LINE__);
return status;
}
input_dim = attr.dim_num;
for (i = 0; i < input_dim; i++)
{
input_size[i] = attr.size[i];
}
shaderParam.globalWorkOffset[0] = 0;
shaderParam.globalWorkOffset[1] = 0;
shaderParam.globalWorkScale[0] = 16;
shaderParam.globalWorkScale[1] = 2;
shaderParam.globalWorkSize[0] = gcmALIGN((input_size[0] + shaderParam.globalWorkScale[0] - 1)
/ shaderParam.globalWorkScale[0], 4);
shaderParam.globalWorkSize[1] = (input_size[1] + shaderParam.globalWorkScale[1] - 1)
/ shaderParam.globalWorkScale[1];
vxSetNodeUniform(nodObj, "uniPackEvenData_2x8", 1, uniPackEvenData_2x8);
status |= vxSetNodeAttribute(nodObj, VX_NODE_ATTRIBUTE_KERNEL_EXECUTION_PARAMETERS,
&shaderParam, sizeof(vx_kernel_execution_parameters_t));
return VX_SUCCESS;
}
static vx_param_description_t vxTensorResizeKernelParam[] =
{
{VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
{VX_OUTPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}
};
#ifdef __cplusplus
extern "C" {
#endif
vx_kernel_description_t _VX_KERNEL_VAR =
{
_VX_KERNEL_ID,
_VX_KERNEL_NAME,
_VX_KERNEL_FUNC_KERNEL,
s_params,
_cnt_of_array( s_params ),
vsi_nn_KernelValidator,
NULL,
NULL,
vsi_nn_KernelInitializer,
vsi_nn_KernelDeinitializer
};
vx_kernel_description_t vxTensorResize16BitsDownSampleQuarterKernelInfo =
{
VX_KERNEL_ENUM_RESIZE_16BITS_DOWNSAMPLE_QUARTER,
VX_KERNEL_NAME_RESIZE_16BITS_DOWNSAMPLE_QUARTER,
NULL,
vxTensorResizeKernelParam,
(sizeof(vxTensorResizeKernelParam) / sizeof(vxTensorResizeKernelParam[0])),
vsi_nn_KernelValidator,
NULL,
NULL,
vxTensorResizeInitializer,
vsi_nn_KernelDeinitializer
};
vx_kernel_description_t vxTensorResize8BitsDownSampleQuarterKernelInfo =
{
VX_KERNEL_ENUM_RESIZE_8BITS_DOWNSAMPLE_QUARTER,
VX_KERNEL_NAME_RESIZE_8BITS_DOWNSAMPLE_QUARTER,
NULL,
vxTensorResizeKernelParam,
(sizeof(vxTensorResizeKernelParam) / sizeof(vxTensorResizeKernelParam[0])),
vsi_nn_KernelValidator,
NULL,
NULL,
vxTensorResizeInitializer,
vsi_nn_KernelDeinitializer
};
vx_kernel_description_t * vx_kernel_RESIZE_list[] =
{
&_VX_KERNEL_VAR,
&vxTensorResize16BitsDownSampleQuarterKernelInfo,
&vxTensorResize8BitsDownSampleQuarterKernelInfo,
NULL
};
#ifdef __cplusplus
}
#endif

View File

@ -1,317 +0,0 @@
/****************************************************************************
*
* Copyright (c) 2020 Vivante Corporation
*
* Permission is hereby granted, free of charge, to any person obtaining a
* copy of this software and associated documentation files (the "Software"),
* to deal in the Software without restriction, including without limitation
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
* and/or sell copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in
* all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
* DEALINGS IN THE SOFTWARE.
*
*****************************************************************************/
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <stdint.h>
#include <math.h>
#include "vsi_nn_platform.h"
#include "vsi_nn_prv.h"
#include "vsi_nn_log.h"
#include "vsi_nn_test.h"
#include "vsi_nn_tensor_util.h"
#include "utils/vsi_nn_util.h"
#include "utils/vsi_nn_dtype_util.h"
#include "utils/vsi_nn_math.h"
#include "client/vsi_nn_vxkernel.h"
#include "libnnext/vx_lib_nnext.h"
#define _VX_KERNEL_VAR (vx_kernel_ROI_ALIGN)
#define _VX_KERNEL_ID (VX_KERNEL_ENUM_ROI_ALIGN)
#define _VX_KERNEL_NAME (VX_KERNEL_NAME_ROI_ALIGN)
#define _VX_KERNEL_FUNC_KERNEL (vxRoi_alignKernel)
static vsi_status VX_CALLBACK vxRoi_alignKernel
(
vx_node node,
const vx_reference* paramObj,
uint32_t paramNum
)
{
#define ARG_NUM (6)
#define TENSOR_NUM_INPUT (3)
#define TENSOR_NUM_OUTPUT (1)
#define TENSOR_NUM (TENSOR_NUM_INPUT+TENSOR_NUM_OUTPUT)
vsi_status status = VSI_FAILURE;
vx_context context = NULL;
vx_tensor input[TENSOR_NUM_INPUT] = {0};
vx_tensor output[TENSOR_NUM_OUTPUT] = {0};
float *f32_in_buffer[TENSOR_NUM_INPUT] = {0};
int32_t* int32_in_buffer[TENSOR_NUM_INPUT] = {0};
float *f32_out_buffer[TENSOR_NUM_OUTPUT] = {0};
vsi_nn_tensor_attr_t in_attr[TENSOR_NUM_INPUT];
vsi_nn_tensor_attr_t out_attr[TENSOR_NUM_OUTPUT];
uint32_t in_elements[TENSOR_NUM_INPUT] = {0};
uint32_t out_elements[TENSOR_NUM_OUTPUT]= {0};
int32_t output_height;
int32_t output_width;
float height_ratio;
float width_ratio;
int32_t height_sample_num;
int32_t width_sample_num;
uint32_t i = 0;
for(i = 0; i < TENSOR_NUM_INPUT; i++)
{
memset(&in_attr[i], 0x0, sizeof(vsi_nn_tensor_attr_t));
}
for(i = 0; i < TENSOR_NUM_OUTPUT; i++)
{
memset(&out_attr[i], 0x0, sizeof(vsi_nn_tensor_attr_t));
}
/* prepare data */
context = vxGetContext((vx_reference)node);
for(i = 0; i < TENSOR_NUM_INPUT; i ++)
{
input[i] = (vx_tensor)paramObj[i];
status = vsi_nn_vxGetTensorAttr(input[i], &in_attr[i]);
TEST_CHECK_STATUS(status, final);
in_elements[i] = vsi_nn_vxGetTensorElementNum(&in_attr[i]);
if (i == 2)
{
int32_in_buffer[i] = (int32_t *)vsi_nn_vxCopyTensorToData(context,
input[i], &in_attr[i]);
}
else
{
f32_in_buffer[i] = (float *)malloc(in_elements[i] * sizeof(float));
status = vsi_nn_vxConvertTensorToFloat32Data(
context, input[i], &in_attr[i], f32_in_buffer[i],
in_elements[i] * sizeof(float));
TEST_CHECK_STATUS(status, final);
}
}
for(i = 0; i < TENSOR_NUM_OUTPUT; i ++)
{
output[i] = (vx_tensor)paramObj[i + TENSOR_NUM_INPUT];
status = vsi_nn_vxGetTensorAttr(output[i], &out_attr[i]);
TEST_CHECK_STATUS(status, final);
out_elements[i] = vsi_nn_vxGetTensorElementNum(&out_attr[i]);
f32_out_buffer[i]= (float *)malloc(out_elements[i] * sizeof(float));
memset(f32_out_buffer[i], 0, out_elements[i] * sizeof(float));
}
vxCopyScalar((vx_scalar)paramObj[TENSOR_NUM], &(output_height),
VX_READ_ONLY, VX_MEMORY_TYPE_HOST);
vxCopyScalar((vx_scalar)paramObj[TENSOR_NUM + 1], &(output_width),
VX_READ_ONLY, VX_MEMORY_TYPE_HOST);
vxCopyScalar((vx_scalar)paramObj[TENSOR_NUM + 2], &(height_ratio),
VX_READ_ONLY, VX_MEMORY_TYPE_HOST);
vxCopyScalar((vx_scalar)paramObj[TENSOR_NUM + 3], &(width_ratio),
VX_READ_ONLY, VX_MEMORY_TYPE_HOST);
vxCopyScalar((vx_scalar)paramObj[TENSOR_NUM + 4], &(height_sample_num),
VX_READ_ONLY, VX_MEMORY_TYPE_HOST);
vxCopyScalar((vx_scalar)paramObj[TENSOR_NUM + 5], &(width_sample_num),
VX_READ_ONLY, VX_MEMORY_TYPE_HOST);
/* TODO: Add CPU kernel implement */
{
uint32_t n, j, k;
uint32_t kRoiDim = 4;
float heightScale = 1.0f / height_ratio;
float widthScale = 1.0f / width_ratio;
uint32_t inHeight = in_attr[0].size[2];
uint32_t inWidth = in_attr[0].size[1];
uint32_t inDepth = in_attr[0].size[0];
uint32_t numRois = in_attr[1].size[1];
uint32_t outHeight = out_attr[0].size[2];
uint32_t outWidth = out_attr[0].size[1];
uint32_t out_index = 0;
for(n = 0; n < numRois; n++)
{
uint32_t batchId = int32_in_buffer[2][n];
float scale = (in_attr[1].dtype.vx_type == VSI_NN_TYPE_UINT16) ? 0.125f : 1.0f;
float wRoiStart = f32_in_buffer[1][n * kRoiDim] * widthScale * scale;
float hRoiStart = f32_in_buffer[1][n * kRoiDim + 1] * heightScale * scale;
float wRoiEnd = f32_in_buffer[1][n * kRoiDim + 2] * widthScale * scale;
float hRoiEnd = f32_in_buffer[1][n * kRoiDim + 3] * heightScale * scale;
float roiWidth = vsi_nn_max((wRoiEnd - wRoiStart), 1.0f);
float roiHeight = vsi_nn_max((hRoiEnd - hRoiStart), 1.0f);
float wStepSize = roiWidth / outWidth;
float hStepSize = roiHeight / outHeight;
uint32_t wSamplingRatio = width_sample_num > 0
? width_sample_num : (uint32_t)ceil(wStepSize);
uint32_t hSamplingRatio = height_sample_num > 0
? height_sample_num : (uint32_t)ceil(hStepSize);
int32_t numSamplingPoints = wSamplingRatio * hSamplingRatio;
float wBinSize = wStepSize / (float)(wSamplingRatio);
float hBinSize = hStepSize / (float)(hSamplingRatio);
int32_t batch_base_index = batchId * inHeight * inWidth * inDepth;
for (i = 0; i < outHeight; i++)
{
for (j = 0; j < outWidth; j++)
{
float wStart = wStepSize * j + wRoiStart;
float wEnd = wStepSize * (j + 1) + wRoiStart;
float hStart = hStepSize * i + hRoiStart;
float hEnd = hStepSize * (i + 1) + hRoiStart;
float x,y;
for (y = hStart + hBinSize / 2; y < hEnd; y += hBinSize)
{
for (x = wStart + wBinSize / 2; x < wEnd; x += wBinSize)
{
uint32_t x1 = (uint32_t)floor(x);
uint32_t y1 = (uint32_t)floor(y);
uint32_t x2 = x1 + 1, y2 = y1 + 1;
float dx1 = x - (float)(x1);
float dy1 = y - (float)(y1);
if (x1 >= inWidth - 1) {
x1 = x2 = inWidth - 1;
dx1 = 0;
}
if (y1 >= inHeight - 1) {
y1 = y2 = inHeight - 1;
dy1 = 0;
}
{
float dx2 = 1.0f - dx1, dy2 = 1.0f - dy1;
float ws[] = {dx2 * dy2, dx1 * dy2,
dx2 * dy1, dx1 * dy1};
uint32_t offsets[] = {y1 * inWidth * inDepth + x1 * inDepth,
y1 * inWidth * inDepth + x2 * inDepth,
y2 * inWidth * inDepth + x1 * inDepth,
y2 * inWidth * inDepth + x2 * inDepth};
for (k = 0; k < inDepth; k++) {
float interpolation = 0;
uint32_t c;
for (c = 0; c < 4; c++)
{
interpolation += ws[c]
* f32_in_buffer[0][batch_base_index + offsets[c] + k];
}
f32_out_buffer[0][out_index + k] += interpolation;
}
}
}
}
for (k = 0; k < inDepth; k++)
{
f32_out_buffer[0][out_index + k] /= (float)(numSamplingPoints);
}
out_index += inDepth;
}
}
}
}
/* save data */
for(i = 0; i < TENSOR_NUM_OUTPUT; i++)
{
status = vsi_nn_vxConvertFloat32DataToTensor(
context, output[i], &out_attr[i], f32_out_buffer[i],
out_elements[i] * sizeof(float));
TEST_CHECK_STATUS(status, final);
}
final:
for (i = 0; i < TENSOR_NUM_INPUT; i++)
{
if (f32_in_buffer[i]) free(f32_in_buffer[i]);
if (int32_in_buffer[i]) free(int32_in_buffer[i]);
}
for(i = 0; i < TENSOR_NUM_OUTPUT; i++)
{
if (f32_out_buffer[i]) free(f32_out_buffer[i]);
}
return status;
} /* _VX_KERNEL_FUNC_KERNEL() */
static vx_param_description_t vxRoi_alignKernelParam[] =
{
{VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
{VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
{VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
{VX_OUTPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
{VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
{VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
{VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
{VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
{VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
{VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
};
vx_status VX_CALLBACK vxRoi_alignInitializer
(
vx_node nodObj,
const vx_reference *paramObj,
vx_uint32 paraNum
)
{
vx_status status = VX_SUCCESS;
/*TODO: Add initial code for VX program*/
return status;
}
#ifdef __cplusplus
extern "C" {
#endif
vx_kernel_description_t vxRoi_align_CPU =
{
_VX_KERNEL_ID,
_VX_KERNEL_NAME,
_VX_KERNEL_FUNC_KERNEL,
vxRoi_alignKernelParam,
_cnt_of_array( vxRoi_alignKernelParam ),
vsi_nn_KernelValidator,
NULL,
NULL,
vsi_nn_KernelInitializer,
vsi_nn_KernelDeinitializer
};
vx_kernel_description_t vxRoi_align_VX =
{
_VX_KERNEL_ID,
_VX_KERNEL_NAME,
NULL,
vxRoi_alignKernelParam,
_cnt_of_array( vxRoi_alignKernelParam ),
vsi_nn_KernelValidator,
NULL,
NULL,
vxRoi_alignInitializer,
vsi_nn_KernelDeinitializer
};
vx_kernel_description_t * vx_kernel_ROI_ALIGN_list[] =
{
&vxRoi_align_CPU,
&vxRoi_align_VX,
NULL
};
#ifdef __cplusplus
}
#endif

View File

@ -1,410 +0,0 @@
/****************************************************************************
*
* Copyright (c) 2020 Vivante Corporation
*
* Permission is hereby granted, free of charge, to any person obtaining a
* copy of this software and associated documentation files (the "Software"),
* to deal in the Software without restriction, including without limitation
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
* and/or sell copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in
* all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
* DEALINGS IN THE SOFTWARE.
*
*****************************************************************************/
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <stdint.h>
#include "vsi_nn_platform.h"
#include "vsi_nn_prv.h"
#include "vsi_nn_log.h"
#include "vsi_nn_test.h"
#include "vsi_nn_tensor_util.h"
#include "utils/vsi_nn_util.h"
#include "utils/vsi_nn_dtype_util.h"
#include "utils/vsi_nn_math.h"
#include "client/vsi_nn_vxkernel.h"
#include "libnnext/vx_lib_nnext.h"
#define _VX_KERNEL_VAR (vx_kernel_SCALE)
#define _VX_KERNEL_ID (VX_KERNEL_ENUM_SCALE)
#define _VX_KERNEL_NAME ("vsi_nn_kernel_scale")
#define _VX_KERNEL_FUNC_KERNEL (vxScaleKernel)
static vsi_status VX_CALLBACK vxScaleKernel
(
vx_node node,
const vx_reference* paramObj,
uint32_t paramNum
)
{
vsi_status status = VX_ERROR_INVALID_PARAMETERS;
if( 6 == paramNum )
{
vx_context context = NULL;
vx_tensor input_tensor = NULL;
vx_tensor scale_tensor = NULL;
vx_tensor bias_tensor = NULL;
vx_tensor output_tensor = NULL;
uint8_t * input_buffer = NULL;
uint8_t * scale_buffer = NULL;
uint8_t * bias_buffer = NULL;
uint8_t * output_buffer = NULL;
vx_scalar axis_scalar = NULL;
vx_scalar has_bias_scalar = NULL;
int axis = 1;
float has_bias = 0;
uint32_t input_dims = 0;
uint32_t scale_dims = 0;
uint32_t bias_dims = 0;
uint32_t output_dims = 0;
vsi_enum inputFormat = VSI_NN_TYPE_FLOAT16;
vsi_enum scaleFormat = VSI_NN_TYPE_FLOAT16;
vsi_enum biasFormat = VSI_NN_TYPE_FLOAT32;
vsi_enum outputFormat = VSI_NN_TYPE_FLOAT16;
uint32_t input_size[4] = {1, 1, 1, 1};
uint32_t scale_size[4] = {1, 1, 1, 1};
uint32_t bias_size[4] = {1, 1, 1, 1};
uint32_t output_size[4] = {1, 1, 1, 1};
uint32_t input_stride_size[VSI_NN_MAX_DIM_NUM] = { 0 };
uint32_t output_stride_size[VSI_NN_MAX_DIM_NUM] = { 0 };
vx_tensor_addressing input_user_addr = NULL;
vx_tensor_addressing scale_user_addr = NULL;
vx_tensor_addressing bias_user_addr = NULL;
vx_tensor_addressing output_user_addr = NULL;
vsi_nn_tensor_attr_t out_attr;
status = VX_SUCCESS;
memset(&out_attr, 0x0, sizeof(vsi_nn_tensor_attr_t));
input_tensor = (vx_tensor)paramObj[0];
scale_tensor = (vx_tensor)paramObj[1];
bias_tensor = (vx_tensor)paramObj[2];
output_tensor = (vx_tensor)paramObj[3];
axis_scalar = (vx_scalar)paramObj[4];
has_bias_scalar = (vx_scalar)paramObj[5];
context = vxGetContext((vx_reference)node);
if( NULL == context)
{
VSILOGE("vxGetContext failure!\n");
status = VX_FAILURE;
goto OnError;
}
input_buffer = vsi_nn_ConvertRawTensorToData(context, input_tensor,
&input_dims, &inputFormat, input_size, input_stride_size,
&input_user_addr, VX_READ_ONLY);
if( NULL == input_buffer )
{
VSILOGE("vsi_nn_ConvertRawTensorToData failure!\n");
status = VX_ERROR_NO_MEMORY;
goto OnError;
}
scale_buffer = vsi_nn_ConvertRawTensorToData(context, scale_tensor,
&scale_dims, &scaleFormat, scale_size, input_stride_size,
&scale_user_addr, VX_READ_ONLY);
if( NULL == scale_buffer )
{
VSILOGE("vsi_nn_ConvertRawTensorToData failure!\n");
status = VX_ERROR_NO_MEMORY;
goto OnError;
}
bias_buffer = vsi_nn_ConvertRawTensorToData(context, bias_tensor,
&bias_dims, &biasFormat, bias_size, input_stride_size,
&bias_user_addr, VX_READ_ONLY);
if( NULL == bias_buffer )
{
VSILOGE("vsi_nn_ConvertRawTensorToData failure!\n");
status = VX_ERROR_NO_MEMORY;
goto OnError;
}
output_buffer = vsi_nn_ConvertRawTensorToData(context, output_tensor,
&output_dims, &outputFormat, output_size, output_stride_size,
&output_user_addr, VX_WRITE_ONLY);
if( NULL == output_buffer )
{
VSILOGE("vsi_nn_ConvertRawTensorToData failure!\n");
status = VX_ERROR_NO_MEMORY;
goto OnError;
}
status = vsi_nn_vxGetTensorAttr(output_tensor, &out_attr);
if (status != VX_SUCCESS)
{
VSILOGE("vsi_nn_vxGetTensorAttr failure! at line %d\n", __LINE__);
goto OnError;
}
status = vxCopyScalar(axis_scalar, &axis, VX_READ_ONLY, VX_MEMORY_TYPE_HOST);
if( VX_SUCCESS != status)
{
VSILOGE("vxCopyScalar axis failure! status:%d\n", status);
goto OnError;
}
status = vxCopyScalar(has_bias_scalar, &has_bias, VX_READ_ONLY, VX_MEMORY_TYPE_HOST);
if( VX_SUCCESS != status )
{
VSILOGE("vxCopyScalar axis failure! has_bias:%f\n", has_bias);
goto OnError;
}
if( input_dims != output_dims )
{
VSILOGE("Invalid parameters, input_dims output_dims mismatch %d:%d\n",
input_dims, output_dims);
status = VX_ERROR_INVALID_PARAMETERS;
goto OnError;
}
if( input_size[0] != scale_size[0] || input_size[0] != bias_size[0] )
{
VSILOGE("Invalid parameters, input size mismatch %d:%d:%d\n",
input_size[0], scale_size[0], bias_size[0]);
status = VX_ERROR_INVALID_PARAMETERS;
goto OnError;
}
{
uint32_t i = 0;
uint32_t j = 0;
uint32_t fixed_num = 1;
uint32_t changed_num = 1;
fixed_num = input_size[1] * input_size[2] * input_size[3];
changed_num = input_size[0];
for( i = 0; i < fixed_num; i++ )
{
int16_t* cur_input_row_ofst = ((int16_t *)input_buffer) + i * changed_num;
int16_t* cur_scale_row_ofst = ((int16_t *)scale_buffer);
float* cur_bias_row_ofst = ((float *)bias_buffer);
int16_t* cur_output_row_ofst = ((int16_t *)output_buffer) + i * changed_num;
for( j = 0; j < changed_num; j++ )
{
float cur_input_v = vsi_nn_Fp16ToFp32(*(cur_input_row_ofst + j));
float cur_scale_v = vsi_nn_Fp16ToFp32(*(cur_scale_row_ofst + j));
float cur_bias_v = *(cur_bias_row_ofst + j);
float cur_result = cur_input_v * cur_scale_v + cur_bias_v;
*(cur_output_row_ofst + j) = vsi_nn_Fp32ToFp16(cur_result);
}
}
#if defined(_SAVE_TENSOR)
{
static int count = 0;
char fname[256] = { 0 };
sprintf(fname, "scale_output_tensor.%d.axis.%d.txt", count, axis);
vsi_nn_SaveDataToText(fname, output_buffer,
vsi_nn_ShapeProduct(output_size, output_dims), VSI_NN_TYPE_FLOAT16, NULL);
count++;
}
#endif
}
status = vsi_nn_vxCopyDataToTensor(context, output_tensor, &out_attr, output_buffer);
TEST_CHECK_STATUS(status, OnError);
OnError:
if( NULL != input_buffer )
{
free( input_buffer );
input_buffer = NULL;
}
if( NULL != scale_buffer )
{
free( scale_buffer );
scale_buffer = NULL;
}
if( NULL != bias_buffer )
{
free( bias_buffer );
bias_buffer = NULL;
}
if( NULL != output_buffer )
{
free( output_buffer );
output_buffer = NULL;
}
if (input_user_addr)
{
vxReleaseTensorAddressing(&input_user_addr);
}
if (scale_user_addr)
{
vxReleaseTensorAddressing(&scale_user_addr);
}
if (bias_user_addr)
{
vxReleaseTensorAddressing(&bias_user_addr);
}
if (output_user_addr)
{
vxReleaseTensorAddressing(&output_user_addr);
}
}
return status;
} /* _VX_KERNEL_FUNC_KERNEL() */
static vx_param_description_t s_params[] =
{
{ VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED },
{ VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED },
{ VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_OPTIONAL },
{ VX_OUTPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED },
{ VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED },
{ VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED },
};
vsi_status VX_CALLBACK vxScaleInitializer
(
vx_node nodObj,
const vx_reference *paramObj,
uint32_t paraNum
)
{
// Alignment with a power of two value.
#define gcmALIGN(n, align) ((n) + ((align) - 1)) & ~((align) - 1)
vx_kernel_execution_parameters_t shaderParam = {
2, // workdim
{0, 0, 0}, // globalWorkOffset: control the start location be processed in the image
{0, 0, 0}, // globalWorkScale: how many pixels could be processed by a single thread
{0, 0, 0}, // localWorkSize: local group size in thread
{0, 0, 0}}; // globalWorkSize: image size in thread
uint32_t uniExtractHalf8_2x8[16] = {
0x11111111, // TCfg
0x11110000, // ASelt
0x06040200, 0x06040200, // ABin
0x22222222, // BSelt
0x00000000, 0x00000000, // BBin
0x00002100, // AccumType, ConstantType, and PostShift
0x00003c00, 0x00003c00, 0x00003c00, 0x00003c00,
0x00003c00, 0x00003c00, 0x00003c00, 0x00003c00 // Constant
};
uint32_t uniFp16MulFp16ToFp32_Lo_4x4[16] = {
0x01010101, // TCfg
0x00000000, // ASelt
0x00010000, 0x00030002, // ABin
0x01010101, // BSelt
0x00010000, 0x00030002, // BBin
0x00000400, // AccumType, ConstantType, and PostShift
0x00000000, 0x00000000, 0x00000000, 0x00000000,
0x00000000, 0x00000000, 0x00000000, 0x00000000 // Constant
};
uint32_t uniFp16MulFp16ToFp32_Hi_4x4[16] = {
0x01010101, // TCfg
0x00000000, // ASelt
0x00050004, 0x00070006, // ABin
0x01010101, // BSelt
0x00050004, 0x00070006, // BBin
0x00000400, // AccumType, ConstantType, and PostShift
0x00000000, 0x00000000, 0x00000000, 0x00000000,
0x00000000, 0x00000000, 0x00000000, 0x00000000 // Constant
};
vsi_status status = VX_SUCCESS;
vx_tensor input = (vx_tensor)paramObj[0];
uint32_t input_size[DIM_SIZE] = {1, 1, 1, 1};
vx_uint32 i = 0;
vsi_nn_tensor_attr_t attr;
memset(&attr, 0, sizeof(vsi_nn_tensor_attr_t));
status = vsi_nn_vxGetTensorAttr(input, &attr);
if (status != VX_SUCCESS)
{
VSILOGE("vsi_nn_vxGetTensorAttr failure! at line %d\n", __LINE__);
return status;
}
for (i = 0; i < attr.dim_num; i++)
{
input_size[i] = attr.size[i];
}
shaderParam.globalWorkOffset[0] = 0;
shaderParam.globalWorkOffset[1] = 0;
shaderParam.globalWorkScale[0] = 8;
shaderParam.globalWorkScale[1] = 1;
shaderParam.globalWorkSize[0] = gcmALIGN((input_size[0] + shaderParam.globalWorkScale[0] - 1)
/ shaderParam.globalWorkScale[0], 4);
shaderParam.globalWorkSize[1] = (input_size[1] + shaderParam.globalWorkScale[1] - 1)
/ shaderParam.globalWorkScale[1];
vxSetNodeUniform(nodObj, "uniExtractHalf8_2x8", 1, uniExtractHalf8_2x8);
vxSetNodeUniform(nodObj, "uniFp16MulFp16ToFp32_Lo_4x4", 1, uniFp16MulFp16ToFp32_Lo_4x4);
vxSetNodeUniform(nodObj, "uniFp16MulFp16ToFp32_Hi_4x4", 1, uniFp16MulFp16ToFp32_Hi_4x4);
status |= vxSetNodeAttribute(nodObj, VX_NODE_ATTRIBUTE_KERNEL_EXECUTION_PARAMETERS,
&shaderParam, sizeof(vx_kernel_execution_parameters_t));
return VX_SUCCESS;
}
static vx_param_description_t vxScaleKernelParam[] =
{
{VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
{VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
{VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_OPTIONAL},
{VX_OUTPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}
};
#ifdef __cplusplus
extern "C" {
#endif
vx_kernel_description_t _VX_KERNEL_VAR =
{
_VX_KERNEL_ID,
_VX_KERNEL_NAME,
_VX_KERNEL_FUNC_KERNEL,
s_params,
_cnt_of_array( s_params ),
vsi_nn_KernelValidator,
NULL,
NULL,
vsi_nn_KernelInitializer,
vsi_nn_KernelDeinitializer
};
vx_kernel_description_t vxScaleKernelInfo =
{
VX_KERNEL_ENUM_SCALE,
VX_KERNEL_NAME_SCALE_FP16,
NULL,
vxScaleKernelParam,
(sizeof(vxScaleKernelParam) / sizeof(vxScaleKernelParam[0])),
vsi_nn_KernelValidator,
NULL,
NULL,
vxScaleInitializer,
vsi_nn_KernelDeinitializer
};
vx_kernel_description_t * vx_kernel_SCALE_list[] =
{
&_VX_KERNEL_VAR,
&vxScaleKernelInfo,
NULL
};
#ifdef __cplusplus
}
#endif

View File

@ -1,345 +0,0 @@
/****************************************************************************
*
* Copyright (c) 2020 Vivante Corporation
*
* Permission is hereby granted, free of charge, to any person obtaining a
* copy of this software and associated documentation files (the "Software"),
* to deal in the Software without restriction, including without limitation
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
* and/or sell copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in
* all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
* DEALINGS IN THE SOFTWARE.
*
*****************************************************************************/
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <stdint.h>
#include "vsi_nn_platform.h"
#include "vsi_nn_prv.h"
#include "vsi_nn_log.h"
#include "vsi_nn_test.h"
#include "vsi_nn_tensor_util.h"
#include "utils/vsi_nn_util.h"
#include "utils/vsi_nn_dtype_util.h"
#include "utils/vsi_nn_math.h"
#include "client/vsi_nn_vxkernel.h"
#include "libnnext/vx_lib_nnext.h"
vsi_status vxShuffleChannelFunc
(
vx_context context,
vx_tensor input,
vx_tensor output,
int32_t group_number,
int32_t axis
)
{
vsi_status status = VX_SUCCESS;
vsi_nn_tensor_attr_t input_attr;
vsi_nn_tensor_attr_t output_attr;
uint8_t *in_data = NULL;
uint8_t *out_data = NULL;
uint32_t stride_size[VSI_NN_MAX_DIM_NUM] = {0};
uint32_t buf_sz = 0;
uint32_t group_row = group_number;
uint32_t chs = 0, group_col = 0;
uint32_t len = 1, num = 1, feature_map_size = 1;
uint32_t n = 0, i = 0, j = 0;
uint32_t type_bytes = 0, len_bytes = 0, fms_bytes = 0;
status = vsi_nn_vxGetTensorAttr(input, &input_attr);
status |= vsi_nn_vxGetTensorAttr(output, &output_attr);
TEST_CHECK_STATUS(status, final);
in_data = vsi_nn_vxCopyTensorToData(context, input, &input_attr);
TEST_CHECK_PTR(in_data, final);
buf_sz = vsi_nn_GetStrideSize(&output_attr, stride_size);
out_data = (uint8_t *)malloc( buf_sz );
TEST_CHECK_PTR(out_data, final);
chs = input_attr.size[axis];
group_col = chs / group_row;
type_bytes = vsi_nn_TypeGetBytes( input_attr.dtype.vx_type );
for ( i = 0; i < (uint32_t)axis; i++)
{
len *= input_attr.size[i];
}
for ( i = axis + 1; i < input_attr.dim_num; i++)
{
num *= input_attr.size[i];
}
for ( i = 0; i <= (uint32_t)axis; i++)
{
feature_map_size *= input_attr.size[i];
}
/* Shuffle Channel CPU Implement, the shape and dtype of output must same as input */
len_bytes = len * type_bytes;
fms_bytes = feature_map_size * type_bytes;
for ( n = 0; n < num; n++)
{
for ( i = 0; i < group_row; i++)
{
for ( j = 0; j < group_col; j++)
{
uint8_t *in_ptr = in_data + n * fms_bytes + (i * group_col + j) * len_bytes;
uint8_t *out_ptr = out_data + n * fms_bytes + (j * group_row + i) * len_bytes;
memcpy(out_ptr, in_ptr, len_bytes);
}
}
}
/* Copy data to output tensor */
status = vsi_nn_vxCopyDataToTensor(context, output, &output_attr, out_data);
TEST_CHECK_STATUS(status, final);
final:
if (in_data) free(in_data);
if (out_data) free(out_data);
return status;
}
vsi_status VX_CALLBACK vxShuffleChannelKernel
(
vx_node node,
const vx_reference* paramObj,
uint32_t paramNum
)
{
vsi_status status = VX_ERROR_INVALID_PARAMETERS;
if(paramNum == 4)
{
vx_context context = NULL;
// tensor
vx_tensor imgObj[2] = { NULL };
// scalar
vx_scalar scalar[2] = { NULL };
int32_t group_number = 0;
int32_t axis = 0;
imgObj[0] = (vx_tensor)paramObj[0];
imgObj[1] = (vx_tensor)paramObj[1];
scalar[0] = (vx_scalar)paramObj[2];
scalar[1] = (vx_scalar)paramObj[3];
context = vxGetContext((vx_reference)node);
TEST_CHECK_PTR(context,final);
// scalar
status = vxCopyScalar(scalar[0], &group_number, VX_READ_ONLY, VX_MEMORY_TYPE_HOST);
TEST_CHECK_STATUS(status, final);
status = vxCopyScalar(scalar[1], &axis, VX_READ_ONLY, VX_MEMORY_TYPE_HOST);
TEST_CHECK_STATUS(status, final);
// Call C Prototype
status = vxShuffleChannelFunc(context, imgObj[0], imgObj[1], group_number, axis);
TEST_CHECK_STATUS(status, final);
}
final:
return status;
}
vsi_status VX_CALLBACK vxShuffleChannelInitializer
(
vx_node nodObj,
const vx_reference *paramObj,
uint32_t paraNum
)
{
vsi_status status = VX_SUCCESS;
// Alignment with a power of two value.
#define gcmALIGN(n, align) ((n) + ((align) - 1)) & ~((align) - 1)
vx_kernel_execution_parameters_t shaderParam = {
3, // workdim
{0, 0, 0}, // globalWorkOffset: control the start location be processed in the image
{0, 0, 0}, // globalWorkScale: how many pixels could be processed by a single thread
{0, 0, 0}, // localWorkSize: local group size in thread
{0, 0, 0}}; // globalWorkSize: image size in thread
vx_tensor input = (vx_tensor)paramObj[0];
vx_scalar group_numbers = (vx_scalar)paramObj[2];
vx_scalar axis_s = (vx_scalar)paramObj[3];
uint32_t input_size[4] = {1, 1, 1, 1};
vsi_nn_type_e inputDataFormat = VSI_NN_TYPE_FLOAT16;
int32_t group_number = 0;
int32_t axis = 0;
int32_t group_column = 0;
float rgroup_column = 0.0f;
uint32_t chs = 0;
vx_uint32 i = 0;
vsi_nn_tensor_attr_t attr;
memset(&attr, 0, sizeof(vsi_nn_tensor_attr_t));
status = vsi_nn_vxGetTensorAttr(input, &attr);
if (status != VX_SUCCESS)
{
VSILOGE("vsi_nn_vxGetTensorAttr failure! at line %d\n", __LINE__);
return status;
}
for (i = 0; i < attr.dim_num; i++)
{
input_size[i] = attr.size[i];
}
inputDataFormat = attr.dtype.vx_type;
status |= vxCopyScalar(group_numbers, &group_number, VX_READ_ONLY, VX_MEMORY_TYPE_HOST);
status |= vxCopyScalar(axis_s, &axis, VX_READ_ONLY, VX_MEMORY_TYPE_HOST);
if(VX_SUCCESS != status)
{
VSILOGE("[%s : %d]Initializer failure! \n",__FILE__, __LINE__);
return status;
}
chs = input_size[axis];
if (chs % group_number)
{
VSILOGE("input channel can't be exact divided by group number! at line %d\n", __LINE__);
return VX_FAILURE;
}
shaderParam.globalWorkOffset[0] = 0;
shaderParam.globalWorkOffset[1] = 0;
shaderParam.globalWorkOffset[2] = 0;
if (axis == 2)
{
if (inputDataFormat == VSI_NN_TYPE_FLOAT16 || inputDataFormat == VSI_NN_TYPE_INT16)
shaderParam.globalWorkScale[0] = 8;
else
shaderParam.globalWorkScale[0] = 16;
shaderParam.globalWorkScale[1] = 4;
shaderParam.globalWorkScale[2] = 1;
shaderParam.globalWorkSize[0] = gcmALIGN((input_size[0] + shaderParam.globalWorkScale[0] - 1)
/ shaderParam.globalWorkScale[0], 4);
shaderParam.globalWorkSize[1] = (input_size[1] + shaderParam.globalWorkScale[1] - 1)
/ shaderParam.globalWorkScale[1];
shaderParam.globalWorkSize[2] = input_size[2];
}
else if (axis == 1)
{
shaderParam.globalWorkScale[0] = 32;
shaderParam.globalWorkScale[1] = 1;
shaderParam.globalWorkScale[2] = 1;
shaderParam.globalWorkSize[0] = gcmALIGN((input_size[0] + shaderParam.globalWorkScale[0] - 1)
/ shaderParam.globalWorkScale[0], 4);
shaderParam.globalWorkSize[1] = input_size[1];
shaderParam.globalWorkSize[2] = input_size[2];
}
else
{
VSILOGE("[%s : %d]Initializer failure, not support axis: %d! \n",__FILE__, __LINE__, axis);
return VX_FAILURE;
}
group_column = chs / group_number;
rgroup_column = 1.0f / group_column;
status |= vxSetNodeUniform(nodObj, "group_column", 1, &group_column);
status |= vxSetNodeUniform(nodObj, "rgroup_column", 1, &rgroup_column);
status |= vxSetNodeAttribute(nodObj, VX_NODE_ATTRIBUTE_KERNEL_EXECUTION_PARAMETERS,
&shaderParam, sizeof(vx_kernel_execution_parameters_t));
if(status < 0)
{
VSILOGE("[%s : %d]Initializer failure! \n",__FILE__, __LINE__);
}
return status;
}
static vx_param_description_t vxShuffleChannelKernelParam[] =
{
{VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
{VX_OUTPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
{VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
{VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED}
};
#ifdef __cplusplus
extern "C" {
#endif
vx_kernel_description_t vxShuffleChannelKernelInfo =
{
VX_KERNEL_ENUM_SHUFFLECHANNEL,
VX_KERNEL_NAME_SHUFFLECHANNEL,
NULL,
vxShuffleChannelKernelParam,
(sizeof(vxShuffleChannelKernelParam) / sizeof(vxShuffleChannelKernelParam[0])),
vsi_nn_KernelValidator,
NULL,
NULL,
vxShuffleChannelInitializer,
vsi_nn_KernelDeinitializer
};
vx_kernel_description_t vxShuffleChannelKernelInfo8Bits =
{
VX_KERNEL_ENUM_SHUFFLECHANNEL,
VX_KERNEL_NAME_SHUFFLECHANNEL8BITS,
NULL,
vxShuffleChannelKernelParam,
(sizeof(vxShuffleChannelKernelParam) / sizeof(vxShuffleChannelKernelParam[0])),
vsi_nn_KernelValidator,
NULL,
NULL,
vxShuffleChannelInitializer,
vsi_nn_KernelDeinitializer
};
vx_kernel_description_t vxShuffleChannelKernelInfo_CPU =
{
VX_KERNEL_ENUM_SHUFFLECHANNEL,
VX_KERNEL_NAME_SHUFFLECHANNEL,
vxShuffleChannelKernel,
vxShuffleChannelKernelParam,
(sizeof(vxShuffleChannelKernelParam) / sizeof(vxShuffleChannelKernelParam[0])),
vsi_nn_KernelValidator,
NULL,
NULL,
vsi_nn_KernelInitializer,
vsi_nn_KernelDeinitializer
};
vx_kernel_description_t vxShuffleChannelKernelInfo_16BitsAxis1 =
{
VX_KERNEL_ENUM_SHUFFLECHANNEL,
VX_KERNEL_NAME_SHUFFLECHANNEL16BITS_AXIS1,
NULL,
vxShuffleChannelKernelParam,
(sizeof(vxShuffleChannelKernelParam) / sizeof(vxShuffleChannelKernelParam[0])),
vsi_nn_KernelValidator,
NULL,
NULL,
vxShuffleChannelInitializer,
vsi_nn_KernelDeinitializer
};
vx_kernel_description_t vxShuffleChannelKernelInfo_8BitsAxis1 =
{
VX_KERNEL_ENUM_SHUFFLECHANNEL,
VX_KERNEL_NAME_SHUFFLECHANNEL8BITS_AXIS1,
NULL,
vxShuffleChannelKernelParam,
(sizeof(vxShuffleChannelKernelParam) / sizeof(vxShuffleChannelKernelParam[0])),
vsi_nn_KernelValidator,
NULL,
NULL,
vxShuffleChannelInitializer,
vsi_nn_KernelDeinitializer
};
vx_kernel_description_t * vx_kernel_SHUFFLECHANNEL_list[] =
{
&vxShuffleChannelKernelInfo_CPU,
&vxShuffleChannelKernelInfo,
&vxShuffleChannelKernelInfo8Bits,
&vxShuffleChannelKernelInfo_16BitsAxis1,
&vxShuffleChannelKernelInfo_8BitsAxis1,
NULL
};
#ifdef __cplusplus
}
#endif

View File

@ -1,293 +0,0 @@
/****************************************************************************
*
* Copyright (c) 2020 Vivante Corporation
*
* Permission is hereby granted, free of charge, to any person obtaining a
* copy of this software and associated documentation files (the "Software"),
* to deal in the Software without restriction, including without limitation
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
* and/or sell copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in
* all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
* DEALINGS IN THE SOFTWARE.
*
*****************************************************************************/
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <stdint.h>
#include "vsi_nn_platform.h"
#include "vsi_nn_prv.h"
#include "vsi_nn_log.h"
#include "vsi_nn_tensor_util.h"
#include "utils/vsi_nn_util.h"
#include "utils/vsi_nn_dtype_util.h"
#include "utils/vsi_nn_math.h"
#include "client/vsi_nn_vxkernel.h"
#include "libnnext/vx_lib_nnext.h"
#define _VX_KERNEL_VAR (vx_kernel_SPACE2DEPTH)
#define _VX_KERNEL_ID (VX_KERNEL_ENUM_SPACE2DEPTH)
#define _VX_KERNEL_NAME ("vsi_nn_kernel_space2depth")
#define _VX_KERNEL_FUNC_KERNEL (vxSpace2DepthKernel)
static vsi_status VX_CALLBACK vxSpace2DepthKernel
(
vx_node node,
const vx_reference* paramObj,
uint32_t paramNum
)
{
/* TODO: */
#define ARG_NUM (2)
#define TENSOR_NUM_INPUT (1)
#define TENSOR_NUM_OUTPUT (1)
#define TENSOR_NUM (TENSOR_NUM_INPUT+TENSOR_NUM_OUTPUT)
vsi_status status = VX_SUCCESS;
uint32_t i = 0;
vx_context context = NULL;
vsi_nn_tensor_attr_t attr[TENSOR_NUM];
uint32_t stride_size[TENSOR_NUM][VSI_NN_MAX_DIM_NUM];
vx_tensor_addressing user_addr[TENSOR_NUM] = {NULL};
uint8_t *buffer_ptr[TENSOR_NUM] = {NULL};
vx_tensor tensor[TENSOR_NUM] = {NULL};
int32_t block_size_x = 0, block_size_y = 0;
int32_t output_depth = 0, output_height = 0, output_width = 0;
int32_t input_batch = 0, input_depth = 0, input_height = 0, input_width = 0;
int32_t batch = 0, dim = 0;
for(i = 0; i < TENSOR_NUM; i++)
{
memset(&attr[i], 0x0, sizeof(vsi_nn_tensor_attr_t));
}
//prepare data
context = vxGetContext((vx_reference)node);
for( i = 0; i < TENSOR_NUM_INPUT; i ++ )
{
tensor[i] = (vx_tensor)paramObj[i];
buffer_ptr[i] = vsi_nn_ConvertRawTensorToData2(context, tensor[i],
&(attr[i]), stride_size[i], &(user_addr[i]), VX_READ_ONLY);
}
for( i = TENSOR_NUM_INPUT; i < TENSOR_NUM; i ++ )
{
tensor[i] = (vx_tensor)paramObj[i];
buffer_ptr[i] = vsi_nn_ConvertRawTensorToData2(context, tensor[i],
&(attr[i]), stride_size[i], &(user_addr[i]), VX_WRITE_ONLY);
}
vxCopyScalar((vx_scalar)paramObj[TENSOR_NUM], &(block_size_x),
VX_READ_ONLY, VX_MEMORY_TYPE_HOST);
vxCopyScalar((vx_scalar)paramObj[TENSOR_NUM + 1], &(block_size_y),
VX_READ_ONLY, VX_MEMORY_TYPE_HOST);
dim = attr[0].dim_num;
if(dim < 4)
attr[0].size[3] = 1;
//op calc
//output_batch = attr[1].size[3];
output_depth = attr[1].size[2];
output_height = attr[1].size[1];
output_width = attr[1].size[0];
input_batch = attr[0].size[3];
input_depth = attr[0].size[2];
input_height = attr[0].size[1];
input_width = attr[0].size[0];
for (batch = 0; batch < input_batch; ++batch)
{
vx_uint32 output_batch_index = batch * output_height * output_width * output_depth;
vx_uint32 input_batch_index = batch * input_height * input_width * input_depth;
vx_uint32 in_d;
for (in_d = 0; in_d < (vx_uint32)input_depth; in_d ++)
{
vx_uint32 in_h;
for (in_h = 0; in_h < (vx_uint32)input_height; ++ in_h)
{
vx_uint32 in_w;
for (in_w = 0; in_w < (vx_uint32)input_width; in_w ++)
{
vx_int32 out_w = in_w / block_size_x;
vx_int32 out_h = in_h / block_size_y;
//vx_int32 out_d = (in_w % block_size_x) * input_depth + (in_h % block_size_y) * block_size_x * input_depth + in_d;
vx_int32 out_d = (in_w % block_size_x) + (in_h % block_size_y) * block_size_x + in_d * block_size_x * block_size_y;
vx_int32 in_index = in_w + in_h * input_width +in_d * input_height * input_width + input_batch_index;
vx_int32 out_index = out_w + out_h * output_width + out_d * output_width * output_height + output_batch_index;
//outputBase[out_index] = inputBase[in_index];
float fval;
vsi_nn_DtypeToFloat32(&buffer_ptr[0][stride_size[0][0] * in_index],
&fval, &attr[0].dtype);
vsi_nn_Float32ToDtype(fval, &buffer_ptr[1][stride_size[1][0] * out_index],
&attr[1].dtype);
}
}
}
}
//save data
for( i = TENSOR_NUM_INPUT; i < TENSOR_NUM; i ++ )
{
vsi_nn_copy_tensor_patch(tensor[i], &attr[i], buffer_ptr[i], VX_WRITE_ONLY);
}
for( i = 0; i < TENSOR_NUM; i ++ )
{
if (user_addr[i]) vxReleaseTensorAddressing(&(user_addr[i]));
if (buffer_ptr[i]) free(buffer_ptr[i]);
}
return status;
} /* _VX_KERNEL_FUNC_KERNEL() */
vsi_status VX_CALLBACK vxSpace2DepthInitializer
(
vx_node nodObj,
const vx_reference *paramObj,
uint32_t paraNum
)
{
// Alignment with a power of two value.
#define gcmALIGN(n, align) ((n) + ((align) - 1)) & ~((align) - 1)
vx_kernel_execution_parameters_t shaderParam = {
3, // workdim
{0, 0, 0}, // globalWorkOffset: control the start location be processed in the image
{0, 0, 0}, // globalWorkScale: how many pixels could be processed by a single thread
{0, 0, 0}, // localWorkSize: local group size in thread
{0, 0, 0}}; // globalWorkSize: image size in thread
vsi_status status = VX_SUCCESS;
vx_tensor input = (vx_tensor)paramObj[0];
uint32_t input_size[4] = {1, 1, 1, 1};
vx_uint32 input_dimz = 0;
vx_uint32 input_depth = 0;
vx_uint32 i = 0;
vsi_nn_tensor_attr_t attr;
memset(&attr, 0, sizeof(vsi_nn_tensor_attr_t));
status = vsi_nn_vxGetTensorAttr(input, &attr);
if (status != VX_SUCCESS)
{
VSILOGE("vsi_nn_vxGetTensorAttr failure! at line %d\n", __LINE__);
return status;
}
for (i = 0; i < attr.dim_num; i++)
{
input_size[i] = attr.size[i];
}
input_depth = input_size[2];
if(input_size[3] > 0)
input_dimz = input_depth * input_size[3];
shaderParam.globalWorkOffset[0] = 0;
shaderParam.globalWorkOffset[1] = 0;
shaderParam.globalWorkOffset[2] = 0;
shaderParam.globalWorkScale[0] = 8;
shaderParam.globalWorkScale[1] = 1;
shaderParam.globalWorkScale[2] = 1;
shaderParam.localWorkSize[0] = 8;
shaderParam.localWorkSize[1] = 1;
shaderParam.localWorkSize[2] = 1;
shaderParam.globalWorkSize[0] = gcmALIGN((input_size[0] + shaderParam.globalWorkScale[0] - 1)
/ shaderParam.globalWorkScale[0], shaderParam.localWorkSize[0]);
shaderParam.globalWorkSize[1] = gcmALIGN((input_size[1] + shaderParam.globalWorkScale[1] - 1)
/ shaderParam.globalWorkScale[1], shaderParam.localWorkSize[1]);
shaderParam.globalWorkSize[2] = input_dimz;
{
vx_uint32 uniExtractEvenFp16Stride2_4x4[16] = {
0x01010101, // TCfg
0x00000000, // ASelt
0x00020000, 0x00060004, // ABin
0x02020202, // BSelt
0x00000000, 0x00000000, // BBin
0x00000400, // AccumType, ConstantType, and PostShift
0x00000001, 0x00000000, 0x00000001, 0x00000000, 0x00000001, 0x00000000, 0x00000001, 0x00000000 // Constant
};
vx_uint32 uniExtractOddFp16Stride2_4x4[16] = {
0x01010101, // TCfg
0x00000000, // ASelt
0x00030001, 0x00070005, // ABin
0x02020202, // BSelt
0x00000000, 0x00000000, // BBin
0x00000400, // AccumType, ConstantType, and PostShift
0x00000001, 0x00000000, 0x00000001, 0x00000000, 0x00000001, 0x00000000, 0x00000001, 0x00000000 // Constant
};
status |= vxSetNodeUniform(nodObj, "uniExtractEvenFp16Stride2_4x4", 1, uniExtractEvenFp16Stride2_4x4);
status |= vxSetNodeUniform(nodObj, "uniExtractOddFp16Stride2_4x4", 1, uniExtractOddFp16Stride2_4x4);
//status |= vxSetNodeUniform(nodObj, "input_depth", 1, &input_depth);
}
status |= vxSetNodeAttribute(nodObj, VX_NODE_ATTRIBUTE_KERNEL_EXECUTION_PARAMETERS,
&shaderParam, sizeof(vx_kernel_execution_parameters_t));
return VX_SUCCESS;
}
static vx_param_description_t s_params[] =
{
{ VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED },
{ VX_OUTPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED },
{ VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED },
{ VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED },
};
#ifdef __cplusplus
extern "C" {
#endif
vx_kernel_description_t _VX_KERNEL_VAR =
{
_VX_KERNEL_ID,
_VX_KERNEL_NAME,
_VX_KERNEL_FUNC_KERNEL,
s_params,
_cnt_of_array( s_params ),
vsi_nn_KernelValidator,
NULL,
NULL,
vsi_nn_KernelInitializer,
vsi_nn_KernelDeinitializer
};
vx_kernel_description_t vxSpace2DepthKernelInfo_int16_int16 =
{
_VX_KERNEL_ID,
VX_KERNEL_NAME_SPACE2DEPTH_INT16_INT16,
NULL,
s_params,
_cnt_of_array( s_params ),
vsi_nn_KernelValidator,
NULL,
NULL,
vxSpace2DepthInitializer,
vsi_nn_KernelDeinitializer
};
vx_kernel_description_t * vx_kernel_SPACE2DEPTH_list[] =
{
NULL,
&_VX_KERNEL_VAR,
&vxSpace2DepthKernelInfo_int16_int16,
NULL
};
#ifdef __cplusplus
}
#endif

View File

@ -54,3 +54,81 @@ __kernel void a_times_b_plus_c_F16_F16_F16toF16_2D
VXC_WriteImage(output, coord, result, VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0));
}
_viv_uniform VXC_512Bits uniExtractHalf8_2x8;
_viv_uniform VXC_512Bits uniA_Times_B_lo_4x4;
_viv_uniform VXC_512Bits uniA_Times_B_hi_4x4;
__kernel void a_times_b_plus_c_F16_F16_F32toF16
(
__read_only image2d_array_t input0,
__read_only image2d_array_t input1,
__read_only image2d_array_t input2,
__write_only image2d_array_t output
)
{
int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0);
vxc_half8 src0, src1, dst;
vxc_ushort8 vec0, vec1, result;
float4 b0, b1;
float4 dst0, dst1;
VXC_ReadImage2DArray(vec0, input0, coord, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
_viv_asm(COPY, src0, vec0, 16);
VXC_ReadImage2DArray(vec1, input1, coord, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
_viv_asm(COPY, src1, vec1, 16);
b0 = read_imagef(input2, coord);
coord.x += 4;
b1 = read_imagef(input2, coord);
coord.x -= 4;
VXC_DP4x4(dst0, src0, src1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 1), uniA_Times_B_lo_4x4);
VXC_DP4x4(dst1, src0, src1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 1), uniA_Times_B_hi_4x4);
dst0 += b0;
dst1 += b1;
half4 t0, t1;
_viv_asm(CONV, t0, dst0);
_viv_asm(CONV, t1, dst1);
VXC_DP2x8(dst, t0, t1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniExtractHalf8_2x8);
_viv_asm(COPY, result, dst, 16);
VXC_WriteImage2DArray(output, coord, result, VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0));
}
__kernel void a_times_b_plus_c_F16_F16_F32toF16_2D
(
__read_only image2d_array_t input0,
__read_only image2d_array_t input1,
__read_only image2d_t input2,
__write_only image2d_array_t output
)
{
int4 coord = (int4)(get_global_id(0), get_global_id(1), 0, 0);
vxc_half8 src0, src1, dst;
vxc_ushort8 vec0, vec1, result;
float4 b0, b1;
float4 dst0, dst1;
VXC_ReadImage(vec0, input0, coord.xy, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
_viv_asm(COPY, src0, vec0, 16);
VXC_ReadImage(vec1, input1, coord.xy, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
_viv_asm(COPY, src1, vec1, 16);
b0 = read_imagef(input2, coord.xy);
coord.z = coord.x + 4;
b1 = read_imagef(input2, coord.zy);
VXC_DP4x4(dst0, src0, src1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 1), uniA_Times_B_lo_4x4);
VXC_DP4x4(dst1, src0, src1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 1), uniA_Times_B_hi_4x4);
dst0 += b0;
dst1 += b1;
half4 t0, t1;
_viv_asm(CONV, t0, dst0);
_viv_asm(CONV, t1, dst1);
VXC_DP2x8(dst, t0, t1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniExtractHalf8_2x8);
_viv_asm(COPY, result, dst, 16);
VXC_WriteImage(output, coord, result, VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0));
}

View File

@ -1,10 +1,11 @@
#include "cl_viv_vx_ext.h"
_viv_uniform int indices_num;
_viv_uniform VXC_512Bits uniExtraCopyDpKeepinEvis_2x8;
__kernel void gather_I8toI8(
__read_only image2d_t input0,
__read_only image2d_array_t input1,
__read_only image2d_t input1,
__write_only image2d_t output,
int block_size,
int block_num,
@ -16,7 +17,7 @@ __kernel void gather_I8toI8(
int gidz = get_global_id(2); // block_num
int4 coord_in = (int4)(gidy, 0, gidx, 0);
int4 indice = read_imagei(input1, coord_in.xyyy);
int4 indice = read_imagei(input1, coord_in.xy);
coord_in.w = gidz * axis_num + indice.x;
vxc_char16 src;
@ -28,7 +29,7 @@ __kernel void gather_I8toI8(
__kernel void gather_U8toU8(
__read_only image2d_t input0,
__read_only image2d_array_t input1,
__read_only image2d_t input1,
__write_only image2d_t output,
int block_size,
int block_num,
@ -40,7 +41,7 @@ __kernel void gather_U8toU8(
int gidz = get_global_id(2); // block_num
int4 coord_in = (int4)(gidy, 0, gidx, 0);
int4 indice = read_imagei(input1, coord_in.xyyy);
int4 indice = read_imagei(input1, coord_in.xy);
coord_in.w = gidz * axis_num + indice.x;
vxc_uchar16 src;
@ -52,7 +53,7 @@ __kernel void gather_U8toU8(
__kernel void gather_I16toI16(
__read_only image2d_t input0,
__read_only image2d_array_t input1,
__read_only image2d_t input1,
__write_only image2d_t output,
int block_size,
int block_num,
@ -66,7 +67,7 @@ __kernel void gather_I16toI16(
int4 coord_in = (int4)(gidy, 0, gidx, 0);
int4 indice = read_imagei(input1, coord_in.xyyy);
int4 indice = read_imagei(input1, coord_in.xy);
coord_in.w = gidz * axis_num + indice.x;
vxc_short8 src;
@ -78,7 +79,7 @@ __kernel void gather_I16toI16(
__kernel void gather_F16toF16(
__read_only image2d_t input0,
__read_only image2d_array_t input1,
__read_only image2d_t input1,
__write_only image2d_t output,
int block_size,
int block_num,
@ -92,7 +93,7 @@ __kernel void gather_F16toF16(
int4 coord_in = (int4)(gidy, 0, gidx, 0);
int4 indice = read_imagei(input1, coord_in.xyyy);
int4 indice = read_imagei(input1, coord_in.xy);
coord_in.w = gidz * axis_num + indice.x;
vxc_short8 src;
@ -101,3 +102,107 @@ __kernel void gather_F16toF16(
int2 coord = (int2)(gidx, gidz * indices_num + gidy);
VXC_WriteImage(output, coord, src, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
}
__kernel void gather_I8toI8_axis0(
__read_only image2d_t input0,
__read_only image2d_t input1,
__write_only image2d_t output,
int block_size,
int block_num,
int axis_num
)
{
int2 coord = (int2)(get_global_id(0), get_global_id(1));
int4 indices = read_imagei(input1, coord.xx);
int2 coord_in = (int2)(indices.x, get_global_id(1));
vxc_char16 src, dst;
VXC_ReadImage(src, input0, coord_in, 0, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));
indices.x = get_global_id(1);
VXC_ReadImage(src, input0, indices.yx, 0, VXC_MODIFIER(1, 1, 0, VXC_RM_TowardZero, 0));
VXC_ReadImage(src, input0, indices.zx, 0, VXC_MODIFIER(2, 2, 0, VXC_RM_TowardZero, 0));
VXC_ReadImage(src, input0, indices.wx, 0, VXC_MODIFIER(3, 3, 0, VXC_RM_TowardZero, 0));
VXC_DP2x8(dst, src, src, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0),
uniExtraCopyDpKeepinEvis_2x8);
VXC_WriteImage(output, coord, dst, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0));
}
__kernel void gather_U8toU8_axis0(
__read_only image2d_t input0,
__read_only image2d_t input1,
__write_only image2d_t output,
int block_size,
int block_num,
int axis_num
)
{
int2 coord = (int2)(get_global_id(0), get_global_id(1));
int4 indices = read_imagei(input1, coord.xx);
int2 coord_in = (int2)(indices.x, get_global_id(1));
vxc_uchar16 src, dst;
VXC_ReadImage(src, input0, coord_in, 0, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));
indices.x = get_global_id(1);
VXC_ReadImage(src, input0, indices.yx, 0, VXC_MODIFIER(1, 1, 0, VXC_RM_TowardZero, 0));
VXC_ReadImage(src, input0, indices.zx, 0, VXC_MODIFIER(2, 2, 0, VXC_RM_TowardZero, 0));
VXC_ReadImage(src, input0, indices.wx, 0, VXC_MODIFIER(3, 3, 0, VXC_RM_TowardZero, 0));
VXC_DP2x8(dst, src, src, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0),
uniExtraCopyDpKeepinEvis_2x8);
VXC_WriteImage(output, coord, dst, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0));
}
__kernel void gather_I16toI16_axis0(
__read_only image2d_t input0,
__read_only image2d_t input1,
__write_only image2d_t output,
int block_size,
int block_num,
int axis_num
)
{
int2 coord = (int2)(get_global_id(0), get_global_id(1));
int4 indices = read_imagei(input1, coord.xx);
int2 coord_in = (int2)(indices.x, get_global_id(1));
vxc_short8 src, dst;
VXC_ReadImage(src, input0, coord_in, 0, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));
indices.x = get_global_id(1);
VXC_ReadImage(src, input0, indices.yx, 0, VXC_MODIFIER(1, 1, 0, VXC_RM_TowardZero, 0));
VXC_ReadImage(src, input0, indices.zx, 0, VXC_MODIFIER(2, 2, 0, VXC_RM_TowardZero, 0));
VXC_ReadImage(src, input0, indices.wx, 0, VXC_MODIFIER(3, 3, 0, VXC_RM_TowardZero, 0));
VXC_DP2x8(dst, src, src, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0),
uniExtraCopyDpKeepinEvis_2x8);
VXC_WriteImage(output, coord, dst, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0));
}
__kernel void gather_F16toF16_axis0(
__read_only image2d_t input0,
__read_only image2d_t input1,
__write_only image2d_t output,
int block_size,
int block_num,
int axis_num
)
{
int2 coord = (int2)(get_global_id(0), get_global_id(1));
int4 indices = read_imagei(input1, coord.xx);
int2 coord_in = (int2)(indices.x, get_global_id(1));
vxc_short8 src, dst;
VXC_ReadImage(src, input0, coord_in, 0, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));
indices.x = get_global_id(1);
VXC_ReadImage(src, input0, indices.yx, 0, VXC_MODIFIER(1, 1, 0, VXC_RM_TowardZero, 0));
VXC_ReadImage(src, input0, indices.zx, 0, VXC_MODIFIER(2, 2, 0, VXC_RM_TowardZero, 0));
VXC_ReadImage(src, input0, indices.wx, 0, VXC_MODIFIER(3, 3, 0, VXC_RM_TowardZero, 0));
VXC_DP2x8(dst, src, src, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0),
uniExtraCopyDpKeepinEvis_2x8);
VXC_WriteImage(output, coord, dst, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0));
}

View File

@ -11,7 +11,7 @@ _viv_uniform int2 multAndoutZP1;//[0:15] multiplier, [31:63] output zp
#define GATHER_8BITS_TO_F16(src0_type_name, read_type) \
__kernel void gather_##src0_type_name##toF16( \
__read_only image2d_t input0, \
__read_only image2d_array_t input1, \
__read_only image2d_t input1, \
__write_only image2d_t output, \
int block_size, \
int block_num, \
@ -23,7 +23,7 @@ __kernel void gather_##src0_type_name##toF16( \
int gidz = get_global_id(2); \
\
int4 coord_in = (int4)(gidy, 0, gidx, 0); \
int4 indice = read_imagei(input1, coord_in.xyyy); \
int4 indice = read_imagei(input1, coord_in.xy); \
coord_in.w = gidz * axis_num + indice.x; \
\
read_type src; \
@ -47,7 +47,7 @@ GATHER_8BITS_TO_F16(I8, vxc_char16)
#define GATHER_F16_TO_QINT(src1_type_name, write_type) \
__kernel void gather_F16to##src1_type_name( \
__read_only image2d_t input0, \
__read_only image2d_array_t input1, \
__read_only image2d_t input1, \
__write_only image2d_t output, \
int block_size, \
int block_num, \
@ -59,7 +59,7 @@ __kernel void gather_F16to##src1_type_name( \
int gidz = get_global_id(2); \
int4 coord_in = (int4)(gidy, 0, gidx, 0); \
\
int4 indice = read_imagei(input1, coord_in.xyyy); \
int4 indice = read_imagei(input1, coord_in.xy); \
coord_in.w = gidz * axis_num + indice.x; \
\
vxc_short8 src; \
@ -79,7 +79,7 @@ GATHER_F16_TO_QINT(I16, vxc_short8)
__kernel void gather_I16toF16(
__read_only image2d_t input0,
__read_only image2d_array_t input1,
__read_only image2d_t input1,
__write_only image2d_t output,
int block_size,
int block_num,
@ -91,7 +91,7 @@ __kernel void gather_I16toF16(
int gidz = get_global_id(2);
int4 coord_in = (int4)(gidy, 0, gidx, 0);
int4 indice = read_imagei(input1, coord_in.xyyy);
int4 indice = read_imagei(input1, coord_in.xy);
coord_in.w = gidz * axis_num + indice.x;
vxc_short8 src;
@ -109,3 +109,97 @@ __kernel void gather_I16toF16(
VXC_WriteImage(output, coord, dst0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
}
#define GATHER_8BITS_TO_F16_AXIS0(src0_type_name, read_type) \
__kernel void gather_##src0_type_name##toF16_axis0( \
__read_only image2d_t input0, \
__read_only image2d_t input1, \
__write_only image2d_t output, \
int block_size, \
int block_num, \
int axis_num \
) \
{ \
int2 coord = (int2)(get_global_id(0), get_global_id(1)); \
int4 indices = read_imagei(input1, coord.xx); \
int2 coord_in = (int2)(indices.x, get_global_id(1)); \
\
read_type src; \
VXC_ReadImage(src, input0, coord_in, 0, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0)); \
indices.x = get_global_id(1); \
VXC_ReadImage(src, input0, indices.yx, 0, VXC_MODIFIER(1, 1, 0, VXC_RM_TowardZero, 0)); \
VXC_ReadImage(src, input0, indices.zx, 0, VXC_MODIFIER(2, 2, 0, VXC_RM_TowardZero, 0)); \
VXC_ReadImage(src, input0, indices.wx, 0, VXC_MODIFIER(3, 3, 0, VXC_RM_TowardZero, 0)); \
vxc_half8 src0; \
vxc_short8 dst0; \
vxc_ushort8 ms0; \
_viv_asm(COPY, ms0, multAndoutZP0, 16); \
VXC_DP2x8(src0,src,ms0, VXC_MODIFIER(0,7,0, VXC_RM_TowardZero,1), uniU8MulAndPostShift_0_Lo_2x8); \
_viv_asm(COPY, dst0, src0, 16); \
VXC_WriteImage(output, coord, dst0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \
}
GATHER_8BITS_TO_F16_AXIS0(U8, vxc_uchar16)
GATHER_8BITS_TO_F16_AXIS0(I8, vxc_char16)
#define GATHER_F16_TO_QINT_AXIS0(src1_type_name, write_type) \
__kernel void gather_F16to##src1_type_name##_axis0( \
__read_only image2d_t input0, \
__read_only image2d_t input1, \
__write_only image2d_t output, \
int block_size, \
int block_num, \
int axis_num \
) \
{ \
int2 coord = (int2)(get_global_id(0), get_global_id(1)); \
int4 indices = read_imagei(input1, coord.xx); \
int2 coord_in = (int2)(indices.x, get_global_id(1)); \
\
vxc_short8 src; \
VXC_ReadImage(src, input0, coord_in, 0, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0)); \
indices.x = get_global_id(1); \
VXC_ReadImage(src, input0, indices.yx, 0, VXC_MODIFIER(1, 1, 0, VXC_RM_TowardZero, 0)); \
VXC_ReadImage(src, input0, indices.zx, 0, VXC_MODIFIER(2, 2, 0, VXC_RM_TowardZero, 0)); \
VXC_ReadImage(src, input0, indices.wx, 0, VXC_MODIFIER(3, 3, 0, VXC_RM_TowardZero, 0)); \
vxc_ushort8 mp1; \
_viv_asm(COPY, mp1, multAndoutZP1, 16); \
vxc_half8 data; \
write_type dst; \
_viv_asm(COPY, data, src, 16); \
VXC_DP2x8(dst,data,mp1,VXC_MODIFIER(0,7,0,VXC_RM_ToNearestEven, 1),uniConvertFp16toU8_2x8); \
VXC_WriteImage(output, coord, dst, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \
}
GATHER_F16_TO_QINT_AXIS0(U8, vxc_uchar16)
GATHER_F16_TO_QINT_AXIS0(I8, vxc_char16)
GATHER_F16_TO_QINT_AXIS0(I16, vxc_short8)
__kernel void gather_I16toF16_axis0(
__read_only image2d_t input0,
__read_only image2d_t input1,
__write_only image2d_t output,
int block_size,
int block_num,
int axis_num
)
{
int2 coord = (int2)(get_global_id(0), get_global_id(1));
int4 indices = read_imagei(input1, coord.xx);
int2 coord_in = (int2)(indices.x, get_global_id(1));
vxc_short8 src;
VXC_ReadImage(src, input0, coord_in, 0, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));
indices.x = get_global_id(1);
VXC_ReadImage(src, input0, indices.yx, 0, VXC_MODIFIER(1, 1, 0, VXC_RM_TowardZero, 0));
VXC_ReadImage(src, input0, indices.zx, 0, VXC_MODIFIER(2, 2, 0, VXC_RM_TowardZero, 0));
VXC_ReadImage(src, input0, indices.wx, 0, VXC_MODIFIER(3, 3, 0, VXC_RM_TowardZero, 0));
vxc_half8 src0;
vxc_short8 dst0;
vxc_ushort8 ms0;
_viv_asm(COPY, ms0, multAndoutZP0, 16);
VXC_DP2x8(src0, src, ms0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1),\
uniU8MulAndPostShift_0_Lo_2x8);
_viv_asm(COPY, dst0, src0, 16);
VXC_WriteImage(output, coord, dst0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0));
}

View File

@ -0,0 +1,279 @@
#include "cl_viv_vx_ext.h"
/**************************layernorm float16***********************************/
_viv_uniform int width;
_viv_uniform float dimRatio;
_viv_uniform VXC_512Bits uniFp16SumSqr_dp8x2;
_viv_uniform VXC_512Bits UniFP16toFP32Lo4_dp4x4;
_viv_uniform VXC_512Bits uniExtractHalf4_dp4x4;
__kernel void layer_norm_F16toF16(
image2d_array_t input, image2d_t bias, image2d_t scale,
image2d_array_t output, float eps)
{
int4 coord = (int4)(0, get_global_id(1), get_global_id(2), 0);
int4 coord_out = coord;
int8 input_desc, output_desc;
_viv_asm(COPY, input_desc, input, sizeof(input_desc));
int baseAddr_a = (int)get_global_id(2) * input_desc.s4 + input_desc.s0;
_viv_asm(MOV, coord.z, baseAddr_a);
vxc_short8 src0, src1;
vxc_float sum = 0, sqr = 0;
VXC_OP4(img_load_3d, src0, input, coord.xyzz, VXC_5BITOFFSET_XY(0, 0), \
VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
_viv_asm(COPY, output_desc, output, sizeof(output_desc));
int baseAddr = (int)get_global_id(2) * output_desc.s4 + output_desc.s0;
_viv_asm(MOV, coord_out.w, baseAddr);
for(coord.x = 8; coord.x < (width+8); coord.x += 8)
{
vxc_half8 val0_h;
_viv_asm(COPY, val0_h, src0, 16);
VXC_OP4(img_load_3d, src0, input, coord.xyzz, VXC_5BITOFFSET_XY(0, 0), \
VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
vxc_float4 sumsqr;
VXC_DP8x2(sumsqr, val0_h, val0_h, VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0),\
uniFp16SumSqr_dp8x2);
sum += sumsqr.x;
sqr += sumsqr.y;
}
vxc_float mean;
mean = sum * dimRatio;
vxc_float vari;
vari = sqr*dimRatio - mean*mean;
vari += eps;
vari = rsqrt(vari);
vxc_float4 bias_f;
for(coord.x = 0; coord.x < width; coord.x += 4)
{
VXC_OP4(img_load_3d, src0, input, coord.xyzz, VXC_5BITOFFSET_XY(0, 0), \
VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0));
VXC_ReadImage(src1, scale, coord.xw, VXC_5BITOFFSET_XY(0, 0),\
VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0));
bias_f = read_imagef(bias, coord.xw);
vxc_half8 in_h, scale_h;
_viv_asm(COPY, in_h, src0, 16);
_viv_asm(COPY, scale_h, src1, 16);
vxc_float4 in_f, scale_f;
VXC_DP4x4(in_f, in_h, in_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\
UniFP16toFP32Lo4_dp4x4);
VXC_DP4x4(scale_f, scale_h, scale_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\
UniFP16toFP32Lo4_dp4x4);
vxc_float4 sub, norm;
sub = in_f - mean;
norm = scale_f * vari * sub + bias_f;
half4 norm_h;
_viv_asm(CONV, norm_h, norm);
vxc_half8 dst;
VXC_DP4x4(dst, norm_h, norm_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\
uniExtractHalf4_dp4x4);
vxc_short8 dstval;
_viv_asm(COPY, dstval, dst, 16);
coord_out.x = coord.x;
VXC_OP4_NoDest(img_store_3d, output, coord_out.xyww, dstval, \
VXC_MODIFIER(0, 3, 0,VXC_RM_TowardZero, 0));
}
}
/*****************************layernorm uint8 to uint8****************************/
_viv_uniform VXC_512Bits uniConvert1stUint8SubZpToFp32_4x4;
_viv_uniform VXC_512Bits uniConvert2ndUint8SubZpToFp32_4x4;
_viv_uniform VXC_512Bits uniConvert3rdUint8SubZpToFp32_4x4;
_viv_uniform VXC_512Bits uniConvert4thUint8SubZpToFp32_4x4;
_viv_uniform VXC_512Bits uniSumU8_16x1;
_viv_uniform VXC_512Bits uniSqrSum_16x1;
_viv_uniform float input_scale;
_viv_uniform int inputZP;
_viv_uniform float outputScale;
_viv_uniform float output_zp;
_viv_uniform int sumInZp;
_viv_uniform int tmpZp1;
_viv_uniform int tmpZp2;
_viv_uniform float e2InScale;
_viv_uniform VXC_512Bits uniConvertSecFp16Fp32_4x4;
_viv_uniform VXC_512Bits uniConvertInt32toUint8_2x8;
__kernel void layer_norm_U8toU8(
image2d_array_t input, image2d_t bias, image2d_t scale,
image2d_array_t output, float eps)
{
int4 coord = (int4)(0, get_global_id(1), get_global_id(2), 0);
int4 coord_out = coord;
vxc_uchar16 src0, src2;
vxc_short8 src1;
vxc_half8 scale_h;
float sum = 0, sqr = 0;
vxc_float4 bias_f0, bias_f1, scale_f0, scale_f1;
int tmpSum = 0, tmpSqr = 0;
vxc_int4 tmpSum1;
vxc_int4 tmpSqr1;
short zp = inputZP;
int8 input_desc, output_desc;
_viv_asm(COPY, input_desc, input, sizeof(input_desc));
int baseAddr_a = (int)get_global_id(2) * input_desc.s4 + input_desc.s0;
_viv_asm(MOV, coord.z, baseAddr_a);
_viv_asm(COPY, output_desc, output, sizeof(output_desc));
int baseAddr = (int)get_global_id(2) * output_desc.s4 + output_desc.s0;
_viv_asm(MOV, coord_out.w, baseAddr);
for(coord.x = 0; coord.x < width; coord.x += 16)
{
VXC_OP4(img_load_3d, src0, input, coord.xyzz, VXC_5BITOFFSET_XY(0, 0), \
VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));
VXC_DP16x1(tmpSum1, src0, src0, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0), uniSumU8_16x1);
tmpSum += (tmpSum1.x);
VXC_DP16x1(tmpSqr1, src0, src0, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0), uniSqrSum_16x1);
tmpSqr += (tmpSqr1.x + tmpZp1 * tmpSum1.x);
}
sum = (tmpSum + sumInZp) * input_scale;
sqr = (tmpSqr + tmpZp2) * e2InScale;
float mean, vari;
mean = sum * dimRatio;
vari = sqr*dimRatio - mean*mean;
vari += eps;
vari = rsqrt(vari);
vxc_int4 tmpVal0, tmpVal1;
vxc_float4 tmpData0, tmpData1, tmpData2, tmpData3;
int2 coord_bias = (int2)(0, 0);
for(coord.x = 0; coord.x < width; coord.x += 16)
{
VXC_OP4(img_load_3d, src0, input, coord.xyzz, VXC_5BITOFFSET_XY(0, 0), \
VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));
VXC_ReadImage(src1, scale, coord.xw, VXC_5BITOFFSET_XY(0, 0),\
VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
coord_bias.x = coord.x;
_viv_asm(COPY, scale_h, src1, 16);
VXC_DP4x4(scale_f0, scale_h, scale_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\
UniFP16toFP32Lo4_dp4x4);
VXC_DP4x4(scale_f1, scale_h, scale_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\
uniConvertSecFp16Fp32_4x4);
bias_f0 = read_imagef(bias, coord_bias);
coord_bias.x += 4;
bias_f1 = read_imagef(bias, coord_bias);
coord_bias.x += 4;
VXC_ReadImage(src1, scale, coord.xw, VXC_5BITOFFSET_XY(8, 0),\
VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
_viv_asm(COPY, scale_h, src1, 16);
VXC_DP4x4(tmpData0, src0, zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\
uniConvert1stUint8SubZpToFp32_4x4);
VXC_DP4x4(tmpData1, src0, zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\
uniConvert2ndUint8SubZpToFp32_4x4);
VXC_DP4x4(tmpData2, src0, zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\
uniConvert3rdUint8SubZpToFp32_4x4);
VXC_DP4x4(tmpData3, src0, zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\
uniConvert4thUint8SubZpToFp32_4x4);
tmpData0 *= input_scale;
tmpData1 *= input_scale;
tmpData2 *= input_scale;
tmpData3 *= input_scale;
vxc_float4 norm;
tmpData0 -= mean;
norm = scale_f0 * vari * tmpData0 + bias_f0;
bias_f0 = read_imagef(bias, coord_bias);
VXC_DP4x4(scale_f0, scale_h, scale_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\
UniFP16toFP32Lo4_dp4x4);
coord_bias.x += 4;
tmpVal0 = convert_int4_rte(norm * outputScale + output_zp);
tmpData1 -= mean;
norm = scale_f1 * vari * tmpData1 + bias_f1;
bias_f1 = read_imagef(bias, coord_bias);
VXC_DP4x4(scale_f1, scale_h, scale_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\
uniConvertSecFp16Fp32_4x4);
tmpVal1 = convert_int4_rte(norm * outputScale + output_zp);
VXC_DP2x8(src2, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1),\
uniConvertInt32toUint8_2x8);
tmpData2 -= mean;
norm = scale_f0 * vari * tmpData2 + bias_f0;
tmpVal0 = convert_int4_rte(norm * outputScale + output_zp);
tmpData3 -= mean;
norm = scale_f1 * vari * tmpData3 + bias_f1;
tmpVal1 = convert_int4_rte(norm * outputScale + output_zp);
VXC_DP2x8(src2, tmpVal0, tmpVal1, VXC_MODIFIER(8, 15, 0, VXC_RM_TowardZero, 1),\
uniConvertInt32toUint8_2x8);
coord_out.x = coord.x;
VXC_OP4_NoDest(img_store_3d, output, coord_out.xyww, src2, \
VXC_MODIFIER(0, 15, 0,VXC_RM_TowardZero, 0));
}
}
/***************************layernorm float16 to uint8**************************/
__kernel void layer_norm_F16toU8(
image2d_array_t input, image2d_t bias, image2d_t scale,
image2d_array_t output, float eps)
{
int4 coord = (int4)(0, get_global_id(1), get_global_id(2), 0);
int4 coord_out = coord;
int8 input_desc, output_desc;
_viv_asm(COPY, input_desc, input, sizeof(input_desc));
int baseAddr_a = (int)get_global_id(2) * input_desc.s4 + input_desc.s0;
_viv_asm(MOV, coord.z, baseAddr_a);
vxc_short8 src0, src1;
vxc_float sum = 0, sqr = 0;
VXC_OP4(img_load_3d, src0, input, coord.xyzz, VXC_5BITOFFSET_XY(0, 0), \
VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
_viv_asm(COPY, output_desc, output, sizeof(output_desc));
int baseAddr = (int)get_global_id(2) * output_desc.s4 + output_desc.s0;
_viv_asm(MOV, coord_out.w, baseAddr);
for(coord.x = 8; coord.x < (width+8); coord.x += 8)
{
vxc_half8 val0_h;
_viv_asm(COPY, val0_h, src0, 16);
VXC_OP4(img_load_3d, src0, input, coord.xyzz, VXC_5BITOFFSET_XY(0, 0), \
VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
vxc_float4 sumsqr;
VXC_DP8x2(sumsqr, val0_h, val0_h, VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0),\
uniFp16SumSqr_dp8x2);
sum += sumsqr.x;
sqr += sumsqr.y;
}
vxc_float mean;
mean = sum * dimRatio;
vxc_float vari;
vari = sqr*dimRatio - mean*mean;
vari += eps;
vari = rsqrt(vari);
vxc_float4 bias_f;
for(coord.x = 0; coord.x < width; coord.x += 4)
{
VXC_OP4(img_load_3d, src0, input, coord.xyzz, VXC_5BITOFFSET_XY(0, 0), \
VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0));
VXC_ReadImage(src1, scale, coord.xw, VXC_5BITOFFSET_XY(0, 0),\
VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0));
bias_f = read_imagef(bias, coord.xw);
vxc_half8 in_h, scale_h;
_viv_asm(COPY, in_h, src0, 16);
_viv_asm(COPY, scale_h, src1, 16);
vxc_float4 in_f, scale_f;
VXC_DP4x4(in_f, in_h, in_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\
UniFP16toFP32Lo4_dp4x4);
VXC_DP4x4(scale_f, scale_h, scale_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\
UniFP16toFP32Lo4_dp4x4);
vxc_float4 sub, norm;
sub = in_f - mean;
norm = scale_f * vari * sub + bias_f;
norm = norm * outputScale + output_zp;
int4 output_int4;
output_int4 = convert_int4_rte(norm);
vxc_uchar8 dst;
VXC_DP2x8(dst, output_int4, output_int4, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1),
uniConvertInt32toUint8_2x8);
coord_out.x = coord.x;
VXC_OP4_NoDest(img_store_3d, output, coord_out.xyww, dst, \
VXC_MODIFIER(0, 3, 0,VXC_RM_TowardZero, 0));
}
}

View File

@ -7,12 +7,9 @@ _viv_uniform VXC_512Bits uniFp16SumSqr_dp8x2;
_viv_uniform VXC_512Bits UniFP16toFP32Lo4_dp4x4;
_viv_uniform VXC_512Bits uniExtractHalf4_dp4x4;
__kernel void vxcLayerNorm(
image2d_array_t input,
image2d_array_t bias,
image2d_array_t scale,
image2d_array_t output,
float eps)
__kernel void layer_norm_F16toF16_2D(
image2d_t input, image2d_t bias, image2d_t scale,
image2d_t output, float eps)
{
int4 coord = (int4)(0, get_global_id(1), 0, 0);
vxc_short8 src0, src1;
@ -44,7 +41,7 @@ __kernel void vxcLayerNorm(
VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0));
VXC_ReadImage(src1, scale, coord.xw, VXC_5BITOFFSET_XY(0, 0),\
VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0));
bias_f = read_imagef(bias, coord.xwww);
bias_f = read_imagef(bias, coord.xw);
vxc_half8 in_h, scale_h;
_viv_asm(COPY, in_h, src0, 16);
_viv_asm(COPY, scale_h, src1, 16);
@ -76,7 +73,7 @@ _viv_uniform VXC_512Bits uniSqrSum_16x1;
_viv_uniform float input_scale;
_viv_uniform int inputZP;
_viv_uniform float outputScale;
_viv_uniform int output_ZP;
_viv_uniform float output_zp;
_viv_uniform int sumInZp;
_viv_uniform int tmpZp1;
_viv_uniform int tmpZp2;
@ -84,12 +81,9 @@ _viv_uniform float e2InScale;
_viv_uniform VXC_512Bits uniConvertSecFp16Fp32_4x4;
_viv_uniform VXC_512Bits uniConvertInt32toUint8_2x8;
__kernel void vxcLayerNorm_u8(
image2d_array_t input,
image2d_array_t bias,
image2d_array_t scale,
image2d_array_t output,
float eps)
__kernel void layer_norm_U8toU8_2D(
image2d_t input, image2d_t bias, image2d_t scale,
image2d_t output, float eps)
{
int4 coord = (int4)(0, get_global_id(1), 0, 0);
vxc_uchar16 src0, src2;
@ -121,15 +115,15 @@ __kernel void vxcLayerNorm_u8(
vari = rsqrt(vari);
vxc_int4 tmpVal0, tmpVal1;
vxc_float4 tmpData0, tmpData1, tmpData2, tmpData3;
int4 coord_bias = (int4)(0, 0, 0, 0);
int2 coord_bias = (int2)(0, 0);
for(coord.x = 0; coord.x < width; coord.x += 16)
{
coord_bias.x = coord.x;
VXC_ReadImage(src0, input, coord.xy, VXC_5BITOFFSET_XY(0, 0),\
VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));
VXC_ReadImage(src1, scale, coord.xw, VXC_5BITOFFSET_XY(0, 0),\
VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
coord_bias.x = coord.x;
_viv_asm(COPY, scale_h, src1, 16);
VXC_DP4x4(scale_f0, scale_h, scale_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\
UniFP16toFP32Lo4_dp4x4);
@ -151,49 +145,41 @@ __kernel void vxcLayerNorm_u8(
uniConvert3rdUint8SubZpToFp32_4x4);
VXC_DP4x4(tmpData3, src0, zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\
uniConvert4thUint8SubZpToFp32_4x4);
tmpData0 *= input_scale;
tmpData1 *= input_scale;
tmpData2 *= input_scale;
tmpData3 *= input_scale;
tmpData0 = tmpData0 * input_scale - mean;
tmpData1 = tmpData1 * input_scale - mean;
tmpData2 = tmpData2 * input_scale - mean;
tmpData3 = tmpData3 * input_scale - mean;
vxc_float4 norm;
tmpData0 -= mean;
norm = scale_f0 * vari * tmpData0 + bias_f0;
bias_f0 = read_imagef(bias, coord_bias);
VXC_DP4x4(scale_f0, scale_h, scale_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\
UniFP16toFP32Lo4_dp4x4);
coord_bias.x += 4;
tmpVal0 = convert_int4_rte(norm * outputScale + output_ZP);
tmpVal0 = convert_int4_rte(norm * outputScale + output_zp);
tmpData1 -= mean;
norm = scale_f1 * vari * tmpData1 + bias_f1;
bias_f1 = read_imagef(bias, coord_bias);
VXC_DP4x4(scale_f1, scale_h, scale_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\
uniConvertSecFp16Fp32_4x4);
tmpVal1 = convert_int4_rte(norm * outputScale + output_ZP);
tmpVal1 = convert_int4_rte(norm * outputScale + output_zp);
VXC_DP2x8(src2, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1),\
uniConvertInt32toUint8_2x8);
tmpData2 -= mean;
norm = scale_f0 * vari * tmpData2 + bias_f0;
tmpVal0 = convert_int4_rte(norm * outputScale + output_ZP);
tmpVal0 = convert_int4_rte(norm * outputScale + output_zp);
tmpData3 -= mean;
norm = scale_f1 * vari * tmpData3 + bias_f1;
tmpVal1 = convert_int4_rte(norm * outputScale + output_ZP);
tmpVal1 = convert_int4_rte(norm * outputScale + output_zp);
VXC_DP2x8(src2, tmpVal0, tmpVal1, VXC_MODIFIER(8, 15, 0, VXC_RM_TowardZero, 1),\
uniConvertInt32toUint8_2x8);
VXC_WriteImage(output, coord.xy, src2, VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));
}
}
/***************************layernorm float16 to uint8**************************/
_viv_uniform float outputZP;
__kernel void vxcLayerNormFP16toU8(
image2d_array_t input,
image2d_array_t bias,
image2d_array_t scale,
image2d_array_t output,
float eps)
__kernel void layer_norm_F16toU8_2D(
image2d_t input, image2d_t bias, image2d_t scale,
image2d_t output, float eps)
{
int4 coord = (int4)(0, get_global_id(1), 0, 0);
vxc_short8 src0, src1;
@ -225,7 +211,7 @@ __kernel void vxcLayerNormFP16toU8(
VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0));
VXC_ReadImage(src1, scale, coord.xw, VXC_5BITOFFSET_XY(0, 0),\
VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0));
bias_f = read_imagef(bias, coord.xwww);
bias_f = read_imagef(bias, coord.xw);
vxc_half8 in_h, scale_h;
_viv_asm(COPY, in_h, src0, 16);
_viv_asm(COPY, scale_h, src1, 16);
@ -237,7 +223,7 @@ __kernel void vxcLayerNormFP16toU8(
vxc_float4 sub, norm;
sub = in_f - mean;
norm = scale_f * vari * sub + bias_f;
norm = norm * outputScale + outputZP;
norm = norm * outputScale + output_zp;
int4 output_int4;
output_int4 = convert_int4_rte(norm);
vxc_uchar8 dst;
@ -245,4 +231,4 @@ __kernel void vxcLayerNormFP16toU8(
uniConvertInt32toUint8_2x8);
VXC_WriteImage(output, coord.xy, dst, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0));
}
}
}

View File

@ -0,0 +1,167 @@
#include "cl_viv_vx_ext.h"
/**************************layernorm float16***********************************/
_viv_uniform int width;
_viv_uniform float dimRatio;
_viv_uniform float dimRatio_scale;
_viv_uniform VXC_512Bits uniInt16SumSqr_dp8x2;
_viv_uniform VXC_512Bits UniFP16toFP32Lo4_dp4x4;
_viv_uniform VXC_512Bits uniConvertSecFp16Fp32_4x4;
_viv_uniform VXC_512Bits uniConvert1stUint8SubZpToFp32_4x4;
_viv_uniform VXC_512Bits uniConvert2ndUint8SubZpToFp32_4x4;
_viv_uniform VXC_512Bits uniConvertInt32toUint8_2x8;
_viv_uniform float e2InScale;
_viv_uniform float outputScale;
_viv_uniform float output_zp;
_viv_uniform float input_scale;
_viv_uniform int inputZP;
__kernel void layer_norm_I16toI16(
image2d_array_t input, image2d_t bias, image2d_t scale,
image2d_array_t output, float eps)
{
int4 coord = (int4)(0, get_global_id(1), get_global_id(2), 0);
int8 input_desc, output_desc;
_viv_asm(COPY, input_desc, input, sizeof(input_desc));
int baseAddr_a = (int)get_global_id(2) * input_desc.s4 + input_desc.s0;
_viv_asm(MOV, coord.z, baseAddr_a);
_viv_asm(COPY, output_desc, output, sizeof(output_desc));
int baseAddr = (int)get_global_id(2) * output_desc.s4 + output_desc.s0;
_viv_asm(MOV, coord.w, baseAddr);
vxc_short8 src0, src1, dst;
vxc_float sum = 0, sqr = 0;
for(; coord.x < width;)
{
VXC_OP4(img_load_3d, src0, input, coord.xyzz, VXC_5BITOFFSET_XY(0, 0), \
VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
coord.x += 8;
vxc_float4 sumsqr;
VXC_DP8x2(sumsqr, src0, src0, VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0),\
uniInt16SumSqr_dp8x2);
sum += sumsqr.x;
sqr = sqr + sumsqr.y * e2InScale;
}
vxc_float mean;
mean = sum * dimRatio_scale;
vxc_float vari;
vari = sqr*dimRatio - mean*mean;
vari += eps;
vari = rsqrt(vari);
short zp = inputZP;
vxc_float4 tmpData0, tmpData1;
vxc_float4 bias_f0, bias_f1, scale_f0, scale_f1;
vxc_half8 scale_h;
vxc_int4 tmpVal0, tmpVal1;
int2 coord_bias = (int2)(0, 0);
for(coord.x = 0; coord.x < width; coord.x += 8)
{
VXC_OP4(img_load_3d, src0, input, coord.xyzz, VXC_5BITOFFSET_XY(0, 0), \
VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
coord_bias.x = coord.x;
VXC_ReadImage(src1, scale, coord_bias, VXC_5BITOFFSET_XY(0, 0),\
VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
bias_f0 = read_imagef(bias, coord_bias);
coord_bias.x += 4;
bias_f1 = read_imagef(bias, coord_bias);
VXC_DP4x4(tmpData0, src0, zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\
uniConvert1stUint8SubZpToFp32_4x4);
VXC_DP4x4(tmpData1, src0, zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\
uniConvert2ndUint8SubZpToFp32_4x4);
_viv_asm(COPY, scale_h, src1, 16);
VXC_DP4x4(scale_f0, scale_h, scale_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\
UniFP16toFP32Lo4_dp4x4);
VXC_DP4x4(scale_f1, scale_h, scale_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\
uniConvertSecFp16Fp32_4x4);
vxc_float4 sub, norm;
sub = tmpData0 * input_scale - mean;
norm = scale_f0 * vari * sub + bias_f0;
tmpVal0 = convert_int4_rte(norm * outputScale + output_zp);
sub = tmpData1 * input_scale - mean;
norm = scale_f1 * vari * sub + bias_f1;
tmpVal1 = convert_int4_rte(norm * outputScale + output_zp);
VXC_DP2x8(dst, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1),\
uniConvertInt32toUint8_2x8);
VXC_OP4_NoDest(img_store_3d, output, coord.xyww, dst, \
VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0));
}
}
__kernel void layer_norm_I16toI16_2D(
image2d_t input, image2d_t bias, image2d_t scale,
image2d_t output, float eps)
{
int2 coord = (int2)(0, get_global_id(1));
vxc_short8 src0, src1, dst;
vxc_float sum = 0, sqr = 0;
for(; coord.x < width;)
{
VXC_ReadImage(src0, input, coord.xy, VXC_5BITOFFSET_XY(0, 0),\
VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
coord.x += 8;
vxc_float4 sumsqr;
VXC_DP8x2(sumsqr, src0, src0, VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0),\
uniInt16SumSqr_dp8x2);
sum += sumsqr.x;
sqr = sqr + sumsqr.y * e2InScale;
}
vxc_float mean, vari;
mean = sum * dimRatio_scale;
vari = sqr * dimRatio - mean * mean;
vari += eps;
vari = rsqrt(vari);
short zp = inputZP;
vxc_float4 tmpData0, tmpData1;
vxc_float4 bias_f0, bias_f1, scale_f0, scale_f1;
vxc_half8 scale_h;
vxc_int4 tmpVal0, tmpVal1;
int2 coord_bias = (int2)(0, 0);
for(coord.x = 0; coord.x < width; coord.x += 8)
{
VXC_ReadImage(src0, input, coord.xy, VXC_5BITOFFSET_XY(0, 0),\
VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
coord_bias.x = coord.x;
VXC_ReadImage(src1, scale, coord_bias, VXC_5BITOFFSET_XY(0, 0),\
VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
bias_f0 = read_imagef(bias, coord_bias);
coord_bias.x += 4;
bias_f1 = read_imagef(bias, coord_bias);
VXC_DP4x4(tmpData0, src0, zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\
uniConvert1stUint8SubZpToFp32_4x4);
VXC_DP4x4(tmpData1, src0, zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\
uniConvert2ndUint8SubZpToFp32_4x4);
_viv_asm(COPY, scale_h, src1, 16);
VXC_DP4x4(scale_f0, scale_h, scale_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\
UniFP16toFP32Lo4_dp4x4);
VXC_DP4x4(scale_f1, scale_h, scale_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\
uniConvertSecFp16Fp32_4x4);
vxc_float4 sub, norm;
sub = tmpData0 * input_scale - mean;
norm = scale_f0 * vari * sub + bias_f0;
tmpVal0 = convert_int4_rte(norm * outputScale + output_zp);
sub = tmpData1 * input_scale - mean;
norm = scale_f1 * vari * sub + bias_f1;
tmpVal1 = convert_int4_rte(norm * outputScale + output_zp);
VXC_DP2x8(dst, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1),\
uniConvertInt32toUint8_2x8);
VXC_WriteImage(output, coord.xy, dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
}
}

View File

@ -0,0 +1,252 @@
#include "cl_viv_vx_ext.h"
/*****************************layernorm uint8 to fp16****************************/
_viv_uniform int width;
_viv_uniform float dimRatio;
_viv_uniform VXC_512Bits UniFP16toFP32Lo4_dp4x4;
_viv_uniform VXC_512Bits uniConvert1stUint8SubZpToFp32_4x4;
_viv_uniform VXC_512Bits uniConvert2ndUint8SubZpToFp32_4x4;
_viv_uniform VXC_512Bits uniConvert3rdUint8SubZpToFp32_4x4;
_viv_uniform VXC_512Bits uniConvert4thUint8SubZpToFp32_4x4;
_viv_uniform VXC_512Bits uniSumU8_16x1;
_viv_uniform VXC_512Bits uniSqrSum_16x1;
_viv_uniform float input_scale;
_viv_uniform int inputZP;
_viv_uniform int sumInZp;
_viv_uniform int tmpZp1;
_viv_uniform int tmpZp2;
_viv_uniform float e2InScale;
_viv_uniform VXC_512Bits uniConvertSecFp16Fp32_4x4;
_viv_uniform VXC_512Bits UniPackFP16even_2x8;
__kernel void layer_norm_U8toF16(
image2d_array_t input,
image2d_t bias,
image2d_t scale,
image2d_array_t output,
float eps)
{
int4 coord = (int4)(0, get_global_id(1), get_global_id(2), 0);
int4 coord_out = coord;
vxc_uchar16 src0;
float sum = 0, sqr = 0;
int tmpSum = 0, tmpSqr = 0;
vxc_int4 tmpSum1;
vxc_int4 tmpSqr1;
int8 input_desc, output_desc;
_viv_asm(COPY, input_desc, input, sizeof(input_desc));
int baseAddr_a = (int)get_global_id(2) * input_desc.s4 + input_desc.s0;
_viv_asm(MOV, coord.z, baseAddr_a);
_viv_asm(COPY, output_desc, output, sizeof(output_desc));
int baseAddr = (int)get_global_id(2) * output_desc.s4 + output_desc.s0;
_viv_asm(MOV, coord_out.w, baseAddr);
for(coord.x = 0; coord.x < width; coord.x += 16)
{
VXC_OP4(img_load_3d, src0, input, coord.xyzz, VXC_5BITOFFSET_XY(0, 0), \
VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));
VXC_DP16x1(tmpSum1, src0, src0, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0), uniSumU8_16x1);
tmpSum += (tmpSum1.x);
VXC_DP16x1(tmpSqr1, src0, src0, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0), uniSqrSum_16x1);
tmpSqr += (tmpSqr1.x + tmpZp1 * tmpSum1.x);
}
sum = (tmpSum + sumInZp) * input_scale;
sqr = (tmpSqr + tmpZp2) * e2InScale;
float mean, vari;
mean = sum * dimRatio;
vari = sqr*dimRatio - mean*mean;
vari += eps;
vari = rsqrt(vari);
vxc_float4 tmpData0, tmpData1, tmpData2, tmpData3;
int2 coord_bias = (int2)(0, 0);
vxc_half8 scale_h;
vxc_float4 bias_f0, bias_f1, scale_f0, scale_f1;
vxc_short8 src1, outval;
short zp = inputZP;
half4 tmpVal0, tmpVal1;
vxc_half8 dst;
for(coord.x = 0; coord.x < width; coord.x += 16)
{
VXC_OP4(img_load_3d, src0, input, coord.xyzz, VXC_5BITOFFSET_XY(0, 0), \
VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));
VXC_ReadImage(src1, scale, coord.xw, VXC_5BITOFFSET_XY(0, 0),\
VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
coord_bias.x = coord.x;
_viv_asm(COPY, scale_h, src1, 16);
VXC_DP4x4(scale_f0, scale_h, scale_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\
UniFP16toFP32Lo4_dp4x4);
VXC_DP4x4(scale_f1, scale_h, scale_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\
uniConvertSecFp16Fp32_4x4);
bias_f0 = read_imagef(bias, coord_bias);
coord_bias.x += 4;
bias_f1 = read_imagef(bias, coord_bias);
coord_bias.x += 4;
VXC_ReadImage(src1, scale, coord.xw, VXC_5BITOFFSET_XY(8, 0),\
VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
_viv_asm(COPY, scale_h, src1, 16);
VXC_DP4x4(tmpData0, src0, zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\
uniConvert1stUint8SubZpToFp32_4x4);
VXC_DP4x4(tmpData1, src0, zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\
uniConvert2ndUint8SubZpToFp32_4x4);
VXC_DP4x4(tmpData2, src0, zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\
uniConvert3rdUint8SubZpToFp32_4x4);
VXC_DP4x4(tmpData3, src0, zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\
uniConvert4thUint8SubZpToFp32_4x4);
tmpData0 *= input_scale;
tmpData1 *= input_scale;
tmpData2 *= input_scale;
tmpData3 *= input_scale;
vxc_float4 norm;
tmpData0 -= mean;
norm = scale_f0 * vari * tmpData0 + bias_f0;
bias_f0 = read_imagef(bias, coord_bias);
VXC_DP4x4(scale_f0, scale_h, scale_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\
UniFP16toFP32Lo4_dp4x4);
coord_bias.x += 4;
_viv_asm(CONV, tmpVal0, norm);
tmpData1 -= mean;
norm = scale_f1 * vari * tmpData1 + bias_f1;
bias_f1 = read_imagef(bias, coord_bias);
VXC_DP4x4(scale_f1, scale_h, scale_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\
uniConvertSecFp16Fp32_4x4);
_viv_asm(CONV, tmpVal1, norm);
VXC_DP2x8(dst, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0),\
UniPackFP16even_2x8);
_viv_asm(COPY, outval, dst, 16);
coord_out.x = coord.x;
VXC_OP4_NoDest(img_store_3d, output, coord_out.xyww, outval, \
VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0));
tmpData2 -= mean;
norm = scale_f0 * vari * tmpData2 + bias_f0;
_viv_asm(CONV, tmpVal0, norm);
tmpData3 -= mean;
norm = scale_f1 * vari * tmpData3 + bias_f1;
_viv_asm(CONV, tmpVal1, norm);
VXC_DP2x8(dst, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0),\
UniPackFP16even_2x8);
_viv_asm(COPY, outval, dst, 16);
coord_out.x += 8;
VXC_OP4_NoDest(img_store_3d, output, coord_out.xyww, outval, \
VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0));
}
}
__kernel void layer_norm_U8toF16_2D(
image2d_t input,
image2d_t bias,
image2d_t scale,
image2d_t output,
float eps)
{
int4 coord = (int4)(0, get_global_id(1), 0, 0);
vxc_uchar16 src0;
float sum = 0, sqr = 0;
int tmpSum = 0, tmpSqr = 0;
vxc_int4 tmpSum1;
vxc_int4 tmpSqr1;
for(coord.x = 0; coord.x < width; coord.x += 16)
{
VXC_ReadImage(src0, input, coord.xy, VXC_5BITOFFSET_XY(0, 0),\
VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));
VXC_DP16x1(tmpSum1, src0, src0, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0), uniSumU8_16x1);
tmpSum += (tmpSum1.x);
VXC_DP16x1(tmpSqr1, src0, src0, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0), uniSqrSum_16x1);
tmpSqr += (tmpSqr1.x + tmpZp1 * tmpSum1.x);
}
sum = (tmpSum + sumInZp) * input_scale;
sqr = (tmpSqr + tmpZp2) * e2InScale;
float mean, vari;
mean = sum * dimRatio;
vari = sqr*dimRatio - mean*mean;
vari += eps;
vari = rsqrt(vari);
vxc_float4 tmpData0, tmpData1, tmpData2, tmpData3;
int2 coord_bias = (int2)(0, 0);
vxc_half8 scale_h;
vxc_float4 bias_f0, bias_f1, scale_f0, scale_f1;
vxc_short8 src1, outval;
short zp = inputZP;
half4 tmpVal0, tmpVal1;
vxc_half8 dst;
int2 coord_out = (int2)(get_global_id(0), get_global_id(1));
for(coord.x = 0; coord.x < width; coord.x += 16)
{
coord_bias.x = coord.x;
VXC_ReadImage(src0, input, coord.xy, VXC_5BITOFFSET_XY(0, 0),\
VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));
VXC_ReadImage(src1, scale, coord.xw, VXC_5BITOFFSET_XY(0, 0),\
VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
_viv_asm(COPY, scale_h, src1, 16);
VXC_DP4x4(scale_f0, scale_h, scale_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\
UniFP16toFP32Lo4_dp4x4);
VXC_DP4x4(scale_f1, scale_h, scale_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\
uniConvertSecFp16Fp32_4x4);
bias_f0 = read_imagef(bias, coord_bias);
coord_bias.x += 4;
bias_f1 = read_imagef(bias, coord_bias);
coord_bias.x += 4;
VXC_ReadImage(src1, scale, coord.xw, VXC_5BITOFFSET_XY(8, 0),\
VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
_viv_asm(COPY, scale_h, src1, 16);
VXC_DP4x4(tmpData0, src0, zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\
uniConvert1stUint8SubZpToFp32_4x4);
VXC_DP4x4(tmpData1, src0, zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\
uniConvert2ndUint8SubZpToFp32_4x4);
VXC_DP4x4(tmpData2, src0, zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\
uniConvert3rdUint8SubZpToFp32_4x4);
VXC_DP4x4(tmpData3, src0, zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\
uniConvert4thUint8SubZpToFp32_4x4);
tmpData0 *= input_scale;
tmpData1 *= input_scale;
tmpData2 *= input_scale;
tmpData3 *= input_scale;
vxc_float4 norm;
tmpData0 -= mean;
norm = scale_f0 * vari * tmpData0 + bias_f0;
bias_f0 = read_imagef(bias, coord_bias);
VXC_DP4x4(scale_f0, scale_h, scale_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\
UniFP16toFP32Lo4_dp4x4);
coord_bias.x += 4;
_viv_asm(CONV, tmpVal0, norm);
tmpData1 -= mean;
norm = scale_f1 * vari * tmpData1 + bias_f1;
bias_f1 = read_imagef(bias, coord_bias);
VXC_DP4x4(scale_f1, scale_h, scale_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\
uniConvertSecFp16Fp32_4x4);
_viv_asm(CONV, tmpVal1, norm);
VXC_DP2x8(dst, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0),\
UniPackFP16even_2x8);
_viv_asm(COPY, outval, dst, 16);
coord_out.x = coord.x;
VXC_WriteImage(output, coord_out, outval, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
tmpData2 -= mean;
norm = scale_f0 * vari * tmpData2 + bias_f0;
_viv_asm(CONV, tmpVal0, norm);
tmpData3 -= mean;
norm = scale_f1 * vari * tmpData3 + bias_f1;
_viv_asm(CONV, tmpVal1, norm);
VXC_DP2x8(dst, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0),\
UniPackFP16even_2x8);
_viv_asm(COPY, outval, dst, 16);
coord_out.x += 8;
VXC_WriteImage(output, coord_out, outval, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
}
}

View File

@ -0,0 +1,426 @@
#include "cl_viv_vx_ext.h"
_viv_uniform VXC_512Bits uniFp16SumSqr_dp8x2;
_viv_uniform int width;
_viv_uniform int height;
_viv_uniform int height_depth;
_viv_uniform float dimRatio;
_viv_uniform int group_num;
_viv_uniform VXC_512Bits UniFP16toFP32Lo4_dp4x4;
_viv_uniform VXC_512Bits uniConvertSecFp16Fp32_4x4;
_viv_uniform VXC_512Bits uniConvertHalfToFp16_2x8;
_viv_uniform VXC_512Bits uniConvertInt32toUint8_2x8;
_viv_uniform float outputScale;
_viv_uniform float output_zp;
__kernel __attribute__((reqd_work_group_size(16, 1, 1))) void layernorm_wh_sumSqr_F16toF32(
image2d_array_t input, image2d_t output)
{
int gidx = get_global_id(0) << 3;
int lidx = get_local_id(0);
int gidz = get_global_id(1);
int4 coord = (int4)(gidx, 0, gidz, 0);
vxc_short8 src0;
vxc_half8 in_h;
vxc_float4 sumsqr;
vxc_float4 tmpSumSqr = (vxc_float4)(0);
__local float lcl_sum[16];
__local float lcl_sqr[16];
int8 input_desc;
_viv_asm(COPY, input_desc, input, sizeof(input_desc));
int baseAddr_a = (int)coord.z * input_desc.s4 + input_desc.s0;
_viv_asm(MOV, coord.w, baseAddr_a);
if(gidx < width)
{
for(coord.y = 0; coord.y < height;)
{
VXC_OP4(img_load_3d, src0, input, coord.xyww, VXC_5BITOFFSET_XY(0, 0),
VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
coord.y++;
_viv_asm(COPY, in_h, src0, 16);
VXC_DP8x2(sumsqr, in_h, in_h, VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0),\
uniFp16SumSqr_dp8x2);
tmpSumSqr += sumsqr;
}
}
lcl_sum[lidx] = tmpSumSqr.x;
lcl_sqr[lidx] = tmpSumSqr.y;
barrier(CLK_LOCAL_MEM_FENCE);
int2 coord_out = (int2)(get_group_id(0) << 2, gidz);
if(lidx == 0)
{
float4 one = (float4)(1, 1, 1, 1);
__local float4* tmp_sum = (__local float4*)lcl_sum;
__local float4* tmp_sqr = (__local float4*)lcl_sqr;
float sum = 0;
float sqr = 0;
for(int i = 0; i < 4; i++)
{
sum += dot(tmp_sum[i], one);
sqr += dot(tmp_sqr[i], one);
}
float4 data = (float4)(sum, sqr, 0, 0);
write_imagef(output, coord_out, data);
}
}
__kernel __attribute__((reqd_work_group_size(16, 1, 1))) void layernorm_wh_sumSqr_F16toF32_2D(
image2d_array_t input, image2d_t output)
{
int gidx = get_global_id(0) << 3;
int lidx = get_local_id(0);
int gidz = get_global_id(1);
int gidy = gidz * height;
int2 coord = (int2)(gidx, gidy);
vxc_short8 src0;
vxc_half8 in_h;
vxc_float4 sumsqr;
vxc_float4 tmpSumSqr = (vxc_float4)(0);
__local float lcl_sum[16];
__local float lcl_sqr[16];
int endH = gidy + height;
if(gidx < width)
{
for(; coord.y < endH;)
{
VXC_ReadImage(src0, input, coord, VXC_5BITOFFSET_XY(0, 0),
VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
coord.y++;
_viv_asm(COPY, in_h, src0, 16);
VXC_DP8x2(sumsqr, in_h, in_h, VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0),\
uniFp16SumSqr_dp8x2);
tmpSumSqr += sumsqr;
}
}
lcl_sum[lidx] = tmpSumSqr.x;
lcl_sqr[lidx] = tmpSumSqr.y;
barrier(CLK_LOCAL_MEM_FENCE);
int2 coord_out = (int2)(get_group_id(0) << 2, gidz);
if(lidx == 0)
{
float4 one = (float4)(1, 1, 1, 1);
__local float4* tmp_sum = (__local float4*)lcl_sum;
__local float4* tmp_sqr = (__local float4*)lcl_sqr;
float sum = 0;
float sqr = 0;
for(int i = 0; i < 4; i++)
{
sum += dot(tmp_sum[i], one);
sqr += dot(tmp_sqr[i], one);
}
float4 data = (float4)(sum, sqr, 0, 0);
write_imagef(output, coord_out, data);
}
}
__kernel __attribute__((reqd_work_group_size(16, 1, 1))) void layernorm_wh_F16toF16(
image2d_array_t input, image2d_array_t bias, image2d_array_t scale, image2d_t meanVari,
image2d_array_t output, float eps)
{
int gidz = get_global_id(1);
int4 coord = (int4)(get_global_id(0), 0, gidz, 0);
int2 coord_sum = (int2)(0, gidz);
int4 coord_para = coord;
coord_para.z = (ushort)gidz / (ushort)(height_depth);
vxc_short8 src0;
vxc_short8 src1;
vxc_half8 scale_h, in_h;
vxc_float4 bias_f0, bias_f1, scale_f0, scale_f1;
vxc_float4 mean_vari = (vxc_float4)(0);
for(int i = 0; i < group_num; i++)
{
mean_vari += read_imagef(meanVari, coord_sum);
coord_sum.x += 4;
}
mean_vari *= dimRatio;
mean_vari.s1 = mean_vari.s1 - mean_vari.s0 * mean_vari.s0 + eps;
mean_vari.s1 = rsqrt(mean_vari.s1);
int4 coord_bias = coord_para;
int8 input_desc, scale_desc, output_desc;
_viv_asm(COPY, input_desc, input, sizeof(input_desc));
int baseAddr_a = (int)coord.z * input_desc.s4 + input_desc.s0;
_viv_asm(MOV, coord.w, baseAddr_a);
_viv_asm(COPY, scale_desc, scale, sizeof(scale_desc));
int baseAddr_c = (int)coord_para.z * scale_desc.s4 + scale_desc.s0;
_viv_asm(MOV, coord_para.w, baseAddr_c);
_viv_asm(COPY, output_desc, output, sizeof(output_desc));
int baseAddr = (int)coord.z * output_desc.s4 + output_desc.s0;
_viv_asm(MOV, coord.z, baseAddr);
vxc_float4 tmpData0, tmpData1;
vxc_short8 outval;
half4 tmpVal0, tmpVal1;
vxc_half8 dst;
for(coord.y = 0; coord.y < height; coord.y++)
{
VXC_OP4(img_load_3d, src0, input, coord.xyww, VXC_5BITOFFSET_XY(0, 0),
VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
coord_para.y = coord.y;
coord_bias.y = coord.y;
VXC_OP4(img_load_3d, src1, scale, coord_para.xyww, VXC_5BITOFFSET_XY(0, 0),
VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
bias_f0 = read_imagef(bias, coord_bias);
coord_bias.x += 4;
bias_f1 = read_imagef(bias, coord_bias);
coord_bias.x = coord.x;
_viv_asm(COPY, in_h, src0, 16);
VXC_DP4x4(tmpData0, in_h, in_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\
UniFP16toFP32Lo4_dp4x4);
VXC_DP4x4(tmpData1, in_h, in_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\
uniConvertSecFp16Fp32_4x4);
_viv_asm(COPY, scale_h, src1, 16);
VXC_DP4x4(scale_f0, scale_h, scale_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\
UniFP16toFP32Lo4_dp4x4);
VXC_DP4x4(scale_f1, scale_h, scale_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\
uniConvertSecFp16Fp32_4x4);
vxc_float4 sub, norm;
sub = tmpData0 - mean_vari.s0;
norm = scale_f0 * mean_vari.s1 * sub + bias_f0;
_viv_asm(CONV, tmpVal0, norm);
sub = tmpData1 - mean_vari.s0;
norm = scale_f1 * mean_vari.s1 * sub + bias_f1;
_viv_asm(CONV, tmpVal1, norm);
VXC_DP2x8(dst, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0),\
uniConvertHalfToFp16_2x8);
_viv_asm(COPY, outval, dst, 16);
VXC_OP4_NoDest(img_store_3d, output, coord.xyzz, outval, \
VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0));
}
}
__kernel __attribute__((reqd_work_group_size(16, 1, 1))) void layernorm_wh_F16toF16_2D(
image2d_t input, image2d_t bias, image2d_t scale, image2d_t meanVari,
image2d_t output, float eps)
{
int2 coord = (int2)(get_global_id(0), 0);
int2 coord_bias = (int2)(0, 0);
vxc_short8 src0;
vxc_short8 src1;
vxc_half8 scale_h, in_h;
vxc_float4 bias_f0, bias_f1, scale_f0, scale_f1;
vxc_float4 mean_vari = (vxc_float4)(0);
for(int i = 0; i < group_num; i++)
{
mean_vari += read_imagef(meanVari, coord_bias);
coord_bias.x += 4;
}
mean_vari *= dimRatio;
mean_vari.s1 = mean_vari.s1 - mean_vari.s0 * mean_vari.s0 + eps;
mean_vari.s1 = rsqrt(mean_vari.s1);
coord_bias = coord;
vxc_float4 tmpData0, tmpData1;
vxc_short8 outval;
half4 tmpVal0, tmpVal1;
vxc_half8 dst;
for(coord.y = 0; coord.y < height; coord.y++)
{
VXC_ReadImage(src0, input, coord, VXC_5BITOFFSET_XY(0, 0),\
VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
coord_bias.y = coord.y;
VXC_ReadImage(src1, scale, coord, VXC_5BITOFFSET_XY(0, 0),\
VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
bias_f0 = read_imagef(bias, coord_bias);
coord_bias.x += 4;
bias_f1 = read_imagef(bias, coord_bias);
coord_bias.x = coord.x;
_viv_asm(COPY, in_h, src0, 16);
VXC_DP4x4(tmpData0, in_h, in_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\
UniFP16toFP32Lo4_dp4x4);
VXC_DP4x4(tmpData1, in_h, in_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\
uniConvertSecFp16Fp32_4x4);
_viv_asm(COPY, scale_h, src1, 16);
VXC_DP4x4(scale_f0, scale_h, scale_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\
UniFP16toFP32Lo4_dp4x4);
VXC_DP4x4(scale_f1, scale_h, scale_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\
uniConvertSecFp16Fp32_4x4);
vxc_float4 sub, norm;
sub = tmpData0 - mean_vari.s0;
norm = scale_f0 * mean_vari.s1 * sub + bias_f0;
_viv_asm(CONV, tmpVal0, norm);
sub = tmpData1 - mean_vari.s0;
norm = scale_f1 * mean_vari.s1 * sub + bias_f1;
_viv_asm(CONV, tmpVal1, norm);
VXC_DP2x8(dst, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0),\
uniConvertHalfToFp16_2x8);
_viv_asm(COPY, outval, dst, 16);
VXC_WriteImage(output, coord, outval, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
}
}
__kernel __attribute__((reqd_work_group_size(16, 1, 1))) void layernorm_wh_F16toU8(
image2d_array_t input, image2d_array_t bias, image2d_array_t scale, image2d_t meanVari,
image2d_array_t output, float eps)
{
int gidz = get_global_id(1);
int4 coord = (int4)(get_global_id(0), 0, gidz, 0);
int2 coord_sum = (int2)(0, gidz);
int4 coord_para = coord;
coord_para.z = (ushort)gidz / (ushort)(height_depth);
vxc_short8 src0;
vxc_short8 src1;
vxc_half8 scale_h, in_h;
vxc_float4 bias_f0, bias_f1, scale_f0, scale_f1;
vxc_float4 mean_vari = (vxc_float4)(0);
for(int i = 0; i < group_num; i++)
{
mean_vari += read_imagef(meanVari, coord_sum);
coord_sum.x += 4;
}
mean_vari *= dimRatio;
mean_vari.s1 = mean_vari.s1 - mean_vari.s0 * mean_vari.s0 + eps;
mean_vari.s1 = rsqrt(mean_vari.s1);
int4 coord_bias = coord_para;
int8 input_desc, scale_desc, output_desc;
_viv_asm(COPY, input_desc, input, sizeof(input_desc));
int baseAddr_a = (int)coord.z * input_desc.s4 + input_desc.s0;
_viv_asm(MOV, coord.w, baseAddr_a);
_viv_asm(COPY, scale_desc, scale, sizeof(scale_desc));
int baseAddr_c = (int)coord_para.z * scale_desc.s4 + scale_desc.s0;
_viv_asm(MOV, coord_para.w, baseAddr_c);
_viv_asm(COPY, output_desc, output, sizeof(output_desc));
int baseAddr = (int)coord.z * output_desc.s4 + output_desc.s0;
_viv_asm(MOV, coord.z, baseAddr);
vxc_float4 tmpData0, tmpData1;
vxc_uchar16 outval;
vxc_int4 tmpVal0, tmpVal1;
for(coord.y = 0; coord.y < height; coord.y++)
{
VXC_OP4(img_load_3d, src0, input, coord.xyww, VXC_5BITOFFSET_XY(0, 0),
VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
coord_para.y = coord.y;
coord_bias.y = coord.y;
VXC_OP4(img_load_3d, src1, scale, coord_para.xyww, VXC_5BITOFFSET_XY(0, 0),
VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
bias_f0 = read_imagef(bias, coord_bias);
coord_bias.x += 4;
bias_f1 = read_imagef(bias, coord_bias);
coord_bias.x = coord.x;
_viv_asm(COPY, in_h, src0, 16);
VXC_DP4x4(tmpData0, in_h, in_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\
UniFP16toFP32Lo4_dp4x4);
VXC_DP4x4(tmpData1, in_h, in_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\
uniConvertSecFp16Fp32_4x4);
_viv_asm(COPY, scale_h, src1, 16);
VXC_DP4x4(scale_f0, scale_h, scale_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\
UniFP16toFP32Lo4_dp4x4);
VXC_DP4x4(scale_f1, scale_h, scale_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\
uniConvertSecFp16Fp32_4x4);
vxc_float4 sub, norm;
sub = tmpData0 - mean_vari.s0;
norm = scale_f0 * mean_vari.s1 * sub + bias_f0;
tmpVal0 = convert_int4_rte(norm * outputScale + output_zp);
sub = tmpData1 - mean_vari.s0;
norm = scale_f1 * mean_vari.s1 * sub + bias_f1;
tmpVal1 = convert_int4_rte(norm * outputScale + output_zp);
VXC_DP2x8(outval, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1),\
uniConvertInt32toUint8_2x8);
VXC_OP4_NoDest(img_store_3d, output, coord.xyzz, outval, \
VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0));
}
}
__kernel __attribute__((reqd_work_group_size(16, 1, 1))) void layernorm_wh_F16toU8_2D(
image2d_t input, image2d_t bias, image2d_t scale, image2d_t meanVari,
image2d_t output, float eps)
{
int2 coord = (int2)(get_global_id(0), 0);
int2 coord_bias = (int2)(0, 0);
vxc_short8 src0;
vxc_short8 src1;
vxc_half8 scale_h, in_h;
vxc_float4 bias_f0, bias_f1, scale_f0, scale_f1;
vxc_float4 mean_vari = (vxc_float4)(0);
for(int i = 0; i < group_num; i++)
{
mean_vari += read_imagef(meanVari, coord_bias);
coord_bias.x += 4;
}
mean_vari *= dimRatio;
mean_vari.s1 = mean_vari.s1 - mean_vari.s0 * mean_vari.s0 + eps;
mean_vari.s1 = rsqrt(mean_vari.s1);
coord_bias = coord;
vxc_float4 tmpData0, tmpData1;
vxc_uchar16 outval;
vxc_int4 tmpVal0, tmpVal1;
for(coord.y = 0; coord.y < height; coord.y++)
{
VXC_ReadImage(src0, input, coord, VXC_5BITOFFSET_XY(0, 0),\
VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
coord_bias.y = coord.y;
VXC_ReadImage(src1, scale, coord, VXC_5BITOFFSET_XY(0, 0),\
VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
bias_f0 = read_imagef(bias, coord_bias);
coord_bias.x += 4;
bias_f1 = read_imagef(bias, coord_bias);
coord_bias.x = coord.x;
_viv_asm(COPY, in_h, src0, 16);
VXC_DP4x4(tmpData0, in_h, in_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\
UniFP16toFP32Lo4_dp4x4);
VXC_DP4x4(tmpData1, in_h, in_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\
uniConvertSecFp16Fp32_4x4);
_viv_asm(COPY, scale_h, src1, 16);
VXC_DP4x4(scale_f0, scale_h, scale_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\
UniFP16toFP32Lo4_dp4x4);
VXC_DP4x4(scale_f1, scale_h, scale_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\
uniConvertSecFp16Fp32_4x4);
vxc_float4 sub, norm;
sub = tmpData0 - mean_vari.s0;
norm = scale_f0 * mean_vari.s1 * sub + bias_f0;
tmpVal0 = convert_int4_rte(norm * outputScale + output_zp);
sub = tmpData1 - mean_vari.s0;
norm = scale_f1 * mean_vari.s1 * sub + bias_f1;
tmpVal1 = convert_int4_rte(norm * outputScale + output_zp);
VXC_DP2x8(outval, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1),\
uniConvertInt32toUint8_2x8);
VXC_WriteImage(output, coord, outval, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
}
}

View File

@ -0,0 +1,266 @@
#include "cl_viv_vx_ext.h"
_viv_uniform VXC_512Bits uniInt16SumSqr_dp8x2;
_viv_uniform float e2InScale;
_viv_uniform int width;
_viv_uniform float input_scale;
_viv_uniform int height;
_viv_uniform int height_depth;
_viv_uniform float dimRatio;
_viv_uniform int group_num;
_viv_uniform VXC_512Bits UniFP16toFP32Lo4_dp4x4;
_viv_uniform VXC_512Bits uniConvertSecFp16Fp32_4x4;
_viv_uniform VXC_512Bits uniConvertHalfToFp16_2x8;
_viv_uniform VXC_512Bits uniConvertInt32toUint8_2x8;
_viv_uniform float outputScale;
_viv_uniform float output_zp;
_viv_uniform VXC_512Bits uniConvert1stUint8SubZpToFp32_4x4;
_viv_uniform VXC_512Bits uniConvert2ndUint8SubZpToFp32_4x4;
_viv_uniform int inputZP;
__kernel __attribute__((reqd_work_group_size(16, 1, 1))) void layernorm_wh_sumSqr_I16toF32(
image2d_array_t input, image2d_t output)
{
int gidx = get_global_id(0) << 4;
int lidx = get_local_id(0);
int gidz = get_global_id(1);
int4 coord = (int4)(gidx, 0, gidz, 0);
vxc_short8 src0;
float4 tmpSumSqr = (float4)(0);
__local float lcl_sum[16];
__local float lcl_sqr[16];
int8 input_desc;
_viv_asm(COPY, input_desc, input, sizeof(input_desc));
int baseAddr_a = (int)coord.z * input_desc.s4 + input_desc.s0;
_viv_asm(MOV, coord.w, baseAddr_a);
if(gidx < width)
{
for(coord.y = 0; coord.y < height;)
{
VXC_OP4(img_load_3d, src0, input, coord.xyww, VXC_5BITOFFSET_XY(0, 0),
VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
coord.y++;
vxc_float4 sumsqr;
VXC_DP8x2(sumsqr, src0, src0, VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0),\
uniInt16SumSqr_dp8x2);
tmpSumSqr += sumsqr;
}
tmpSumSqr.x *= input_scale;
tmpSumSqr.y *= e2InScale;
}
lcl_sum[lidx] = tmpSumSqr.x;
lcl_sqr[lidx] = tmpSumSqr.y;
barrier(CLK_LOCAL_MEM_FENCE);
int2 coord_out = (int2)(get_group_id(0) << 2, gidz);
if(lidx == 0)
{
float4 one = (float4)(1, 1, 1, 1);
__local float4* tmp_sum = (__local float4*)lcl_sum;
__local float4* tmp_sqr = (__local float4*)lcl_sqr;
float4 data = (float4)(0);
for(int i = 0; i < 4; i++)
{
data.x += dot(tmp_sum[i], one);
data.y += dot(tmp_sqr[i], one);
}
write_imagef(output, coord_out, data);
}
}
__kernel __attribute__((reqd_work_group_size(16, 1, 1))) void layernorm_wh_sumSqr_I16toF32_2D(
image2d_t input, image2d_t output)
{
int gidx = get_global_id(0) << 4;
int lidx = get_local_id(0);
int gidz = get_global_id(1);
int gidy = gidz * height;
int2 coord = (int2)(gidx, gidy);
vxc_short8 src0;
float4 tmpSumSqr = (float4)(0);
__local float lcl_sum[16];
__local float lcl_sqr[16];
int endH = gidy + height;
if(gidx < width)
{
for(; coord.y < endH;)
{
VXC_ReadImage(src0, input, coord, VXC_5BITOFFSET_XY(0, 0),
VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
coord.y++;
vxc_float4 sumsqr;
VXC_DP8x2(sumsqr, src0, src0, VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0),\
uniInt16SumSqr_dp8x2);
tmpSumSqr += sumsqr;
}
tmpSumSqr.x *= input_scale;
tmpSumSqr.y *= e2InScale;
}
lcl_sum[lidx] = tmpSumSqr.x;
lcl_sqr[lidx] = tmpSumSqr.y;
barrier(CLK_LOCAL_MEM_FENCE);
int2 coord_out = (int2)(get_group_id(0) << 2, gidz);
if(lidx == 0)
{
float4 one = (float4)(1, 1, 1, 1);
__local float4* tmp_sum = (__local float4*)lcl_sum;
__local float4* tmp_sqr = (__local float4*)lcl_sqr;
float4 data = (float4)(0);
for(int i = 0; i < 4; i++)
{
data.x += dot(tmp_sum[i], one);
data.y += dot(tmp_sqr[i], one);
}
write_imagef(output, coord_out, data);
}
}
__kernel __attribute__((reqd_work_group_size(16, 1, 1))) void layernorm_wh_I16toI16(
image2d_array_t input, image2d_array_t bias, image2d_array_t scale, image2d_t meanVari,
image2d_array_t output, float eps)
{
int gidz = get_global_id(1);
int4 coord = (int4)(get_global_id(0), 0, gidz, 0);
int2 coord_sum = (int2)(0, gidz);
int4 coord_para = coord;
coord_para.z = (ushort)gidz / (ushort)(height_depth);
vxc_short8 src0, src1, outval;
vxc_half8 scale_h;
vxc_float4 bias_f0, bias_f1, scale_f0, scale_f1;
vxc_float4 mean_vari = (vxc_float4)(0);
for(int i = 0; i < group_num; i++)
{
mean_vari += read_imagef(meanVari, coord_sum);
coord_sum.x += 4;
}
mean_vari *= dimRatio;
mean_vari.s1 = mean_vari.s1 - mean_vari.s0 * mean_vari.s0 + eps;
mean_vari.s1 = rsqrt(mean_vari.s1);
int4 coord_bias = coord_para;
int8 input_desc, scale_desc, output_desc;
_viv_asm(COPY, input_desc, input, sizeof(input_desc));
int baseAddr_a = (int)coord.z * input_desc.s4 + input_desc.s0;
_viv_asm(MOV, coord.w, baseAddr_a);
_viv_asm(COPY, scale_desc, scale, sizeof(scale_desc));
int baseAddr_c = (int)coord_para.z * scale_desc.s4 + scale_desc.s0;
_viv_asm(MOV, coord_para.w, baseAddr_c);
_viv_asm(COPY, output_desc, output, sizeof(output_desc));
int baseAddr = (int)coord.z * output_desc.s4 + output_desc.s0;
_viv_asm(MOV, coord.z, baseAddr);
short zp = inputZP;
vxc_float4 tmpData0, tmpData1, norm;
vxc_int4 tmpVal0, tmpVal1;
for(coord.y = 0; coord.y < height; coord.y++)
{
VXC_OP4(img_load_3d, src0, input, coord.xyww, VXC_5BITOFFSET_XY(0, 0),
VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
coord_para.y = coord.y;
coord_bias.y = coord.y;
VXC_OP4(img_load_3d, src1, scale, coord_para.xyww, VXC_5BITOFFSET_XY(0, 0),
VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
bias_f0 = read_imagef(bias, coord_bias);
coord_bias.x += 4;
bias_f1 = read_imagef(bias, coord_bias);
coord_bias.x = coord.x;
_viv_asm(COPY, scale_h, src1, 16);
VXC_DP4x4(scale_f0, scale_h, scale_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\
UniFP16toFP32Lo4_dp4x4);
VXC_DP4x4(scale_f1, scale_h, scale_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\
uniConvertSecFp16Fp32_4x4);
VXC_DP4x4(tmpData0, src0, zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\
uniConvert1stUint8SubZpToFp32_4x4);
VXC_DP4x4(tmpData1, src0, zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\
uniConvert2ndUint8SubZpToFp32_4x4);
tmpData0 = tmpData0 * input_scale - mean_vari.s0;
tmpData1 = tmpData1 * input_scale - mean_vari.s0;
norm = scale_f0 * mean_vari.s1 * tmpData0 + bias_f0;
tmpVal0 = convert_int4_rte(norm * outputScale + output_zp);
norm = scale_f1 * mean_vari.s1 * tmpData1 + bias_f1;
tmpVal1 = convert_int4_rte(norm * outputScale + output_zp);
VXC_DP2x8(outval, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1),\
uniConvertInt32toUint8_2x8);
VXC_OP4_NoDest(img_store_3d, output, coord.xyzz, outval, \
VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0));
}
}
__kernel __attribute__((reqd_work_group_size(16, 1, 1))) void layernorm_wh_I16toI16_2D(
image2d_t input, image2d_t bias, image2d_t scale, image2d_t meanVari,
image2d_t output, float eps)
{
int2 coord = (int2)(get_global_id(0), 0);
int2 coord_bias = (int2)(0, 0);
vxc_short8 src0, src1, outval;
vxc_half8 scale_h;
vxc_float4 bias_f0, bias_f1, scale_f0, scale_f1;
vxc_float4 mean_vari = (vxc_float4)(0);
for(int i = 0; i < group_num; i++)
{
mean_vari += read_imagef(meanVari, coord_bias);
coord_bias.x += 4;
}
mean_vari *= dimRatio;
mean_vari.s1 = mean_vari.s1 - mean_vari.s0 * mean_vari.s0 + eps;
mean_vari.s1 = rsqrt(mean_vari.s1);
coord_bias = coord;
short zp = inputZP;
vxc_float4 tmpData0, tmpData1, norm;
vxc_int4 tmpVal0, tmpVal1;
for(coord.y = 0; coord.y < height; coord.y++)
{
VXC_ReadImage(src0, input, coord, VXC_5BITOFFSET_XY(0, 0),\
VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
coord_bias.y = coord.y;
VXC_ReadImage(src1, scale, coord, VXC_5BITOFFSET_XY(0, 0),\
VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
bias_f0 = read_imagef(bias, coord_bias);
coord_bias.x += 4;
bias_f1 = read_imagef(bias, coord_bias);
coord_bias.x = coord.x;
_viv_asm(COPY, scale_h, src1, 16);
VXC_DP4x4(scale_f0, scale_h, scale_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\
UniFP16toFP32Lo4_dp4x4);
VXC_DP4x4(scale_f1, scale_h, scale_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\
uniConvertSecFp16Fp32_4x4);
VXC_DP4x4(tmpData0, src0, zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\
uniConvert1stUint8SubZpToFp32_4x4);
VXC_DP4x4(tmpData1, src0, zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\
uniConvert2ndUint8SubZpToFp32_4x4);
tmpData0 = tmpData0 * input_scale - mean_vari.s0;
tmpData1 = tmpData1 * input_scale - mean_vari.s0;
norm = scale_f0 * mean_vari.s1 * tmpData0 + bias_f0;
tmpVal0 = convert_int4_rte(norm * outputScale + output_zp);
norm = scale_f1 * mean_vari.s1 * tmpData1 + bias_f1;
tmpVal1 = convert_int4_rte(norm * outputScale + output_zp);
VXC_DP2x8(outval, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1),\
uniConvertInt32toUint8_2x8);
VXC_WriteImage(output, coord, outval, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
}
}

View File

@ -0,0 +1,419 @@
#include "cl_viv_vx_ext.h"
_viv_uniform VXC_512Bits uniSumU8_16x1;
_viv_uniform VXC_512Bits uniSqrSum_16x1;
_viv_uniform int sumInZp;
_viv_uniform int tmpZp1;
_viv_uniform float e2InScale;
_viv_uniform float rowSumScale;
_viv_uniform int width;
_viv_uniform float input_scale;
_viv_uniform int height;
_viv_uniform int height_depth;
_viv_uniform float dimRatio;
_viv_uniform int group_num;
_viv_uniform VXC_512Bits UniFP16toFP32Lo4_dp4x4;
_viv_uniform VXC_512Bits uniConvertSecFp16Fp32_4x4;
_viv_uniform VXC_512Bits uniConvertHalfToFp16_2x8;
_viv_uniform VXC_512Bits uniConvertInt32toUint8_2x8;
_viv_uniform float outputScale;
_viv_uniform float output_zp;
_viv_uniform VXC_512Bits uniConvert1stUint8SubZpToFp32_4x4;
_viv_uniform VXC_512Bits uniConvert2ndUint8SubZpToFp32_4x4;
_viv_uniform int inputZP;
__kernel __attribute__((reqd_work_group_size(16, 1, 1))) void layernorm_wh_sumSqr_U8toF32(
image2d_array_t input, image2d_t output)
{
int gidx = get_global_id(0) << 4;
int lidx = get_local_id(0);
int gidz = get_global_id(1);
int4 coord = (int4)(gidx, 0, gidz, 0);
vxc_uchar16 src0;
float sum = 0, sqr = 0;
int tmpSum = 0, tmpSqr = 0, tmpSum1, tmpSqr1;
__local float lcl_sum[16];
__local float lcl_sqr[16];
int8 input_desc;
_viv_asm(COPY, input_desc, input, sizeof(input_desc));
int baseAddr_a = (int)coord.z * input_desc.s4 + input_desc.s0;
_viv_asm(MOV, coord.w, baseAddr_a);
if(gidx < width)
{
for(coord.y = 0; coord.y < height;)
{
VXC_OP4(img_load_3d, src0, input, coord.xyww, VXC_5BITOFFSET_XY(0, 0),
VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));
coord.y++;
VXC_DP16x1(tmpSum1, src0, src0, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0), uniSumU8_16x1);
tmpSum += (tmpSum1);
VXC_DP16x1(tmpSqr1, src0, src0, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0), uniSqrSum_16x1);
tmpSqr += (tmpSqr1 + tmpZp1 * tmpSum1);
}
sqr += (tmpSqr * e2InScale + rowSumScale);
sum = (tmpSum + sumInZp) * input_scale;
}
lcl_sum[lidx] = sum;
lcl_sqr[lidx] = sqr;
barrier(CLK_LOCAL_MEM_FENCE);
int2 coord_out = (int2)(get_group_id(0) << 2, gidz);
if(lidx == 0)
{
float4 one = (float4)(1, 1, 1, 1);
__local float4* tmp_sum = (__local float4*)lcl_sum;
__local float4* tmp_sqr = (__local float4*)lcl_sqr;
sum = 0; sqr = 0;
for(int i = 0; i < 4; i++)
{
sum += dot(tmp_sum[i], one);
sqr += dot(tmp_sqr[i], one);
}
float4 data = (float4)(sum, sqr, 0, 0);
write_imagef(output, coord_out, data);
}
}
__kernel __attribute__((reqd_work_group_size(16, 1, 1))) void layernorm_wh_sumSqr_U8toF32_2D(
image2d_t input, image2d_t output)
{
int gidx = get_global_id(0) << 4;
int lidx = get_local_id(0);
int gidz = get_global_id(1);
int gidy = gidz * height;
int2 coord = (int2)(gidx, gidy);
vxc_uchar16 src0;
float sum = 0, sqr = 0;
int tmpSum = 0, tmpSqr = 0, tmpSum1, tmpSqr1;
__local float lcl_sum[16];
__local float lcl_sqr[16];
int endH = gidy + height;
if(gidx < width)
{
for(; coord.y < endH;)
{
VXC_ReadImage(src0, input, coord, VXC_5BITOFFSET_XY(0, 0),
VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));
coord.y++;
VXC_DP16x1(tmpSum1, src0, src0, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0), uniSumU8_16x1);
tmpSum += (tmpSum1);
VXC_DP16x1(tmpSqr1, src0, src0, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0), uniSqrSum_16x1);
tmpSqr += (tmpSqr1 + tmpZp1 * tmpSum1);
}
sqr += (tmpSqr * e2InScale + rowSumScale);
sum = (tmpSum + sumInZp) * input_scale;
}
lcl_sum[lidx] = sum;
lcl_sqr[lidx] = sqr;
barrier(CLK_LOCAL_MEM_FENCE);
int2 coord_out = (int2)(get_group_id(0) << 2, gidz);
if(lidx == 0)
{
float4 one = (float4)(1, 1, 1, 1);
__local float4* tmp_sum = (__local float4*)lcl_sum;
__local float4* tmp_sqr = (__local float4*)lcl_sqr;
sum = 0; sqr = 0;
for(int i = 0; i < 4; i++)
{
sum += dot(tmp_sum[i], one);
sqr += dot(tmp_sqr[i], one);
}
float4 data = (float4)(sum, sqr, 0, 0);
write_imagef(output, coord_out, data);
}
}
__kernel __attribute__((reqd_work_group_size(16, 1, 1))) void layernorm_wh_U8toF16(
image2d_array_t input, image2d_array_t bias, image2d_array_t scale, image2d_t meanVari,
image2d_array_t output, float eps)
{
int gidz = get_global_id(1);
int4 coord = (int4)(get_global_id(0), 0, gidz, 0);
int2 coord_sum = (int2)(0, gidz);
int4 coord_para = coord;
coord_para.z = (ushort)gidz / (ushort)(height_depth);
vxc_uchar16 src0;
vxc_short8 src1, outval;
vxc_half8 scale_h, dst;
vxc_float4 bias_f0, bias_f1, scale_f0, scale_f1;
vxc_float4 mean_vari = (vxc_float4)(0);
for(int i = 0; i < group_num; i++)
{
mean_vari += read_imagef(meanVari, coord_sum);
coord_sum.x += 4;
}
mean_vari *= dimRatio;
mean_vari.s1 = mean_vari.s1 - mean_vari.s0 * mean_vari.s0 + eps;
mean_vari.s1 = rsqrt(mean_vari.s1);
int4 coord_bias = coord_para;
int8 input_desc, scale_desc, output_desc;
_viv_asm(COPY, input_desc, input, sizeof(input_desc));
int baseAddr_a = (int)coord.z * input_desc.s4 + input_desc.s0;
_viv_asm(MOV, coord.w, baseAddr_a);
_viv_asm(COPY, scale_desc, scale, sizeof(scale_desc));
int baseAddr_c = (int)coord_para.z * scale_desc.s4 + scale_desc.s0;
_viv_asm(MOV, coord_para.w, baseAddr_c);
_viv_asm(COPY, output_desc, output, sizeof(output_desc));
int baseAddr = (int)coord.z * output_desc.s4 + output_desc.s0;
_viv_asm(MOV, coord.z, baseAddr);
short zp = inputZP;
vxc_float4 tmpData0, tmpData1, norm;
half4 tmpVal0, tmpVal1;
for(coord.y = 0; coord.y < height; coord.y++)
{
VXC_OP4(img_load_3d, src0, input, coord.xyww, VXC_5BITOFFSET_XY(0, 0),
VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
coord_para.y = coord.y; coord_bias.y = coord.y;
VXC_OP4(img_load_3d, src1, scale, coord_para.xyww, VXC_5BITOFFSET_XY(0, 0),
VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
bias_f0 = read_imagef(bias, coord_bias);
coord_bias.x += 4;
bias_f1 = read_imagef(bias, coord_bias);
coord_bias.x = coord.x;
_viv_asm(COPY, scale_h, src1, 16);
VXC_DP4x4(scale_f0, scale_h, scale_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\
UniFP16toFP32Lo4_dp4x4);
VXC_DP4x4(scale_f1, scale_h, scale_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\
uniConvertSecFp16Fp32_4x4);
VXC_DP4x4(tmpData0, src0, zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\
uniConvert1stUint8SubZpToFp32_4x4);
VXC_DP4x4(tmpData1, src0, zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\
uniConvert2ndUint8SubZpToFp32_4x4);
tmpData0 = tmpData0 * input_scale - mean_vari.s0;
tmpData1 = tmpData1 * input_scale - mean_vari.s0;
norm = scale_f0 * mean_vari.s1 * tmpData0 + bias_f0;
_viv_asm(CONV, tmpVal0, norm);
norm = scale_f1 * mean_vari.s1 * tmpData1 + bias_f1;
_viv_asm(CONV, tmpVal1, norm);
VXC_DP2x8(dst, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0),\
uniConvertHalfToFp16_2x8);
_viv_asm(COPY, outval, dst, 16);
VXC_OP4_NoDest(img_store_3d, output, coord.xyzz, outval, \
VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0));
}
}
__kernel __attribute__((reqd_work_group_size(16, 1, 1))) void layernorm_wh_U8toF16_2D(
image2d_t input, image2d_t bias, image2d_t scale, image2d_t meanVari,
image2d_t output, float eps)
{
int2 coord = (int2)(get_global_id(0), 0);
int2 coord_bias = (int2)(0, 0);
vxc_uchar16 src0;
vxc_short8 src1, outval;
vxc_half8 scale_h, dst;
vxc_float4 bias_f0, bias_f1, scale_f0, scale_f1;
vxc_float4 mean_vari = (vxc_float4)(0);
for(int i = 0; i < group_num; i++)
{
mean_vari += read_imagef(meanVari, coord_bias);
coord_bias.x += 4;
}
mean_vari *= dimRatio;
mean_vari.s1 = mean_vari.s1 - mean_vari.s0 * mean_vari.s0 + eps;
mean_vari.s1 = rsqrt(mean_vari.s1);
coord_bias = coord;
short zp = inputZP;
vxc_float4 tmpData0, tmpData1, norm;
half4 tmpVal0, tmpVal1;
for(coord.y = 0; coord.y < height; coord.y++)
{
VXC_ReadImage(src0, input, coord, VXC_5BITOFFSET_XY(0, 0),\
VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
coord_bias.y = coord.y;
VXC_ReadImage(src1, scale, coord, VXC_5BITOFFSET_XY(0, 0),\
VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
bias_f0 = read_imagef(bias, coord_bias);
coord_bias.x += 4;
bias_f1 = read_imagef(bias, coord_bias);
coord_bias.x = coord.x;
_viv_asm(COPY, scale_h, src1, 16);
VXC_DP4x4(scale_f0, scale_h, scale_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\
UniFP16toFP32Lo4_dp4x4);
VXC_DP4x4(scale_f1, scale_h, scale_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\
uniConvertSecFp16Fp32_4x4);
VXC_DP4x4(tmpData0, src0, zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\
uniConvert1stUint8SubZpToFp32_4x4);
VXC_DP4x4(tmpData1, src0, zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\
uniConvert2ndUint8SubZpToFp32_4x4);
tmpData0 = tmpData0 * input_scale - mean_vari.s0;
tmpData1 = tmpData1 * input_scale - mean_vari.s0;
norm = scale_f0 * mean_vari.s1 * tmpData0 + bias_f0;
_viv_asm(CONV, tmpVal0, norm);
norm = scale_f1 * mean_vari.s1 * tmpData1 + bias_f1;
_viv_asm(CONV, tmpVal1, norm);
VXC_DP2x8(dst, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0),\
uniConvertHalfToFp16_2x8);
_viv_asm(COPY, outval, dst, 16);
VXC_WriteImage(output, coord, outval, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
}
}
__kernel __attribute__((reqd_work_group_size(16, 1, 1))) void layernorm_wh_U8toU8(
image2d_array_t input, image2d_array_t bias, image2d_array_t scale, image2d_t meanVari,
image2d_array_t output, float eps)
{
int gidz = get_global_id(1);
int4 coord = (int4)(get_global_id(0), 0, gidz, 0);
int2 coord_sum = (int2)(0, gidz);
int4 coord_para = coord;
coord_para.z = (ushort)gidz / (ushort)(height_depth);
vxc_uchar16 src0 , outval;
vxc_short8 src1;
vxc_half8 scale_h;
vxc_float4 bias_f0, bias_f1, scale_f0, scale_f1;
vxc_float4 mean_vari = (vxc_float4)(0);
for(int i = 0; i < group_num; i++)
{
mean_vari += read_imagef(meanVari, coord_sum);
coord_sum.x += 4;
}
mean_vari *= dimRatio;
mean_vari.s1 = mean_vari.s1 - mean_vari.s0 * mean_vari.s0 + eps;
mean_vari.s1 = rsqrt(mean_vari.s1);
int4 coord_bias = coord_para;
int8 input_desc, scale_desc, output_desc;
_viv_asm(COPY, input_desc, input, sizeof(input_desc));
int baseAddr_a = (int)coord.z * input_desc.s4 + input_desc.s0;
_viv_asm(MOV, coord.w, baseAddr_a);
_viv_asm(COPY, scale_desc, scale, sizeof(scale_desc));
int baseAddr_c = (int)coord_para.z * scale_desc.s4 + scale_desc.s0;
_viv_asm(MOV, coord_para.w, baseAddr_c);
_viv_asm(COPY, output_desc, output, sizeof(output_desc));
int baseAddr = (int)coord.z * output_desc.s4 + output_desc.s0;
_viv_asm(MOV, coord.z, baseAddr);
short zp = inputZP;
vxc_float4 tmpData0, tmpData1, norm;
vxc_int4 tmpVal0, tmpVal1;
for(coord.y = 0; coord.y < height; coord.y++)
{
VXC_OP4(img_load_3d, src0, input, coord.xyww, VXC_5BITOFFSET_XY(0, 0),
VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
coord_para.y = coord.y;
coord_bias.y = coord.y;
VXC_OP4(img_load_3d, src1, scale, coord_para.xyww, VXC_5BITOFFSET_XY(0, 0),
VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
bias_f0 = read_imagef(bias, coord_bias);
coord_bias.x += 4;
bias_f1 = read_imagef(bias, coord_bias);
coord_bias.x = coord.x;
_viv_asm(COPY, scale_h, src1, 16);
VXC_DP4x4(scale_f0, scale_h, scale_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\
UniFP16toFP32Lo4_dp4x4);
VXC_DP4x4(scale_f1, scale_h, scale_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\
uniConvertSecFp16Fp32_4x4);
VXC_DP4x4(tmpData0, src0, zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\
uniConvert1stUint8SubZpToFp32_4x4);
VXC_DP4x4(tmpData1, src0, zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\
uniConvert2ndUint8SubZpToFp32_4x4);
tmpData0 = tmpData0 * input_scale - mean_vari.s0;
tmpData1 = tmpData1 * input_scale - mean_vari.s0;
norm = scale_f0 * mean_vari.s1 * tmpData0 + bias_f0;
tmpVal0 = convert_int4_rte(norm * outputScale + output_zp);
norm = scale_f1 * mean_vari.s1 * tmpData1 + bias_f1;
tmpVal1 = convert_int4_rte(norm * outputScale + output_zp);
VXC_DP2x8(outval, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1),\
uniConvertInt32toUint8_2x8);
VXC_OP4_NoDest(img_store_3d, output, coord.xyzz, outval, \
VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0));
}
}
__kernel __attribute__((reqd_work_group_size(16, 1, 1))) void layernorm_wh_U8toU8_2D(
image2d_t input, image2d_t bias, image2d_t scale, image2d_t meanVari,
image2d_t output, float eps)
{
int2 coord = (int2)(get_global_id(0), 0);
int2 coord_bias = (int2)(0, 0);
vxc_uchar16 src0, outval;
vxc_short8 src1;
vxc_half8 scale_h;
vxc_float4 bias_f0, bias_f1, scale_f0, scale_f1;
vxc_float4 mean_vari = (vxc_float4)(0);
for(int i = 0; i < group_num; i++)
{
mean_vari += read_imagef(meanVari, coord_bias);
coord_bias.x += 4;
}
mean_vari *= dimRatio;
mean_vari.s1 = mean_vari.s1 - mean_vari.s0 * mean_vari.s0 + eps;
mean_vari.s1 = rsqrt(mean_vari.s1);
coord_bias = coord;
short zp = inputZP;
vxc_float4 tmpData0, tmpData1, norm;
vxc_int4 tmpVal0, tmpVal1;
for(coord.y = 0; coord.y < height; coord.y++)
{
VXC_ReadImage(src0, input, coord, VXC_5BITOFFSET_XY(0, 0),\
VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
coord_bias.y = coord.y;
VXC_ReadImage(src1, scale, coord, VXC_5BITOFFSET_XY(0, 0),\
VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
bias_f0 = read_imagef(bias, coord_bias);
coord_bias.x += 4;
bias_f1 = read_imagef(bias, coord_bias);
coord_bias.x = coord.x;
_viv_asm(COPY, scale_h, src1, 16);
VXC_DP4x4(scale_f0, scale_h, scale_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\
UniFP16toFP32Lo4_dp4x4);
VXC_DP4x4(scale_f1, scale_h, scale_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\
uniConvertSecFp16Fp32_4x4);
VXC_DP4x4(tmpData0, src0, zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\
uniConvert1stUint8SubZpToFp32_4x4);
VXC_DP4x4(tmpData1, src0, zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\
uniConvert2ndUint8SubZpToFp32_4x4);
tmpData0 = tmpData0 * input_scale - mean_vari.s0;
tmpData1 = tmpData1 * input_scale - mean_vari.s0;
norm = scale_f0 * mean_vari.s1 * tmpData0 + bias_f0;
tmpVal0 = convert_int4_rte(norm * outputScale + output_zp);
norm = scale_f1 * mean_vari.s1 * tmpData1 + bias_f1;
tmpVal1 = convert_int4_rte(norm * outputScale + output_zp);
VXC_DP2x8(outval, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1),\
uniConvertInt32toUint8_2x8);
VXC_WriteImage(output, coord, outval, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
}
}

View File

@ -1,136 +0,0 @@
#include "cl_viv_vx_ext.h"
_viv_uniform VXC_512Bits uniDescaleU8_4x4;
_viv_uniform VXC_512Bits uniConvertIntergetoF32_4x4;
_viv_uniform VXC_512Bits uniBilinearTmp1BgraShort_4x4;
_viv_uniform VXC_512Bits uniBilinearTmp2BgraShort_4x4;
_viv_uniform VXC_512Bits uniBilinearTmp3BgraShort_4x4;
_viv_uniform VXC_512Bits uniBilinearTmp4BgraShort_4x4;
_viv_uniform VXC_512Bits uniBilinearTmp5BgraShort_4x4;
_viv_uniform VXC_512Bits uniBilinearTmp6BgraShort_4x4;
_viv_uniform VXC_512Bits uniBilinearTmp7BgraShort_4x4;
_viv_uniform VXC_512Bits uniBilinearTmp8BgraShort_4x4;
_viv_uniform VXC_512Bits uniConvertInt32toUint8_2x8;
_viv_uniform VXC_512Bits uniExtractInt32BgraToU8Bgr_2x8;
_viv_uniform int zp;
_viv_uniform float outputScale;
__kernel void pre_process_bgra_scale_nhwc_U8toU8(
__read_only image2d_array_t input, __write_only image2d_array_t output,
global int *xRatio, global int * yRatio, global int * xOffset, global int * yOffset,
float rMean, float gMean, float bMean, float var, int reverse_channel, int trans)
{
int4 gidx = get_global_id(0);
int gidy = get_global_id(1);
gidx += (int4)(0, 1, 2, 3);
int4 fx = (gidx * (*xRatio) + ((*xRatio) >> 1)) - (1 << 14);
int4 sx = fx & 0xffff8000; // Floor
int fy, sy;
fx -= sx;
sx = sx >> 15;
fx = (fx +(1 << 4)) >> 5;
// for y
fy = (gidy * (*yRatio) + ((*yRatio) >> 1)) - (1<< 14);
sy = fy & 0xffff8000; // Floor
fy -= sy;
sy = sy >> 15;
sy = sy < 0 ? 0 : sy;
fy = fy < 0 ? 0 : fy;
fy = (fy + (1<< 4)) >> 5;
sx = (sx + (*xOffset)) * 4 ;
sy += (*yOffset);
int4 srcPos = (int4)(sx.x, sy, sy + 1, sx.y);
vxc_uchar16 lineBGRA0, lineBGRA1, lineBGRA2, lineBGRA3;
vxc_uchar16 dataB, dataG, dataR;
VXC_ReadImage(lineBGRA0, input, srcPos.xy, VXC_5BITOFFSET_XY(0, 0),
VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
VXC_ReadImage(lineBGRA0, input, srcPos.xz, VXC_5BITOFFSET_XY(0, 0),
VXC_MODIFIER(8, 15, 0, VXC_RM_TowardZero, 0));
VXC_ReadImage(lineBGRA1, input, srcPos.wy, VXC_5BITOFFSET_XY(0, 0),
VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
VXC_ReadImage(lineBGRA1, input, srcPos.wz, VXC_5BITOFFSET_XY(0, 0),
VXC_MODIFIER(8, 15, 0, VXC_RM_TowardZero, 0));
srcPos.x = sx.z;
srcPos.w = sx.w;
VXC_ReadImage(lineBGRA2, input, srcPos.xy, VXC_5BITOFFSET_XY(0, 0),
VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
VXC_ReadImage(lineBGRA2, input, srcPos.xz, VXC_5BITOFFSET_XY(0, 0),
VXC_MODIFIER(8, 15, 0, VXC_RM_TowardZero, 0));
VXC_ReadImage(lineBGRA3, input, srcPos.wy, VXC_5BITOFFSET_XY(0, 0),
VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
VXC_ReadImage(lineBGRA3, input, srcPos.wz, VXC_5BITOFFSET_XY(0, 0),
VXC_MODIFIER(8, 15, 0, VXC_RM_TowardZero, 0));
vxc_uchar4 val_u8;
int4 tmp1, tmp2, result1, result2;
float4 tmpDst, tmp0;
float4 mean = (float4)(bMean, gMean, rMean, 0);
//tmpFx = (int4)(fx.x, fx.x, fx.x, fx.x);
int tmpV = 1 << 19;
vxc_short8 tmpFx;
VXC_DP2x8(tmpFx, fx, fx, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1),
uniConvertInt32toUint8_2x8);
//tmpFx = fx.xxxx;
VXC_DP4x4(tmp1, lineBGRA0, tmpFx, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),
uniBilinearTmp1BgraShort_4x4);
VXC_DP4x4(tmp2, lineBGRA0, tmpFx, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),
uniBilinearTmp2BgraShort_4x4);
tmp1 = fy * (tmp2 - tmp1) + (tmp1 << 10);
VXC_DP4x4(val_u8, tmp1, tmpV, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), uniDescaleU8_4x4);
VXC_DP4x4(tmp0, val_u8, val_u8, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), \
uniConvertIntergetoF32_4x4);
tmpDst = (tmp0 - mean) * var;
result1 = convert_int4_rte(tmpDst * outputScale + zp);
//tmpFx = fx.yyyy;
VXC_DP4x4(tmp1, lineBGRA1, tmpFx, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniBilinearTmp3BgraShort_4x4);
VXC_DP4x4(tmp2, lineBGRA1, tmpFx, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniBilinearTmp4BgraShort_4x4);
tmp1 = fy * (tmp2 - tmp1) + (tmp1 << 10);
VXC_DP4x4(val_u8, tmp1, tmpV, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), uniDescaleU8_4x4);
VXC_DP4x4(tmp0, val_u8, val_u8, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), \
uniConvertIntergetoF32_4x4);
tmpDst = (tmp0 - mean) * var;
result2 = convert_int4_rte(tmpDst * outputScale + zp);
vxc_uchar16 dst;
VXC_DP2x8(dst, result1, result2, VXC_MODIFIER(0, 5, 0, VXC_RM_ToNearestEven, 1),
uniExtractInt32BgraToU8Bgr_2x8);
//tmpFx = fx.zzzz;
VXC_DP4x4(tmp1, lineBGRA2, tmpFx, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniBilinearTmp5BgraShort_4x4);
VXC_DP4x4(tmp2, lineBGRA2, tmpFx, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniBilinearTmp6BgraShort_4x4);
tmp1 = fy * (tmp2 - tmp1) + (tmp1 << 10);
VXC_DP4x4(val_u8, tmp1, tmpV, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), uniDescaleU8_4x4);
VXC_DP4x4(tmp0, val_u8, val_u8, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), \
uniConvertIntergetoF32_4x4);
tmpDst = (tmp0 - mean) * var;
result1 = convert_int4_rte(tmpDst * outputScale + zp);
//tmpFx = fx.wwww;
VXC_DP4x4(tmp1, lineBGRA3, tmpFx, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniBilinearTmp7BgraShort_4x4);
VXC_DP4x4(tmp2, lineBGRA3, tmpFx, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniBilinearTmp8BgraShort_4x4);
tmp1 = fy * (tmp2 - tmp1) + (tmp1 << 10);
VXC_DP4x4(val_u8, tmp1, tmpV, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), uniDescaleU8_4x4);
VXC_DP4x4(tmp0, val_u8, val_u8, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), \
uniConvertIntergetoF32_4x4);
tmpDst = (tmp0 - mean) * var;
result2 = convert_int4_rte(tmpDst * outputScale + zp);
VXC_DP2x8(dst, result1, result2, VXC_MODIFIER(6, 11, 0, VXC_RM_ToNearestEven, 1),
uniExtractInt32BgraToU8Bgr_2x8);
int4 dstPos = (int4)(get_global_id(0) * 3, gidy, 0, 0);
VXC_WriteImage2DArray(output, dstPos, dst, VXC_MODIFIER(0, 11, 0, VXC_RM_TowardZero, 0));
}

View File

@ -1,89 +0,0 @@
#include "cl_viv_vx_ext.h"
_viv_uniform int bOrder;
_viv_uniform int rOrder;
_viv_uniform float outputScaleVar;
_viv_uniform float bMeanScaleVarZp;
_viv_uniform float gMeanScaleVarZp;
_viv_uniform float rMeanScaleVarZp;
_viv_uniform uint xrIntFloat_16;
_viv_uniform uint yrIntFloat_16;
_viv_uniform VXC_512Bits uniConvertNV12toB_4x4;
_viv_uniform VXC_512Bits uniConvertNV12toG_4x4;
_viv_uniform VXC_512Bits uniConvertNV12toR_4x4;
_viv_uniform VXC_512Bits uniConvertInt32toUint8_2x8;
_viv_uniform VXC_512Bits uniConvertUVtoCharSub128_2x8;
_viv_uniform VXC_512Bits uniTransPackBgr1st_2x8;
_viv_uniform VXC_512Bits uniTransPackBgr2nd_2x8;
__kernel void pre_process_nv12_trans_U8toU8(
__read_only image2d_t y_img, __read_only image2d_t uv_img,
__write_only image2d_t output,
global int *xRatio, global int * yRatio, global int * xOffset, global int * yOffset,
float rMean, float gMean, float bMean, float var, int reverse_channel, int trans)
{
uint4 gidx = get_global_id(0);
uint gidy = get_global_id(1);
gidx += (uint4)(0, 1, 2, 3);
uint dy = (gidy * yrIntFloat_16) >> 16;
uint4 dx = (gidx * xrIntFloat_16) >> 16;
int sy = convert_int(dy) + (*yOffset);
int4 sx = convert_int4(dx) + (*xOffset);
int4 uvX = sx & 0xfffffffe;
int uvY = sy >> 1;
vxc_uchar16 Y, UV;
int2 coord = (int2)(sx.x, sy);
int2 coord_uv = (int2)(uvX.x, uvY);
VXC_ReadImage(Y, y_img, coord, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));
coord.x = sx.y;
VXC_ReadImage(Y, y_img, coord, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(1, 1, 0, VXC_RM_TowardZero, 0));
coord.x = sx.z;
VXC_ReadImage(Y, y_img, coord, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(2, 2, 0, VXC_RM_TowardZero, 0));
coord.x = sx.w;
VXC_ReadImage(Y, y_img, coord, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(3, 3, 0, VXC_RM_TowardZero, 0));
VXC_ReadImage(UV, uv_img,coord_uv, VXC_5BITOFFSET_XY(0,0),VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0));
coord_uv.x = uvX.y;
VXC_ReadImage(UV, uv_img,coord_uv, VXC_5BITOFFSET_XY(0,0),VXC_MODIFIER(2, 3, 0, VXC_RM_TowardZero, 0));
coord_uv.x = uvX.z;
VXC_ReadImage(UV, uv_img,coord_uv, VXC_5BITOFFSET_XY(0,0),VXC_MODIFIER(4, 5, 0, VXC_RM_TowardZero, 0));
coord_uv.x = uvX.w;
VXC_ReadImage(UV, uv_img,coord_uv, VXC_5BITOFFSET_XY(0,0),VXC_MODIFIER(6, 7, 0, VXC_RM_TowardZero, 0));
vxc_char16 tmpUV;
short tmpVal = 128;
VXC_DP2x8(tmpUV, UV, tmpVal, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvertUVtoCharSub128_2x8);
float4 tmpDstB, tmpDstG, tmpDstR;
VXC_DP4x4(tmpDstB, Y, tmpUV, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertNV12toB_4x4);
VXC_DP4x4(tmpDstG, Y, tmpUV, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertNV12toG_4x4);
VXC_DP4x4(tmpDstR, Y, tmpUV, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertNV12toR_4x4);
int4 result, dstR, dstG, dstB;
vxc_uchar16 dst, tmpPack;
dstB = convert_int4_rte(tmpDstB * outputScaleVar + bMeanScaleVarZp);
dstG = convert_int4_rte(tmpDstG * outputScaleVar + gMeanScaleVarZp);
dstR = convert_int4_rte(tmpDstR * outputScaleVar + rMeanScaleVarZp);
if(bOrder == 2)
{
int4 exchangeData = dstB;
dstB = dstR;
dstR = exchangeData;
}
VXC_DP2x8(tmpPack, dstB, dstG, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniConvertInt32toUint8_2x8);
VXC_DP2x8(dst, tmpPack, dstR, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniTransPackBgr1st_2x8);
VXC_DP2x8(dst, tmpPack, dstR, VXC_MODIFIER(8, 11, 0, VXC_RM_ToNearestEven, 1), uniTransPackBgr2nd_2x8);
int2 dstPos = (int2)(get_global_id(0) * 3, gidy);
VXC_WriteImage(output, dstPos, dst, VXC_MODIFIER(0, 11, 0, VXC_RM_TowardZero, 0));
}

View File

@ -1,94 +0,0 @@
#include "cl_viv_vx_ext.h"
_viv_uniform float outputScale;
_viv_uniform float outputZP;
_viv_uniform VXC_512Bits uniNormilizationLo_2x8;
_viv_uniform VXC_512Bits uniNormilizationHi_2x8;
#define IMAGE_PRE_PROCESS_COPY_16BITS_NHWC(dst_name, dst_type, copy_type) \
__kernel void pre_process_rgb_copy_nhwc_U8to##dst_name \
( \
__read_only image2d_array_t input, \
__write_only image2d_array_t output, \
global int *xRatio, \
global int *yRatio, \
global int *xOffset, \
global int *yOffset, \
float rMean, \
float gMean, \
float bMean, \
float f32Var, \
int reverse_channel, \
int trans \
) \
{ \
int2 coord = (int2)(get_global_id(0), get_global_id(1)); \
\
coord.xy += (int2) (*xOffset, *yOffset); \
vxc_uchar16 src0, src1; \
dst_type dst0, dst1; \
copy_type dst; \
\
VXC_ReadImage(src0, input, coord.xy, 0, VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); \
\
f32Var *= outputScale; \
float4 paramData = (float4)(rMean * f32Var - outputZP, gMean * f32Var - outputZP, \
bMean * f32Var - outputZP, f32Var); \
half4 paramData_f16; \
_viv_asm(CONV, paramData_f16, paramData); \
\
int4 coord_out = (int4)(get_global_id(0), get_global_id(1), get_global_id(0), get_global_id(0)); \
coord_out.z = coord_out.x + 8; \
\
VXC_DP2x8(dst0, src0, paramData_f16, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), \
uniNormilizationLo_2x8); \
VXC_DP2x8(dst1, src0, paramData_f16, VXC_MODIFIER(0, 6, 0, VXC_RM_ToNearestEven, 1), \
uniNormilizationHi_2x8); \
_viv_asm(COPY, dst, dst0, 16); \
VXC_WriteImage(output, coord_out.xy, dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \
_viv_asm(COPY, dst, dst1, 16); \
VXC_WriteImage(output, coord_out.zy, dst, VXC_MODIFIER(0, 6, 0, VXC_RM_TowardZero, 0)); \
}
IMAGE_PRE_PROCESS_COPY_16BITS_NHWC(I16, vxc_short8, vxc_short8)
IMAGE_PRE_PROCESS_COPY_16BITS_NHWC(F16, vxc_half8, vxc_short8)
#define IMAGE_PRE_PROCESS_COPY_8BITS_NHWC(dst_name, dst_type) \
__kernel void pre_process_rgb_copy_nhwc_U8to##dst_name \
( \
__read_only image2d_array_t input, \
__write_only image2d_array_t output, \
global int *xRatio, \
global int *yRatio, \
global int *xOffset, \
global int *yOffset, \
float rMean, \
float gMean, \
float bMean, \
float f32Var, \
int reverse_channel, \
int trans \
) \
{ \
int2 coord = (int2)(get_global_id(0), get_global_id(1)); \
coord.xy += (int2) (*xOffset, *yOffset); \
vxc_uchar16 src0, src1; \
dst_type dst; \
\
VXC_ReadImage(src0, input, coord.xy, 0, VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); \
\
f32Var *= outputScale; \
float4 paramData = (float4)(rMean * f32Var - outputZP, gMean * f32Var - outputZP, \
bMean * f32Var - outputZP, f32Var); \
\
half4 paramData_f16; \
_viv_asm(CONV, paramData_f16, paramData); \
\
int2 coord_out = (int2)(get_global_id(0), get_global_id(1)); \
\
VXC_DP2x8(dst, src0, paramData_f16, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), \
uniNormilizationLo_2x8); \
VXC_DP2x8(dst, src0, paramData_f16, VXC_MODIFIER(8, 14, 0, VXC_RM_ToNearestEven, 1), \
uniNormilizationHi_2x8); \
VXC_WriteImage(output, coord_out, dst, VXC_MODIFIER(0, 14, 0, VXC_RM_TowardZero, 0)); \
}
IMAGE_PRE_PROCESS_COPY_8BITS_NHWC(U8, vxc_uchar16)
IMAGE_PRE_PROCESS_COPY_8BITS_NHWC(I8, vxc_char16)

View File

@ -1,172 +0,0 @@
#include "cl_viv_vx_ext.h"
_viv_uniform VXC_512Bits uniVecShift10;
_viv_uniform VXC_512Bits uniAddRShift;
_viv_uniform VXC_512Bits uniGetTempVal;
_viv_uniform VXC_512Bits uniExtractBytes;
_viv_uniform VXC_512Bits uniUnpackToR;
_viv_uniform VXC_512Bits uniUnpackToG;
_viv_uniform VXC_512Bits uniUnpackToB;
_viv_uniform VXC_512Bits uniConvertIntergetoF32_4x4;
_viv_uniform float outputScale;
_viv_uniform VXC_512Bits uniExtract8Data_2x8;
_viv_uniform float outputZP;
_viv_uniform VXC_512Bits uniRePackRGBLo_2x8;
_viv_uniform VXC_512Bits uniRePackRGBHi_2x8;
#define IMAGE_PRE_PROCESS_NHWC(dst_name, conv_type, dst_type, copy_type) \
__kernel void pre_process_rgb_scale_nhwc_U8to##dst_name \
( \
__read_only image2d_array_t input, \
__write_only image2d_array_t output, \
global int *xRatio, \
global int *yRatio, \
global int *xOffset, \
global int *yOffset, \
float rMean, \
float gMean, \
float bMean, \
float f32Var, \
int reverse_channel, \
int trans \
) \
{ \
int2 ratioXY = (int2)(*xRatio, *yRatio); \
int4 xPos = get_global_id(0); \
int yPos = get_global_id(1); \
int2 ratioSufXY = (ratioXY >> 1) - (1 << 14); \
xPos += (int4)(0, 1, 2, 3); \
\
/*x*/ \
int4 fx0 = xPos * ratioXY.x + ratioSufXY.x; \
int4 sx = fx0 & 0xffff8000; \
fx0 -= sx; \
sx = sx >> 15; \
\
vxc_short4 fx; \
VXC_DP4x4(fx, fx0, 1 << 4, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAddRShift); \
/*y*/ \
int fy = yPos * ratioXY.y + ratioSufXY.y; \
int sy = fy & 0xffff8000; \
\
fy -= sy; \
sy = sy >> 15; \
\
fy = (fy + (1<< 4)) >> 5; \
\
vxc_uchar16 line0RGB1, line0RGB2; \
vxc_uchar16 line1RGB3, line1RGB4; \
int4 coord; \
sx = sx * 3 + *xOffset; \
coord.xyz = sx.xyz; \
coord.w = sy + *yOffset; \
int2 coord1 = (int2)(sx.w, coord.w); \
VXC_ReadImage(line0RGB1, input, coord.xw, 0, VXC_MODIFIER(0, 5, 0, VXC_RM_TowardZero, 0)); \
VXC_ReadImage(line0RGB1, input, coord.yw, 0, VXC_MODIFIER(6, 11, 0, VXC_RM_TowardZero, 0)); \
VXC_ReadImage(line0RGB2, input, coord.zw, 0, VXC_MODIFIER(0, 5, 0, VXC_RM_TowardZero, 0)); \
VXC_ReadImage(line0RGB2, input, coord1, 0, VXC_MODIFIER(6, 11, 0, VXC_RM_TowardZero, 0)); \
\
VXC_ReadImage(line1RGB3, input, coord.xw, VXC_5BITOFFSET_XY(0, 1), \
VXC_MODIFIER(0, 5, 0, VXC_RM_TowardZero, 0)); \
VXC_ReadImage(line1RGB3, input, coord.yw, VXC_5BITOFFSET_XY(0, 1), \
VXC_MODIFIER(6, 11, 0, VXC_RM_TowardZero, 0)); \
VXC_ReadImage(line1RGB4, input, coord.zw, VXC_5BITOFFSET_XY(0, 1), \
VXC_MODIFIER(0, 5, 0, VXC_RM_TowardZero, 0)); \
VXC_ReadImage(line1RGB4, input, coord1, VXC_5BITOFFSET_XY(0, 1), \
VXC_MODIFIER(6, 11, 0, VXC_RM_TowardZero, 0)); \
\
float4 bgrMean = (float4)(bMean, gMean, rMean, 0); \
\
bgrMean *= f32Var; \
\
int4 test01, temp1; \
int4 test02, temp2; \
int4 tt; \
vxc_uchar4 val; \
int4 coord_out = (int4)(xPos.x * 3, yPos, xPos.x * 3 + 6, 0); \
\
vxc_uchar8 line1, line2; \
\
/*R*/ \
VXC_DP2x8(line1, line0RGB1, line0RGB2, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniUnpackToR); \
VXC_DP2x8(line2, line1RGB3, line1RGB4, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniUnpackToR); \
\
VXC_DP4x4(test01, line1, line1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniVecShift10); \
VXC_DP4x4(temp1, line1, fx, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniGetTempVal); \
temp1 = temp1 + test01; \
\
VXC_DP4x4(test02, line2, line2, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniVecShift10); \
VXC_DP4x4(temp2, line2, fx, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniGetTempVal); \
temp2 = temp2 + test02; \
temp2 = fy * (temp2 - temp1) + (temp1 << 10); \
\
vxc_float4 tmp_dst; \
vxc_uchar4 u8_dst; \
VXC_DP4x4(u8_dst, temp2, 1 << 19, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 1), uniExtractBytes); \
VXC_DP4x4(tmp_dst, u8_dst, u8_dst, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 1), \
uniConvertIntergetoF32_4x4); \
\
/*convert U8 to dst*/ \
dst_type dstRG, dstB, dst; \
tmp_dst = tmp_dst * f32Var - bgrMean.zzzz; \
tmp_dst = tmp_dst * outputScale + outputZP; \
conv_type dst0; \
_viv_asm(CONV_RTE, dst0, tmp_dst); \
VXC_DP2x8(dstRG, dst0, dst0, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), uniExtract8Data_2x8); \
\
/*G*/ \
VXC_DP2x8(line1, line0RGB1, line0RGB2, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniUnpackToG); \
VXC_DP2x8(line2, line1RGB3, line1RGB4, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniUnpackToG); \
\
VXC_DP4x4(test01, line1, line1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniVecShift10); \
VXC_DP4x4(temp1, line1, fx, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniGetTempVal); \
temp1 = temp1 + test01; \
\
VXC_DP4x4(test02, line2, line2, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniVecShift10); \
VXC_DP4x4(temp2, line2, fx, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniGetTempVal); \
temp2 = temp2 + test02; \
temp2 = fy * (temp2 - temp1) + (temp1 << 10); \
\
VXC_DP4x4(u8_dst, temp2, 1 << 19, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 1), uniExtractBytes); \
VXC_DP4x4(tmp_dst, u8_dst, u8_dst, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 1), \
uniConvertIntergetoF32_4x4); \
\
tmp_dst = tmp_dst * f32Var - bgrMean.y; \
tmp_dst = tmp_dst * outputScale + outputZP; \
_viv_asm(CONV_RTE, dst0, tmp_dst); \
VXC_DP2x8(dstRG, dst0, dst0, VXC_MODIFIER(4, 7, 0, VXC_RM_ToNearestEven, 1), uniExtract8Data_2x8); \
\
/*B*/ \
VXC_DP2x8(line1, line0RGB1, line0RGB2, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniUnpackToB); \
VXC_DP2x8(line2, line1RGB3, line1RGB4, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniUnpackToB); \
\
VXC_DP4x4(test01, line1, line1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniVecShift10); \
VXC_DP4x4(temp1, line1, fx, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniGetTempVal); \
temp1 = temp1 + test01; \
\
VXC_DP4x4(test02, line2, line2, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniVecShift10); \
VXC_DP4x4(temp2, line2, fx, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniGetTempVal); \
temp2 = temp2 + test02; \
temp2 = fy * (temp2 - temp1) + (temp1 << 10); \
\
VXC_DP4x4(u8_dst, temp2, 1 << 19, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 1), uniExtractBytes); \
VXC_DP4x4(tmp_dst, u8_dst, u8_dst, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 1), \
uniConvertIntergetoF32_4x4); \
\
tmp_dst = tmp_dst * f32Var - bgrMean.x; \
tmp_dst = tmp_dst * outputScale + outputZP; \
_viv_asm(CONV_RTE, dst0, tmp_dst); \
VXC_DP2x8(dstB, dst0, dst0, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), uniExtract8Data_2x8); \
VXC_DP2x8(dst, dstRG, dstB, VXC_MODIFIER(0, 5, 0, VXC_RM_ToNearestEven, 1), uniRePackRGBLo_2x8); \
copy_type result; \
_viv_asm(COPY, result, dst, 16); \
VXC_WriteImage(output, coord_out.xy, result, VXC_MODIFIER(0, 5, 0, VXC_RM_TowardZero, 0)); \
VXC_DP2x8(dst, dstRG, dstB, VXC_MODIFIER(0, 5, 0, VXC_RM_ToNearestEven, 1), uniRePackRGBHi_2x8); \
_viv_asm(COPY, result, dst, 16); \
VXC_WriteImage(output, coord_out.zy, result, VXC_MODIFIER(0, 5, 0, VXC_RM_TowardZero, 0)); \
}
IMAGE_PRE_PROCESS_NHWC(U8, uint4, vxc_uchar16, vxc_uchar16)
IMAGE_PRE_PROCESS_NHWC(I8, int4, vxc_char16, vxc_char16)
IMAGE_PRE_PROCESS_NHWC(I16, int4, vxc_short8, vxc_short8)
IMAGE_PRE_PROCESS_NHWC(F16, half4, vxc_half8, vxc_short8)

View File

@ -23,19 +23,6 @@ _viv_uniform VXC_512Bits uniCalculateTmpB3rd_4x4;
_viv_uniform VXC_512Bits uniCalculateTmpB4th_4x4;
_viv_uniform VXC_512Bits uniCalculateB1st_4x4;
_viv_uniform VXC_512Bits uniPackBG0_2x8;
_viv_uniform VXC_512Bits uniPackTmpAndR_2x8;
_viv_uniform VXC_512Bits uniPackRB0_2x8;
_viv_uniform VXC_512Bits uniPackTmp0AndG_2x8;
_viv_uniform VXC_512Bits uniPackGR1_2x8;
_viv_uniform VXC_512Bits uniPackTmp1AndB_2x8;
_viv_uniform VXC_512Bits uniPackBG1_2x8;
_viv_uniform VXC_512Bits uniPackTmp1AndR_2x8;
_viv_uniform VXC_512Bits uniPackRB2_2x8;
_viv_uniform VXC_512Bits uniPackTmp2AndG_2x8;
_viv_uniform VXC_512Bits uniPackGR2_2x8;
_viv_uniform VXC_512Bits uniPackTmp2AndB_2x8;
_viv_uniform VXC_512Bits uniQuantU8toU8LoB_2x8;
_viv_uniform VXC_512Bits uniQuantU8toU8HiB_2x8;
_viv_uniform VXC_512Bits uniQuantU8toU8LoG_2x8;
@ -145,137 +132,3 @@ __kernel void pre_process_yuv420_copy_U8toU8(
VXC_WriteImage2DArray(output, pos, dst2, VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));
}
// store bgrbgrbgr
__kernel void pre_process_yuv420_copy_trans_U8(
__read_only image2d_t y_img,
__read_only image2d_t u_img,
__read_only image2d_t v_img,
__write_only image2d_array_t output,
global int * xRatio,
global int * yRatio,
global int * xOffset,
global int * yOffset,
float rMean,
float gMean,
float bMean,
float var,
int reverse_channel,
int trans
)
{
int4 pos = (int4)(get_global_id(0) + (*xOffset), get_global_id(1) + (*yOffset), 0, 0);
int4 pos1 = (int4)((get_global_id(0) + (*xOffset)) >> 1, (get_global_id(1) + (*yOffset)) >> 1, 0, 0);
vxc_uchar16 Y;
vxc_uchar8 U, V;
vxc_int4 C0, C1, C2, C3;
vxc_uchar16 R, G, B;
vxc_uchar16 dst;
VXC_ReadImage(Y, y_img, pos.xy, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));
VXC_ReadImage(U, u_img, pos1.xy, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
VXC_ReadImage(V, v_img, pos1.xy, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
var *= outputScale;
float4 paramData = (float4)(bMean * var - zp, gMean * var - zp,\
rMean * var - zp, var);
half4 paramData_f16;
_viv_asm(CONV, paramData_f16, paramData);
//C = Y - 16;
//D = U - 128;
//E = V - 128;
// calculate R
// ((298 * C + 409 * E + 128) >> 8) --> [(298Y + 409V - 56992) >> 8]
int tmpV = -56992;
VXC_DP4x4(C0, Y, V, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniCalculateTmpR1st_4x4);
VXC_DP4x4(C1, Y, V, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniCalculateTmpR2nd_4x4);
VXC_DP4x4(C2, Y, V, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniCalculateTmpR3rd_4x4);
VXC_DP4x4(C3, Y, V, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniCalculateTmpR4th_4x4);
VXC_DP4x4(dst, C0, tmpV, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), uniCalculateR1st_4x4);
VXC_DP4x4(dst, C1, tmpV, VXC_MODIFIER(4, 7, 0, VXC_RM_ToNearestEven, 1), uniCalculateR1st_4x4);
VXC_DP4x4(dst, C2, tmpV, VXC_MODIFIER(8, 11, 0, VXC_RM_ToNearestEven, 1), uniCalculateR1st_4x4);
VXC_DP4x4(dst, C3, tmpV, VXC_MODIFIER(12, 15, 0, VXC_RM_ToNearestEven, 1), uniCalculateR1st_4x4);
VXC_DP2x8(R, dst, paramData_f16, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniQuantU8toU8LoR_2x8);
VXC_DP2x8(R, dst, paramData_f16, VXC_MODIFIER(8, 15, 0, VXC_RM_ToNearestEven, 1), uniQuantU8toU8HiR_2x8);
// calculate G
// ((298 * C - 100* D - 208 * E + 128) >> 8) --> [(298Y - 100U - 208V + 34784) >> 8]
// 298Y - 208V
VXC_DP4x4(C0, Y, V, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniCalculateTmpG1st_4x4);
VXC_DP4x4(C1, Y, V, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniCalculateTmpG2nd_4x4);
VXC_DP4x4(C2, Y, V, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniCalculateTmpG3rd_4x4);
VXC_DP4x4(C3, Y, V, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniCalculateTmpG4th_4x4);
// 34784 - 100U
ushort tmpG = 34784;
vxc_ushort8 tmpDstG;
VXC_DP2x8(tmpDstG, U, tmpG, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniCalculateTmpGbyU_2x8);
VXC_DP4x4(dst, C0, tmpDstG, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), uniCalculateG1st_4x4);
VXC_DP4x4(dst, C1, tmpDstG, VXC_MODIFIER(4, 7, 0, VXC_RM_ToNearestEven, 1), uniCalculateG2nd_4x4);
VXC_DP4x4(dst, C2, tmpDstG, VXC_MODIFIER(8, 11, 0, VXC_RM_ToNearestEven, 1), uniCalculateG3rd_4x4);
VXC_DP4x4(dst, C3, tmpDstG, VXC_MODIFIER(12, 15, 0, VXC_RM_ToNearestEven, 1), uniCalculateG4th_4x4);
VXC_DP2x8(G, dst, paramData_f16, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniQuantU8toU8LoG_2x8);
VXC_DP2x8(G, dst, paramData_f16, VXC_MODIFIER(8, 15, 0, VXC_RM_ToNearestEven, 1), uniQuantU8toU8HiG_2x8);
// calculate B
// ((298 * C + 516 * D + 128) >> 8) ==> [(298Y + 516U - 70688) >> 8]
VXC_DP4x4(C0, Y, U, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniCalculateTmpB1st_4x4);
VXC_DP4x4(C1, Y, U, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniCalculateTmpB2nd_4x4);
VXC_DP4x4(C2, Y, U, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniCalculateTmpB3rd_4x4);
VXC_DP4x4(C3, Y, U, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniCalculateTmpB4th_4x4);
tmpV = -70688;
VXC_DP4x4(dst, C0, tmpV, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), uniCalculateB1st_4x4);
VXC_DP4x4(dst, C1, tmpV, VXC_MODIFIER(4, 7, 0, VXC_RM_ToNearestEven, 1), uniCalculateB1st_4x4);
VXC_DP4x4(dst, C2, tmpV, VXC_MODIFIER(8, 11, 0, VXC_RM_ToNearestEven, 1), uniCalculateB1st_4x4);
VXC_DP4x4(dst, C3, tmpV, VXC_MODIFIER(12, 15, 0, VXC_RM_ToNearestEven, 1), uniCalculateB1st_4x4);
VXC_DP2x8(B, dst, paramData_f16, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniQuantU8toU8LoB_2x8);
VXC_DP2x8(B, dst, paramData_f16, VXC_MODIFIER(8, 15, 0, VXC_RM_ToNearestEven, 1), uniQuantU8toU8HiB_2x8);
// reorder to bgr
vxc_uchar8 tmpdst0, tmpdst1;
vxc_uchar16 dst0, dst1, dst2;
if(bOrder == 2)
{
vxc_uchar16 exchangeData = B;
B = R;
R = exchangeData;
}
// BGR BGR BG
VXC_DP2x8(tmpdst0, B, G, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniPackBG0_2x8);
VXC_DP2x8(dst0, tmpdst0, R, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniPackTmpAndR_2x8);
// RBG RBG RB
VXC_DP2x8(tmpdst0, R, B, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniPackRB0_2x8);
VXC_DP2x8(dst0, tmpdst0, G, VXC_MODIFIER(8, 15, 0, VXC_RM_TowardZero, 0), uniPackTmp0AndG_2x8);
pos = (int4)(get_global_id(0) * 3, get_global_id(1), 0, 0);
VXC_WriteImage2DArray(output, pos, dst0, VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));
pos.x += 16;
// GRB GRB GR
VXC_DP2x8(tmpdst0, G, R, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniPackGR1_2x8);
VXC_DP2x8(dst1, tmpdst0, B, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniPackTmp1AndB_2x8);
// BGR BGR BG
VXC_DP2x8(tmpdst0, B, G, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniPackBG1_2x8);
VXC_DP2x8(dst1, tmpdst0, R, VXC_MODIFIER(8, 15, 0, VXC_RM_TowardZero, 0), uniPackTmp1AndR_2x8);
VXC_WriteImage2DArray(output, pos, dst1, VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));
pos.x += 16;
// RBG RBG RB
VXC_DP2x8(tmpdst0, R, B, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniPackRB2_2x8);
VXC_DP2x8(dst2, tmpdst0, G, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniPackTmp2AndG_2x8);
// GRB GRB GR
VXC_DP2x8(tmpdst1, G, R, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniPackGR2_2x8);
VXC_DP2x8(dst2, tmpdst1, B, VXC_MODIFIER(8, 15, 0, VXC_RM_TowardZero, 0), uniPackTmp2AndB_2x8);
VXC_WriteImage2DArray(output, pos, dst2, VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));
}

View File

@ -1,235 +0,0 @@
#include "cl_viv_vx_ext.h"
_viv_uniform VXC_512Bits uniCalculateR1st_4x4;
_viv_uniform VXC_512Bits uniCalculateTmpGbyU_2x8;
_viv_uniform VXC_512Bits uniCalculateTmpGbyU2nd_2x8;
_viv_uniform VXC_512Bits uniCalculateB1st_4x4;
_viv_uniform VXC_512Bits uniConvertInt32toUint8_2x8;
_viv_uniform VXC_512Bits uniDescaleU8_4x4;
_viv_uniform VXC_512Bits uniCalculateTmpRWise_4x4;
_viv_uniform VXC_512Bits uniCalculateTmpRWise2nd_4x4;
_viv_uniform VXC_512Bits uniCalculateTmpRWise3rd_4x4;
_viv_uniform VXC_512Bits uniCalculateTmpRWise4th_4x4;
_viv_uniform VXC_512Bits uniCalculateTmpGWise_4x4;
_viv_uniform VXC_512Bits uniCalculateTmpGWise2nd_4x4;
_viv_uniform VXC_512Bits uniCalculateTmpGWise3rd_4x4;
_viv_uniform VXC_512Bits uniCalculateTmpGWise4th_4x4;
_viv_uniform VXC_512Bits uniCalculateGWise_4x4;
_viv_uniform VXC_512Bits uniCalculateGWise2nd_4x4;
_viv_uniform VXC_512Bits uniCalculateTmpBWise_4x4;
_viv_uniform VXC_512Bits uniCalculateTmpBWise2nd_4x4;
_viv_uniform VXC_512Bits uniCalculateTmpBWise3rd_4x4;
_viv_uniform VXC_512Bits uniCalculateTmpBWise4th_4x4;
_viv_uniform VXC_512Bits uniBilinearTmp1st_4x4;
_viv_uniform VXC_512Bits uniBilinearTmp2nd_4x4;
_viv_uniform VXC_512Bits uniBilinearTmp3rd_4x4;
_viv_uniform VXC_512Bits uniBilinearTmp4th_4x4;
_viv_uniform VXC_512Bits uniTransPackBgr1st_2x8;
_viv_uniform VXC_512Bits uniTransPackBgr2nd_2x8;
_viv_uniform int bOrder;
_viv_uniform int rOrder;
_viv_uniform int zp;
_viv_uniform float outputScale;
__kernel void pre_process_yuv420_trans_U8toU8(
__read_only image2d_array_t y_img, __read_only image2d_array_t u_img,
__read_only image2d_array_t v_img, __write_only image2d_array_t output,
global int *xRatio, global int * yRatio, global int * xOffset, global int * yOffset,
float rMean, float gMean, float bMean, float var, int reverse_channel, int trans)
{
int4 gidx = get_global_id(0);
int gidy = get_global_id(1);
gidx += (int4)(0, 1, 2, 3);
int4 fx = (gidx * (*xRatio) + ((*xRatio) >> 1)) - (1 << 14);
int4 sx = fx & 0xffff8000; // Floor
int fy, sy;
fx -= sx;
sx = sx >> 15;
fx = (fx +(1 << 4)) >> 5;
// for y
fy = (gidy * (*yRatio) + ((*yRatio) >> 1)) - (1<< 14);
sy = fy & 0xffff8000; // Floor
fy -= sy;
sy = sy >> 15;
sy = sy < 0 ? 0 : sy;
fy = fy < 0 ? 0 : fy;
fy = (fy + (1<< 4)) >> 5;
sx += (*xOffset);
sy += (*yOffset);
int4 srcPos = (int4)(sx.x, sy, get_global_id(2), 0);
int4 srcPos1 = (int4)(sx.x >> 1, sy >> 1, get_global_id(2), 0);
int4 srcPos2 = (int4)(sx.x >> 1, (sy + 1) >> 1, get_global_id(2), 0);
vxc_uchar16 Y, U, V;
vxc_int4 C0, C1, C2, C3;
vxc_uchar16 R, G, B;
VXC_ReadImage(Y, y_img, srcPos, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0));
VXC_ReadImage(Y, y_img, srcPos, VXC_5BITOFFSET_XY(0, 1), VXC_MODIFIER(2, 3, 0, VXC_RM_TowardZero, 0));
VXC_ReadImage(U, u_img, srcPos1, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));
VXC_ReadImage(V, v_img, srcPos1, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));
srcPos1.x = (sx.x + 1) >> 1;
VXC_ReadImage(U, u_img, srcPos1, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(1, 1, 0, VXC_RM_TowardZero, 0));
VXC_ReadImage(V, v_img, srcPos1, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(1, 1, 0, VXC_RM_TowardZero, 0));
VXC_ReadImage(U, u_img, srcPos2, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(2, 2, 0, VXC_RM_TowardZero, 0));
VXC_ReadImage(V, v_img, srcPos2, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(2, 2, 0, VXC_RM_TowardZero, 0));
srcPos2.x = (sx.x + 1) >> 1;
VXC_ReadImage(U, u_img, srcPos2, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(3, 3, 0, VXC_RM_TowardZero, 0));
VXC_ReadImage(V, v_img, srcPos2, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(3, 3, 0, VXC_RM_TowardZero, 0));
srcPos.x = sx.y;
srcPos1.x = sx.y >> 1;
srcPos2.x = sx.y >> 1;
VXC_ReadImage(Y, y_img, srcPos, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(4, 5, 0, VXC_RM_TowardZero, 0));
VXC_ReadImage(Y, y_img, srcPos, VXC_5BITOFFSET_XY(0, 1), VXC_MODIFIER(6, 7, 0, VXC_RM_TowardZero, 0));
VXC_ReadImage(U, u_img, srcPos1, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(4, 4, 0, VXC_RM_TowardZero, 0));
VXC_ReadImage(V, v_img, srcPos1, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(4, 4, 0, VXC_RM_TowardZero, 0));
srcPos1.x = (sx.y + 1) >> 1;
VXC_ReadImage(U, u_img, srcPos1, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(5, 5, 0, VXC_RM_TowardZero, 0));
VXC_ReadImage(V, v_img, srcPos1, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(5, 5, 0, VXC_RM_TowardZero, 0));
VXC_ReadImage(U, u_img, srcPos2, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(6, 6, 0, VXC_RM_TowardZero, 0));
VXC_ReadImage(V, v_img, srcPos2, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(6, 6, 0, VXC_RM_TowardZero, 0));
srcPos2.x = (sx.y + 1) >> 1;
VXC_ReadImage(U, u_img, srcPos2, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(7, 7, 0, VXC_RM_TowardZero, 0));
VXC_ReadImage(V, v_img, srcPos2, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(7, 7, 0, VXC_RM_TowardZero, 0));
srcPos.x = sx.z;
srcPos1.x = sx.z >> 1;
srcPos2.x = sx.z >> 1;
VXC_ReadImage(Y, y_img, srcPos, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(8, 9, 0, VXC_RM_TowardZero, 0));
VXC_ReadImage(Y, y_img, srcPos, VXC_5BITOFFSET_XY(0, 1), VXC_MODIFIER(10, 11, 0, VXC_RM_TowardZero, 0));
VXC_ReadImage(U, u_img, srcPos1, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(8, 8, 0, VXC_RM_TowardZero, 0));
VXC_ReadImage(V, v_img, srcPos1, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(8, 8, 0, VXC_RM_TowardZero, 0));
srcPos1.x = (sx.z + 1) >> 1;
VXC_ReadImage(U, u_img, srcPos1, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(9, 9, 0, VXC_RM_TowardZero, 0));
VXC_ReadImage(V, v_img, srcPos1, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(9, 9, 0, VXC_RM_TowardZero, 0));
VXC_ReadImage(U, u_img, srcPos2, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(10, 10, 0, VXC_RM_TowardZero, 0));
VXC_ReadImage(V, v_img, srcPos2, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(10, 10, 0, VXC_RM_TowardZero, 0));
srcPos2.x = (sx.z + 1) >> 1;
VXC_ReadImage(U, u_img, srcPos2, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(11, 11, 0, VXC_RM_TowardZero, 0));
VXC_ReadImage(V, v_img, srcPos2, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(11, 11, 0, VXC_RM_TowardZero, 0));
srcPos.x = sx.w;
srcPos1.x = sx.w >> 1;
srcPos2.x = sx.w >> 1;
VXC_ReadImage(Y, y_img, srcPos, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(12, 13, 0, VXC_RM_TowardZero, 0));
VXC_ReadImage(Y, y_img, srcPos, VXC_5BITOFFSET_XY(0, 1), VXC_MODIFIER(14, 15, 0, VXC_RM_TowardZero, 0));
VXC_ReadImage(U, u_img, srcPos1, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(12, 12, 0, VXC_RM_TowardZero, 0));
VXC_ReadImage(V, v_img, srcPos1, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(12, 12, 0, VXC_RM_TowardZero, 0));
srcPos1.x = (sx.w + 1) >> 1;
VXC_ReadImage(U, u_img, srcPos1, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(13, 13, 0, VXC_RM_TowardZero, 0));
VXC_ReadImage(V, v_img, srcPos1, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(13, 13, 0, VXC_RM_TowardZero, 0));
VXC_ReadImage(U, u_img, srcPos2, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(14, 14, 0, VXC_RM_TowardZero, 0));
VXC_ReadImage(V, v_img, srcPos2, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(14, 14, 0, VXC_RM_TowardZero, 0));
srcPos2.x = (sx.w + 1) >> 1;
VXC_ReadImage(U, u_img, srcPos2, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(15, 15, 0, VXC_RM_TowardZero, 0));
VXC_ReadImage(V, v_img, srcPos2, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(15, 15, 0, VXC_RM_TowardZero, 0));
//C = Y - 16; D = U - 128; E = V - 128;
// ((298 * C + 409 * E + 128) >> 8) --> [(298Y + 409V - 56992) >> 8]
int tmpV = -56992;
VXC_DP4x4(C0, Y, V, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniCalculateTmpRWise_4x4);
VXC_DP4x4(C1, Y, V, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniCalculateTmpRWise2nd_4x4);
VXC_DP4x4(C2, Y, V, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniCalculateTmpRWise3rd_4x4);
VXC_DP4x4(C3, Y, V, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniCalculateTmpRWise4th_4x4);
VXC_DP4x4(R, C0, tmpV, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), uniCalculateR1st_4x4);
VXC_DP4x4(R, C1, tmpV, VXC_MODIFIER(4, 7, 0, VXC_RM_ToNearestEven, 1), uniCalculateR1st_4x4);
VXC_DP4x4(R, C2, tmpV, VXC_MODIFIER(8, 11, 0, VXC_RM_ToNearestEven, 1), uniCalculateR1st_4x4);
VXC_DP4x4(R, C3, tmpV, VXC_MODIFIER(12, 15, 0, VXC_RM_ToNearestEven, 1), uniCalculateR1st_4x4);
// ((298 * C - 100* D - 208 * E + 128) >> 8) --> [(298Y - 100U - 208V + 34784) >> 8]
// 298Y - 208V
VXC_DP4x4(C0, Y, V, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniCalculateTmpGWise_4x4);
VXC_DP4x4(C1, Y, V, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniCalculateTmpGWise2nd_4x4);
VXC_DP4x4(C2, Y, V, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniCalculateTmpGWise3rd_4x4);
VXC_DP4x4(C3, Y, V, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniCalculateTmpGWise4th_4x4);
// 34784 - 100U
ushort tmpG = 34784;
vxc_ushort8 tmpDstG, tmpDstG1;
VXC_DP2x8(tmpDstG, U, tmpG, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniCalculateTmpGbyU_2x8);
VXC_DP2x8(tmpDstG1, U, tmpG, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniCalculateTmpGbyU2nd_2x8);
VXC_DP4x4(G, C0, tmpDstG, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), uniCalculateGWise_4x4);
VXC_DP4x4(G, C1, tmpDstG, VXC_MODIFIER(4, 7, 0, VXC_RM_ToNearestEven, 1), uniCalculateGWise2nd_4x4);
VXC_DP4x4(G, C2, tmpDstG1, VXC_MODIFIER(8, 11, 0, VXC_RM_ToNearestEven, 1), uniCalculateGWise_4x4);
VXC_DP4x4(G, C3, tmpDstG1, VXC_MODIFIER(12, 15, 0, VXC_RM_ToNearestEven, 1), uniCalculateGWise2nd_4x4);
// ((298 * C + 516 * D + 128) >> 8) ==> [(298Y + 516U - 70688) >> 8]
VXC_DP4x4(C0, Y, U, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniCalculateTmpBWise_4x4);
VXC_DP4x4(C1, Y, U, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniCalculateTmpBWise2nd_4x4);
VXC_DP4x4(C2, Y, U, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniCalculateTmpBWise3rd_4x4);
VXC_DP4x4(C3, Y, U, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniCalculateTmpBWise4th_4x4);
tmpV = -70688;
VXC_DP4x4(B, C0, tmpV, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), uniCalculateB1st_4x4);
VXC_DP4x4(B, C1, tmpV, VXC_MODIFIER(4, 7, 0, VXC_RM_ToNearestEven, 1), uniCalculateB1st_4x4);
VXC_DP4x4(B, C2, tmpV, VXC_MODIFIER(8, 11, 0, VXC_RM_ToNearestEven, 1), uniCalculateB1st_4x4);
VXC_DP4x4(B, C3, tmpV, VXC_MODIFIER(12, 15, 0, VXC_RM_ToNearestEven, 1), uniCalculateB1st_4x4);
int4 result, temp1, temp2, dstR, dstG, dstB;
int4 tmpData0, tmpData1;
VXC_DP4x4(tmpData0, B, B, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniBilinearTmp1st_4x4);
VXC_DP4x4(tmpData1, B, B, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniBilinearTmp2nd_4x4);
temp1 = fx * tmpData0 + tmpData1;
// temp2 - temp1
VXC_DP4x4(tmpData0, B, B, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniBilinearTmp3rd_4x4);
VXC_DP4x4(tmpData1, B, B, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniBilinearTmp4th_4x4);
temp2 = fx * tmpData0 + tmpData1;
result = fy * temp2 + (temp1 << 10);
tmpV = 1 << 19;
vxc_uchar8 dst, tmpPack;
float4 tmpDst;
VXC_DP4x4(tmpDst, result, tmpV, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), uniDescaleU8_4x4);
tmpDst = (tmpDst - bMean) * var;
dstB = convert_int4_rte(tmpDst * outputScale + zp);
VXC_DP4x4(tmpData0, G, G, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniBilinearTmp1st_4x4);
VXC_DP4x4(tmpData1, G, G, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniBilinearTmp2nd_4x4);
temp1 = fx * tmpData0 + tmpData1;
VXC_DP4x4(tmpData0, G, G, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniBilinearTmp3rd_4x4);
VXC_DP4x4(tmpData1, G, G, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniBilinearTmp4th_4x4);
temp2 = fx * tmpData0 + tmpData1;
result = fy * temp2 + (temp1 << 10);
VXC_DP4x4(tmpDst, result, tmpV, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), uniDescaleU8_4x4);
tmpDst = (tmpDst - gMean) * var;
dstG = convert_int4_rte(tmpDst * outputScale + zp);
VXC_DP4x4(tmpData0, R, R, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniBilinearTmp1st_4x4);
VXC_DP4x4(tmpData1, R, R, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniBilinearTmp2nd_4x4);
temp1 = fx * tmpData0 + tmpData1;
VXC_DP4x4(tmpData0, R, R, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniBilinearTmp3rd_4x4);
VXC_DP4x4(tmpData1, R, R, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniBilinearTmp4th_4x4);
temp2 = fx * tmpData0 + tmpData1;
result = fy * temp2 + (temp1 << 10);
VXC_DP4x4(tmpDst, result, tmpV, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), uniDescaleU8_4x4);
tmpDst = (tmpDst - rMean) * var;
dstR = convert_int4_rte(tmpDst * outputScale + zp);
if(bOrder == 2)
{
int4 exchangeData = dstB;
dstB = dstR;
dstR = exchangeData;
}
VXC_DP2x8(tmpPack, dstB, dstG, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniConvertInt32toUint8_2x8);
VXC_DP2x8(dst, tmpPack, dstR, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniTransPackBgr1st_2x8);
VXC_DP2x8(dst, tmpPack, dstR, VXC_MODIFIER(8, 11, 0, VXC_RM_ToNearestEven, 1), uniTransPackBgr2nd_2x8);
int2 dstPos = (int2)(get_global_id(0) * 3, gidy);
VXC_WriteImage(output, dstPos, dst, VXC_MODIFIER(0, 11, 0, VXC_RM_TowardZero, 0));
}

View File

@ -22,19 +22,6 @@ _viv_uniform VXC_512Bits uniCalculateTmpB3rd_4x4;
_viv_uniform VXC_512Bits uniCalculateTmpB4th_4x4;
_viv_uniform VXC_512Bits uniCalculateB1st_4x4;
_viv_uniform VXC_512Bits uniPackBG0_2x8;
_viv_uniform VXC_512Bits uniPackTmpAndR_2x8;
_viv_uniform VXC_512Bits uniPackRB0_2x8;
_viv_uniform VXC_512Bits uniPackTmp0AndG_2x8;
_viv_uniform VXC_512Bits uniPackGR1_2x8;
_viv_uniform VXC_512Bits uniPackTmp1AndB_2x8;
_viv_uniform VXC_512Bits uniPackBG1_2x8;
_viv_uniform VXC_512Bits uniPackTmp1AndR_2x8;
_viv_uniform VXC_512Bits uniPackRB2_2x8;
_viv_uniform VXC_512Bits uniPackTmp2AndG_2x8;
_viv_uniform VXC_512Bits uniPackGR2_2x8;
_viv_uniform VXC_512Bits uniPackTmp2AndB_2x8;
_viv_uniform VXC_512Bits uniQuantU8toU8LoB_2x8;
_viv_uniform VXC_512Bits uniQuantU8toU8HiB_2x8;
_viv_uniform VXC_512Bits uniQuantU8toU8LoG_2x8;
@ -143,137 +130,3 @@ __kernel void pre_process_yuv444_copy_U8toU8(
pos.z = rOrder;
VXC_WriteImage2DArray(output, pos, dst2, VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));
}
// store bgrbgrbgr
__kernel void pre_process_yuv444_copy_trans_U8(
__read_only image2d_t y_img,
__read_only image2d_t u_img,
__read_only image2d_t v_img,
__write_only image2d_array_t output,
global int * xRatio,
global int * yRatio,
global int * xOffset,
global int * yOffset,
float rMean,
float gMean,
float bMean,
float var,
int reverse_channel,
int trans
)
{
int4 pos = (int4)(get_global_id(0) + (*xOffset), get_global_id(1) + (*yOffset), 0, 0);
vxc_uchar16 Y, U, V;
vxc_int4 C0, C1, C2, C3;
vxc_uchar16 R, G, B;
vxc_uchar16 dst;
VXC_ReadImage(Y, y_img, pos.xy, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));
VXC_ReadImage(U, u_img, pos.xy, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));
VXC_ReadImage(V, v_img, pos.xy, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));
var *= outputScale;
float4 paramData = (float4)(bMean * var - zp, gMean * var - zp,\
rMean * var - zp, var);
half4 paramData_f16;
_viv_asm(CONV, paramData_f16, paramData);
//C = Y - 16;
//D = U - 128;
//E = V - 128;
// calculate R
// ((298 * C + 409 * E + 128) >> 8) --> [(298Y + 409V - 56992) >> 8]
int tmpV = -56992;
VXC_DP4x4(C0, Y, V, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniCalculateTmpR1st_4x4);
VXC_DP4x4(C1, Y, V, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniCalculateTmpR2nd_4x4);
VXC_DP4x4(C2, Y, V, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniCalculateTmpR3rd_4x4);
VXC_DP4x4(C3, Y, V, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniCalculateTmpR4th_4x4);
VXC_DP4x4(dst, C0, tmpV, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), uniCalculateR1st_4x4);
VXC_DP4x4(dst, C1, tmpV, VXC_MODIFIER(4, 7, 0, VXC_RM_ToNearestEven, 1), uniCalculateR1st_4x4);
VXC_DP4x4(dst, C2, tmpV, VXC_MODIFIER(8, 11, 0, VXC_RM_ToNearestEven, 1), uniCalculateR1st_4x4);
VXC_DP4x4(dst, C3, tmpV, VXC_MODIFIER(12, 15, 0, VXC_RM_ToNearestEven, 1), uniCalculateR1st_4x4);
VXC_DP2x8(R, dst, paramData_f16, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniQuantU8toU8LoR_2x8);
VXC_DP2x8(R, dst, paramData_f16, VXC_MODIFIER(8, 15, 0, VXC_RM_ToNearestEven, 1), uniQuantU8toU8HiR_2x8);
// calculate G
// ((298 * C - 100* D - 208 * E + 128) >> 8) --> [(298Y - 100U - 208V + 34784) >> 8]
// 298Y - 208V
VXC_DP4x4(C0, Y, V, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniCalculateTmpG1st_4x4);
VXC_DP4x4(C1, Y, V, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniCalculateTmpG2nd_4x4);
VXC_DP4x4(C2, Y, V, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniCalculateTmpG3rd_4x4);
VXC_DP4x4(C3, Y, V, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniCalculateTmpG4th_4x4);
// 34784 - 100U
ushort tmpG = 34784;
vxc_ushort8 tmpDstG0, tmpDstG1;
VXC_DP2x8(tmpDstG0, U, tmpG, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniCalculateTmpGbyU_2x8);
VXC_DP2x8(tmpDstG1, U, tmpG, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniCalculateTmpGbyU2_2x8);
VXC_DP4x4(dst, C0, tmpDstG0, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), uniCalculateG1st_4x4);
VXC_DP4x4(dst, C1, tmpDstG0, VXC_MODIFIER(4, 7, 0, VXC_RM_ToNearestEven, 1), uniCalculateG2nd_4x4);
VXC_DP4x4(dst, C2, tmpDstG1, VXC_MODIFIER(8, 11, 0, VXC_RM_ToNearestEven, 1), uniCalculateG1st_4x4);
VXC_DP4x4(dst, C3, tmpDstG1, VXC_MODIFIER(12, 15, 0, VXC_RM_ToNearestEven, 1), uniCalculateG2nd_4x4);
VXC_DP2x8(G, dst, paramData_f16, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniQuantU8toU8LoG_2x8);
VXC_DP2x8(G, dst, paramData_f16, VXC_MODIFIER(8, 15, 0, VXC_RM_ToNearestEven, 1), uniQuantU8toU8HiG_2x8);
// calculate B
// ((298 * C + 516 * D + 128) >> 8) ==> [(298Y + 516U - 70688) >> 8]
VXC_DP4x4(C0, Y, U, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniCalculateTmpB1st_4x4);
VXC_DP4x4(C1, Y, U, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniCalculateTmpB2nd_4x4);
VXC_DP4x4(C2, Y, U, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniCalculateTmpB3rd_4x4);
VXC_DP4x4(C3, Y, U, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniCalculateTmpB4th_4x4);
tmpV = -70688;
VXC_DP4x4(dst, C0, tmpV, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), uniCalculateB1st_4x4);
VXC_DP4x4(dst, C1, tmpV, VXC_MODIFIER(4, 7, 0, VXC_RM_ToNearestEven, 1), uniCalculateB1st_4x4);
VXC_DP4x4(dst, C2, tmpV, VXC_MODIFIER(8, 11, 0, VXC_RM_ToNearestEven, 1), uniCalculateB1st_4x4);
VXC_DP4x4(dst, C3, tmpV, VXC_MODIFIER(12, 15, 0, VXC_RM_ToNearestEven, 1), uniCalculateB1st_4x4);
VXC_DP2x8(B, dst, paramData_f16, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniQuantU8toU8LoB_2x8);
VXC_DP2x8(B, dst, paramData_f16, VXC_MODIFIER(8, 15, 0, VXC_RM_ToNearestEven, 1), uniQuantU8toU8HiB_2x8);
// reorder to bgr
vxc_uchar8 tmpdst0, tmpdst1;
vxc_uchar16 dst0, dst1, dst2;
if(bOrder == 2)
{
vxc_uchar16 exchangeData = B;
B = R;
R = exchangeData;
}
// BGR BGR BG
VXC_DP2x8(tmpdst0, B, G, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniPackBG0_2x8);
VXC_DP2x8(dst0, tmpdst0, R, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniPackTmpAndR_2x8);
// RBG RBG RB
VXC_DP2x8(tmpdst0, R, B, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniPackRB0_2x8);
VXC_DP2x8(dst0, tmpdst0, G, VXC_MODIFIER(8, 15, 0, VXC_RM_TowardZero, 0), uniPackTmp0AndG_2x8);
pos = (int4)(get_global_id(0) * 3, get_global_id(1), 0, 0);
VXC_WriteImage2DArray(output, pos, dst0, VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));
pos.x += 16;
// GRB GRB GR
VXC_DP2x8(tmpdst0, G, R, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniPackGR1_2x8);
VXC_DP2x8(dst1, tmpdst0, B, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniPackTmp1AndB_2x8);
// BGR BGR BG
VXC_DP2x8(tmpdst0, B, G, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniPackBG1_2x8);
VXC_DP2x8(dst1, tmpdst0, R, VXC_MODIFIER(8, 15, 0, VXC_RM_TowardZero, 0), uniPackTmp1AndR_2x8);
VXC_WriteImage2DArray(output, pos, dst1, VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));
pos.x += 16;
// RBG RBG RB
VXC_DP2x8(tmpdst0, R, B, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniPackRB2_2x8);
VXC_DP2x8(dst2, tmpdst0, G, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniPackTmp2AndG_2x8);
// GRB GRB GR
VXC_DP2x8(tmpdst1, G, R, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniPackGR2_2x8);
VXC_DP2x8(dst2, tmpdst1, B, VXC_MODIFIER(8, 15, 0, VXC_RM_TowardZero, 0), uniPackTmp2AndB_2x8);
VXC_WriteImage2DArray(output, pos, dst2, VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));
}

View File

@ -1,196 +0,0 @@
#include "cl_viv_vx_ext.h"
_viv_uniform VXC_512Bits uniCalculateR1st_4x4;
_viv_uniform VXC_512Bits uniCalculateTmpGbyU_2x8;
_viv_uniform VXC_512Bits uniCalculateTmpGbyU2nd_2x8;
_viv_uniform VXC_512Bits uniCalculateB1st_4x4;
_viv_uniform VXC_512Bits uniConvertInt32toUint8_2x8;
_viv_uniform VXC_512Bits uniDescaleU8_4x4;
_viv_uniform VXC_512Bits uniCalculateTmpRWise_4x4;
_viv_uniform VXC_512Bits uniCalculateTmpRWise2nd_4x4;
_viv_uniform VXC_512Bits uniCalculateTmpRWise3rd_4x4;
_viv_uniform VXC_512Bits uniCalculateTmpRWise4th_4x4;
_viv_uniform VXC_512Bits uniCalculateTmpGWise_4x4;
_viv_uniform VXC_512Bits uniCalculateTmpGWise2nd_4x4;
_viv_uniform VXC_512Bits uniCalculateTmpGWise3rd_4x4;
_viv_uniform VXC_512Bits uniCalculateTmpGWise4th_4x4;
_viv_uniform VXC_512Bits uniCalculateGWise_4x4;
_viv_uniform VXC_512Bits uniCalculateGWise2nd_4x4;
_viv_uniform VXC_512Bits uniCalculateTmpBWise_4x4;
_viv_uniform VXC_512Bits uniCalculateTmpBWise2nd_4x4;
_viv_uniform VXC_512Bits uniCalculateTmpBWise3rd_4x4;
_viv_uniform VXC_512Bits uniCalculateTmpBWise4th_4x4;
_viv_uniform VXC_512Bits uniBilinearTmp1st_4x4;
_viv_uniform VXC_512Bits uniBilinearTmp2nd_4x4;
_viv_uniform VXC_512Bits uniBilinearTmp3rd_4x4;
_viv_uniform VXC_512Bits uniBilinearTmp4th_4x4;
_viv_uniform VXC_512Bits uniTransPackBgr1st_2x8;
_viv_uniform VXC_512Bits uniTransPackBgr2nd_2x8;
_viv_uniform int bOrder;
_viv_uniform int rOrder;
_viv_uniform int zp;
_viv_uniform float outputScale;
#define IMAGE_PRE_PROCESS_YUV444_TRANS(dst_name, dst_type) \
__kernel void pre_process_yuv444_trans_U8to##dst_name( \
__read_only image2d_t y_img, __read_only image2d_t u_img, \
__read_only image2d_t v_img, __write_only image2d_t output, \
global int *xRatio, global int * yRatio, global int * xOffset, global int * yOffset, \
float rMean, float gMean, float bMean, float var, int reverse_channel, int trans) \
{ \
int4 gidx = get_global_id(0); \
int gidy = get_global_id(1); \
gidx += (int4)(0, 1, 2, 3); \
\
int4 fx = (gidx * (*xRatio) + ((*xRatio) >> 1)) - (1 << 14); \
int4 sx = fx & 0xffff8000; \
int fy, sy; \
fx -= sx; \
sx = sx >> 15; \
fx = (fx +(1 << 4)) >> 5; \
\
fy = (gidy * (*yRatio) + ((*yRatio) >> 1)) - (1<< 14); \
sy = fy & 0xffff8000; \
fy -= sy; \
sy = sy >> 15; \
\
sy = sy < 0 ? 0 : sy; \
fy = fy < 0 ? 0 : fy; \
\
fy = (fy + (1<< 4)) >> 5; \
sx += (*xOffset); \
sy += (*yOffset); \
int2 srcPos = (int2)(sx.x, sy); \
\
vxc_uchar16 Y, U, V; \
vxc_int4 C0, C1, C2, C3; \
vxc_uchar16 R, G, B; \
\
VXC_ReadImage(Y, y_img, srcPos, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0)); \
VXC_ReadImage(Y, y_img, srcPos, VXC_5BITOFFSET_XY(0, 1), VXC_MODIFIER(2, 3, 0, VXC_RM_TowardZero, 0)); \
VXC_ReadImage(U, u_img, srcPos, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0)); \
VXC_ReadImage(U, u_img, srcPos, VXC_5BITOFFSET_XY(0, 1), VXC_MODIFIER(2, 3, 0, VXC_RM_TowardZero, 0)); \
VXC_ReadImage(V, v_img, srcPos, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0)); \
VXC_ReadImage(V, v_img, srcPos, VXC_5BITOFFSET_XY(0, 1), VXC_MODIFIER(2, 3, 0, VXC_RM_TowardZero, 0)); \
\
srcPos.x = sx.y; \
VXC_ReadImage(Y, y_img, srcPos, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(4, 5, 0, VXC_RM_TowardZero, 0)); \
VXC_ReadImage(Y, y_img, srcPos, VXC_5BITOFFSET_XY(0, 1), VXC_MODIFIER(6, 7, 0, VXC_RM_TowardZero, 0)); \
VXC_ReadImage(U, u_img, srcPos, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(4, 5, 0, VXC_RM_TowardZero, 0)); \
VXC_ReadImage(U, u_img, srcPos, VXC_5BITOFFSET_XY(0, 1), VXC_MODIFIER(6, 7, 0, VXC_RM_TowardZero, 0)); \
VXC_ReadImage(V, v_img, srcPos, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(4, 5, 0, VXC_RM_TowardZero, 0)); \
VXC_ReadImage(V, v_img, srcPos, VXC_5BITOFFSET_XY(0, 1), VXC_MODIFIER(6, 7, 0, VXC_RM_TowardZero, 0)); \
\
srcPos.x = sx.z; \
VXC_ReadImage(Y, y_img, srcPos, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(8, 9, 0, VXC_RM_TowardZero, 0)); \
VXC_ReadImage(Y, y_img, srcPos, VXC_5BITOFFSET_XY(0, 1), VXC_MODIFIER(10, 11, 0, VXC_RM_TowardZero, 0)); \
VXC_ReadImage(U, u_img, srcPos, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(8, 9, 0, VXC_RM_TowardZero, 0)); \
VXC_ReadImage(U, u_img, srcPos, VXC_5BITOFFSET_XY(0, 1), VXC_MODIFIER(10, 11, 0, VXC_RM_TowardZero, 0)); \
VXC_ReadImage(V, v_img, srcPos, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(8, 9, 0, VXC_RM_TowardZero, 0)); \
VXC_ReadImage(V, v_img, srcPos, VXC_5BITOFFSET_XY(0, 1), VXC_MODIFIER(10, 11, 0, VXC_RM_TowardZero, 0)); \
\
srcPos.x = sx.w; \
VXC_ReadImage(Y, y_img, srcPos, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(12, 13, 0, VXC_RM_TowardZero, 0)); \
VXC_ReadImage(Y, y_img, srcPos, VXC_5BITOFFSET_XY(0, 1), VXC_MODIFIER(14, 15, 0, VXC_RM_TowardZero, 0)); \
VXC_ReadImage(U, u_img, srcPos, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(12, 13, 0, VXC_RM_TowardZero, 0)); \
VXC_ReadImage(U, u_img, srcPos, VXC_5BITOFFSET_XY(0, 1), VXC_MODIFIER(14, 15, 0, VXC_RM_TowardZero, 0)); \
VXC_ReadImage(V, v_img, srcPos, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(12, 13, 0, VXC_RM_TowardZero, 0)); \
VXC_ReadImage(V, v_img, srcPos, VXC_5BITOFFSET_XY(0, 1), VXC_MODIFIER(14, 15, 0, VXC_RM_TowardZero, 0)); \
\
int tmpV = -56992; \
VXC_DP4x4(C0, Y, V, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniCalculateTmpRWise_4x4); \
VXC_DP4x4(C1, Y, V, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniCalculateTmpRWise2nd_4x4); \
VXC_DP4x4(C2, Y, V, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniCalculateTmpRWise3rd_4x4); \
VXC_DP4x4(C3, Y, V, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniCalculateTmpRWise4th_4x4); \
VXC_DP4x4(R, C0, tmpV, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), uniCalculateR1st_4x4); \
VXC_DP4x4(R, C1, tmpV, VXC_MODIFIER(4, 7, 0, VXC_RM_ToNearestEven, 1), uniCalculateR1st_4x4); \
VXC_DP4x4(R, C2, tmpV, VXC_MODIFIER(8, 11, 0, VXC_RM_ToNearestEven, 1), uniCalculateR1st_4x4); \
VXC_DP4x4(R, C3, tmpV, VXC_MODIFIER(12, 15, 0, VXC_RM_ToNearestEven, 1), uniCalculateR1st_4x4); \
\
VXC_DP4x4(C0, Y, V, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniCalculateTmpGWise_4x4); \
VXC_DP4x4(C1, Y, V, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniCalculateTmpGWise2nd_4x4); \
VXC_DP4x4(C2, Y, V, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniCalculateTmpGWise3rd_4x4); \
VXC_DP4x4(C3, Y, V, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniCalculateTmpGWise4th_4x4); \
\
ushort tmpG = 34784; \
vxc_ushort8 tmpDstG, tmpDstG1; \
VXC_DP2x8(tmpDstG, U, tmpG, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniCalculateTmpGbyU_2x8); \
VXC_DP2x8(tmpDstG1, U, tmpG, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniCalculateTmpGbyU2nd_2x8); \
VXC_DP4x4(G, C0, tmpDstG, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), uniCalculateGWise_4x4); \
VXC_DP4x4(G, C1, tmpDstG, VXC_MODIFIER(4, 7, 0, VXC_RM_ToNearestEven, 1), uniCalculateGWise2nd_4x4); \
VXC_DP4x4(G, C2, tmpDstG1, VXC_MODIFIER(8, 11, 0, VXC_RM_ToNearestEven, 1), uniCalculateGWise_4x4); \
VXC_DP4x4(G, C3, tmpDstG1, VXC_MODIFIER(12, 15, 0, VXC_RM_ToNearestEven, 1), uniCalculateGWise2nd_4x4); \
\
VXC_DP4x4(C0, Y, U, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniCalculateTmpBWise_4x4); \
VXC_DP4x4(C1, Y, U, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniCalculateTmpBWise2nd_4x4); \
VXC_DP4x4(C2, Y, U, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniCalculateTmpBWise3rd_4x4); \
VXC_DP4x4(C3, Y, U, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniCalculateTmpBWise4th_4x4); \
tmpV = -70688; \
VXC_DP4x4(B, C0, tmpV, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), uniCalculateB1st_4x4); \
VXC_DP4x4(B, C1, tmpV, VXC_MODIFIER(4, 7, 0, VXC_RM_ToNearestEven, 1), uniCalculateB1st_4x4); \
VXC_DP4x4(B, C2, tmpV, VXC_MODIFIER(8, 11, 0, VXC_RM_ToNearestEven, 1), uniCalculateB1st_4x4); \
VXC_DP4x4(B, C3, tmpV, VXC_MODIFIER(12, 15, 0, VXC_RM_ToNearestEven, 1), uniCalculateB1st_4x4); \
\
int4 result, temp1, temp2, dstR, dstG, dstB; \
int4 tmpData0, tmpData1; \
\
VXC_DP4x4(tmpData0, B, B, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniBilinearTmp1st_4x4); \
VXC_DP4x4(tmpData1, B, B, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniBilinearTmp2nd_4x4); \
temp1 = fx * tmpData0 + tmpData1; \
\
VXC_DP4x4(tmpData0, B, B, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniBilinearTmp3rd_4x4); \
VXC_DP4x4(tmpData1, B, B, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniBilinearTmp4th_4x4); \
temp2 = fx * tmpData0 + tmpData1; \
result = fy * temp2 + (temp1 << 10); \
\
tmpV = 1 << 19; \
dst_type dst, tmpPack; \
float4 tmpDst; \
\
VXC_DP4x4(tmpDst, result, tmpV, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), uniDescaleU8_4x4); \
tmpDst = (tmpDst - bMean) * var; \
dstB = convert_int4_rte(tmpDst * outputScale + zp); \
\
VXC_DP4x4(tmpData0, G, G, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniBilinearTmp1st_4x4); \
VXC_DP4x4(tmpData1, G, G, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniBilinearTmp2nd_4x4); \
temp1 = fx * tmpData0 + tmpData1; \
VXC_DP4x4(tmpData0, G, G, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniBilinearTmp3rd_4x4); \
VXC_DP4x4(tmpData1, G, G, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniBilinearTmp4th_4x4); \
temp2 = fx * tmpData0 + tmpData1; \
result = fy * temp2 + (temp1 << 10); \
VXC_DP4x4(tmpDst, result, tmpV, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), uniDescaleU8_4x4); \
tmpDst = (tmpDst - gMean) * var; \
dstG = convert_int4_rte(tmpDst * outputScale + zp); \
\
VXC_DP4x4(tmpData0, R, R, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniBilinearTmp1st_4x4); \
VXC_DP4x4(tmpData1, R, R, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniBilinearTmp2nd_4x4); \
temp1 = fx * tmpData0 + tmpData1; \
VXC_DP4x4(tmpData0, R, R, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniBilinearTmp3rd_4x4); \
VXC_DP4x4(tmpData1, R, R, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniBilinearTmp4th_4x4); \
temp2 = fx * tmpData0 + tmpData1; \
result = fy * temp2 + (temp1 << 10); \
VXC_DP4x4(tmpDst, result, tmpV, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), uniDescaleU8_4x4); \
tmpDst = (tmpDst - rMean) * var; \
dstR = convert_int4_rte(tmpDst * outputScale + zp); \
\
if(bOrder == 2) \
{ \
int4 exchangeData = dstB; \
dstB = dstR; \
dstR = exchangeData; \
} \
\
VXC_DP2x8(tmpPack, dstB, dstG, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniConvertInt32toUint8_2x8); \
VXC_DP2x8(dst, tmpPack, dstR, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniTransPackBgr1st_2x8); \
VXC_DP2x8(dst, tmpPack, dstR, VXC_MODIFIER(8, 11, 0, VXC_RM_ToNearestEven, 1), uniTransPackBgr2nd_2x8); \
\
int2 dstPos = (int2)(get_global_id(0) * 3, gidy); \
VXC_WriteImage(output, dstPos, dst, VXC_MODIFIER(0, 11, 0, VXC_RM_TowardZero, 0)); \
}
IMAGE_PRE_PROCESS_YUV444_TRANS(U8, vxc_uchar16)

View File

@ -28,37 +28,34 @@ __kernel void resize_bilinear_BF16toBF16_DOWN
float top_y_f = floor(in_y);
float y_lerp = in_y - top_y_f;
int top_y_idx = convert_int(top_y_f);
int bottom_y_idx = top_y_idx + 1;
vxc_short8 top;
vxc_short8 bottom;
vxc_short8 zero = (vxc_short8)(0, 0, 0, 0, 0, 0, 0, 0);
int4 coord_in = (int4)(left_x_idx.x, top_y_idx, coord_out.z, 0);
VXC_ReadImage2DArray(top, input, coord_in, \
VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0));
int8 input_desc;
_viv_asm(COPY, input_desc, input, sizeof(input_desc));
int baseAddr = (int)coord_in.z * input_desc.s4 + input_desc.s0;
_viv_asm(MOV, coord_in.w, baseAddr);
VXC_OP4(img_load_3d, top, input, coord_in.xyww, VXC_5BITOFFSET_XY(0, 0),
VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0));
VXC_OP4(img_load_3d, bottom, input, coord_in.xyww, VXC_5BITOFFSET_XY(0, 1),
VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0));
coord_in.x = left_x_idx.y;
VXC_ReadImage2DArray(top, input, coord_in, \
VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(2, 3, 0, VXC_RM_TowardZero, 0));
VXC_OP4(img_load_3d, top, input, coord_in.xyww, VXC_5BITOFFSET_XY(0, 0),
VXC_MODIFIER(2, 3, 0, VXC_RM_TowardZero, 0));
VXC_OP4(img_load_3d, bottom, input, coord_in.xyww, VXC_5BITOFFSET_XY(0, 1),
VXC_MODIFIER(2, 3, 0, VXC_RM_TowardZero, 0));
coord_in.x = left_x_idx.z;
VXC_ReadImage2DArray(top, input, coord_in, \
VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(4, 5, 0, VXC_RM_TowardZero, 0));
VXC_OP4(img_load_3d, top, input, coord_in.xyww, VXC_5BITOFFSET_XY(0, 0),
VXC_MODIFIER(4, 5, 0, VXC_RM_TowardZero, 0));
VXC_OP4(img_load_3d, bottom, input, coord_in.xyww, VXC_5BITOFFSET_XY(0, 1),
VXC_MODIFIER(4, 5, 0, VXC_RM_TowardZero, 0));
coord_in.x = left_x_idx.w;
VXC_ReadImage2DArray(top, input, coord_in, \
VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(6, 7, 0, VXC_RM_TowardZero, 0));
coord_in.y = bottom_y_idx;
coord_in.x = left_x_idx.x;
VXC_ReadImage2DArray(bottom, input, coord_in, \
VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0));
coord_in.x = left_x_idx.y;
VXC_ReadImage2DArray(bottom, input, coord_in, \
VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(2, 3, 0, VXC_RM_TowardZero, 0));
coord_in.x = left_x_idx.z;
VXC_ReadImage2DArray(bottom, input, coord_in, \
VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(4, 5, 0, VXC_RM_TowardZero, 0));
coord_in.x = left_x_idx.w;
VXC_ReadImage2DArray(bottom, input, coord_in, \
VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(6, 7, 0, VXC_RM_TowardZero, 0));
VXC_OP4(img_load_3d, top, input, coord_in.xyww, VXC_5BITOFFSET_XY(0, 0),
VXC_MODIFIER(6, 7, 0, VXC_RM_TowardZero, 0));
VXC_OP4(img_load_3d, bottom, input, coord_in.xyww, VXC_5BITOFFSET_XY(0, 1),
VXC_MODIFIER(6, 7, 0, VXC_RM_TowardZero, 0));
vxc_ushort8 src;
float4 left4;
@ -84,7 +81,14 @@ __kernel void resize_bilinear_BF16toBF16_DOWN
vxc_ushort8 tmp, dst;
_viv_asm(COPY, tmp, dst4, 16);
dst.s0123 = tmp.s1357;
VXC_WriteImage2DArray(output, coord_out, dst, VXC_MODIFIER(0, 3, 0,VXC_RM_TowardZero, 0));
int8 output_desc;
_viv_asm(COPY, output_desc, output, sizeof(output_desc));
baseAddr = (int)coord_out.z * output_desc.s4 + output_desc.s0;
_viv_asm(MOV, coord_out.w, baseAddr);
VXC_OP4_NoDest(img_store_3d, output, coord_out.xyww, dst,
VXC_MODIFIER(0, 3, 0,VXC_RM_TowardZero, 0));
}
__kernel void resize_bilinear_BF16toBF16_UP
@ -107,22 +111,24 @@ __kernel void resize_bilinear_BF16toBF16_UP
float top_y_f = floor(in_y);
float y_lerp = in_y - top_y_f;
int top_y_idx = convert_int(top_y_f);
float bottom_y_f = ceil(in_y);
int bottom_y_idx= convert_int(bottom_y_f);
vxc_ushort8 src0, src1, src2, src3, dst0, dst1;
vxc_short8 zero = (vxc_short8)(0, 0, 0, 0, 0, 0, 0, 0);
int4 coord_in = (int4)(left_x_idx.x, top_y_idx, coord_out.z, 0);
VXC_ReadImage2DArray(src0, input, coord_in, \
VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
VXC_ReadImage2DArray(src1, input, coord_in, \
VXC_5BITOFFSET_XY(8, 0), VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
int8 input_desc;
_viv_asm(COPY, input_desc, input, sizeof(input_desc));
int baseAddr = (int)coord_in.z * input_desc.s4 + input_desc.s0;
_viv_asm(MOV, coord_in.w, baseAddr);
VXC_OP4(img_load_3d, src0, input, coord_in.xyww, VXC_5BITOFFSET_XY(0, 0),
VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
VXC_OP4(img_load_3d, src1, input, coord_in.xyww, VXC_5BITOFFSET_XY(8, 0),
VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
VXC_OP4(img_load_3d, src2, input, coord_in.xyww, VXC_5BITOFFSET_XY(0, 1),
VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
VXC_OP4(img_load_3d, src3, input, coord_in.xyww, VXC_5BITOFFSET_XY(8, 1),
VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
coord_in.y = bottom_y_idx;
VXC_ReadImage2DArray(src2, input, coord_in, \
VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
VXC_ReadImage2DArray(src3, input, coord_in, \
VXC_5BITOFFSET_XY(8, 0), VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
vxc_ushort8 bitextract_p0;
vxc_uchar16 maskShift = {16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16};
@ -132,29 +138,36 @@ __kernel void resize_bilinear_BF16toBF16_UP
VXC_DP2x8(maskShift, bitextract_p0, constData, \
VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniGetMaskShift_2x8);
do
int8 output_desc;
_viv_asm(COPY, output_desc, output, sizeof(output_desc));
baseAddr = (int)coord_out.z * output_desc.s4 + output_desc.s0;
_viv_asm(MOV, coord_out.w, baseAddr);
float4 left4;
float4 right4;
float4 top4;
float4 bottom4;
int loop = depth - 1;
while (coord_in.z < loop)
{
VXC_BitExtract(dst0, src0, src1, maskShift, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
VXC_BitExtract(dst1, src2, src3, maskShift, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
coord_in.z ++;
coord_in.y = top_y_idx;
VXC_ReadImage2DArray(src0, input, coord_in, \
VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
VXC_ReadImage2DArray(src1, input, coord_in, \
VXC_5BITOFFSET_XY(8, 0), VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
coord_in.y = bottom_y_idx;
VXC_ReadImage2DArray(src2, input, coord_in, \
VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
VXC_ReadImage2DArray(src3, input, coord_in, \
VXC_5BITOFFSET_XY(8, 0), VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
coord_in.w += input_desc.s4;
VXC_OP4(img_load_3d, src0, input, coord_in.xyww, VXC_5BITOFFSET_XY(0, 0),
VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
VXC_OP4(img_load_3d, src1, input, coord_in.xyww, VXC_5BITOFFSET_XY(8, 0),
VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
VXC_OP4(img_load_3d, src2, input, coord_in.xyww, VXC_5BITOFFSET_XY(0, 1),
VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
VXC_OP4(img_load_3d, src3, input, coord_in.xyww, VXC_5BITOFFSET_XY(8, 1),
VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
coord_in.z ++;
vxc_ushort8 dst_tmp;
float4 left4;
float4 right4;
float4 top4;
float4 bottom4;
VXC_DP2x8(dst_tmp, dst0, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvBF16toF32_Part0_2x8);
_viv_asm(COPY, left4, dst_tmp, 16);
@ -176,7 +189,30 @@ __kernel void resize_bilinear_BF16toBF16_UP
vxc_ushort8 tmp, dst;
_viv_asm(COPY, tmp, dst4, 16);
dst.s0123 = tmp.s1357;
VXC_WriteImage2DArray(output, coord_out, dst, VXC_MODIFIER(0, 3, 0,VXC_RM_TowardZero, 0));
coord_out.z ++;
} while (coord_in.z < depth);
VXC_OP4_NoDest(img_store_3d, output, coord_out.xyww, dst, VXC_MODIFIER(0, 3, 0,VXC_RM_TowardZero, 0));
coord_out.w += output_desc.s4;
}
VXC_BitExtract(dst0, src0, src1, maskShift, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
VXC_BitExtract(dst1, src2, src3, maskShift, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
vxc_ushort8 dst_tmp;
VXC_DP2x8(dst_tmp, dst0, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvBF16toF32_Part0_2x8);
_viv_asm(COPY, left4, dst_tmp, 16);
VXC_DP2x8(dst_tmp, dst0, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvBF16toF32_Part1_2x8);
_viv_asm(COPY, right4, dst_tmp, 16);
right4 -= left4;
top4 = right4 * x_lerp + left4;
VXC_DP2x8(dst_tmp, dst1, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvBF16toF32_Part0_2x8);
_viv_asm(COPY, left4, dst_tmp, 16);
VXC_DP2x8(dst_tmp, dst1, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvBF16toF32_Part1_2x8);
_viv_asm(COPY, right4, dst_tmp, 16);
right4 -= left4;
bottom4 = right4 * x_lerp + left4;
bottom4 -= top4;
float4 dst4 = bottom4 * y_lerp + top4;
vxc_ushort8 tmp, dst;
_viv_asm(COPY, tmp, dst4, 16);
dst.s0123 = tmp.s1357;
VXC_OP4_NoDest(img_store_3d, output, coord_out.xyww, dst, VXC_MODIFIER(0, 3, 0,VXC_RM_TowardZero, 0));
}

View File

@ -1,7 +1,7 @@
#include "cl_viv_vx_ext.h"
_viv_uniform VXC_512Bits uniExtact8Bit_2x8;
_viv_uniform VXC_512Bits uniFp16toFp32_4x4;
_viv_uniform VXC_512Bits uniFp16toFp32_left_4x4;
_viv_uniform VXC_512Bits uniRightSubLeft_4x4;
_viv_uniform VXC_512Bits uniExtactHalf8_2x8;
_viv_uniform float2 scale_xy;
@ -27,94 +27,66 @@ __kernel void resize_bilinear_F16toF16_DOWN
float4 left_x_f = floor(in_x);
float4 x_lerp = in_x - left_x_f;
int4 left_x_idx = convert_int4(left_x_f);
float4 right_x_f = ceil(in_x);
int4 right_x_idx = convert_int4(right_x_f);
float in_y = (convert_float(coord_out.y) + half_pixel_value) * scale_xy.y - half_pixel_value;
float top_y_f = floor(in_y);
float y_lerp = in_y - top_y_f;
int top_y_idx = convert_int(top_y_f);
float bottom_y_f = ceil(in_y);
int bottom_y_idx= convert_int(bottom_y_f);
vxc_short8 top_left0, top_right0;
vxc_short8 bottom_left0, bottom_right0;
vxc_half8 top_left, top_right;
vxc_half8 bottom_left, bottom_right;
vxc_short8 top_short, bottom_short, dst;
vxc_half8 top, bottom, result;
int4 coord_in = (int4)(left_x_idx.x, top_y_idx, coord_out.z, 0);
VXC_ReadImage2DArray(top_left0, input, coord_in, \
VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));
int8 input_desc;
_viv_asm(COPY, input_desc, input, sizeof(input_desc));
int baseAddr = (int)coord_in.z * input_desc.s4 + input_desc.s0;
_viv_asm(MOV, coord_in.w, baseAddr);
VXC_OP4(img_load_3d, top_short, input, coord_in.xyww, VXC_5BITOFFSET_XY(0, 0),
VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0));
VXC_OP4(img_load_3d, bottom_short, input, coord_in.xyww, VXC_5BITOFFSET_XY(0, 1),
VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0));
coord_in.x = left_x_idx.y;
VXC_ReadImage2DArray(top_left0, input, coord_in, \
VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(1, 1, 0, VXC_RM_TowardZero, 0));
VXC_OP4(img_load_3d, top_short, input, coord_in.xyww, VXC_5BITOFFSET_XY(0, 0),
VXC_MODIFIER(2, 3, 0, VXC_RM_TowardZero, 0));
VXC_OP4(img_load_3d, bottom_short, input, coord_in.xyww, VXC_5BITOFFSET_XY(0, 1),
VXC_MODIFIER(2, 3, 0, VXC_RM_TowardZero, 0));
coord_in.x = left_x_idx.z;
VXC_ReadImage2DArray(top_left0, input, coord_in, \
VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(2, 2, 0, VXC_RM_TowardZero, 0));
VXC_OP4(img_load_3d, top_short, input, coord_in.xyww, VXC_5BITOFFSET_XY(0, 0),
VXC_MODIFIER(4, 5, 0, VXC_RM_TowardZero, 0));
VXC_OP4(img_load_3d, bottom_short, input, coord_in.xyww, VXC_5BITOFFSET_XY(0, 1),
VXC_MODIFIER(4, 5, 0, VXC_RM_TowardZero, 0));
coord_in.x = left_x_idx.w;
VXC_ReadImage2DArray(top_left0, input, coord_in, \
VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(3, 3, 0, VXC_RM_TowardZero, 0));
_viv_asm(COPY, top_left, top_left0, 16);
coord_in.x = right_x_idx.x;
VXC_ReadImage2DArray(top_right0, input, coord_in, \
VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));
coord_in.x = right_x_idx.y;
VXC_ReadImage2DArray(top_right0, input, coord_in, \
VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(1, 1, 0, VXC_RM_TowardZero, 0));
coord_in.x = right_x_idx.z;
VXC_ReadImage2DArray(top_right0, input, coord_in, \
VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(2, 2, 0, VXC_RM_TowardZero, 0));
coord_in.x = right_x_idx.w;
VXC_ReadImage2DArray(top_right0, input, coord_in, \
VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(3, 3, 0, VXC_RM_TowardZero, 0));
_viv_asm(COPY, top_right, top_right0, 16);
coord_in.y = bottom_y_idx;
coord_in.x = left_x_idx.x;
VXC_ReadImage2DArray(bottom_left0, input, coord_in, \
VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));
coord_in.x = left_x_idx.y;
VXC_ReadImage2DArray(bottom_left0, input, coord_in, \
VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(1, 1, 0, VXC_RM_TowardZero, 0));
coord_in.x = left_x_idx.z;
VXC_ReadImage2DArray(bottom_left0, input, coord_in, \
VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(2, 2, 0, VXC_RM_TowardZero, 0));
coord_in.x = left_x_idx.w;
VXC_ReadImage2DArray(bottom_left0, input, coord_in, \
VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(3, 3, 0, VXC_RM_TowardZero, 0));
_viv_asm(COPY, bottom_left, bottom_left0, 16);
coord_in.x = right_x_idx.x;
VXC_ReadImage2DArray(bottom_right0, input, coord_in, \
VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));
coord_in.x = right_x_idx.y;
VXC_ReadImage2DArray(bottom_right0, input, coord_in, \
VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(1, 1, 0, VXC_RM_TowardZero, 0));
coord_in.x = right_x_idx.z;
VXC_ReadImage2DArray(bottom_right0, input, coord_in, \
VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(2, 2, 0, VXC_RM_TowardZero, 0));
coord_in.x = right_x_idx.w;
VXC_ReadImage2DArray(bottom_right0, input, coord_in, \
VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(3, 3, 0, VXC_RM_TowardZero, 0));
_viv_asm(COPY, bottom_right, bottom_right0, 16);
VXC_OP4(img_load_3d, top_short, input, coord_in.xyww, VXC_5BITOFFSET_XY(0, 0),
VXC_MODIFIER(6, 7, 0, VXC_RM_TowardZero, 0));
VXC_OP4(img_load_3d, bottom_short, input, coord_in.xyww, VXC_5BITOFFSET_XY(0, 1),
VXC_MODIFIER(6, 7, 0, VXC_RM_TowardZero, 0));
_viv_asm(COPY, top, top_short, 16);
_viv_asm(COPY, bottom, bottom_short, 16);
float4 left4;
float4 right4;
float4 top4;
float4 bottom4;
VXC_DP4x4(left4, top_left, top_left, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniFp16toFp32_4x4);
VXC_DP4x4(right4, top_right, top_left, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniRightSubLeft_4x4);
top4 = right4 * x_lerp + left4;
VXC_DP4x4(left4, bottom_left, bottom_left, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniFp16toFp32_4x4);
VXC_DP4x4(right4, bottom_right, bottom_left, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniRightSubLeft_4x4);
bottom4 = right4 * x_lerp + left4;
bottom4 -= top4;
float4 dst4 = bottom4 * y_lerp + top4;
VXC_DP4x4(left4, top, top, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniFp16toFp32_left_4x4);
VXC_DP4x4(right4, top, top, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniRightSubLeft_4x4);
top4 = right4 * x_lerp + left4;
VXC_DP4x4(left4, bottom, bottom, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniFp16toFp32_left_4x4);
VXC_DP4x4(right4, bottom, bottom, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniRightSubLeft_4x4);
bottom4 = right4 * x_lerp + left4;
bottom4 -= top4;
float4 dst4 = bottom4 * y_lerp + top4;
half4 tmp;
_viv_asm(CONV, tmp, dst4);
VXC_DP2x8(top_left, tmp, tmp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniExtactHalf8_2x8);
_viv_asm(COPY, top_left0, top_left, 16);
VXC_WriteImage2DArray(output, coord_out, top_left0, VXC_MODIFIER(0, 3, 0,VXC_RM_TowardZero, 0));
VXC_DP2x8(result, tmp, tmp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniExtactHalf8_2x8);
_viv_asm(COPY, dst, result, 16);
int8 output_desc;
_viv_asm(COPY, output_desc, output, sizeof(output_desc));
baseAddr = (int)coord_out.z * output_desc.s4 + output_desc.s0;
_viv_asm(MOV, coord_out.w, baseAddr);
VXC_OP4_NoDest(img_store_3d, output, coord_out.xyww, dst,
VXC_MODIFIER(0, 3, 0,VXC_RM_TowardZero, 0));
}
__kernel void resize_bilinear_F16toU8_DOWN
@ -131,84 +103,50 @@ __kernel void resize_bilinear_F16toU8_DOWN
float4 left_x_f = floor(in_x);
float4 x_lerp = in_x - left_x_f;
int4 left_x_idx = convert_int4(left_x_f);
float4 right_x_f = ceil(in_x);
int4 right_x_idx = convert_int4(right_x_f);
float in_y = (convert_float(coord_out.y) + half_pixel_value) * scale_xy.y - half_pixel_value;
float top_y_f = floor(in_y);
float y_lerp = in_y - top_y_f;
int top_y_idx = convert_int(top_y_f);
float bottom_y_f = ceil(in_y);
int bottom_y_idx= convert_int(bottom_y_f);
vxc_short8 top_left0, top_right0;
vxc_short8 bottom_left0, bottom_right0;
vxc_half8 top_left, top_right;
vxc_half8 bottom_left, bottom_right;
vxc_short8 top_short, bottom_short;
vxc_half8 top, bottom;
int4 coord_in = (int4)(left_x_idx.x, top_y_idx, coord_out.z, 0);
VXC_ReadImage2DArray(top_left0, input, coord_in, \
VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));
int8 input_desc;
_viv_asm(COPY, input_desc, input, sizeof(input_desc));
int baseAddr = (int)coord_in.z * input_desc.s4 + input_desc.s0;
_viv_asm(MOV, coord_in.w, baseAddr);
VXC_OP4(img_load_3d, top_short, input, coord_in.xyww, VXC_5BITOFFSET_XY(0, 0),
VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0));
VXC_OP4(img_load_3d, bottom_short, input, coord_in.xyww, VXC_5BITOFFSET_XY(0, 1),
VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0));
coord_in.x = left_x_idx.y;
VXC_ReadImage2DArray(top_left0, input, coord_in, \
VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(1, 1, 0, VXC_RM_TowardZero, 0));
VXC_OP4(img_load_3d, top_short, input, coord_in.xyww, VXC_5BITOFFSET_XY(0, 0),
VXC_MODIFIER(2, 3, 0, VXC_RM_TowardZero, 0));
VXC_OP4(img_load_3d, bottom_short, input, coord_in.xyww, VXC_5BITOFFSET_XY(0, 1),
VXC_MODIFIER(2, 3, 0, VXC_RM_TowardZero, 0));
coord_in.x = left_x_idx.z;
VXC_ReadImage2DArray(top_left0, input, coord_in, \
VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(2, 2, 0, VXC_RM_TowardZero, 0));
VXC_OP4(img_load_3d, top_short, input, coord_in.xyww, VXC_5BITOFFSET_XY(0, 0),
VXC_MODIFIER(4, 5, 0, VXC_RM_TowardZero, 0));
VXC_OP4(img_load_3d, bottom_short, input, coord_in.xyww, VXC_5BITOFFSET_XY(0, 1),
VXC_MODIFIER(4, 5, 0, VXC_RM_TowardZero, 0));
coord_in.x = left_x_idx.w;
VXC_ReadImage2DArray(top_left0, input, coord_in, \
VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(3, 3, 0, VXC_RM_TowardZero, 0));
_viv_asm(COPY, top_left, top_left0, 16);
VXC_OP4(img_load_3d, top_short, input, coord_in.xyww, VXC_5BITOFFSET_XY(0, 0),
VXC_MODIFIER(6, 7, 0, VXC_RM_TowardZero, 0));
VXC_OP4(img_load_3d, bottom_short, input, coord_in.xyww, VXC_5BITOFFSET_XY(0, 1),
VXC_MODIFIER(6, 7, 0, VXC_RM_TowardZero, 0));
_viv_asm(COPY, top, top_short, 16);
_viv_asm(COPY, bottom, bottom_short, 16);
coord_in.x = right_x_idx.x;
VXC_ReadImage2DArray(top_right0, input, coord_in, \
VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));
coord_in.x = right_x_idx.y;
VXC_ReadImage2DArray(top_right0, input, coord_in, \
VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(1, 1, 0, VXC_RM_TowardZero, 0));
coord_in.x = right_x_idx.z;
VXC_ReadImage2DArray(top_right0, input, coord_in, \
VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(2, 2, 0, VXC_RM_TowardZero, 0));
coord_in.x = right_x_idx.w;
VXC_ReadImage2DArray(top_right0, input, coord_in, \
VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(3, 3, 0, VXC_RM_TowardZero, 0));
_viv_asm(COPY, top_right, top_right0, 16);
coord_in.y = bottom_y_idx;
coord_in.x = left_x_idx.x;
VXC_ReadImage2DArray(bottom_left0, input, coord_in, \
VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));
coord_in.x = left_x_idx.y;
VXC_ReadImage2DArray(bottom_left0, input, coord_in, \
VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(1, 1, 0, VXC_RM_TowardZero, 0));
coord_in.x = left_x_idx.z;
VXC_ReadImage2DArray(bottom_left0, input, coord_in, \
VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(2, 2, 0, VXC_RM_TowardZero, 0));
coord_in.x = left_x_idx.w;
VXC_ReadImage2DArray(bottom_left0, input, coord_in, \
VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(3, 3, 0, VXC_RM_TowardZero, 0));
_viv_asm(COPY, bottom_left, bottom_left0, 16);
coord_in.x = right_x_idx.x;
VXC_ReadImage2DArray(bottom_right0, input, coord_in, \
VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));
coord_in.x = right_x_idx.y;
VXC_ReadImage2DArray(bottom_right0, input, coord_in, \
VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(1, 1, 0, VXC_RM_TowardZero, 0));
coord_in.x = right_x_idx.z;
VXC_ReadImage2DArray(bottom_right0, input, coord_in, \
VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(2, 2, 0, VXC_RM_TowardZero, 0));
coord_in.x = right_x_idx.w;
VXC_ReadImage2DArray(bottom_right0, input, coord_in, \
VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(3, 3, 0, VXC_RM_TowardZero, 0));
_viv_asm(COPY, bottom_right, bottom_right0, 16);
float4 left4;
float4 right4;
float4 top4;
float4 bottom4;
VXC_DP4x4(left4, top_left, top_left, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniFp16toFp32_4x4);
VXC_DP4x4(right4, top_right, top_left, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniRightSubLeft_4x4);
VXC_DP4x4(left4, top, top, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniFp16toFp32_left_4x4);
VXC_DP4x4(right4, top, top, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniRightSubLeft_4x4);
top4 = right4 * x_lerp + left4;
VXC_DP4x4(left4, bottom_left, bottom_left, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniFp16toFp32_4x4);
VXC_DP4x4(right4, bottom_right, bottom_left, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniRightSubLeft_4x4);
VXC_DP4x4(left4, bottom, bottom, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniFp16toFp32_left_4x4);
VXC_DP4x4(right4, bottom, bottom, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniRightSubLeft_4x4);
bottom4 = right4 * x_lerp + left4;
bottom4 -= top4;
float4 dst4 = bottom4 * y_lerp + top4;
@ -216,7 +154,14 @@ __kernel void resize_bilinear_F16toU8_DOWN
int4 dst = convert_int4_rte(dst4);
vxc_uchar8 dst_uchar;
VXC_DP2x8(dst_uchar, dst, dst, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), uniExtact8Bit_2x8);
VXC_WriteImage2DArray(output, coord_out, dst_uchar, VXC_MODIFIER(0, 3, 0,VXC_RM_TowardZero, 0));
int8 output_desc;
_viv_asm(COPY, output_desc, output, sizeof(output_desc));
baseAddr = (int)coord_out.z * output_desc.s4 + output_desc.s0;
_viv_asm(MOV, coord_out.w, baseAddr);
VXC_OP4_NoDest(img_store_3d, output, coord_out.xyww, dst_uchar,
VXC_MODIFIER(0, 3, 0,VXC_RM_TowardZero, 0));
}
__kernel void resize_bilinear_F16toF16_UP
@ -239,24 +184,26 @@ __kernel void resize_bilinear_F16toF16_UP
float top_y_f = floor(in_y);
float y_lerp = in_y - top_y_f;
int top_y_idx = convert_int(top_y_f);
float bottom_y_f = ceil(in_y);
int bottom_y_idx= convert_int(bottom_y_f);
vxc_ushort8 src0, src1, src2, src3, dst0, dst1;
vxc_half8 top;
vxc_half8 bottom;
int4 coord_in = (int4)(left_x_idx.x, top_y_idx, coord_out.z, 0);
VXC_ReadImage2DArray(src0, input, coord_in, \
VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
VXC_ReadImage2DArray(src1, input, coord_in, \
VXC_5BITOFFSET_XY(8, 0), VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
int8 input_desc;
_viv_asm(COPY, input_desc, input, sizeof(input_desc));
int baseAddr = (int)coord_in.z * input_desc.s4 + input_desc.s0;
_viv_asm(MOV, coord_in.w, baseAddr);
coord_in.y = bottom_y_idx;
VXC_ReadImage2DArray(src2, input, coord_in, \
VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
VXC_ReadImage2DArray(src3, input, coord_in, \
VXC_5BITOFFSET_XY(8, 0), VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
VXC_OP4(img_load_3d, src0, input, coord_in.xyww, VXC_5BITOFFSET_XY(0, 0),
VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
VXC_OP4(img_load_3d, src1, input, coord_in.xyww, VXC_5BITOFFSET_XY(8, 0),
VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
VXC_OP4(img_load_3d, src2, input, coord_in.xyww, VXC_5BITOFFSET_XY(0, 1),
VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
VXC_OP4(img_load_3d, src3, input, coord_in.xyww, VXC_5BITOFFSET_XY(8, 1),
VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
vxc_ushort8 bitextract_p0;
vxc_uchar16 maskShift = {16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16};
@ -266,32 +213,41 @@ __kernel void resize_bilinear_F16toF16_UP
VXC_DP2x8(maskShift, bitextract_p0, constData, \
VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniGetMaskShift_2x8);
do
int8 output_desc;
_viv_asm(COPY, output_desc, output, sizeof(output_desc));
baseAddr = (int)coord_out.z * output_desc.s4 + output_desc.s0;
_viv_asm(MOV, coord_out.w, baseAddr);
float4 left4;
float4 right4;
float4 top4;
float4 bottom4;
int loop = depth - 1;
while (coord_in.z < loop)
{
VXC_BitExtract(dst0, src0, src1, maskShift, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
VXC_BitExtract(dst1, src2, src3, maskShift, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
_viv_asm(COPY, top, dst0, 16);
_viv_asm(COPY, bottom, dst1, 16);
coord_in.w += input_desc.s4;
VXC_OP4(img_load_3d, src0, input, coord_in.xyww, VXC_5BITOFFSET_XY(0, 0),
VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
VXC_OP4(img_load_3d, src1, input, coord_in.xyww, VXC_5BITOFFSET_XY(8, 0),
VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
VXC_OP4(img_load_3d, src2, input, coord_in.xyww, VXC_5BITOFFSET_XY(0, 1),
VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
VXC_OP4(img_load_3d, src3, input, coord_in.xyww, VXC_5BITOFFSET_XY(8, 1),
VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
coord_in.z ++;
coord_in.y = top_y_idx;
VXC_ReadImage2DArray(src0, input, coord_in, \
VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
VXC_ReadImage2DArray(src1, input, coord_in, \
VXC_5BITOFFSET_XY(8, 0), VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
coord_in.y = bottom_y_idx;
VXC_ReadImage2DArray(src2, input, coord_in, \
VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
VXC_ReadImage2DArray(src3, input, coord_in, \
VXC_5BITOFFSET_XY(8, 0), VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
float4 left4;
float4 right4;
float4 top4;
float4 bottom4;
VXC_DP4x4(left4, top, top, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniFp16toFp32_4x4);
VXC_DP4x4(right4, top, top, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniFp16toFp32_part1_4x4);
VXC_DP4x4(left4, top, top, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniFp16toFp32_left_4x4);
VXC_DP4x4(right4, top, top, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniRightSubLeft_4x4);
top4 = right4 * x_lerp + left4;
VXC_DP4x4(left4, bottom, bottom, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniFp16toFp32_4x4);
VXC_DP4x4(right4, bottom, bottom, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniFp16toFp32_part1_4x4);
VXC_DP4x4(left4, bottom, bottom, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniFp16toFp32_left_4x4);
VXC_DP4x4(right4, bottom, bottom, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniRightSubLeft_4x4);
bottom4 = right4 * x_lerp + left4;
bottom4 -= top4;
float4 dst4 = bottom4 * y_lerp + top4;
@ -299,7 +255,28 @@ __kernel void resize_bilinear_F16toF16_UP
_viv_asm(CONV, tmp, dst4);
VXC_DP2x8(top, tmp, tmp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniExtactHalf8_2x8);
_viv_asm(COPY, dst0, top, 16);
VXC_WriteImage2DArray(output, coord_out, dst0, VXC_MODIFIER(0, 3, 0,VXC_RM_TowardZero, 0));
coord_out.z ++;
} while (coord_in.z < depth);
VXC_OP4_NoDest(img_store_3d, output, coord_out.xyww, dst0, VXC_MODIFIER(0, 3, 0,VXC_RM_TowardZero, 0));
coord_out.w += output_desc.s4;
}
VXC_BitExtract(dst0, src0, src1, maskShift, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
VXC_BitExtract(dst1, src2, src3, maskShift, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
_viv_asm(COPY, top, dst0, 16);
_viv_asm(COPY, bottom, dst1, 16);
VXC_DP4x4(left4, top, top, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniFp16toFp32_left_4x4);
VXC_DP4x4(right4, top, top, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniRightSubLeft_4x4);
top4 = right4 * x_lerp + left4;
VXC_DP4x4(left4, bottom, bottom, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniFp16toFp32_left_4x4);
VXC_DP4x4(right4, bottom, bottom, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniRightSubLeft_4x4);
bottom4 = right4 * x_lerp + left4;
bottom4 -= top4;
float4 dst4 = bottom4 * y_lerp + top4;
half4 tmp;
_viv_asm(CONV, tmp, dst4);
VXC_DP2x8(top, tmp, tmp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniExtactHalf8_2x8);
_viv_asm(COPY, dst0, top, 16);
VXC_OP4_NoDest(img_store_3d, output, coord_out.xyww, dst0, VXC_MODIFIER(0, 3, 0,VXC_RM_TowardZero, 0));
}

View File

@ -5,8 +5,8 @@ _viv_uniform float2 scale_xy;
_viv_uniform int depth;
_viv_uniform VXC_512Bits uniConvertI32toI16_2x8;
_viv_uniform VXC_512Bits uniGetMaskShift_2x8;
_viv_uniform VXC_512Bits uniConvertDFP2FP32_part1_4x4;
_viv_uniform VXC_512Bits uniConvertDFP2FP32_4x4;
_viv_uniform VXC_512Bits uniDFPtoFp32_left_4x4;
_viv_uniform VXC_512Bits uniRightSubLeft_4x4;
_viv_uniform float dfpScale;
_viv_uniform float half_pixel_value;
@ -34,8 +34,6 @@ __kernel void resize_bilinear_I16toI16_UP
float top_y_f = floor(in_y);
float y_lerp = in_y - top_y_f;
int top_y_idx = convert_int(top_y_f);
float bottom_y_f = ceil(in_y);
int bottom_y_idx= convert_int(bottom_y_f);
vxc_ushort8 src0, src1, src2, src3, dst0, dst1;
@ -44,16 +42,19 @@ __kernel void resize_bilinear_I16toI16_UP
int4 coord_in = (int4)(left_x_idx.x, top_y_idx, coord_out.z, 0);
VXC_ReadImage2DArray(src0, input, coord_in, \
VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
VXC_ReadImage2DArray(src1, input, coord_in, \
VXC_5BITOFFSET_XY(8, 0), VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
int8 input_desc;
_viv_asm(COPY, input_desc, input, sizeof(input_desc));
int baseAddr = (int)coord_in.z * input_desc.s4 + input_desc.s0;
_viv_asm(MOV, coord_in.w, baseAddr);
coord_in.y = bottom_y_idx;
VXC_ReadImage2DArray(src2, input, coord_in, \
VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
VXC_ReadImage2DArray(src3, input, coord_in, \
VXC_5BITOFFSET_XY(8, 0), VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
VXC_OP4(img_load_3d, src0, input, coord_in.xyww, VXC_5BITOFFSET_XY(0, 0),
VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
VXC_OP4(img_load_3d, src1, input, coord_in.xyww, VXC_5BITOFFSET_XY(8, 0),
VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
VXC_OP4(img_load_3d, src2, input, coord_in.xyww, VXC_5BITOFFSET_XY(0, 1),
VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
VXC_OP4(img_load_3d, src3, input, coord_in.xyww, VXC_5BITOFFSET_XY(8, 1),
VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
vxc_ushort8 bitextract_p0;
vxc_uchar16 maskShift = {16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16};
@ -63,39 +64,42 @@ __kernel void resize_bilinear_I16toI16_UP
VXC_DP2x8(maskShift, bitextract_p0, constData, \
VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniGetMaskShift_2x8);
do
int8 output_desc;
_viv_asm(COPY, output_desc, output, sizeof(output_desc));
baseAddr = (int)coord_out.z * output_desc.s4 + output_desc.s0;
_viv_asm(MOV, coord_out.w, baseAddr);
float4 left4;
float4 right4;
float4 top4;
float4 bottom4;
int loop = depth - 1;
while (coord_in.z < loop)
{
VXC_BitExtract(dst0, src0, src1, maskShift, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
VXC_BitExtract(dst1, src2, src3, maskShift, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
_viv_asm(COPY, top, dst0, 16);
_viv_asm(COPY, bottom, dst1, 16);
float4 left4;
float4 right4;
float4 top4;
float4 bottom4;
coord_in.w += input_desc.s4;
VXC_OP4(img_load_3d, src0, input, coord_in.xyww, VXC_5BITOFFSET_XY(0, 0),
VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
VXC_OP4(img_load_3d, src1, input, coord_in.xyww, VXC_5BITOFFSET_XY(8, 0),
VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
VXC_OP4(img_load_3d, src2, input, coord_in.xyww, VXC_5BITOFFSET_XY(0, 1),
VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
VXC_OP4(img_load_3d, src3, input, coord_in.xyww, VXC_5BITOFFSET_XY(8, 1),
VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
coord_in.z ++;
coord_in.y = top_y_idx;
VXC_ReadImage2DArray(src0, input, coord_in, \
VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
VXC_ReadImage2DArray(src1, input, coord_in, \
VXC_5BITOFFSET_XY(8, 0), VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
coord_in.y = bottom_y_idx;
VXC_ReadImage2DArray(src2, input, coord_in, \
VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
VXC_ReadImage2DArray(src3, input, coord_in, \
VXC_5BITOFFSET_XY(8, 0), VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
VXC_DP4x4(left4, top, top, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertDFP2FP32_4x4);
VXC_DP4x4(right4, top, top, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertDFP2FP32_part1_4x4);
VXC_DP4x4(left4, top, top, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniDFPtoFp32_left_4x4);
VXC_DP4x4(right4, top, top, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniRightSubLeft_4x4);
top4 = right4 * x_lerp + left4;
VXC_DP4x4(left4, bottom, bottom, \
VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertDFP2FP32_4x4);
VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniDFPtoFp32_left_4x4);
VXC_DP4x4(right4, bottom, bottom, \
VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertDFP2FP32_part1_4x4);
VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniRightSubLeft_4x4);
bottom4 = right4 * x_lerp + left4;
bottom4 -= top4;
float4 dst4 = bottom4 * y_lerp + top4;
@ -103,10 +107,30 @@ __kernel void resize_bilinear_I16toI16_UP
int4 dst = convert_int4_rte(dst4);
VXC_DP2x8(top, dst, dst, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), uniExtact8Bit_2x8);
VXC_WriteImage2DArray(output, coord_out, top, VXC_MODIFIER(0, 3, 0,VXC_RM_TowardZero, 0));
coord_out.z ++;
} while (coord_in.z < depth);
VXC_OP4_NoDest(img_store_3d, output, coord_out.xyww, top, VXC_MODIFIER(0, 3, 0,VXC_RM_TowardZero, 0));
coord_out.w += output_desc.s4;
}
VXC_BitExtract(dst0, src0, src1, maskShift, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
VXC_BitExtract(dst1, src2, src3, maskShift, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
_viv_asm(COPY, top, dst0, 16);
_viv_asm(COPY, bottom, dst1, 16);
VXC_DP4x4(left4, top, top, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniDFPtoFp32_left_4x4);
VXC_DP4x4(right4, top, top, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniRightSubLeft_4x4);
top4 = right4 * x_lerp + left4;
VXC_DP4x4(left4, bottom, bottom, \
VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniDFPtoFp32_left_4x4);
VXC_DP4x4(right4, bottom, bottom, \
VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniRightSubLeft_4x4);
bottom4 = right4 * x_lerp + left4;
bottom4 -= top4;
float4 dst4 = bottom4 * y_lerp + top4;
dst4 = dst4 * dfpScale;
int4 dst = convert_int4_rte(dst4);
VXC_DP2x8(top, dst, dst, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), uniExtact8Bit_2x8);
VXC_OP4_NoDest(img_store_3d, output, coord_out.xyww, top, VXC_MODIFIER(0, 3, 0,VXC_RM_TowardZero, 0));
}
__kernel void resize_bilinear_I16toI16_DOWN
@ -125,103 +149,67 @@ __kernel void resize_bilinear_I16toI16_DOWN
float4 left_x_f = floor(in_x);
float4 x_lerp = in_x - left_x_f;
int4 left_x_idx = convert_int4(left_x_f);
float4 right_x_f = ceil(in_x);
int4 right_x_idx = convert_int4(right_x_f);
float in_y = (convert_float(coord_out.y) + half_pixel_value) * scale_xy.y - half_pixel_value;
float top_y_f = floor(in_y);
float y_lerp = in_y - top_y_f;
int top_y_idx = convert_int(top_y_f);
float bottom_y_f = ceil(in_y);
int bottom_y_idx= convert_int(bottom_y_f);
vxc_short8 top_left, top_right;
vxc_short8 bottom_left, bottom_right;
vxc_short8 top, bottom, result;
int4 coord_in = (int4)(left_x_idx.x, top_y_idx, coord_out.z, 0);
VXC_ReadImage2DArray(top_left, input, coord_in, \
VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));
int8 input_desc;
_viv_asm(COPY, input_desc, input, sizeof(input_desc));
int baseAddr = (int)coord_in.z * input_desc.s4 + input_desc.s0;
_viv_asm(MOV, coord_in.w, baseAddr);
VXC_OP4(img_load_3d, top, input, coord_in.xyww, VXC_5BITOFFSET_XY(0, 0),
VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0));
VXC_OP4(img_load_3d, bottom, input, coord_in.xyww, VXC_5BITOFFSET_XY(0, 1),
VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0));
coord_in.x = left_x_idx.y;
VXC_ReadImage2DArray(top_left, input, coord_in, \
VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(1, 1, 0, VXC_RM_TowardZero, 0));
VXC_OP4(img_load_3d, top, input, coord_in.xyww, VXC_5BITOFFSET_XY(0, 0),
VXC_MODIFIER(2, 3, 0, VXC_RM_TowardZero, 0));
VXC_OP4(img_load_3d, bottom, input, coord_in.xyww, VXC_5BITOFFSET_XY(0, 1),
VXC_MODIFIER(2, 3, 0, VXC_RM_TowardZero, 0));
coord_in.x = left_x_idx.z;
VXC_ReadImage2DArray(top_left, input, coord_in, \
VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(2, 2, 0, VXC_RM_TowardZero, 0));
VXC_OP4(img_load_3d, top, input, coord_in.xyww, VXC_5BITOFFSET_XY(0, 0),
VXC_MODIFIER(4, 5, 0, VXC_RM_TowardZero, 0));
VXC_OP4(img_load_3d, bottom, input, coord_in.xyww, VXC_5BITOFFSET_XY(0, 1),
VXC_MODIFIER(4, 5, 0, VXC_RM_TowardZero, 0));
coord_in.x = left_x_idx.w;
VXC_ReadImage2DArray(top_left, input, coord_in, \
VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(3, 3, 0, VXC_RM_TowardZero, 0));
coord_in.x = right_x_idx.x;
VXC_ReadImage2DArray(top_right, input, coord_in, \
VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));
coord_in.x = right_x_idx.y;
VXC_ReadImage2DArray(top_right, input, coord_in, \
VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(1, 1, 0, VXC_RM_TowardZero, 0));
coord_in.x = right_x_idx.z;
VXC_ReadImage2DArray(top_right, input, coord_in, \
VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(2, 2, 0, VXC_RM_TowardZero, 0));
coord_in.x = right_x_idx.w;
VXC_ReadImage2DArray(top_right, input, coord_in, \
VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(3, 3, 0, VXC_RM_TowardZero, 0));
coord_in.y = bottom_y_idx;
coord_in.x = left_x_idx.x;
VXC_ReadImage2DArray(bottom_left, input, coord_in, \
VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));
coord_in.x = left_x_idx.y;
VXC_ReadImage2DArray(bottom_left, input, coord_in, \
VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(1, 1, 0, VXC_RM_TowardZero, 0));
coord_in.x = left_x_idx.z;
VXC_ReadImage2DArray(bottom_left, input, coord_in, \
VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(2, 2, 0, VXC_RM_TowardZero, 0));
coord_in.x = left_x_idx.w;
VXC_ReadImage2DArray(bottom_left, input, coord_in, \
VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(3, 3, 0, VXC_RM_TowardZero, 0));
coord_in.x = right_x_idx.x;
VXC_ReadImage2DArray(bottom_right, input, coord_in, \
VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));
coord_in.x = right_x_idx.y;
VXC_ReadImage2DArray(bottom_right, input, coord_in, \
VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(1, 1, 0, VXC_RM_TowardZero, 0));
coord_in.x = right_x_idx.z;
VXC_ReadImage2DArray(bottom_right, input, coord_in, \
VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(2, 2, 0, VXC_RM_TowardZero, 0));
coord_in.x = right_x_idx.w;
VXC_ReadImage2DArray(bottom_right, input, coord_in, \
VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(3, 3, 0, VXC_RM_TowardZero, 0));
VXC_OP4(img_load_3d, top, input, coord_in.xyww, VXC_5BITOFFSET_XY(0, 0),
VXC_MODIFIER(6, 7, 0, VXC_RM_TowardZero, 0));
VXC_OP4(img_load_3d, bottom, input, coord_in.xyww, VXC_5BITOFFSET_XY(0, 1),
VXC_MODIFIER(6, 7, 0, VXC_RM_TowardZero, 0));
float4 left4;
float4 right4;
float4 top4;
float4 bottom4;
VXC_DP4x4(left4, top_left, top_left, \
VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertDFP2FP32_4x4);
VXC_DP4x4(right4, top_right, top_right, \
VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertDFP2FP32_4x4);
right4 -= left4;
VXC_DP4x4(left4, top, top, \
VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniDFPtoFp32_left_4x4);
VXC_DP4x4(right4, top, top, \
VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniRightSubLeft_4x4);
top4 = right4 * x_lerp + left4;
VXC_DP4x4(left4, bottom_left, bottom_left, \
VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertDFP2FP32_4x4);
VXC_DP4x4(right4, bottom_right, bottom_right, \
VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertDFP2FP32_4x4);
right4 -= left4;
VXC_DP4x4(left4, bottom, bottom, \
VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniDFPtoFp32_left_4x4);
VXC_DP4x4(right4, bottom, bottom, \
VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniRightSubLeft_4x4);
bottom4 = right4 * x_lerp + left4;
bottom4 -= top4;
float4 dst4 = bottom4 * y_lerp + top4;
dst4 = dst4 * dfpScale;
int4 dst = convert_int4_rte(dst4);
VXC_DP2x8(top_left, dst, dst, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), uniExtact8Bit_2x8);
VXC_WriteImage2DArray(output, coord_out, top_left, VXC_MODIFIER(0, 3, 0,VXC_RM_TowardZero, 0));
VXC_DP2x8(result, dst, dst, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), uniExtact8Bit_2x8);
int8 output_desc;
_viv_asm(COPY, output_desc, output, sizeof(output_desc));
baseAddr = (int)coord_out.z * output_desc.s4 + output_desc.s0;
_viv_asm(MOV, coord_out.w, baseAddr);
VXC_OP4_NoDest(img_store_3d, output, coord_out.xyww, result,
VXC_MODIFIER(0, 3, 0,VXC_RM_TowardZero, 0));
}

View File

@ -5,8 +5,8 @@ _viv_uniform float2 scale_xy;
_viv_uniform int depth;
_viv_uniform VXC_512Bits uniConvertI32toI16_2x8;
_viv_uniform VXC_512Bits uniGetMaskShift_2x8;
_viv_uniform VXC_512Bits uniConvertDFP2FP32_part1_4x4;
_viv_uniform VXC_512Bits uniConvertDFP2FP32_4x4;
_viv_uniform VXC_512Bits uniDFPtoFp32_left_4x4;
_viv_uniform VXC_512Bits uniRightSubLeft_4x4;
_viv_uniform float dfpScale;
_viv_uniform float half_pixel_value;
@ -34,8 +34,6 @@ __kernel void resize_bilinear_I8toI8_UP
float top_y_f = floor(in_y);
float y_lerp = in_y - top_y_f;
int top_y_idx = convert_int(top_y_f);
float bottom_y_f = ceil(in_y);
int bottom_y_idx= convert_int(bottom_y_f);
vxc_uchar16 src0, src1, dst0, dst1;
@ -44,12 +42,15 @@ __kernel void resize_bilinear_I8toI8_UP
int4 coord_in = (int4)(left_x_idx.x, top_y_idx, coord_out.z, 0);
VXC_ReadImage2DArray(src0, input, coord_in, \
VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));
int8 input_desc;
_viv_asm(COPY, input_desc, input, sizeof(input_desc));
int baseAddr = (int)coord_in.z * input_desc.s4 + input_desc.s0;
_viv_asm(MOV, coord_in.w, baseAddr);
coord_in.y = bottom_y_idx;
VXC_ReadImage2DArray(src1, input, coord_in, \
VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));
VXC_OP4(img_load_3d, src0, input, coord_in.xyww, VXC_5BITOFFSET_XY(0, 0),
VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));
VXC_OP4(img_load_3d, src1, input, coord_in.xyww, VXC_5BITOFFSET_XY(0, 1),
VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));
vxc_ushort8 bitextract_p0;
vxc_uchar16 maskShift = {8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8};
@ -59,37 +60,42 @@ __kernel void resize_bilinear_I8toI8_UP
VXC_DP2x8(maskShift, bitextract_p0, constData, \
VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniGetMaskShift_2x8);
do
int8 output_desc;
_viv_asm(COPY, output_desc, output, sizeof(output_desc));
baseAddr = (int)coord_out.z * output_desc.s4 + output_desc.s0;
_viv_asm(MOV, coord_out.w, baseAddr);
float4 left4;
float4 right4;
float4 top4;
float4 bottom4;
int loop = depth - 1;
while (coord_in.z < loop)
{
VXC_BitExtract(dst0, src0, src0, maskShift, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
VXC_BitExtract(dst1, src1, src1, maskShift, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
_viv_asm(COPY, top, dst0, 16);
_viv_asm(COPY, bottom, dst1, 16);
coord_in.w += input_desc.s4;
VXC_OP4(img_load_3d, src0, input, coord_in.xyww, VXC_5BITOFFSET_XY(0, 0),
VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));
VXC_OP4(img_load_3d, src1, input, coord_in.xyww, VXC_5BITOFFSET_XY(0, 1),
VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));
coord_in.z ++;
coord_in.y = top_y_idx;
VXC_ReadImage2DArray(src0, input, coord_in, \
VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));
coord_in.y = bottom_y_idx;
VXC_ReadImage2DArray(src1, input, coord_in, \
VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));
float4 left4;
float4 right4;
float4 top4;
float4 bottom4;
VXC_DP4x4(left4, top, top, \
VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertDFP2FP32_4x4);
VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniDFPtoFp32_left_4x4);
VXC_DP4x4(right4, top, top, \
VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertDFP2FP32_part1_4x4);
VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniRightSubLeft_4x4);
top4 = right4 * x_lerp + left4;
VXC_DP4x4(left4, bottom, bottom, \
VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertDFP2FP32_4x4);
VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniDFPtoFp32_left_4x4);
VXC_DP4x4(right4, bottom, bottom, \
VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertDFP2FP32_part1_4x4);
VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniRightSubLeft_4x4);
bottom4 = right4 * x_lerp + left4;
bottom4 -= top4;
@ -97,10 +103,31 @@ __kernel void resize_bilinear_I8toI8_UP
dst4 = dst4 * dfpScale;
int4 dst = convert_int4_rte(dst4);
VXC_DP2x8(top, dst, dst, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), uniExtact8Bit_2x8);
VXC_WriteImage2DArray(output, coord_out, top, VXC_MODIFIER(0, 3, 0,VXC_RM_TowardZero, 0));
coord_out.z ++;
} while (coord_in.z < depth);
VXC_OP4_NoDest(img_store_3d, output, coord_out.xyww, top, VXC_MODIFIER(0, 3, 0,VXC_RM_TowardZero, 0));
coord_out.w += output_desc.s4;
}
VXC_BitExtract(dst0, src0, src0, maskShift, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
VXC_BitExtract(dst1, src1, src1, maskShift, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
_viv_asm(COPY, top, dst0, 16);
_viv_asm(COPY, bottom, dst1, 16);
VXC_DP4x4(left4, top, top, \
VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniDFPtoFp32_left_4x4);
VXC_DP4x4(right4, top, top, \
VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniRightSubLeft_4x4);
top4 = right4 * x_lerp + left4;
VXC_DP4x4(left4, bottom, bottom, \
VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniDFPtoFp32_left_4x4);
VXC_DP4x4(right4, bottom, bottom, \
VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniRightSubLeft_4x4);
bottom4 = right4 * x_lerp + left4;
bottom4 -= top4;
float4 dst4 = bottom4 * y_lerp + top4;
dst4 = dst4 * dfpScale;
int4 dst = convert_int4_rte(dst4);
VXC_DP2x8(top, dst, dst, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), uniExtact8Bit_2x8);
VXC_OP4_NoDest(img_store_3d, output, coord_out.xyww, top, VXC_MODIFIER(0, 3, 0,VXC_RM_TowardZero, 0));
}
__kernel void resize_bilinear_I8toI8_DOWN
@ -112,98 +139,55 @@ __kernel void resize_bilinear_I8toI8_DOWN
)
{
int4 coord_out = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0);
int4 coord_x = coord_out.xxxx + (int4)(0, 1, 2, 3);
float4 in_x = (convert_float4(coord_x) + half_pixel_value) * scale_xy.xxxx - half_pixel_value;
float4 left_x_f = floor(in_x);
float4 x_lerp = in_x - left_x_f;
int4 left_x_idx = convert_int4(left_x_f);
float4 right_x_f = ceil(in_x);
int4 right_x_idx = convert_int4(right_x_f);
float in_y = (convert_float(coord_out.y) + half_pixel_value) * scale_xy.y - half_pixel_value;
float top_y_f = floor(in_y);
float y_lerp = in_y - top_y_f;
int top_y_idx = convert_int(top_y_f);
float bottom_y_f = ceil(in_y);
int bottom_y_idx= convert_int(bottom_y_f);
vxc_char16 top_left, top_right;
vxc_char16 bottom_left, bottom_right;
vxc_char16 top, bottom, result;
int4 coord_in = (int4)(left_x_idx.x, top_y_idx, coord_out.z, 0);
VXC_ReadImage2DArray(top_left, input, coord_in, \
VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));
int8 input_desc;
_viv_asm(COPY, input_desc, input, sizeof(input_desc));
int baseAddr = (int)coord_in.z * input_desc.s4 + input_desc.s0;
_viv_asm(MOV, coord_in.w, baseAddr);
VXC_OP4(img_load_3d, top, input, coord_in.xyww, VXC_5BITOFFSET_XY(0, 0),
VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0));
VXC_OP4(img_load_3d, bottom, input, coord_in.xyww, VXC_5BITOFFSET_XY(0, 1),
VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0));
coord_in.x = left_x_idx.y;
VXC_ReadImage2DArray(top_left, input, coord_in, \
VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(1, 1, 0, VXC_RM_TowardZero, 0));
VXC_OP4(img_load_3d, top, input, coord_in.xyww, VXC_5BITOFFSET_XY(0, 0),
VXC_MODIFIER(2, 3, 0, VXC_RM_TowardZero, 0));
VXC_OP4(img_load_3d, bottom, input, coord_in.xyww, VXC_5BITOFFSET_XY(0, 1),
VXC_MODIFIER(2, 3, 0, VXC_RM_TowardZero, 0));
coord_in.x = left_x_idx.z;
VXC_ReadImage2DArray(top_left, input, coord_in, \
VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(2, 2, 0, VXC_RM_TowardZero, 0));
VXC_OP4(img_load_3d, top, input, coord_in.xyww, VXC_5BITOFFSET_XY(0, 0),
VXC_MODIFIER(4, 5, 0, VXC_RM_TowardZero, 0));
VXC_OP4(img_load_3d, bottom, input, coord_in.xyww, VXC_5BITOFFSET_XY(0, 1),
VXC_MODIFIER(4, 5, 0, VXC_RM_TowardZero, 0));
coord_in.x = left_x_idx.w;
VXC_ReadImage2DArray(top_left, input, coord_in, \
VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(3, 3, 0, VXC_RM_TowardZero, 0));
coord_in.x = right_x_idx.x;
VXC_ReadImage2DArray(top_right, input, coord_in, \
VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));
coord_in.x = right_x_idx.y;
VXC_ReadImage2DArray(top_right, input, coord_in, \
VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(1, 1, 0, VXC_RM_TowardZero, 0));
coord_in.x = right_x_idx.z;
VXC_ReadImage2DArray(top_right, input, coord_in, \
VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(2, 2, 0, VXC_RM_TowardZero, 0));
coord_in.x = right_x_idx.w;
VXC_ReadImage2DArray(top_right, input, coord_in, \
VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(3, 3, 0, VXC_RM_TowardZero, 0));
coord_in.y = bottom_y_idx;
coord_in.x = left_x_idx.x;
VXC_ReadImage2DArray(bottom_left, input, coord_in, \
VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));
coord_in.x = left_x_idx.y;
VXC_ReadImage2DArray(bottom_left, input, coord_in, \
VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(1, 1, 0, VXC_RM_TowardZero, 0));
coord_in.x = left_x_idx.z;
VXC_ReadImage2DArray(bottom_left, input, coord_in, \
VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(2, 2, 0, VXC_RM_TowardZero, 0));
coord_in.x = left_x_idx.w;
VXC_ReadImage2DArray(bottom_left, input, coord_in, \
VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(3, 3, 0, VXC_RM_TowardZero, 0));
coord_in.x = right_x_idx.x;
VXC_ReadImage2DArray(bottom_right, input, coord_in, \
VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));
coord_in.x = right_x_idx.y;
VXC_ReadImage2DArray(bottom_right, input, coord_in, \
VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(1, 1, 0, VXC_RM_TowardZero, 0));
coord_in.x = right_x_idx.z;
VXC_ReadImage2DArray(bottom_right, input, coord_in, \
VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(2, 2, 0, VXC_RM_TowardZero, 0));
coord_in.x = right_x_idx.w;
VXC_ReadImage2DArray(bottom_right, input, coord_in, \
VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(3, 3, 0, VXC_RM_TowardZero, 0));
VXC_OP4(img_load_3d, top, input, coord_in.xyww, VXC_5BITOFFSET_XY(0, 0),
VXC_MODIFIER(6, 7, 0, VXC_RM_TowardZero, 0));
VXC_OP4(img_load_3d, bottom, input, coord_in.xyww, VXC_5BITOFFSET_XY(0, 1),
VXC_MODIFIER(6, 7, 0, VXC_RM_TowardZero, 0));
float4 left4;
float4 right4;
float4 top4;
float4 bottom4;
VXC_DP4x4(left4, top_left, top_left, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertDFP2FP32_4x4);
VXC_DP4x4(right4, top_right, top_right, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertDFP2FP32_4x4);
right4 -= left4;
VXC_DP4x4(left4, top, top, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniDFPtoFp32_left_4x4);
VXC_DP4x4(right4, top, top, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniRightSubLeft_4x4);
top4 = right4 * x_lerp + left4;
VXC_DP4x4(left4, bottom_left, bottom_left, \
VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertDFP2FP32_4x4);
VXC_DP4x4(right4, bottom_right, bottom_right, \
VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertDFP2FP32_4x4);
right4 -= left4;
VXC_DP4x4(left4, bottom, bottom, \
VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniDFPtoFp32_left_4x4);
VXC_DP4x4(right4, bottom, bottom, \
VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniRightSubLeft_4x4);
bottom4 = right4 * x_lerp + left4;
bottom4 -= top4;
@ -213,6 +197,11 @@ __kernel void resize_bilinear_I8toI8_DOWN
int4 dst = convert_int4_rte(dst4);
VXC_DP2x8(top_left, dst, dst, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), uniExtact8Bit_2x8);
VXC_WriteImage2DArray(output, coord_out, top_left, VXC_MODIFIER(0, 3, 0,VXC_RM_TowardZero, 0));
VXC_DP2x8(result, dst, dst, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), uniExtact8Bit_2x8);
int8 output_desc;
_viv_asm(COPY, output_desc, output, sizeof(output_desc));
baseAddr = (int)coord_out.z * output_desc.s4 + output_desc.s0;
_viv_asm(MOV, coord_out.w, baseAddr);
VXC_OP4_NoDest(img_store_3d, output, coord_out.xyww, result, VXC_MODIFIER(0, 3, 0,VXC_RM_TowardZero, 0));
}

View File

@ -1,13 +1,13 @@
#include "cl_viv_vx_ext.h"
_viv_uniform VXC_512Bits uniU8SubZPtoFp32_4x4;
_viv_uniform VXC_512Bits uniU8SubZPtoFp32_left_4x4;
_viv_uniform VXC_512Bits uniU8RightSubLeft_4x4;
_viv_uniform VXC_512Bits uniExtact8Bit_2x8;
_viv_uniform float2 scale_xy;
_viv_uniform int depth;
_viv_uniform int input_ZP;
_viv_uniform float uint8Scale;
_viv_uniform float output_ZP;
_viv_uniform VXC_512Bits uniU8SubZPtoFp32_part1_4x4;
_viv_uniform VXC_512Bits uniConvertI32toI16_2x8;
_viv_uniform VXC_512Bits uniGetMaskShift_2x8;
_viv_uniform float half_pixel_value;
@ -26,69 +26,36 @@ __kernel void resize_bilinear_U8toF16_DOWN
float4 left_x_f = floor(in_x);
float4 x_lerp = in_x - left_x_f;
int4 left_x_idx = convert_int4(left_x_f);
float4 right_x_f = ceil(in_x);
int4 right_x_idx = convert_int4(right_x_f);
float in_y = (convert_float(coord_out.y) + half_pixel_value) * scale_xy.y - half_pixel_value;
float top_y_f = floor(in_y);
float y_lerp = in_y - top_y_f;
int top_y_idx = convert_int(top_y_f);
float bottom_y_f = ceil(in_y);
int bottom_y_idx= convert_int(bottom_y_f);
vxc_uchar16 top_left, top_right;
vxc_uchar16 bottom_left, bottom_right;
vxc_uchar16 top, bottom;
int4 coord_in = (int4)(left_x_idx.x, top_y_idx, coord_out.z, 0);
VXC_ReadImage2DArray(top_left, input, coord_in, \
VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));
int8 input_desc;
_viv_asm(COPY, input_desc, input, sizeof(input_desc));
int baseAddr = (int)coord_in.z * input_desc.s4 + input_desc.s0;
_viv_asm(MOV, coord_in.w, baseAddr);
VXC_OP4(img_load_3d, top, input, coord_in.xyww, VXC_5BITOFFSET_XY(0, 0),
VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0));
VXC_OP4(img_load_3d, bottom, input, coord_in.xyww, VXC_5BITOFFSET_XY(0, 1),
VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0));
coord_in.x = left_x_idx.y;
VXC_ReadImage2DArray(top_left, input, coord_in, \
VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(1, 1, 0, VXC_RM_TowardZero, 0));
VXC_OP4(img_load_3d, top, input, coord_in.xyww, VXC_5BITOFFSET_XY(0, 0),
VXC_MODIFIER(2, 3, 0, VXC_RM_TowardZero, 0));
VXC_OP4(img_load_3d, bottom, input, coord_in.xyww, VXC_5BITOFFSET_XY(0, 1),
VXC_MODIFIER(2, 3, 0, VXC_RM_TowardZero, 0));
coord_in.x = left_x_idx.z;
VXC_ReadImage2DArray(top_left, input, coord_in, \
VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(2, 2, 0, VXC_RM_TowardZero, 0));
VXC_OP4(img_load_3d, top, input, coord_in.xyww, VXC_5BITOFFSET_XY(0, 0),
VXC_MODIFIER(4, 5, 0, VXC_RM_TowardZero, 0));
VXC_OP4(img_load_3d, bottom, input, coord_in.xyww, VXC_5BITOFFSET_XY(0, 1),
VXC_MODIFIER(4, 5, 0, VXC_RM_TowardZero, 0));
coord_in.x = left_x_idx.w;
VXC_ReadImage2DArray(top_left, input, coord_in, \
VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(3, 3, 0, VXC_RM_TowardZero, 0));
coord_in.x = right_x_idx.x;
VXC_ReadImage2DArray(top_right, input, coord_in, \
VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));
coord_in.x = right_x_idx.y;
VXC_ReadImage2DArray(top_right, input, coord_in, \
VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(1, 1, 0, VXC_RM_TowardZero, 0));
coord_in.x = right_x_idx.z;
VXC_ReadImage2DArray(top_right, input, coord_in, \
VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(2, 2, 0, VXC_RM_TowardZero, 0));
coord_in.x = right_x_idx.w;
VXC_ReadImage2DArray(top_right, input, coord_in, \
VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(3, 3, 0, VXC_RM_TowardZero, 0));
coord_in.y = bottom_y_idx;
coord_in.x = left_x_idx.x;
VXC_ReadImage2DArray(bottom_left, input, coord_in, \
VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));
coord_in.x = left_x_idx.y;
VXC_ReadImage2DArray(bottom_left, input, coord_in, \
VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(1, 1, 0, VXC_RM_TowardZero, 0));
coord_in.x = left_x_idx.z;
VXC_ReadImage2DArray(bottom_left, input, coord_in, \
VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(2, 2, 0, VXC_RM_TowardZero, 0));
coord_in.x = left_x_idx.w;
VXC_ReadImage2DArray(bottom_left, input, coord_in, \
VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(3, 3, 0, VXC_RM_TowardZero, 0));
coord_in.x = right_x_idx.x;
VXC_ReadImage2DArray(bottom_right, input, coord_in, \
VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));
coord_in.x = right_x_idx.y;
VXC_ReadImage2DArray(bottom_right, input, coord_in, \
VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(1, 1, 0, VXC_RM_TowardZero, 0));
coord_in.x = right_x_idx.z;
VXC_ReadImage2DArray(bottom_right, input, coord_in, \
VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(2, 2, 0, VXC_RM_TowardZero, 0));
coord_in.x = right_x_idx.w;
VXC_ReadImage2DArray(bottom_right, input, coord_in, \
VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(3, 3, 0, VXC_RM_TowardZero, 0));
VXC_OP4(img_load_3d, top, input, coord_in.xyww, VXC_5BITOFFSET_XY(0, 0),
VXC_MODIFIER(6, 7, 0, VXC_RM_TowardZero, 0));
VXC_OP4(img_load_3d, bottom, input, coord_in.xyww, VXC_5BITOFFSET_XY(0, 1),
VXC_MODIFIER(6, 7, 0, VXC_RM_TowardZero, 0));
float4 left4;
float4 right4;
@ -97,16 +64,12 @@ __kernel void resize_bilinear_U8toF16_DOWN
unsigned char inputZP;
_viv_asm(COPY, inputZP, input_ZP, 4);
VXC_DP4x4(left4, top_left, inputZP, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniU8SubZPtoFp32_4x4);
VXC_DP4x4(right4, top_right, inputZP, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniU8SubZPtoFp32_4x4);
right4 -= left4;
VXC_DP4x4(left4, top, inputZP, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniU8SubZPtoFp32_left_4x4);
VXC_DP4x4(right4, top, inputZP, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniU8RightSubLeft_4x4);
top4 = right4 * x_lerp + left4;
VXC_DP4x4(left4, bottom_left, inputZP, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniU8SubZPtoFp32_4x4);
VXC_DP4x4(right4, bottom_right, inputZP, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniU8SubZPtoFp32_4x4);
right4 -= left4;
VXC_DP4x4(left4, bottom, inputZP, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniU8SubZPtoFp32_left_4x4);
VXC_DP4x4(right4, bottom, inputZP, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniU8RightSubLeft_4x4);
bottom4 = right4 * x_lerp + left4;
bottom4 -= top4;
@ -120,7 +83,12 @@ __kernel void resize_bilinear_U8toF16_DOWN
vxc_short8 dst_short;
_viv_asm(COPY, dst_short, dst, 16);
VXC_WriteImage2DArray(output, coord_out, dst_short.s0246, VXC_MODIFIER(0, 3, 0,VXC_RM_TowardZero, 0));
int8 output_desc;
_viv_asm(COPY, output_desc, output, sizeof(output_desc));
baseAddr = (int)coord_out.z * output_desc.s4 + output_desc.s0;
_viv_asm(MOV, coord_out.w, baseAddr);
VXC_OP4_NoDest(img_store_3d, output, coord_out.xyww, dst_short.s0246,
VXC_MODIFIER(0, 3, 0,VXC_RM_TowardZero, 0));
}
__kernel void resize_bilinear_U8toU8_UP
@ -147,8 +115,6 @@ __kernel void resize_bilinear_U8toU8_UP
float top_y_f = floor(in_y);
float y_lerp = in_y - top_y_f;
int top_y_idx = convert_int(top_y_f);
float bottom_y_f = ceil(in_y);
int bottom_y_idx= convert_int(bottom_y_f);
vxc_uchar16 src0, src1;
@ -157,12 +123,15 @@ __kernel void resize_bilinear_U8toU8_UP
int4 coord_in = (int4)(left_x_idx.x, top_y_idx, coord_out.z, 0);
VXC_ReadImage2DArray(src0, input, coord_in, \
VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));
int8 input_desc;
_viv_asm(COPY, input_desc, input, sizeof(input_desc));
int baseAddr = (int)coord_in.z * input_desc.s4 + input_desc.s0;
_viv_asm(MOV, coord_in.w, baseAddr);
coord_in.y = bottom_y_idx;
VXC_ReadImage2DArray(src1, input, coord_in, \
VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));
VXC_OP4(img_load_3d, src0, input, coord_in.xyww, VXC_5BITOFFSET_XY(0, 0),
VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));
VXC_OP4(img_load_3d, src1, input, coord_in.xyww, VXC_5BITOFFSET_XY(0, 1),
VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));
vxc_ushort8 bitextract_p0;
vxc_uchar16 maskShift = {8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8};
@ -172,46 +141,67 @@ __kernel void resize_bilinear_U8toU8_UP
VXC_DP2x8(maskShift, bitextract_p0, constData, \
VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniGetMaskShift_2x8);
do
int8 output_desc;
_viv_asm(COPY, output_desc, output, sizeof(output_desc));
baseAddr = (int)coord_out.z * output_desc.s4 + output_desc.s0;
_viv_asm(MOV, coord_out.w, baseAddr);
float4 left4;
float4 right4;
float4 top4;
float4 bottom4;
int loop = depth - 1;
while (coord_in.z < loop)
{
VXC_BitExtract(top, src0, src0, maskShift, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
VXC_BitExtract(bottom, src1, src1, maskShift, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
coord_in.w += input_desc.s4;
VXC_OP4(img_load_3d, src0, input, coord_in.xyww, VXC_5BITOFFSET_XY(0, 0),
VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));
VXC_OP4(img_load_3d, src1, input, coord_in.xyww, VXC_5BITOFFSET_XY(0, 1),
VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));
coord_in.z ++;
coord_in.y = top_y_idx;
VXC_ReadImage2DArray(src0, input, coord_in, \
VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));
coord_in.y = bottom_y_idx;
VXC_ReadImage2DArray(src1, input, coord_in, \
VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));
float4 left4;
float4 right4;
float4 top4;
float4 bottom4;
unsigned char inputZP;
_viv_asm(COPY, inputZP, input_ZP, 4);
VXC_DP4x4(left4, top, inputZP, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniU8SubZPtoFp32_4x4);
VXC_DP4x4(right4, top, top, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniU8SubZPtoFp32_part1_4x4);
VXC_DP4x4(left4, top, inputZP, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniU8SubZPtoFp32_left_4x4);
VXC_DP4x4(right4, top, top, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniU8RightSubLeft_4x4);
top4 = right4 * x_lerp + left4;
VXC_DP4x4(left4, bottom, inputZP, \
VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniU8SubZPtoFp32_4x4);
VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniU8SubZPtoFp32_left_4x4);
VXC_DP4x4(right4, bottom, bottom, \
VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniU8SubZPtoFp32_part1_4x4);
VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniU8RightSubLeft_4x4);
bottom4 = right4 * x_lerp + left4;
bottom4 -= top4;
float4 dst4 = bottom4 * y_lerp + top4;
dst4 = dst4 * uint8Scale + output_ZP;
int4 dst = convert_int4_rte(dst4);
VXC_DP2x8(top, dst, dst, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), uniExtact8Bit_2x8);
VXC_WriteImage2DArray(output, coord_out, top, VXC_MODIFIER(0, 3, 0,VXC_RM_TowardZero, 0));
VXC_OP4_NoDest(img_store_3d, output, coord_out.xyww, top, VXC_MODIFIER(0, 3, 0,VXC_RM_TowardZero, 0));
coord_out.w += output_desc.s4;
}
coord_out.z ++;
} while (coord_in.z < depth);
VXC_BitExtract(top, src0, src0, maskShift, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
VXC_BitExtract(bottom, src1, src1, maskShift, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
unsigned char inputZP;
_viv_asm(COPY, inputZP, input_ZP, 4);
VXC_DP4x4(left4, top, inputZP, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniU8SubZPtoFp32_left_4x4);
VXC_DP4x4(right4, top, top, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniU8RightSubLeft_4x4);
top4 = right4 * x_lerp + left4;
VXC_DP4x4(left4, bottom, inputZP, \
VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniU8SubZPtoFp32_left_4x4);
VXC_DP4x4(right4, bottom, bottom, \
VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniU8RightSubLeft_4x4);
bottom4 = right4 * x_lerp + left4;
bottom4 -= top4;
float4 dst4 = bottom4 * y_lerp + top4;
dst4 = dst4 * uint8Scale + output_ZP;
int4 dst = convert_int4_rte(dst4);
VXC_DP2x8(top, dst, dst, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), uniExtact8Bit_2x8);
VXC_OP4_NoDest(img_store_3d, output, coord_out.xyww, top, VXC_MODIFIER(0, 3, 0,VXC_RM_TowardZero, 0));
}
__kernel void resize_bilinear_U8toU8_DOWN
@ -228,69 +218,36 @@ __kernel void resize_bilinear_U8toU8_DOWN
float4 left_x_f = floor(in_x);
float4 x_lerp = in_x - left_x_f;
int4 left_x_idx = convert_int4(left_x_f);
float4 right_x_f = ceil(in_x);
int4 right_x_idx = convert_int4(right_x_f);
float in_y = (convert_float(coord_out.y) + half_pixel_value) * scale_xy.y - half_pixel_value;
float top_y_f = floor(in_y);
float y_lerp = in_y - top_y_f;
int top_y_idx = convert_int(top_y_f);
float bottom_y_f = ceil(in_y);
int bottom_y_idx= convert_int(bottom_y_f);
vxc_uchar16 top_left, top_right;
vxc_uchar16 bottom_left, bottom_right;
vxc_uchar16 top, bottom, result;
int4 coord_in = (int4)(left_x_idx.x, top_y_idx, coord_out.z, 0);
VXC_ReadImage2DArray(top_left, input, coord_in, \
VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));
int8 input_desc;
_viv_asm(COPY, input_desc, input, sizeof(input_desc));
int baseAddr = (int)coord_in.z * input_desc.s4 + input_desc.s0;
_viv_asm(MOV, coord_in.w, baseAddr);
VXC_OP4(img_load_3d, top, input, coord_in.xyww, VXC_5BITOFFSET_XY(0, 0),
VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0));
VXC_OP4(img_load_3d, bottom, input, coord_in.xyww, VXC_5BITOFFSET_XY(0, 1),
VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0));
coord_in.x = left_x_idx.y;
VXC_ReadImage2DArray(top_left, input, coord_in, \
VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(1, 1, 0, VXC_RM_TowardZero, 0));
VXC_OP4(img_load_3d, top, input, coord_in.xyww, VXC_5BITOFFSET_XY(0, 0),
VXC_MODIFIER(2, 3, 0, VXC_RM_TowardZero, 0));
VXC_OP4(img_load_3d, bottom, input, coord_in.xyww, VXC_5BITOFFSET_XY(0, 1),
VXC_MODIFIER(2, 3, 0, VXC_RM_TowardZero, 0));
coord_in.x = left_x_idx.z;
VXC_ReadImage2DArray(top_left, input, coord_in, \
VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(2, 2, 0, VXC_RM_TowardZero, 0));
VXC_OP4(img_load_3d, top, input, coord_in.xyww, VXC_5BITOFFSET_XY(0, 0),
VXC_MODIFIER(4, 5, 0, VXC_RM_TowardZero, 0));
VXC_OP4(img_load_3d, bottom, input, coord_in.xyww, VXC_5BITOFFSET_XY(0, 1),
VXC_MODIFIER(4, 5, 0, VXC_RM_TowardZero, 0));
coord_in.x = left_x_idx.w;
VXC_ReadImage2DArray(top_left, input, coord_in, \
VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(3, 3, 0, VXC_RM_TowardZero, 0));
coord_in.x = right_x_idx.x;
VXC_ReadImage2DArray(top_right, input, coord_in, \
VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));
coord_in.x = right_x_idx.y;
VXC_ReadImage2DArray(top_right, input, coord_in, \
VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(1, 1, 0, VXC_RM_TowardZero, 0));
coord_in.x = right_x_idx.z;
VXC_ReadImage2DArray(top_right, input, coord_in, \
VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(2, 2, 0, VXC_RM_TowardZero, 0));
coord_in.x = right_x_idx.w;
VXC_ReadImage2DArray(top_right, input, coord_in, \
VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(3, 3, 0, VXC_RM_TowardZero, 0));
coord_in.y = bottom_y_idx;
coord_in.x = left_x_idx.x;
VXC_ReadImage2DArray(bottom_left, input, coord_in, \
VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));
coord_in.x = left_x_idx.y;
VXC_ReadImage2DArray(bottom_left, input, coord_in, \
VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(1, 1, 0, VXC_RM_TowardZero, 0));
coord_in.x = left_x_idx.z;
VXC_ReadImage2DArray(bottom_left, input, coord_in, \
VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(2, 2, 0, VXC_RM_TowardZero, 0));
coord_in.x = left_x_idx.w;
VXC_ReadImage2DArray(bottom_left, input, coord_in, \
VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(3, 3, 0, VXC_RM_TowardZero, 0));
coord_in.x = right_x_idx.x;
VXC_ReadImage2DArray(bottom_right, input, coord_in, \
VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));
coord_in.x = right_x_idx.y;
VXC_ReadImage2DArray(bottom_right, input, coord_in, \
VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(1, 1, 0, VXC_RM_TowardZero, 0));
coord_in.x = right_x_idx.z;
VXC_ReadImage2DArray(bottom_right, input, coord_in, \
VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(2, 2, 0, VXC_RM_TowardZero, 0));
coord_in.x = right_x_idx.w;
VXC_ReadImage2DArray(bottom_right, input, coord_in, \
VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(3, 3, 0, VXC_RM_TowardZero, 0));
VXC_OP4(img_load_3d, top, input, coord_in.xyww, VXC_5BITOFFSET_XY(0, 0),
VXC_MODIFIER(6, 7, 0, VXC_RM_TowardZero, 0));
VXC_OP4(img_load_3d, bottom, input, coord_in.xyww, VXC_5BITOFFSET_XY(0, 1),
VXC_MODIFIER(6, 7, 0, VXC_RM_TowardZero, 0));
float4 left4;
float4 right4;
@ -299,25 +256,26 @@ __kernel void resize_bilinear_U8toU8_DOWN
unsigned char inputZP;
_viv_asm(COPY, inputZP, input_ZP, 4);
VXC_DP4x4(left4, top_left, inputZP, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniU8SubZPtoFp32_4x4);
VXC_DP4x4(right4, top_right, inputZP, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniU8SubZPtoFp32_4x4);
right4 -= left4;
VXC_DP4x4(left4, top, inputZP, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniU8SubZPtoFp32_left_4x4);
VXC_DP4x4(right4, top, inputZP, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniU8RightSubLeft_4x4);
top4 = right4 * x_lerp + left4;
VXC_DP4x4(left4, bottom_left, inputZP, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniU8SubZPtoFp32_4x4);
VXC_DP4x4(right4, bottom_right, inputZP, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniU8SubZPtoFp32_4x4);
right4 -= left4;
VXC_DP4x4(left4, bottom, inputZP, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniU8SubZPtoFp32_left_4x4);
VXC_DP4x4(right4, bottom, inputZP, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniU8RightSubLeft_4x4);
bottom4 = right4 * x_lerp + left4;
bottom4 -= top4;
float4 dst4 = bottom4 * y_lerp + top4;
dst4 = dst4 * uint8Scale + output_ZP;
int4 dst = convert_int4_rte(dst4);
VXC_DP2x8(top_left, dst, dst, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), uniExtact8Bit_2x8);
VXC_WriteImage2DArray(output, coord_out, top_left, VXC_MODIFIER(0, 3, 0,VXC_RM_TowardZero, 0));
VXC_DP2x8(result, dst, dst, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), uniExtact8Bit_2x8);
int8 output_desc;
_viv_asm(COPY, output_desc, output, sizeof(output_desc));
baseAddr = (int)coord_out.z * output_desc.s4 + output_desc.s0;
_viv_asm(MOV, coord_out.w, baseAddr);
VXC_OP4_NoDest(img_store_3d, output, coord_out.xyww, result,
VXC_MODIFIER(0, 3, 0,VXC_RM_TowardZero, 0));
}

View File

@ -69,7 +69,8 @@ __kernel void resize_bilinear_U8toU8_UP_opt
baseAddr = (int)coord_out.z * output_desc.s4 + output_desc.s0;
_viv_asm(MOV, coord_out.w, baseAddr);
do
int loop = depth - 1;
while (coord_in.z < loop)
{
VXC_BitExtract(top_bottom, src0, src0, maskShift, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
VXC_BitExtract(top_bottom, src1, src1, maskShift, VXC_MODIFIER(8, 15, 0, VXC_RM_TowardZero, 0));
@ -88,8 +89,17 @@ __kernel void resize_bilinear_U8toU8_UP_opt
VXC_MODIFIER(0, 3, 0,VXC_RM_TowardZero, 0));
coord_out.w += output_desc.s4;
coord_out.z ++;
} while (coord_out.z < depth);
coord_in.z ++;
}
VXC_BitExtract(top_bottom, src0, src0, maskShift, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
VXC_BitExtract(top_bottom, src1, src1, maskShift, VXC_MODIFIER(8, 15, 0, VXC_RM_TowardZero, 0));
vxc_uchar16 dst;
VXC_DP4x4_b(dst, lerp.hi, lerp.lo, top_bottom,
VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), uniBilinear_4x4_b);
VXC_OP4_NoDest(img_store_3d, output, coord_out.xyww, dst,
VXC_MODIFIER(0, 3, 0,VXC_RM_TowardZero, 0));
}
#endif

View File

@ -28,18 +28,30 @@ __kernel void resize_nearest_F16toF16
vxc_short8 src;
int4 coord_in = (int4)(in_x_idx.x, in_y_idx, coord_out.z, 0);
VXC_ReadImage2DArray(src, input, coord_in, \
VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));
int8 input_desc;
_viv_asm(COPY, input_desc, input, sizeof(input_desc));
int baseAddr = (int)coord_in.z * input_desc.s4 + input_desc.s0;
_viv_asm(MOV, coord_in.w, baseAddr);
VXC_OP4(img_load_3d, src, input, coord_in.xyww, VXC_5BITOFFSET_XY(0, 0),
VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));
coord_in.x = in_x_idx.y;
VXC_ReadImage2DArray(src, input, coord_in, \
VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(1, 1, 0, VXC_RM_TowardZero, 0));
VXC_OP4(img_load_3d, src, input, coord_in.xyww, VXC_5BITOFFSET_XY(0, 0),
VXC_MODIFIER(1, 1, 0, VXC_RM_TowardZero, 0));
coord_in.x = in_x_idx.z;
VXC_ReadImage2DArray(src, input, coord_in, \
VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(2, 2, 0, VXC_RM_TowardZero, 0));
VXC_OP4(img_load_3d, src, input, coord_in.xyww, VXC_5BITOFFSET_XY(0, 0),
VXC_MODIFIER(2, 2, 0, VXC_RM_TowardZero, 0));
coord_in.x = in_x_idx.w;
VXC_ReadImage2DArray(src, input, coord_in, \
VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(3, 3, 0, VXC_RM_TowardZero, 0));
VXC_WriteImage2DArray(output, coord_out, src, VXC_MODIFIER(0, 3, 0,VXC_RM_TowardZero, 0));
VXC_OP4(img_load_3d, src, input, coord_in.xyww, VXC_5BITOFFSET_XY(0, 0),
VXC_MODIFIER(3, 3, 0, VXC_RM_TowardZero, 0));
int8 output_desc;
_viv_asm(COPY, output_desc, output, sizeof(output_desc));
baseAddr = (int)coord_out.z * output_desc.s4 + output_desc.s0;
_viv_asm(MOV, coord_out.w, baseAddr);
VXC_OP4_NoDest(img_store_3d, output, coord_out.xyww, src,
VXC_MODIFIER(0, 3, 0,VXC_RM_TowardZero, 0));
}
_viv_uniform VXC_512Bits uniGetExtractData_2x8;
@ -56,18 +68,29 @@ __kernel void resize_nearest_F16toF16_op
vxc_ushort8 src0, src1, dst;
int4 coord_in = (int4)(in_x_idx.x, in_y_idx, coord_out.z, 0);
VXC_ReadImage2DArray(src0, input, coord_in, \
VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
VXC_ReadImage2DArray(src1, input, coord_in, \
VXC_5BITOFFSET_XY(8, 0), VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
//in_x_idx = in_x_idx - in_x_idx.xxxx;
int8 input_desc;
_viv_asm(COPY, input_desc, input, sizeof(input_desc));
int baseAddr = (int)coord_in.z * input_desc.s4 + input_desc.s0;
_viv_asm(MOV, coord_in.w, baseAddr);
VXC_OP4(img_load_3d, src0, input, coord_in.xyww, VXC_5BITOFFSET_XY(0, 0),
VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
VXC_OP4(img_load_3d, src1, input, coord_in.xyww, VXC_5BITOFFSET_XY(8, 0),
VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
vxc_uchar16 mask = (vxc_uchar16)(8, 8, 8, 8, 8, 8, 8, 8, 16, 16, 16, 16, 16, 16, 16, 16);
vxc_ushort8 input_idx;
_viv_asm(COPY, input_idx, in_x_idx, 16);
VXC_DP2x8(mask, input_idx, input_idx, \
VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniGetExtractData_2x8);
VXC_BitExtract(dst, src0, src1, mask, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0));
VXC_WriteImage2DArray(output, coord_out, dst, VXC_MODIFIER(0, 3, 0,VXC_RM_TowardZero, 0));
int8 output_desc;
_viv_asm(COPY, output_desc, output, sizeof(output_desc));
baseAddr = (int)coord_out.z * output_desc.s4 + output_desc.s0;
_viv_asm(MOV, coord_out.w, baseAddr);
VXC_OP4_NoDest(img_store_3d, output, coord_out.xyww, dst,
VXC_MODIFIER(0, 3, 0,VXC_RM_TowardZero, 0));
}
_viv_uniform VXC_512Bits uniConvertI8toI8_2x8;
@ -84,19 +107,31 @@ __kernel void resize_nearest_I8toI8
vxc_char16 src;
int4 coord_in = (int4)(in_x_idx.x, in_y_idx, coord_out.z, 0);
VXC_ReadImage2DArray(src, input, coord_in, \
VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));
int8 input_desc;
_viv_asm(COPY, input_desc, input, sizeof(input_desc));
int baseAddr = (int)coord_in.z * input_desc.s4 + input_desc.s0;
_viv_asm(MOV, coord_in.w, baseAddr);
VXC_OP4(img_load_3d, src, input, coord_in.xyww, VXC_5BITOFFSET_XY(0, 0),
VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));
coord_in.x = in_x_idx.y;
VXC_ReadImage2DArray(src, input, coord_in, \
VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(1, 1, 0, VXC_RM_TowardZero, 0));
VXC_OP4(img_load_3d, src, input, coord_in.xyww, VXC_5BITOFFSET_XY(0, 0),
VXC_MODIFIER(1, 1, 0, VXC_RM_TowardZero, 0));
coord_in.x = in_x_idx.z;
VXC_ReadImage2DArray(src, input, coord_in, \
VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(2, 2, 0, VXC_RM_TowardZero, 0));
VXC_OP4(img_load_3d, src, input, coord_in.xyww, VXC_5BITOFFSET_XY(0, 0),
VXC_MODIFIER(2, 2, 0, VXC_RM_TowardZero, 0));
coord_in.x = in_x_idx.w;
VXC_ReadImage2DArray(src, input, coord_in, \
VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(3, 3, 0, VXC_RM_TowardZero, 0));
VXC_OP4(img_load_3d, src, input, coord_in.xyww, VXC_5BITOFFSET_XY(0, 0),
VXC_MODIFIER(3, 3, 0, VXC_RM_TowardZero, 0));
VXC_DP2x8(src, src, src, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniConvertI8toI8_2x8);
VXC_WriteImage2DArray(output, coord_out, src, VXC_MODIFIER(0, 3, 0,VXC_RM_TowardZero, 0));
int8 output_desc;
_viv_asm(COPY, output_desc, output, sizeof(output_desc));
baseAddr = (int)coord_out.z * output_desc.s4 + output_desc.s0;
_viv_asm(MOV, coord_out.w, baseAddr);
VXC_OP4_NoDest(img_store_3d, output, coord_out.xyww, src,
VXC_MODIFIER(0, 3, 0,VXC_RM_TowardZero, 0));
}
__kernel void resize_nearest_I8toI8_op
@ -113,8 +148,14 @@ __kernel void resize_nearest_I8toI8_op
vxc_char16 dst;
int4 coord_in = (int4)(in_x_idx.x, in_y_idx, coord_out.z, 0);
VXC_ReadImage2DArray(src0, input, coord_in, \
VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));
int8 input_desc;
_viv_asm(COPY, input_desc, input, sizeof(input_desc));
int baseAddr = (int)coord_in.z * input_desc.s4 + input_desc.s0;
_viv_asm(MOV, coord_in.w, baseAddr);
VXC_OP4(img_load_3d, src0, input, coord_in.xyww, VXC_5BITOFFSET_XY(0, 0),
VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));
vxc_uchar16 mask = (vxc_uchar16)(8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8);
vxc_ushort8 input_idx;
_viv_asm(COPY, input_idx, in_x_idx, 16);
@ -123,7 +164,13 @@ __kernel void resize_nearest_I8toI8_op
VXC_BitExtract(dst0, src0, src0, mask, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0));
_viv_asm(COPY, dst, dst0, 8);
VXC_DP2x8(dst, dst, dst, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), uniConvertI8toI8_2x8);
VXC_WriteImage2DArray(output, coord_out, dst, VXC_MODIFIER(0, 3, 0,VXC_RM_TowardZero, 0));
int8 output_desc;
_viv_asm(COPY, output_desc, output, sizeof(output_desc));
baseAddr = (int)coord_out.z * output_desc.s4 + output_desc.s0;
_viv_asm(MOV, coord_out.w, baseAddr);
VXC_OP4_NoDest(img_store_3d, output, coord_out.xyww, dst,
VXC_MODIFIER(0, 3, 0,VXC_RM_TowardZero, 0));
}
__kernel void resize_nearest_U8toU8
@ -139,22 +186,34 @@ __kernel void resize_nearest_U8toU8
vxc_uchar16 src;
int4 coord_in = (int4)(in_x_idx.x, in_y_idx, coord_out.z, 0);
VXC_ReadImage2DArray(src, input, coord_in, \
VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));
int8 input_desc;
_viv_asm(COPY, input_desc, input, sizeof(input_desc));
int baseAddr = (int)coord_in.z * input_desc.s4 + input_desc.s0;
_viv_asm(MOV, coord_in.w, baseAddr);
VXC_OP4(img_load_3d, src, input, coord_in.xyww, VXC_5BITOFFSET_XY(0, 0),
VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));
coord_in.x = in_x_idx.y;
VXC_ReadImage2DArray(src, input, coord_in, \
VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(1, 1, 0, VXC_RM_TowardZero, 0));
VXC_OP4(img_load_3d, src, input, coord_in.xyww, VXC_5BITOFFSET_XY(0, 0),
VXC_MODIFIER(1, 1, 0, VXC_RM_TowardZero, 0));
coord_in.x = in_x_idx.z;
VXC_ReadImage2DArray(src, input, coord_in, \
VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(2, 2, 0, VXC_RM_TowardZero, 0));
VXC_OP4(img_load_3d, src, input, coord_in.xyww, VXC_5BITOFFSET_XY(0, 0),
VXC_MODIFIER(2, 2, 0, VXC_RM_TowardZero, 0));
coord_in.x = in_x_idx.w;
VXC_ReadImage2DArray(src, input, coord_in, \
VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(3, 3, 0, VXC_RM_TowardZero, 0));
VXC_OP4(img_load_3d, src, input, coord_in.xyww, VXC_5BITOFFSET_XY(0, 0),
VXC_MODIFIER(3, 3, 0, VXC_RM_TowardZero, 0));
vxc_ushort8 multiplier;
_viv_asm(COPY, multiplier, multAndoutZP, 16);
VXC_DP2x8(src, src, multiplier, \
VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), uniMultiplyAndPostShift_2x8);
VXC_WriteImage2DArray(output, coord_out, src, VXC_MODIFIER(0, 3, 0,VXC_RM_TowardZero, 0));
int8 output_desc;
_viv_asm(COPY, output_desc, output, sizeof(output_desc));
baseAddr = (int)coord_out.z * output_desc.s4 + output_desc.s0;
_viv_asm(MOV, coord_out.w, baseAddr);
VXC_OP4_NoDest(img_store_3d, output, coord_out.xyww, src,
VXC_MODIFIER(0, 3, 0,VXC_RM_TowardZero, 0));
}
__kernel void resize_nearest_U8toU8_op
@ -170,8 +229,14 @@ __kernel void resize_nearest_U8toU8_op
vxc_uchar16 src0, dst;
int4 coord_in = (int4)(in_x_idx.x, in_y_idx, coord_out.z, 0);
VXC_ReadImage2DArray(src0, input, coord_in, \
VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));
int8 input_desc;
_viv_asm(COPY, input_desc, input, sizeof(input_desc));
int baseAddr = (int)coord_in.z * input_desc.s4 + input_desc.s0;
_viv_asm(MOV, coord_in.w, baseAddr);
VXC_OP4(img_load_3d, src0, input, coord_in.xyww, VXC_5BITOFFSET_XY(0, 0),
VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));
vxc_uchar16 mask = (vxc_uchar16)(8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8);
vxc_ushort8 input_idx;
_viv_asm(COPY, input_idx, in_x_idx, 16);
@ -180,7 +245,13 @@ __kernel void resize_nearest_U8toU8_op
vxc_ushort8 multiplier;
_viv_asm(COPY, multiplier, multAndoutZP, 16);
VXC_DP2x8(dst, dst, multiplier, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), uniMultiplyAndPostShift_2x8);
VXC_WriteImage2DArray(output, coord_out, dst, VXC_MODIFIER(0, 3, 0,VXC_RM_TowardZero, 0));
int8 output_desc;
_viv_asm(COPY, output_desc, output, sizeof(output_desc));
baseAddr = (int)coord_out.z * output_desc.s4 + output_desc.s0;
_viv_asm(MOV, coord_out.w, baseAddr);
VXC_OP4_NoDest(img_store_3d, output, coord_out.xyww, dst,
VXC_MODIFIER(0, 3, 0,VXC_RM_TowardZero, 0));
}
__kernel void resize_nearest_I16toI16
@ -196,19 +267,32 @@ __kernel void resize_nearest_I16toI16
vxc_short8 src;
int4 coord_in = (int4)(in_x_idx.x, in_y_idx, coord_out.z, 0);
VXC_ReadImage2DArray(src, input, coord_in, \
VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));
int8 input_desc;
_viv_asm(COPY, input_desc, input, sizeof(input_desc));
int baseAddr = (int)coord_in.z * input_desc.s4 + input_desc.s0;
_viv_asm(MOV, coord_in.w, baseAddr);
VXC_OP4(img_load_3d, src, input, coord_in.xyww, VXC_5BITOFFSET_XY(0, 0),
VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));
coord_in.x = in_x_idx.y;
VXC_ReadImage2DArray(src, input, coord_in, \
VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(1, 1, 0, VXC_RM_TowardZero, 0));
VXC_OP4(img_load_3d, src, input, coord_in.xyww, VXC_5BITOFFSET_XY(0, 0),
VXC_MODIFIER(1, 1, 0, VXC_RM_TowardZero, 0));
coord_in.x = in_x_idx.z;
VXC_ReadImage2DArray(src, input, coord_in, \
VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(2, 2, 0, VXC_RM_TowardZero, 0));
VXC_OP4(img_load_3d, src, input, coord_in.xyww, VXC_5BITOFFSET_XY(0, 0),
VXC_MODIFIER(2, 2, 0, VXC_RM_TowardZero, 0));
coord_in.x = in_x_idx.w;
VXC_ReadImage2DArray(src, input, coord_in, \
VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(3, 3, 0, VXC_RM_TowardZero, 0));
VXC_OP4(img_load_3d, src, input, coord_in.xyww, VXC_5BITOFFSET_XY(0, 0),
VXC_MODIFIER(3, 3, 0, VXC_RM_TowardZero, 0));
VXC_DP2x8(src, src, src, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), uniConvertI8toI8_2x8);
VXC_WriteImage2DArray(output, coord_out, src, VXC_MODIFIER(0, 3, 0,VXC_RM_TowardZero, 0));
int8 output_desc;
_viv_asm(COPY, output_desc, output, sizeof(output_desc));
baseAddr = (int)coord_out.z * output_desc.s4 + output_desc.s0;
_viv_asm(MOV, coord_out.w, baseAddr);
VXC_OP4_NoDest(img_store_3d, output, coord_out.xyww, src,
VXC_MODIFIER(0, 3, 0,VXC_RM_TowardZero, 0));
}
__kernel void resize_nearest_I16toI16_op
@ -224,10 +308,16 @@ __kernel void resize_nearest_I16toI16_op
vxc_ushort8 src0, src1, dst0;
vxc_short8 dst;
int4 coord_in = (int4)(in_x_idx.x, in_y_idx, coord_out.z, 0);
VXC_ReadImage2DArray(src0, input, coord_in, \
VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
VXC_ReadImage2DArray(src1, input, coord_in, \
VXC_5BITOFFSET_XY(8, 0), VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
int8 input_desc;
_viv_asm(COPY, input_desc, input, sizeof(input_desc));
int baseAddr = (int)coord_in.z * input_desc.s4 + input_desc.s0;
_viv_asm(MOV, coord_in.w, baseAddr);
VXC_OP4(img_load_3d, src0, input, coord_in.xyww, VXC_5BITOFFSET_XY(0, 0),
VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
VXC_OP4(img_load_3d, src1, input, coord_in.xyww, VXC_5BITOFFSET_XY(8, 0),
VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
//in_x_idx = in_x_idx - in_x_idx.xxxx;
vxc_uchar16 mask = (vxc_uchar16)(8, 8, 8, 8, 8, 8, 8, 8, 16, 16, 16, 16, 16, 16, 16, 16);
@ -237,5 +327,11 @@ __kernel void resize_nearest_I16toI16_op
VXC_BitExtract(dst0, src0, src1, mask, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0));
_viv_asm(COPY, dst, dst0, 8);
VXC_DP2x8(dst, dst, dst, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), uniConvertI8toI8_2x8);
VXC_WriteImage2DArray(output, coord_out, dst, VXC_MODIFIER(0, 3, 0,VXC_RM_TowardZero, 0));
int8 output_desc;
_viv_asm(COPY, output_desc, output, sizeof(output_desc));
baseAddr = (int)coord_out.z * output_desc.s4 + output_desc.s0;
_viv_asm(MOV, coord_out.w, baseAddr);
VXC_OP4_NoDest(img_store_3d, output, coord_out.xyww, dst,
VXC_MODIFIER(0, 3, 0,VXC_RM_TowardZero, 0));
}

View File

@ -0,0 +1,135 @@
#include "cl_viv_vx_ext.h"
_viv_uniform VXC_512Bits uniExtractEvenUint8Stride2_2x8;
_viv_uniform VXC_512Bits uniExtractOddUint8Stride2_2x8;
_viv_uniform VXC_512Bits uniExtractEvenFp16Stride2_4x4;
_viv_uniform VXC_512Bits uniExtractOddFp16Stride2_4x4;
_viv_uniform int input_depth;
#define SPACE2DEPTH_INTERNAL_QINT_TO_QINT(src0_type_name, src1_type_name, read_type) \
__kernel void space2depth_internal_##src0_type_name##to##src1_type_name( \
image2d_array_t input, \
image2d_array_t output, \
int block_size_x, \
int block_size_y \
) \
{ \
int gidx = get_global_id(0); \
int gidy = get_global_id(1); \
int gidz = get_global_id(2); \
int4 coord = (int4)(gidx, gidy, gidz, 0); \
read_type src; \
VXC_ReadImage2DArray(src, input, coord, VXC_5BITOFFSET_XY(0, 0), \
VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0)); \
\
ushort stride_x = (ushort)block_size_x; \
ushort stride_y = (ushort)block_size_y; \
ushort sidx = (ushort)gidx; \
ushort sidy = (ushort)gidy; \
ushort tmpX = sidx % stride_x; \
ushort tmpY = sidy % stride_y; \
int tmpId0 = tmpX; \
int tmpId1 = tmpY; \
int4 coord_out = (int4)((int)(sidx / stride_x), (int)(sidy / stride_y), 0, 0); \
coord_out.z = tmpId0 * input_depth + tmpId1 * block_size_x * input_depth + gidz; \
VXC_WriteImage2DArray(output, coord_out, src, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0)); \
}
SPACE2DEPTH_INTERNAL_QINT_TO_QINT(U8, U8, vxc_uchar16)
SPACE2DEPTH_INTERNAL_QINT_TO_QINT(I8, I8, vxc_char16)
SPACE2DEPTH_INTERNAL_QINT_TO_QINT(I16, I16, vxc_short8)
__kernel void space2depth_internal_F16toF16(
image2d_array_t input,
image2d_array_t output,
int block_size_x,
int block_size_y
)
{
int gidx = get_global_id(0);
int gidy = get_global_id(1);
int gidz = get_global_id(2);
int4 coord = (int4)(gidx, gidy, gidz, 0);
vxc_short8 data, imgVal0;
VXC_ReadImage2DArray(data, input, coord, VXC_5BITOFFSET_XY(0, 0),
VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));
ushort stride_x = (ushort)block_size_x;
ushort stride_y = (ushort)block_size_y;
ushort sidx = (ushort)gidx;
ushort sidy = (ushort)gidy;
ushort tmpX = sidx % stride_x;
ushort tmpY = sidy % stride_y;
int tmpId0 = tmpX;
int tmpId1 = tmpY;
int4 coord_out = (int4)((int)(sidx / stride_x), (int)(sidy / stride_y), 0, 0);
coord_out.z = tmpId0 * input_depth + tmpId1 * block_size_x * input_depth + gidz;
VXC_WriteImage2DArray(output, coord_out, data, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));
}
#define SPACE2DEPTH_INTERNAL_QINT_TO_QINT_X2Y1(src0_type_name, src1_type_name, read_type, write_type) \
__kernel void space2depth_internal_##src0_type_name##to##src1_type_name##_X2Y1( \
image2d_array_t input, \
image2d_array_t output, \
int block_size_x, \
int block_size_y \
) \
{ \
int gidx = get_global_id(0); \
int gidy = get_global_id(1); \
int gidz = get_global_id(2); \
\
int4 coord = (int4)(gidx, gidy, gidz, 0); \
int4 coord_out = (int4)(gidx >> 1, gidy, gidz, 0); \
int out_d1; \
read_type imageData; \
write_type imgVal0, imgVal1; \
\
VXC_ReadImage2DArray(imageData, input, coord, VXC_5BITOFFSET_XY(0, 0), \
VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); \
\
out_d1 = gidz + input_depth; \
\
VXC_DP2x8(imgVal0, imageData, imageData,\
VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniExtractEvenUint8Stride2_2x8); \
VXC_DP2x8(imgVal1, imageData, imageData,\
VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniExtractOddUint8Stride2_2x8); \
VXC_WriteImage2DArray(output, coord_out, imgVal0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \
coord_out.z = out_d1; \
VXC_WriteImage2DArray(output, coord_out, imgVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \
}
SPACE2DEPTH_INTERNAL_QINT_TO_QINT_X2Y1(U8, U8, vxc_uchar16, vxc_uchar16)
SPACE2DEPTH_INTERNAL_QINT_TO_QINT_X2Y1(I8, I8, vxc_char16, vxc_char16)
#define SPACE2DEPTH_INTERNAL_16BITS_X2Y1(src0_type_name, src1_type_name, read_type, write_type) \
__kernel void space2depth_internal_##src0_type_name##to##src1_type_name##_X2Y1( \
image2d_array_t input, \
image2d_array_t output, \
int block_size_x, \
int block_size_y \
) \
{ \
int gidx = get_global_id(0); \
int gidy = get_global_id(1); \
int gidz = get_global_id(2); \
\
int4 coord = (int4)(gidx, gidy, gidz, 0); \
int4 coord_out = (int4)(gidx >> 1, gidy, gidz, 0); \
int out_d1; \
read_type imageData; \
write_type imgVal0, imgVal1; \
\
VXC_ReadImage2DArray(imageData, input, coord, VXC_5BITOFFSET_XY(0, 0), \
VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \
out_d1 = gidz + input_depth; \
VXC_DP4x4(imgVal0, imageData, imageData, \
VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniExtractEvenFp16Stride2_4x4); \
VXC_DP4x4(imgVal1, imageData, imageData, \
VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniExtractOddFp16Stride2_4x4); \
VXC_WriteImage2DArray(output, coord_out, imgVal0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \
coord_out.z = out_d1; \
VXC_WriteImage2DArray(output, coord_out, imgVal1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \
}
SPACE2DEPTH_INTERNAL_16BITS_X2Y1(I16, I16, vxc_short8, vxc_short8)
SPACE2DEPTH_INTERNAL_16BITS_X2Y1(F16, F16, vxc_short8, vxc_short8)

View File

@ -0,0 +1,58 @@
#include "cl_viv_vx_ext.h"
_viv_uniform VXC_512Bits uniConvertDatatoF32_4x4;
_viv_uniform float output_scale;
_viv_uniform float tail;
#define UPSAMPLE_SCALETO_FUN(src_name, dst_name, read_type, src_type, dst_type, write_type, conv_func) \
__kernel void upsamplescale_##src_name##to##dst_name( \
__read_only image2d_array_t input, \
__write_only image2d_array_t output, \
int stride, \
float scale) \
{ \
int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0); \
read_type read_val; \
src_type src_val; \
dst_type dst_val; \
write_type write_val; \
VXC_ReadImage2DArray(read_val, input, coord, VXC_5BITOFFSET_XY(0, 0), \
VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \
_viv_asm(COPY, src_val, read_val, 16); \
coord.xy *= stride; \
int8 output_desc; \
_viv_asm(COPY, output_desc, output, sizeof(output_desc)); \
int baseAddr = (int)coord.z * output_desc.s4 + output_desc.s0; \
_viv_asm(MOV, coord.w, baseAddr); \
float4 data; \
VXC_DP4x4(data, src_val, src_val, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), uniConvertDatatoF32_4x4); \
data = data * output_scale + tail; \
_viv_asm(conv_func, dst_val, data); \
_viv_asm(COPY, write_val, dst_val, 16); \
int4 coord_out = coord; \
for (int y = 0; y < stride; y++) \
{ \
coord_out.x = coord.x; \
for (int x = 0; x < stride; ) \
{ \
VXC_OP4_NoDest(img_store_3d, output, coord_out.xyww, write_val, \
VXC_MODIFIER(0, 0, 0,VXC_RM_TowardZero, 0)); \
x++; \
coord_out.x ++; \
} \
coord_out.y ++; \
} \
}
UPSAMPLE_SCALETO_FUN(F16, F16, vxc_short8, vxc_half8, half4, short4, CONV)
UPSAMPLE_SCALETO_FUN(F16, I16, vxc_short8, vxc_half8, int4, short4, CONV_RTE)
UPSAMPLE_SCALETO_FUN(F16, I8, vxc_short8, vxc_half8, int4, char4, CONV_RTE)
UPSAMPLE_SCALETO_FUN(F16, U8, vxc_short8, vxc_half8, int4, uchar4, CONV_RTE)
UPSAMPLE_SCALETO_FUN(I16, I16, vxc_short8, vxc_short8, int4, short4, CONV_RTE)
UPSAMPLE_SCALETO_FUN(I16, F16, vxc_short8, vxc_short8, half4, short4, CONV)
UPSAMPLE_SCALETO_FUN(I8, I8, vxc_char16, vxc_char16, int4, char4, CONV_RTE)
UPSAMPLE_SCALETO_FUN(I8, F16, vxc_short8, vxc_short8, half4, short4, CONV)
UPSAMPLE_SCALETO_FUN(U8, U8, vxc_uchar16, vxc_uchar16, int4, uchar4, CONV_RTE)
UPSAMPLE_SCALETO_FUN(U8, F16, vxc_short8, vxc_short8, half4, short4, CONV)

View File

@ -0,0 +1,83 @@
#include "cl_viv_vx_ext.h"
_viv_uniform VXC_512Bits uniUpScale2X_lo_2x8;
_viv_uniform VXC_512Bits uniUpScale2X_hi_2x8;
_viv_uniform int2 multAndoutZP;//[0:15] multiplier, [31:63] output zp
#define UPSAMPLE_SCALETO8B_FUN(src_name, dst_name, read_type, src_type, dst_type) \
__kernel void upsamplescale_##src_name##to##dst_name##_K2( \
__read_only image2d_array_t input, \
__write_only image2d_array_t output, \
int stride, \
float scale) \
{ \
int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0); \
read_type read_val; \
src_type src_val; \
dst_type dst_val; \
VXC_ReadImage2DArray(read_val, input, coord, VXC_5BITOFFSET_XY(0, 0), \
VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \
_viv_asm(COPY, src_val, read_val, 16); \
coord.xy <<= 1; \
int8 output_desc; \
_viv_asm(COPY, output_desc, output, sizeof(output_desc)); \
int baseAddr = (int)coord.z * output_desc.s4 + output_desc.s0; \
_viv_asm(MOV, coord.w, baseAddr); \
vxc_ushort8 multiplier; \
_viv_asm(COPY, multiplier, multAndoutZP, 16); \
VXC_DP2x8(dst_val, src_val, multiplier, \
VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniUpScale2X_lo_2x8); \
VXC_DP2x8(dst_val, src_val, multiplier, \
VXC_MODIFIER(8, 15, 0, VXC_RM_ToNearestEven, 1), uniUpScale2X_hi_2x8); \
VXC_OP4_NoDest(img_store_3d, output, coord.xyww, dst_val, VXC_MODIFIER(0, 15, 0,VXC_RM_TowardZero, 0)); \
coord.y ++; \
VXC_OP4_NoDest(img_store_3d, output, coord.xyww, dst_val, VXC_MODIFIER(0, 15, 0,VXC_RM_TowardZero, 0)); \
}
UPSAMPLE_SCALETO8B_FUN(F16, I8, vxc_short8, vxc_half8, vxc_char16)
UPSAMPLE_SCALETO8B_FUN(F16, U8, vxc_short8, vxc_half8, vxc_uchar16)
UPSAMPLE_SCALETO8B_FUN(I8, I8, vxc_char16, vxc_char16, vxc_char16)
UPSAMPLE_SCALETO8B_FUN(U8, U8, vxc_uchar16, vxc_uchar16, vxc_uchar16)
#define UPSAMPLE_SCALETO16B_FUN(src_name, dst_name, read_type, src_type, dst_type, write_type) \
__kernel void upsamplescale_##src_name##to##dst_name##_K2( \
__read_only image2d_array_t input, \
__write_only image2d_array_t output, \
int stride, \
float scale) \
{ \
int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0); \
read_type read_val; \
src_type src_val; \
dst_type dst0_val; \
dst_type dst1_val; \
write_type write_val; \
VXC_ReadImage2DArray(read_val, input, coord, VXC_5BITOFFSET_XY(0, 0), \
VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \
_viv_asm(COPY, src_val, read_val, 16); \
coord.xy <<= 1; \
int8 output_desc; \
_viv_asm(COPY, output_desc, output, sizeof(output_desc)); \
int baseAddr = (int)coord.z * output_desc.s4 + output_desc.s0; \
_viv_asm(MOV, coord.w, baseAddr); \
vxc_ushort8 multiplier; \
_viv_asm(COPY, multiplier, multAndoutZP, 16); \
VXC_DP2x8(dst0_val, src_val, multiplier, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1), uniUpScale2X_lo_2x8); \
VXC_DP2x8(dst1_val, src_val, multiplier, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1), uniUpScale2X_hi_2x8); \
_viv_asm(COPY, write_val, dst0_val, 16); \
VXC_OP4_NoDest(img_store_3d, output, coord.xyww, write_val, VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0)); \
coord.y ++; \
VXC_OP4_NoDest(img_store_3d, output, coord.xyww, write_val, VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0)); \
_viv_asm(COPY, write_val, dst1_val, 16); \
coord.xy = coord.xy + (int2)(8, -1); \
VXC_OP4_NoDest(img_store_3d, output, coord.xyww, write_val, VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0)); \
coord.y ++; \
VXC_OP4_NoDest(img_store_3d, output, coord.xyww, write_val, VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0)); \
}
UPSAMPLE_SCALETO16B_FUN(F16, F16, vxc_short8, vxc_half8, vxc_half8, vxc_short8)
UPSAMPLE_SCALETO16B_FUN(F16, I16, vxc_short8, vxc_half8, vxc_short8, vxc_short8)
UPSAMPLE_SCALETO16B_FUN(I8, F16, vxc_char16, vxc_char16, vxc_half8, vxc_short8)
UPSAMPLE_SCALETO16B_FUN(U8, F16, vxc_uchar16, vxc_uchar16, vxc_half8, vxc_short8)
UPSAMPLE_SCALETO16B_FUN(I16, F16, vxc_short8, vxc_short8, vxc_half8, vxc_short8)
UPSAMPLE_SCALETO16B_FUN(I16, I16, vxc_short8, vxc_short8, vxc_short8, vxc_short8)

View File

@ -1,111 +0,0 @@
#include "cl_viv_vx_ext.h"
//-----------------------------------------------tensor crop-------------------------------
__kernel void vxcTensorCrop_Int16(
__read_only image2d_array_t input,
__write_only image2d_array_t output,
int offset0,
int offset1,
int offset2)
{
int4 coord_in = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0);
vxc_ushort8 src0, src1, src2, src3;
VXC_ReadImage2DArray(src0, input, coord_in, VXC_5BITOFFSET_XY(0, 0),\
VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
VXC_ReadImage2DArray(src1, input, coord_in, VXC_5BITOFFSET_XY(0, 1),\
VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
VXC_ReadImage2DArray(src2, input, coord_in, VXC_5BITOFFSET_XY(0, 2),\
VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
VXC_ReadImage2DArray(src3, input, coord_in, VXC_5BITOFFSET_XY(0, 3),\
VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
int4 coord_out = (int4)(get_global_id(0) - offset0, get_global_id(1)\
- offset1, get_global_id(2) - offset2, 0);
VXC_WriteImage2DArray(output, coord_out, src0, VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0));
coord_out.y ++;
VXC_WriteImage2DArray(output, coord_out, src1, VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0));
coord_out.y ++;
VXC_WriteImage2DArray(output, coord_out, src2, VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0));
coord_out.y ++;
VXC_WriteImage2DArray(output, coord_out, src3, VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0));
}
__kernel void vxcTensorCrop_Int8(
__read_only image2d_array_t input,
__write_only image2d_array_t output,
int offset0,
int offset1,
int offset2)
{
int4 coord_in = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0);
vxc_uchar16 src0, src1, src2, src3;
VXC_ReadImage2DArray(src0, input, coord_in, VXC_5BITOFFSET_XY(0, 0),\
VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));
VXC_ReadImage2DArray(src1, input, coord_in, VXC_5BITOFFSET_XY(0, 1),\
VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));
VXC_ReadImage2DArray(src2, input, coord_in, VXC_5BITOFFSET_XY(0, 2),\
VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));
VXC_ReadImage2DArray(src3, input, coord_in, VXC_5BITOFFSET_XY(0, 3),\
VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));
int4 coord_out = (int4)(get_global_id(0) - offset0, get_global_id(1) - offset1,\
get_global_id(2) - offset2, 0);
VXC_WriteImage2DArray(output, coord_out, src0, VXC_MODIFIER(0, 15, 0,VXC_RM_TowardZero, 0));
coord_out.y ++;
VXC_WriteImage2DArray(output, coord_out, src1, VXC_MODIFIER(0, 15, 0,VXC_RM_TowardZero, 0));
coord_out.y ++;
VXC_WriteImage2DArray(output, coord_out, src2, VXC_MODIFIER(0, 15, 0,VXC_RM_TowardZero, 0));
coord_out.y ++;
VXC_WriteImage2DArray(output, coord_out, src3, VXC_MODIFIER(0, 15, 0,VXC_RM_TowardZero, 0));
}
_viv_uniform VXC_512Bits uniConvertInt16toFp16_2x8;
__kernel void vxcTensorCrop_Int16_Fp16(
__read_only image2d_array_t input,
__write_only image2d_array_t output,
int offset0,
int offset1,
int offset2)
{
int4 coord_in = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0);
vxc_short8 src0, src1, src2, src3;
VXC_ReadImage2DArray(src0, input, coord_in, VXC_5BITOFFSET_XY(0, 0),\
VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
VXC_ReadImage2DArray(src1, input, coord_in, VXC_5BITOFFSET_XY(0, 1),\
VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
VXC_ReadImage2DArray(src2, input, coord_in, VXC_5BITOFFSET_XY(0, 2),\
VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
VXC_ReadImage2DArray(src3, input, coord_in, VXC_5BITOFFSET_XY(0, 3),\
VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
int4 coord_out = (int4)(get_global_id(0) - offset0, get_global_id(1)\
- offset1, get_global_id(2) - offset2, 0);
vxc_half8 dst0, dst1, dst2, dst3;
VXC_DP2x8(dst0, src0, src0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1),\
uniConvertInt16toFp16_2x8);
VXC_DP2x8(dst1, src1, src1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1),\
uniConvertInt16toFp16_2x8);
VXC_DP2x8(dst2, src2, src2, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1),\
uniConvertInt16toFp16_2x8);
VXC_DP2x8(dst3, src3, src3, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1),\
uniConvertInt16toFp16_2x8);
vxc_short8 out0, out1, out2, out3;
_viv_asm(COPY, out0, dst0, 16);
_viv_asm(COPY, out1, dst1, 16);
_viv_asm(COPY, out2, dst2, 16);
_viv_asm(COPY, out3, dst3, 16);
VXC_WriteImage2DArray(output, coord_out, out0, VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0));
coord_out.y ++;
VXC_WriteImage2DArray(output, coord_out, out1, VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0));
coord_out.y ++;
VXC_WriteImage2DArray(output, coord_out, out2, VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0));
coord_out.y ++;
VXC_WriteImage2DArray(output, coord_out, out3, VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0));
}

View File

@ -1,63 +0,0 @@
#include "cl_viv_vx_ext.h"
_viv_uniform int loopNum;
_viv_uniform VXC_512Bits uniMulAcc_16x1;
__kernel void vsi_nn_kernel_fullconnect2(
__read_only image2d_array_t input,
__read_only image2d_array_t weight,
__read_only image2d_array_t bias,
__write_only image2d_array_t output)
{
int4 coord_in = (int4)(16, get_global_id(0), get_global_id(1), 0);
int2 coord_out = (int2)(get_global_id(0), get_global_id(1));
vxc_short8 v0, v1, v2, v3, v4, v5, v6, v7;
vxc_half8 i0, i1, i2, i3;
vxc_half8 w0, w1, w2, w3;
float4 sum = 0;
float dst = 0;
dst = read_imagef(bias, coord_in.ywww).x;
do
{
VXC_ReadImage(v0, input, coord_in.xz, VXC_5BITOFFSET_XY(-16, 0),\
VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
_viv_asm(COPY, i0, v0, 16);
VXC_ReadImage(v1, weight, coord_in.xy, VXC_5BITOFFSET_XY(-16, 0),\
VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
_viv_asm(COPY, w0, v1, 16);
VXC_ReadImage(v2, input, coord_in.xz, VXC_5BITOFFSET_XY(-8, 0),\
VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
_viv_asm(COPY, i1, v2, 16);
VXC_ReadImage(v3, weight, coord_in.xy, VXC_5BITOFFSET_XY(-8, 0),\
VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
_viv_asm(COPY, w1, v3, 16);
VXC_ReadImage(v4, input, coord_in.xz, VXC_5BITOFFSET_XY(0, 0),\
VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
_viv_asm(COPY, i2, v4, 16);
VXC_ReadImage(v5, weight, coord_in.xy, VXC_5BITOFFSET_XY(0, 0),\
VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
_viv_asm(COPY, w2, v5, 16);
VXC_ReadImage(v6, input, coord_in.xz, VXC_5BITOFFSET_XY(8, 0),\
VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
_viv_asm(COPY, i3, v6, 16);
VXC_ReadImage(v7, weight, coord_in.xy, VXC_5BITOFFSET_XY(8, 0),\
VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
_viv_asm(COPY, w3, v7, 16);
coord_in.x += 32;
VXC_DP16x1(sum, i0, w0, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0), uniMulAcc_16x1);
VXC_DP16x1(sum, i1, w1, VXC_MODIFIER(1, 1, 0, VXC_RM_TowardZero, 0), uniMulAcc_16x1);
VXC_DP16x1(sum, i2, w2, VXC_MODIFIER(2, 2, 0, VXC_RM_TowardZero, 0), uniMulAcc_16x1);
VXC_DP16x1(sum, i3, w3, VXC_MODIFIER(3, 3, 0, VXC_RM_TowardZero, 0), uniMulAcc_16x1);
float4 tmp = {1, 1, 1, 1};
dst = dst + dot(sum, tmp);
} while (coord_in.x < loopNum);
vxc_half v;
_viv_asm(CONV, v, dst);
_viv_asm(COPY, v0, v, 16);
VXC_WriteImage(output, coord_out.xy, v0, VXC_MODIFIER(0, 0, 0,VXC_RM_TowardZero, 0));
}

View File

@ -1,129 +0,0 @@
#include "cl_viv_vx_ext.h"
/*****************************layernorm uint8 to fp16****************************/
_viv_uniform int width;
_viv_uniform float dimRatio;
_viv_uniform VXC_512Bits UniFP16toFP32Lo4_dp4x4;
_viv_uniform VXC_512Bits uniConvert1stUint8SubZpToFp32_4x4;
_viv_uniform VXC_512Bits uniConvert2ndUint8SubZpToFp32_4x4;
_viv_uniform VXC_512Bits uniConvert3rdUint8SubZpToFp32_4x4;
_viv_uniform VXC_512Bits uniConvert4thUint8SubZpToFp32_4x4;
_viv_uniform VXC_512Bits uniSumU8_16x1;
_viv_uniform VXC_512Bits uniSqrSum_16x1;
_viv_uniform float input_scale;
_viv_uniform int inputZP;
_viv_uniform int sumInZp;
_viv_uniform int tmpZp1;
_viv_uniform int tmpZp2;
_viv_uniform float e2InScale;
_viv_uniform VXC_512Bits uniConvertSecFp16Fp32_4x4;
_viv_uniform VXC_512Bits UniPackFP16even_2x8;
__kernel void vxcLayerNormU8toFp16(
image2d_array_t input,
image2d_array_t bias,
image2d_array_t scale,
image2d_array_t output,
float eps)
{
int4 coord = (int4)(0, get_global_id(1), 0, 0);
vxc_uchar16 src0;
float sum = 0, sqr = 0;
int tmpSum = 0, tmpSqr = 0;
vxc_int4 tmpSum1;
vxc_int4 tmpSqr1;
for(coord.x = 0; coord.x < width; coord.x += 16)
{
VXC_ReadImage(src0, input, coord.xy, VXC_5BITOFFSET_XY(0, 0),\
VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));
VXC_DP16x1(tmpSum1, src0, src0, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0), uniSumU8_16x1);
tmpSum += (tmpSum1.x);
VXC_DP16x1(tmpSqr1, src0, src0, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0), uniSqrSum_16x1);
tmpSqr += (tmpSqr1.x + tmpZp1 * tmpSum1.x);
}
sum = (tmpSum + sumInZp) * input_scale;
sqr = (tmpSqr + tmpZp2) * e2InScale;
float mean, vari;
mean = sum * dimRatio;
vari = sqr*dimRatio - mean*mean;
vari += eps;
vari = rsqrt(vari);
vxc_float4 tmpData0, tmpData1, tmpData2, tmpData3;
int4 coord_bias = (int4)(0, 0, 0, 0);
vxc_half8 scale_h;
vxc_float4 bias_f0, bias_f1, scale_f0, scale_f1;
vxc_short8 src1, outval;
short zp = inputZP;
half4 tmpVal0, tmpVal1;
vxc_half8 dst;
for(coord.x = 0; coord.x < width; coord.x += 16)
{
coord_bias.x = coord.x;
VXC_ReadImage(src0, input, coord.xy, VXC_5BITOFFSET_XY(0, 0),\
VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));
VXC_ReadImage(src1, scale, coord.xw, VXC_5BITOFFSET_XY(0, 0),\
VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
_viv_asm(COPY, scale_h, src1, 16);
VXC_DP4x4(scale_f0, scale_h, scale_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\
UniFP16toFP32Lo4_dp4x4);
VXC_DP4x4(scale_f1, scale_h, scale_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\
uniConvertSecFp16Fp32_4x4);
bias_f0 = read_imagef(bias, coord_bias);
coord_bias.x += 4;
bias_f1 = read_imagef(bias, coord_bias);
coord_bias.x += 4;
VXC_ReadImage(src1, scale, coord.xw, VXC_5BITOFFSET_XY(8, 0),\
VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
_viv_asm(COPY, scale_h, src1, 16);
VXC_DP4x4(tmpData0, src0, zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\
uniConvert1stUint8SubZpToFp32_4x4);
VXC_DP4x4(tmpData1, src0, zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\
uniConvert2ndUint8SubZpToFp32_4x4);
VXC_DP4x4(tmpData2, src0, zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\
uniConvert3rdUint8SubZpToFp32_4x4);
VXC_DP4x4(tmpData3, src0, zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\
uniConvert4thUint8SubZpToFp32_4x4);
tmpData0 *= input_scale;
tmpData1 *= input_scale;
tmpData2 *= input_scale;
tmpData3 *= input_scale;
vxc_float4 norm;
tmpData0 -= mean;
norm = scale_f0 * vari * tmpData0 + bias_f0;
bias_f0 = read_imagef(bias, coord_bias);
VXC_DP4x4(scale_f0, scale_h, scale_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\
UniFP16toFP32Lo4_dp4x4);
coord_bias.x += 4;
_viv_asm(CONV, tmpVal0, norm);
tmpData1 -= mean;
norm = scale_f1 * vari * tmpData1 + bias_f1;
bias_f1 = read_imagef(bias, coord_bias);
VXC_DP4x4(scale_f1, scale_h, scale_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\
uniConvertSecFp16Fp32_4x4);
_viv_asm(CONV, tmpVal1, norm);
VXC_DP2x8(dst, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0),\
UniPackFP16even_2x8);
_viv_asm(COPY, outval, dst, 16);
int2 coord_out = (int2)(coord.x, coord.y);
VXC_WriteImage(output, coord_out, outval, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
tmpData2 -= mean;
norm = scale_f0 * vari * tmpData2 + bias_f0;
_viv_asm(CONV, tmpVal0, norm);
tmpData3 -= mean;
norm = scale_f1 * vari * tmpData3 + bias_f1;
_viv_asm(CONV, tmpVal1, norm);
VXC_DP2x8(dst, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0),\
UniPackFP16even_2x8);
_viv_asm(COPY, outval, dst, 16);
coord_out.x += 8;
VXC_WriteImage(output, coord_out, outval, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
}
}

View File

@ -1,38 +0,0 @@
#include "cl_viv_vx_ext.h"
//--------------------------resize-------------------------
_viv_uniform VXC_512Bits uniPackEvenData_2x8;
__kernel void resize_16bits_downsample_quarter
(
__read_only image2d_array_t input,
__write_only image2d_array_t output
)
{
int2 coord = (int2)(get_global_id(0), get_global_id(1));
vxc_short8 src0, src1;
VXC_ReadImage(src0, input, coord.xy, VXC_5BITOFFSET_XY(0, 0),\
VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
VXC_ReadImage(src1, input, coord.xy, VXC_5BITOFFSET_XY(8, 0),\
VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
coord = coord >> 1;
VXC_DP2x8(src0, src0, src1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardInf, 0), uniPackEvenData_2x8);
VXC_WriteImage(output, coord, src0, VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0));
}
__kernel void resize_8bits_downsample_quarter
(
__read_only image2d_array_t input,
__write_only image2d_array_t output
)
{
int2 coord = (int2)(get_global_id(0), get_global_id(1));
vxc_char16 src0;
vxc_char8 dst;
VXC_ReadImage(src0, input, coord.xy, VXC_5BITOFFSET_XY(0, 0),\
VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));
coord = coord >> 1;
dst = src0.s02468ace;
VXC_WriteImage(output, coord, dst, VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0));
}

View File

@ -1,49 +0,0 @@
#include "cl_viv_vx_ext.h"
//--------------------------scale-------------------------
_viv_uniform VXC_512Bits uniExtractHalf8_2x8;
_viv_uniform VXC_512Bits uniFp16MulFp16ToFp32_Lo_4x4;
_viv_uniform VXC_512Bits uniFp16MulFp16ToFp32_Hi_4x4;
__kernel void scale_fp16
(
__read_only image2d_array_t input,
__read_only image2d_array_t weights,
__read_only image2d_array_t biases,
__write_only image2d_array_t output
)
{
int4 coord = (int4)(get_global_id(0), get_global_id(1), 0, 0);
vxc_short8 vec0, vec1;
vxc_half8 src0;
vxc_half8 w0;
vxc_float4 b0, b1;
vxc_float4 dst0, dst1;
VXC_ReadImage(vec0, input, coord.xy, VXC_5BITOFFSET_XY(0, 0),\
VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
_viv_asm(COPY, src0, vec0, 16);
VXC_ReadImage(vec1, weights, coord.xw, VXC_5BITOFFSET_XY(0, 0),\
VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
_viv_asm(COPY, w0, vec1, 16);
coord.z = coord.x + 4;
b0 = read_imagef(biases, coord.xwww);
b1 = read_imagef(biases, coord.zwww);
VXC_DP4x4(dst0, src0, w0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\
uniFp16MulFp16ToFp32_Lo_4x4);
VXC_DP4x4(dst1, src0, w0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\
uniFp16MulFp16ToFp32_Hi_4x4);
dst0 += b0;
dst1 += b1;
half4 t0, t1;
_viv_asm(CONV, t0, dst0);
_viv_asm(CONV, t1, dst1);
VXC_DP2x8(w0, t0, t1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniExtractHalf8_2x8);
_viv_asm(COPY, vec0, w0, 16);
VXC_WriteImage(output, coord.xy, vec0, VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0));
}

View File

@ -1,67 +0,0 @@
#include "cl_viv_vx_ext.h"
/******************shuffle channel float16/int16********************/
_viv_uniform int group_column;
_viv_uniform float rgroup_column;
__kernel void shuffleChannelVXC(
image2d_array_t input,
image2d_array_t output,
int group_number,
int axis)
{
int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0);
vxc_short8 src0, src1, src2, src3;
VXC_ReadImage2DArray(src0, input, coord, VXC_5BITOFFSET_XY(0, 0),\
VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
VXC_ReadImage2DArray(src1, input, coord, VXC_5BITOFFSET_XY(0, 1),\
VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
VXC_ReadImage2DArray(src2, input, coord, VXC_5BITOFFSET_XY(0, 2),\
VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
VXC_ReadImage2DArray(src3, input, coord, VXC_5BITOFFSET_XY(0, 3),\
VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
int coordz = coord.z;
int index_col = coordz * rgroup_column;
int index_row = coordz - index_col * group_column;
coord.z = index_row * group_number + index_col;
VXC_WriteImage2DArray(output, coord, src0, VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0));
coord.y ++;
VXC_WriteImage2DArray(output, coord, src1, VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0));
coord.y ++;
VXC_WriteImage2DArray(output, coord, src2, VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0));
coord.y ++;
VXC_WriteImage2DArray(output, coord, src3, VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0));
}
/*****************shuffle channel int8/uint8****************************/
__kernel void shuffleChannel8BitsVXC(
image2d_array_t input,
image2d_array_t output,
int group_number,
int axis)
{
int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0);
vxc_char16 src0, src1, src2, src3;
VXC_ReadImage2DArray(src0, input, coord, VXC_5BITOFFSET_XY(0, 0),\
VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));
VXC_ReadImage2DArray(src1, input, coord, VXC_5BITOFFSET_XY(0, 1),\
VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));
VXC_ReadImage2DArray(src2, input, coord, VXC_5BITOFFSET_XY(0, 2),\
VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));
VXC_ReadImage2DArray(src3, input, coord, VXC_5BITOFFSET_XY(0, 3),\
VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));
int coordz = coord.z;
int index_col = coordz * rgroup_column;
int index_row = coordz - index_col * group_column;
coord.z = index_row * group_number + index_col;
VXC_WriteImage2DArray(output, coord, src0, VXC_MODIFIER(0, 15, 0,VXC_RM_TowardZero, 0));
coord.y ++;
VXC_WriteImage2DArray(output, coord, src1, VXC_MODIFIER(0, 15, 0,VXC_RM_TowardZero, 0));
coord.y ++;
VXC_WriteImage2DArray(output, coord, src2, VXC_MODIFIER(0, 15, 0,VXC_RM_TowardZero, 0));
coord.y ++;
VXC_WriteImage2DArray(output, coord, src3, VXC_MODIFIER(0, 15, 0,VXC_RM_TowardZero, 0));
}

View File

@ -1,65 +0,0 @@
#include "cl_viv_vx_ext.h"
/******************shuffle channel float16/int16********************/
_viv_uniform int group_column;
_viv_uniform float rgroup_column;
__kernel void shuffleChannel16Bits_Axis1(
image2d_array_t input,
image2d_array_t output,
int group_number,
int axis)
{
int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0);
int4 coord_out = coord;
vxc_short8 src0, src1, src2, src3;
VXC_ReadImage2DArray(src0, input, coord, VXC_5BITOFFSET_XY(0, 0),\
VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
coord.x += 8;
VXC_ReadImage2DArray(src1, input, coord, VXC_5BITOFFSET_XY(0, 0),\
VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
coord.x += 8;
VXC_ReadImage2DArray(src2, input, coord, VXC_5BITOFFSET_XY(0, 0),\
VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
coord.x += 8;
VXC_ReadImage2DArray(src3, input, coord, VXC_5BITOFFSET_XY(0, 0),\
VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
int coordy = coord.y;
int index_col = coordy * rgroup_column;
int index_row = coordy - index_col * group_column;
coord_out.y = index_row * group_number + index_col;
VXC_WriteImage2DArray(output, coord_out, src0, VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0));
coord_out.x += 8;
VXC_WriteImage2DArray(output, coord_out, src1, VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0));
coord_out.x += 8;
VXC_WriteImage2DArray(output, coord_out, src2, VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0));
coord_out.x += 8;
VXC_WriteImage2DArray(output, coord_out, src3, VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0));
}
/*****************shuffle channel int8/uint8****************************/
__kernel void shuffleChannel8Bits_Axis1(
image2d_array_t input,
image2d_array_t output,
int group_number,
int axis)
{
int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0);
int4 coord_out = coord;
vxc_char16 src0, src1;
VXC_ReadImage2DArray(src0, input, coord, VXC_5BITOFFSET_XY(0, 0),\
VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));
coord.x += 16;
VXC_ReadImage2DArray(src1, input, coord, VXC_5BITOFFSET_XY(0, 0),\
VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));
int coordy = coord.y;
int index_col = coordy * rgroup_column;
int index_row = coordy - index_col * group_column;
coord_out.y = index_row * group_number + index_col;
VXC_WriteImage2DArray(output, coord_out, src0, VXC_MODIFIER(0, 15, 0,VXC_RM_TowardZero, 0));
coord_out.x += 16;
VXC_WriteImage2DArray(output, coord_out, src1, VXC_MODIFIER(0, 15, 0,VXC_RM_TowardZero, 0));
}

View File

@ -1,41 +0,0 @@
#include "cl_viv_vx_ext.h"
_viv_uniform VXC_512Bits uniExtractEvenFp16Stride2_4x4;
_viv_uniform VXC_512Bits uniExtractOddFp16Stride2_4x4;
_viv_uniform int input_depth;
__kernel void vxcReorg2_fp16_fp16_sx2_sy1
(
image2d_array_t input,
image2d_array_t output,
int stridex,
int stridey
)
{
int gidx = get_global_id(0);
int gidy = get_global_id(1);
int gidz = get_global_id(2);
int4 coord = (int4)(gidx, gidy, gidz, 0);
int4 coord_out = (int4)(gidx >> 1, gidy, 0, 0);
int out_d0, out_d1;
vxc_short8 imageData;
vxc_short8 imgVal0, imgVal1;
//int tmpw = gidz / input_depth; \n\
//int tmpz = gidz % input_depth; \n\
VXC_ReadImage2DArray(imageData, input, coord, VXC_5BITOFFSET_XY(0, 0),
VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
VXC_DP4x4(imgVal0, imageData, imageData, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),
uniExtractEvenFp16Stride2_4x4);
VXC_DP4x4(imgVal1, imageData, imageData, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),
uniExtractOddFp16Stride2_4x4);
out_d0 = gidz * 2 * 1;
out_d1 = out_d0 + 1;
coord_out.z = out_d0;
VXC_WriteImage2DArray(output, coord_out, imgVal0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0));
coord_out.z = out_d1;
VXC_WriteImage2DArray(output, coord_out, imgVal1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0));
}

File diff suppressed because it is too large Load Diff

View File

@ -10,8 +10,11 @@ CFLAGS += -fvisibility=hidden -D'OVXLIB_API=__attribute__((visibility("default")
################################################################################
# Supply necessary libraries.
LIBS += -L$(VIVANTE_SDK_LIB) -l OpenVX -l OpenVXU -l CLC -l VSC
ifeq ($(USE_VXC_BINARY)$(USE_VSC_LITE),11)
LIBS += -L$(VIVANTE_SDK_LIB) -l OpenVX -l OpenVXU -l CLC -l VSC_Lite -lGAL
else
LIBS += -L$(VIVANTE_SDK_LIB) -l OpenVX -l OpenVXU -l CLC -l VSC -lGAL
endif
LIBS += -lm -ldl
#############################################################################

View File

@ -219,7 +219,10 @@ static vsi_bool op_check
IO_TYPE(D_F32, D_I32)
IO_TYPE(D_F16, D_I32)
IO_TYPE(D_I32, D_I32)
IO_TYPE(D_U8|Q_ASYM, D_I32)
IO_TYPE(D_I8|Q_DFP, D_I32)
IO_TYPE(D_I8, D_I32)
IO_TYPE(D_U8|Q_ASYM, D_I32)
IO_TYPE(D_U8, D_I32)
END_IO_TYPE_DECL(ARGMIN)
if(!VALIDATE_OP_IO_TYPES(ARGMIN, self, inputs, self->input.num, outputs, self->output.num)) {
char* desc = generate_op_io_types_desc(inputs,

View File

@ -44,190 +44,6 @@
#define _IO_NUM (_INPUT_NUM + _OUTPUT_NUM)
#define _PARAM_NUM (_ARG_NUM + _IO_NUM)
#define USE_OVX_API TRUE
#if (USE_OVX_API == FALSE)
extern vx_kernel_description_t * vx_kernel_CROP_list[];
static void _set_inputs_outputs
(
vx_reference * params,
vsi_nn_tensor_t ** inputs,
vsi_nn_tensor_t ** outputs
)
{
uint32_t i;
uint32_t cnt;
/* Set inputs */
cnt = 0;
for( i = 0; i < _INPUT_NUM; i ++, cnt ++ )
{
params[cnt] = (vx_reference)inputs[i]->t;
}
/* Set outputs */
for( i = 0; i < _OUTPUT_NUM; i ++, cnt ++ )
{
params[cnt] = (vx_reference)outputs[i]->t;
}
} /* _set_inputs_outputs() */
static vsi_status _create_params
(
vsi_nn_node_t * node,
vx_reference * params,
uint32_t num
)
{
vsi_status status;
vx_context ctx;
vsi_nn_crop_param * p;
if( 0 == num )
{
return VSI_SUCCESS;
}
memset( params, 0, sizeof( vx_reference * ) * num );
p = &(node->nn_param.crop);
ctx = vxGetContext( (vx_reference)node->graph->g );
/* Init parameters */
#define _SET_PARAM( i, type, arg ) do{ \
params[i] = (vx_reference)vxCreateScalar( ctx, type, &p->arg ); \
status = vxGetStatus( params[i] ); \
if( VSI_SUCCESS != status ) { \
goto set_param_error; \
} \
} while(0)
_SET_PARAM( 0, VX_TYPE_INT32, offset[0] );
_SET_PARAM( 1, VX_TYPE_INT32, offset[1] );
_SET_PARAM( 2, VX_TYPE_INT32, offset[2] );
#undef _SET_PARAM
set_param_error:
return status;
} /* _create_params */
static void _release_params
(
vx_reference * params,
uint32_t num
)
{
uint32_t i;
vx_scalar scalar;
for( i = 0; i < num; i ++ )
{
scalar = (vx_scalar)params[i];
vxReleaseScalar( &scalar );
}
} /* _release_params() */
static vsi_status cpu_op_compute
(
vsi_nn_node_t * self,
vsi_nn_tensor_t ** inputs,
vsi_nn_tensor_t ** outputs
)
{
vsi_status status = VSI_SUCCESS;
vx_reference params[_PARAM_NUM];
vx_reference * args;
args = &params[_IO_NUM];
if( NULL == self->n )
{
return VSI_FAILURE;
}
/* Set inputs and outputs */
_set_inputs_outputs( params, inputs, outputs );
/* Init parameters. */
_create_params( self, args, _ARG_NUM );
/* Pass parameters to node. */
status = vsi_nn_ClientNodePassParameters( self->n, params, _PARAM_NUM );
_release_params( args, _ARG_NUM );
return status;
}
static vsi_status vx_op_pre_init
(
vsi_nn_node_t * self,
vsi_nn_tensor_t ** inputs,
vsi_nn_tensor_t ** outputs,
vsi_nn_kernel_info_t * kernel_info
)
{
vsi_nn_type_e dataFormat = inputs[0]->attr.dtype.vx_type;
vsi_nn_type_e dstFormat = outputs[0]->attr.dtype.vx_type;
if (dataFormat == VSI_NN_TYPE_FLOAT16
|| (dataFormat == VSI_NN_TYPE_INT16 && dstFormat == VSI_NN_TYPE_INT16))
{
kernel_info->kernel_index = 1;
}
else if(dataFormat == VSI_NN_TYPE_INT16 && dstFormat == VSI_NN_TYPE_FLOAT16)
{
kernel_info->kernel_index = 3;
}
else
{
kernel_info->kernel_index = 2;
}
return VSI_SUCCESS;
}
static vsi_status vx_op_compute
(
vsi_nn_node_t * self,
vsi_nn_tensor_t ** inputs,
vsi_nn_tensor_t ** outputs
)
{
vsi_status status = VSI_SUCCESS;
vx_reference params[_PARAM_NUM];
vx_border_t border;
vx_reference * args;
args = &params[_IO_NUM];
if( NULL == self->n )
{
return VSI_FAILURE;
}
/* Set inputs and outputs */
_set_inputs_outputs( params, inputs, outputs );
/* Init parameters. */
_create_params( self, args, _ARG_NUM );
/* Pass parameters to node. */
status = vsi_nn_ClientNodePassParameters( self->n, params, _PARAM_NUM );
_release_params( args, _ARG_NUM );
border.mode = VX_BORDER_REPLICATE;
border.constant_value.U32 = 0;
status |= vxSetNodeAttribute(self->n, VX_NODE_BORDER, &border, sizeof(border));
return status;
}
static vsi_nn_op_compute_t op_compute_list[] =
{
cpu_op_compute,
vx_op_compute,
NULL
};
#endif
static vsi_status op_compute
(
vsi_nn_node_t * self,
@ -236,7 +52,6 @@ static vsi_status op_compute
)
{
vsi_status status = VSI_FAILURE;
#if (USE_OVX_API == TRUE)
vx_nn_stride_slice_params_t param;
vsi_nn_tensor_t *begin_dims_tensor = NULL;
vsi_nn_tensor_t *end_dims_tensor = NULL;
@ -317,36 +132,6 @@ static vsi_status op_compute
{
status = VSI_SUCCESS;
}
#else
vsi_nn_kernel_info_t kernel_info;
memset(&kernel_info, 0x0, sizeof(vsi_nn_kernel_info_t));
status = VSI_FAILURE;
kernel_info.resource_num = 1;
kernel_info.resource_name = (char **)malloc(kernel_info.resource_num * sizeof(char *));
kernel_info.resource_name[0] = "vsi_nn_kernel_crop";
kernel_info.type = vsi_nn_GetVXKernelTypeForShader();
kernel_info.kernel = vx_kernel_CROP_list;
kernel_info.init_index = 1;
if (vsi_nn_is_do_vx_op_pre_init(kernel_info.type))
{
vx_op_pre_init(self, inputs, outputs, &kernel_info);
}
self->n = vsi_nn_RegisterClientKernelAndNewNode(
self->graph, &kernel_info);
if (kernel_info.resource_name) free(kernel_info.resource_name);
if( NULL == self->n )
{
return VSI_FAILURE;
}
if (NULL != op_compute_list[kernel_info.init_index])
{
status = op_compute_list[kernel_info.init_index](self, inputs, outputs);
}
#endif
OnError:
if (begin_dims_tensor) vsi_nn_ReleaseTensor(&begin_dims_tensor);
if (end_dims_tensor) vsi_nn_ReleaseTensor(&end_dims_tensor);

View File

@ -221,6 +221,9 @@ static vsi_bool op_check
IO_TYPE(D_BF16, D_F32)
IO_TYPE(D_I32, D_I32)
IO_TYPE(D_I32, D_I16|Q_DFP)
IO_TYPE(D_I16, D_I16|Q_DFP)
IO_TYPE(D_I8, D_I8|Q_DFP)
IO_TYPE(D_U8, D_U8|Q_ASYM)
END_IO_TYPE_DECL(DATACONVERT)
if (!VALIDATE_OP_IO_TYPES(DATACONVERT, self, inputs, self->input.num, outputs, self->output.num))
{

View File

@ -196,6 +196,7 @@ static vsi_bool op_check
IO_TYPE(D_I16|Q_DFP, D_I16|Q_DFP, D_I16|Q_DFP, D_I16|Q_DFP)
IO_TYPE(D_U8|Q_ASYM, D_I8|Q_DFP, D_I32|Q_DFP, D_U8|Q_ASYM)
IO_TYPE(D_U8|Q_ASYM, D_I8|Q_SYM_PC, D_I32|Q_SYM_PC, D_U8|Q_ASYM)
IO_TYPE(D_I8|Q_ASYM, D_I8|Q_SYM_PC, D_I32|Q_SYM_PC, D_I8|Q_ASYM)
IO_TYPE(D_U8|Q_ASYM, D_U8|Q_ASYM, D_NONE, D_U8|Q_ASYM)
IO_TYPE(D_U8|Q_ASYM, D_U8|Q_ASYM, D_I32|Q_ASYM, D_U8|Q_ASYM)
IO_TYPE(D_U8|Q_ASYM, D_U8|Q_ASYM, D_I32|Q_ASYM, D_I8|Q_DFP)

View File

@ -89,7 +89,11 @@ static vsi_bool op_check
IO_TYPE(D_I32, D_F16, D_F16)
IO_TYPE(D_I32, D_F32, D_F32)
IO_TYPE(D_I32, D_I32, D_I32)
IO_TYPE(D_I32, D_U8|Q_ASYM, D_F32)
IO_TYPE(D_I32, D_U8|Q_ASYM, D_U8|Q_ASYM)
IO_TYPE(D_I32, D_U8|Q_ASYM, D_I8|Q_DFP)
IO_TYPE(D_I32, D_U8|Q_ASYM, D_I8)
IO_TYPE(D_F16, D_F16, D_F16)
END_IO_TYPE_DECL(EMBEDDING_LOOKUP)
if (!VALIDATE_OP_IO_TYPES(EMBEDDING_LOOKUP, self, inputs, self->input.num, outputs, self->output.num))

View File

@ -42,215 +42,6 @@
#define _IO_NUM (_INPUT_NUM + _OUTPUT_NUM)
#define _PARAM_NUM (_ARG_NUM + _IO_NUM)
#define USE_OVX_API TRUE
#if (USE_OVX_API == FALSE)
extern vx_kernel_description_t * vx_kernel_FCL2_list[];
static void _set_inputs_outputs
(
vx_reference * params,
vsi_nn_tensor_t ** inputs,
vsi_nn_tensor_t ** outputs
)
{
uint32_t i;
uint32_t cnt;
/* Set inputs */
cnt = 0;
for( i = 0; i < _INPUT_NUM; i ++, cnt ++ )
{
params[cnt] = (vx_reference)inputs[i]->t;
}
/* Set outputs */
for( i = 0; i < _OUTPUT_NUM; i ++, cnt ++ )
{
params[cnt] = (vx_reference)outputs[i]->t;
}
} /* _set_inputs_outputs() */
static vsi_status _create_params
(
vsi_nn_node_t * node,
vx_reference * params,
uint32_t num
)
{
vsi_status status;
vx_context ctx;
vsi_nn_fcl_param * p;
if( 0 == num )
{
return VSI_SUCCESS;
}
memset( params, 0, sizeof( vx_reference * ) * num );
p = &(node->nn_param.fcl);
ctx = vxGetContext( (vx_reference)node->graph->g );
/* Init parameters */
#define _SET_PARAM( i, type, arg ) do{ \
params[i] = (vx_reference)vxCreateScalar( ctx, type, &p->arg ); \
status = vxGetStatus( params[i] ); \
if( VSI_SUCCESS != status ) { \
goto set_param_error; \
} \
} while(0)
_SET_PARAM( 0, VX_TYPE_INT32, axis );
//_SET_PARAM( 1, VX_TYPE_FLOAT32, bias );
//_SET_PARAM( 2, VX_TYPE_TENSOR, data_bias );
//_SET_PARAM( 3, VX_TYPE_TENSOR, data_weight );
//_SET_PARAM( 4, VX_TYPE_FLOAT32, regularize );
_SET_PARAM( 1, VX_TYPE_INT32, weights );
#undef _SET_PARAM
set_param_error:
return status;
} /* _create_params */
static void _release_params
(
vx_reference * params,
uint32_t num
)
{
uint32_t i;
vx_scalar scalar;
for( i = 0; i < num; i ++ )
{
scalar = (vx_scalar)params[i];
vxReleaseScalar( &scalar );
}
} /* _release_params() */
static vsi_status cpu_op_compute
(
vsi_nn_node_t * self,
vsi_nn_tensor_t ** inputs,
vsi_nn_tensor_t ** outputs
)
{
vsi_status status = VSI_SUCCESS;
vx_reference params[_PARAM_NUM];
vx_reference * args;
args = &params[_IO_NUM];
if( NULL == self->n )
{
return VSI_FAILURE;
}
/* Set inputs and outputs */
_set_inputs_outputs( params, inputs, outputs );
/* Init parameters. */
_create_params( self, args, _ARG_NUM );
/* Pass parameters to node. */
status = vsi_nn_ClientNodePassParameters( self->n, params, _PARAM_NUM );
_release_params( args, _ARG_NUM );
return status;
}
static vsi_status vx_op_compute
(
vsi_nn_node_t * self,
vsi_nn_tensor_t ** inputs,
vsi_nn_tensor_t ** outputs
)
{
vsi_status status = VSI_SUCCESS;
uint32_t axis;
vsi_nn_fcl_param * p;
uint32_t i = 0;
uint32_t num_fc = 1, num_no_fc = 1;
uint32_t num_of_dims[3] = {0};
uint32_t input_size[VSI_NN_MAX_DIM_NUM] = {0};
uint32_t output_size[VSI_NN_MAX_DIM_NUM] = {0};
uint32_t weights_size[VSI_NN_MAX_DIM_NUM] = {0};
int32_t size[VSI_NN_MAX_DIM_NUM] = {0};
uint32_t ofm = 0;
uint32_t dims = 0;
vx_tensor input = NULL;
vx_tensor output = NULL;
vx_tensor weight = NULL;
vx_tensor bias = NULL;
int32_t index = 0;
vx_border_t border;
if( NULL == self->n )
{
return VSI_FAILURE;
}
p = (vsi_nn_fcl_param *)&(self->nn_param.fcl);
axis = p->axis;
memcpy(input_size, inputs[0]->attr.size, sizeof(uint32_t) * VSI_NN_MAX_DIM_NUM);
num_of_dims[0] = inputs[0]->attr.dim_num;
memcpy(output_size, outputs[0]->attr.size, sizeof(uint32_t) * VSI_NN_MAX_DIM_NUM);
num_of_dims[1] = outputs[0]->attr.dim_num;
memcpy(weights_size, inputs[1]->attr.size, sizeof(uint32_t) * VSI_NN_MAX_DIM_NUM);
num_of_dims[2] = inputs[1]->attr.dim_num;
ofm = weights_size[num_of_dims[2] - 1];
for(i = 0; i <= (uint32_t)axis; ++i)
{
num_fc *= input_size[i];
}
for(i = axis + 1; i < num_of_dims[0]; ++i)
{
num_no_fc *= input_size[i];
}
size[0] = num_fc;
size[1] = num_no_fc;
dims= 2;
input = vxReshapeTensor(inputs[0]->t, size, dims);
size[0] = num_fc;
size[1] = ofm;
dims= 2;
weight = vxReshapeTensor(inputs[1]->t, size, dims);
size[0] = ofm;
size[1] = 1;
dims= 2;
bias = vxReshapeTensor(inputs[2]->t, size, dims);
size[0] = ofm;
size[1] = num_no_fc;
dims= 2;
output = vxReshapeTensor(outputs[0]->t, size, dims);
status |= vxSetParameterByIndex(self->n, index++, (vx_reference)input);
status |= vxSetParameterByIndex(self->n, index++, (vx_reference)weight);
status |= vxSetParameterByIndex(self->n, index++, (vx_reference)bias);
status |= vxSetParameterByIndex(self->n, index++, (vx_reference)output);
border.mode = VX_BORDER_CONSTANT;
border.constant_value.S16 = 0;
status |= vxSetNodeAttribute(self->n, VX_NODE_BORDER, &border, sizeof(border));
if (input) vxReleaseTensor(&input);
if (weight) vxReleaseTensor(&weight);
if (bias) vxReleaseTensor(&bias);
if (output) vxReleaseTensor(&output);
return status;
}
static vsi_nn_op_compute_t op_compute_list[] =
{
cpu_op_compute,
vx_op_compute,
NULL
};
#endif
static vsi_status op_compute
(
vsi_nn_node_t * self,
@ -259,7 +50,6 @@ static vsi_status op_compute
)
{
vsi_status status = VSI_FAILURE;
#if (USE_OVX_API == TRUE)
uint32_t axis;
vsi_nn_fcl_param * p;
uint32_t i = 0;
@ -343,30 +133,7 @@ static vsi_status op_compute
if (weight) vxReleaseTensor(&weight);
if (bias) vxReleaseTensor(&bias);
if (output) vxReleaseTensor(&output);
#else
vsi_nn_kernel_info_t kernel_info;
memset(&kernel_info, 0x0, sizeof(vsi_nn_kernel_info_t));
status = VSI_FAILURE;
kernel_info.resource_num = 1;
kernel_info.resource_name = (char **)malloc(kernel_info.resource_num * sizeof(char *));
kernel_info.resource_name[0] = "vsi_nn_kernel_fullconnect2";
kernel_info.type = VX_KERNEL_TYPE_VX;
kernel_info.kernel = vx_kernel_FCL2_list;
kernel_info.kernel_index = 1;
kernel_info.init_index = 1;
self->n = vsi_nn_RegisterClientKernelAndNewNode(
self->graph, &kernel_info);
if (kernel_info.resource_name) free(kernel_info.resource_name);
if( NULL == self->n )
{
return VSI_FAILURE;
}
if (NULL != op_compute_list[kernel_info.init_index])
{
status = op_compute_list[kernel_info.init_index](self, inputs, outputs);
}
#endif
return status;
} /* op_compute() */

View File

@ -74,6 +74,7 @@ static vsi_status op_compute
vsi_nn_kernel_param_add_int32( param, "block_size", block_size );
vsi_nn_kernel_param_add_int32( param, "block_num", block_num );
vsi_nn_kernel_param_add_int32( param, "axis_num", axis_num );
vsi_nn_kernel_param_add_int32( param, "axis", axis );
vsi_nn_kernel_param_add_int32( param, "indices_num", indices_num );
n = vsi_nn_kernel_selector( self->graph, "gather", inputs, 2, outputs, 1, param );
if( n != NULL )

View File

@ -41,6 +41,50 @@
#define _INPUT_NUM (3)
#define _OUTPUT_NUM (1)
static vsi_status _try_set_high_presision_tensor
(
vsi_nn_tensor_t **inputs
)
{
vsi_status status;
vsi_nn_vxtensor_attr_t attr;
status = VSI_SUCCESS;
attr = VSI_NN_TENSOR_ATTR_HIGH_PRECISION;
if(VSI_NN_TYPE_FLOAT32 == inputs[1]->attr.dtype.vx_type)
{
status = vsi_nn_SetTensorAttr(inputs[1], attr);
if(VSI_SUCCESS != status)
{
return status;
}
}
if(VSI_NN_TYPE_FLOAT32 == inputs[2]->attr.dtype.vx_type)
{
status = vsi_nn_SetTensorAttr(inputs[2], attr);
if(VSI_SUCCESS != status)
{
return status;
}
}
return status;
}
static vsi_bool _is_3d_instance_norm
(
vsi_nn_node_t * self,
vsi_nn_tensor_t ** inputs
)
{
if( 3 == inputs[0]->attr.dim_num )
{
return TRUE;
}
return FALSE;
} /* _is_3d_instance_norm() */
static vsi_status op_compute
(
vsi_nn_node_t * self,
@ -55,19 +99,42 @@ static vsi_status op_compute
uint32_t *input_size = inputs[0]->attr.size;
uint32_t dims_num = inputs[0]->attr.dim_num;
int32_t rs_flg = 0;
vsi_nn_tensor_t * tmp_inputs[3] = {NULL, NULL, NULL};
vsi_nn_tensor_t * tmp_outputs[1] = {NULL};
vsi_nn_instancenorm_lcl_data2 *local = self->nn_param.instancenorm.lcl2_data;
param =vsi_nn_kernel_param_create();
if((input_size[1] * input_size[2] < 65536)
&& dims_num > 2)
status = _try_set_high_presision_tensor(inputs);
if(status != VSI_SUCCESS)
{
rs_flg = 1;
VSILOGE("Set tensor attr of high presision fail");
return status;
}
if(_is_3d_instance_norm(self, inputs))
{
tmp_inputs[0] = local->reshaped_input;
tmp_outputs[0] = local->reshaped_output;
tmp_inputs[1] = inputs[1];
tmp_inputs[2] = inputs[2];
}
else
{
tmp_inputs[0] = inputs[0];
tmp_outputs[0] = outputs[0];
tmp_inputs[1] = inputs[1];
tmp_inputs[2] = inputs[2];
if((input_size[1] * input_size[2] < 65536)
&& dims_num > 2)
{
rs_flg = 1;
}
}
param =vsi_nn_kernel_param_create();
vsi_nn_kernel_param_add_float32( param, "eps", eps );
vsi_nn_kernel_param_add_int32( param, "reshape_flg", rs_flg );
n = vsi_nn_kernel_selector( self->graph, "instance_norm",
inputs, _INPUT_NUM, outputs, _OUTPUT_NUM, param );
tmp_inputs, _INPUT_NUM, tmp_outputs, _OUTPUT_NUM, param );
if( n != NULL )
{
self->n = (vx_node)n;
@ -82,6 +149,59 @@ static vsi_status op_compute
return status;
} /* op_compute() */
static vsi_status op_optimize
(
vsi_nn_node_t * self,
vsi_nn_tensor_t ** inputs,
vsi_nn_tensor_t ** outputs,
vsi_nn_opt_direction_e direction
)
{
uint32_t dim = 0;
vsi_nn_instancenorm_lcl_data2 *local = NULL;
uint32_t shape[VSI_NN_MAX_DIM_NUM];
char tensor_name[128];
dim = inputs[0]->attr.dim_num;
if(_is_3d_instance_norm(self, inputs) == FALSE)
{
return VSI_SUCCESS;
}
VSILOGD("Optimize 3D %s, uid %u", vsi_nn_OpGetName(self->op), self->uid);
/*
insert a reshape node before and after 3D instance_norm
*/
shape[0] = 1;
shape[1] = inputs[0]->attr.size[0];
shape[2] = inputs[0]->attr.size[1];
shape[3] = inputs[0]->attr.size[2];
dim = 4;
local = self->nn_param.instancenorm.lcl2_data;
if (VSI_NN_OPTIMIZE_FORWARD == direction)
{
/* reshape 3d input (xcn) --> 4d input (whcn) */
local->reshaped_input = vsi_nn_reshape_tensor(self->graph, inputs[0], shape, dim);
}
else
{
/* reshape 3d output(xcn) --> 4d output(whcn) */
local->reshaped_output = vsi_nn_reshape_tensor(self->graph, outputs[0], shape, dim);
if(local->reshaped_output && local->reshaped_output->t)
{
memset(tensor_name, 0, sizeof(tensor_name));
snprintf(tensor_name, sizeof(tensor_name), "uid_%u_reshape_out_0", self->uid);
if(vxSetReferenceName((vx_reference)local->reshaped_output->t, tensor_name) == VSI_FAILURE)
{
VSILOGW("Set uid %u batchnorm reshaped output name fail", self->uid);
return VSI_FAILURE;
}
}
}
return VSI_SUCCESS;
} /* op_optimize() */
static vsi_bool op_check
(
vsi_nn_node_t * self,
@ -133,6 +253,8 @@ static vsi_status op_init
self->nn_param.instancenorm.lcl2_data->reshapeFlg = 0;
self->nn_param.instancenorm.lcl2_data->execute_on_sw = 0;
self->nn_param.instancenorm.lcl2_data->hash_idx = 0;
self->nn_param.instancenorm.lcl2_data->reshaped_input = NULL;
self->nn_param.instancenorm.lcl2_data->reshaped_output = NULL;
return status;
} /* op_init() */
@ -143,6 +265,7 @@ static vsi_status op_deinit
)
{
uint32_t i;
vsi_nn_instancenormalize_param *p = &(self->nn_param.instancenorm);
for (i = 0; i < _VSI_NN_INSTANCENORM_LOCAL_TENSOR_NUM; i++)
{
if (self->nn_param.instancenorm.local.local_tensor[i] != NULL)
@ -151,6 +274,16 @@ static vsi_status op_deinit
self->nn_param.instancenorm.local.local_tensor[i] = NULL;
}
}
if(p->lcl2_data->reshaped_input)
{
vsi_nn_ReleaseTensor(&(p->lcl2_data->reshaped_input));
p->lcl2_data->reshaped_input = NULL;
}
if(p->lcl2_data->reshaped_output)
{
vsi_nn_ReleaseTensor(&(p->lcl2_data->reshaped_output));
p->lcl2_data->reshaped_output = NULL;
}
if(self->nn_param.instancenorm.lcl2_data)
{
free(self->nn_param.instancenorm.lcl2_data);
@ -173,7 +306,7 @@ DEF_OP_REG
/* deinit */ op_deinit,
/* check */ op_check,
/* setup */ vsi_nn_op_common_setup,
/* optimize */ NULL,
/* optimize */ op_optimize,
/* input_num */ _INPUT_NUM,
/* output_num */ _OUTPUT_NUM
);

View File

@ -115,6 +115,45 @@ final:
}
static vsi_bool _check_value_is_equal_to_one
(
vsi_nn_graph_t* graph,
vsi_nn_tensor_t* tensor
)
{
vsi_bool ret = TRUE;
float* tensor_data = NULL;
uint32_t elements = 0;
uint32_t i = 0;
elements = vsi_nn_GetElementNum( tensor );
tensor_data = vsi_nn_ConvertTensorToFloat32Data( graph, tensor );
if ( NULL == tensor_data )
{
VSILOGE( "Convert data fail." );
return FALSE;
}
for (i = 0; i < elements; i++)
{
if ( vsi_abs(tensor_data[i] - 1.0f) > 1e-5 )
{
ret = FALSE;
break;
}
}
if ( !tensor->attr.is_created_from_handle )
{
if ( tensor_data )
{
free(tensor_data);
}
}
return ret;
}
static vsi_status op_compute
(
vsi_nn_node_t * self,
@ -141,6 +180,11 @@ static vsi_status op_compute
p = &(self->nn_param.l2normalizescale);
axis = p->axis;
if ( inputs[1]->attr.is_const == TRUE && _check_value_is_equal_to_one(self->graph, inputs[1]) )
{
return vsi_nn_internal_compute_node( self );
}
param =vsi_nn_kernel_param_create();
ret = vsi_nn_kernel_optimize_reduce_shape(
@ -240,6 +284,9 @@ static vsi_status op_deinit
self->nn_param.l2normalizescale.local.local_tensor[i] = NULL;
}
}
vsi_nn_internal_deinit_node_wksp( self );
vsi_nn_op_common_deinit(self);
return VSI_SUCCESS;
@ -253,11 +300,15 @@ static vsi_bool op_setup
)
{
vsi_bool ret = TRUE;
vsi_nn_internal_node_t* curr = NULL;
if( NULL == self )
{
return FALSE;
}
vsi_nn_internal_init_node_wksp( self );
if (self->nn_param.l2normalizescale.axis < 0)
{
self->nn_param.l2normalizescale.axis += (int32_t)inputs[0]->attr.dim_num;
@ -269,6 +320,15 @@ static vsi_bool op_setup
return FALSE;
}
if ( inputs[1]->attr.is_const == TRUE && _check_value_is_equal_to_one( self->graph, inputs[1] ) )
{
curr = vsi_nn_internal_new_node(self, VSI_NN_OP_L2_NORMALIZE, 0, 0);
curr->node->nn_param.l2_normalize.axis = self->nn_param.l2normalizescale.axis;
curr->inputs[0] = inputs[0];
curr->outputs[0] = outputs[0];
vsi_nn_internal_setup_node( self, curr );
}
ret = vsi_nn_op_common_setup(self, inputs, outputs);
return ret;
@ -280,7 +340,7 @@ static vsi_status op_init
)
{
vsi_status status = VSI_SUCCESS;
uint32_t i;
uint32_t i = 0;
if (vsi_nn_compareVersion(self->graph, 1, 1, 13) == -1)
{

View File

@ -35,312 +35,11 @@
#include "vsi_nn_prv.h"
#include "vsi_nn_log.h"
#include "client/vsi_nn_vxkernel.h"
#include "kernel/vsi_nn_kernel.h"
#include "utils/vsi_nn_constraint_check.h"
#define _ARG_NUM (1)
#define _INPUT_NUM (3)
#define _OUTPUT_NUM (1)
#define _IO_NUM (_INPUT_NUM + _OUTPUT_NUM)
#define _PARAM_NUM (_ARG_NUM + _IO_NUM)
extern vx_kernel_description_t * vx_kernel_LAYERNORM_list[];
static void check_tensor_shape
(
vsi_nn_node_t * self,
vsi_nn_tensor_t * input,
vx_reference * params,
uint32_t index,
vx_bool rsFlg
)
{
vsi_nn_tensor_attr_t attr;
if (index == 0 )
{
if(input->attr.dim_num == 1)
{
memcpy(&attr, &(input->attr), sizeof(vsi_nn_tensor_attr_t));
attr.size[1] = 1;
attr.dim_num = 2;
self->nn_param.layernorm.local.local_tensor[index] =
vxReshapeTensor(input->t, (int32_t*)(attr.size), attr.dim_num);
params[index] = (vx_reference)self->nn_param.layernorm.local.local_tensor[index];
}
else if ((input->attr.dim_num == 3 && input->attr.size[2] == 1)
||(input->attr.dim_num == 4 && input->attr.size[2] == 1 && input->attr.size[3] == 1))
{
memcpy(&attr, &(input->attr), sizeof(vsi_nn_tensor_attr_t));
attr.dim_num = 2;
self->nn_param.layernorm.local.local_tensor[index] =
vxReshapeTensor(input->t, (int32_t*)(attr.size), attr.dim_num);
params[index] = (vx_reference)self->nn_param.layernorm.local.local_tensor[index];
}
else
params[index] = (vx_reference)input->t;
}
else if(index == 1 )
{
if(input->attr.dim_num == 1)
{
memcpy(&attr, &(input->attr), sizeof(vsi_nn_tensor_attr_t));
attr.size[1] = 1;
attr.dim_num = 2;
self->nn_param.layernorm.local.local_tensor[index] =
vxReshapeTensor(input->t, (int32_t*)(attr.size), attr.dim_num);
params[index] = (vx_reference)self->nn_param.layernorm.local.local_tensor[index];
}
else
params[index] = (vx_reference)input->t;
}
else if(index == 2)
{
if(input->attr.dim_num == 1)
{
memcpy(&attr, &(input->attr), sizeof(vsi_nn_tensor_attr_t));
attr.size[1] = 1;
attr.dim_num = 2;
self->nn_param.layernorm.local.local_tensor[index] =
vxReshapeTensor(input->t, (int32_t*)(attr.size), attr.dim_num);
params[index] = (vx_reference)self->nn_param.layernorm.local.local_tensor[index];
}
else
params[index] = (vx_reference)input->t;
}
else if(index == 3)
{
if(input->attr.dim_num == 1)
{
memcpy(&attr, &(input->attr), sizeof(vsi_nn_tensor_attr_t));
attr.size[1] = 1;
attr.dim_num = 2;
self->nn_param.layernorm.local.local_tensor[index] =
vxReshapeTensor(input->t, (int32_t*)(attr.size), attr.dim_num);
params[index] = (vx_reference)self->nn_param.layernorm.local.local_tensor[index];
}
else if ((input->attr.dim_num == 3 && input->attr.size[2] == 1)
||(input->attr.dim_num == 4 && input->attr.size[2] == 1 && input->attr.size[3] == 1))
{
memcpy(&attr, &(input->attr), sizeof(vsi_nn_tensor_attr_t));
attr.dim_num = 2;
self->nn_param.layernorm.local.local_tensor[index] =
vxReshapeTensor(input->t, (int32_t*)(attr.size), attr.dim_num);
params[index] = (vx_reference)self->nn_param.layernorm.local.local_tensor[index];
}
else
params[index] = (vx_reference)input->t;
}
else
{
VSILOGE("No more local tensor!(LAYERNORM) at [%s : %d]\n", __FILE__, __LINE__);
}
}
static void _set_inputs_outputs
(
vx_reference * params,
vsi_nn_tensor_t ** inputs,
vsi_nn_tensor_t ** outputs
)
{
uint32_t i;
uint32_t cnt;
/* Set inputs */
cnt = 0;
for( i = 0; i < _INPUT_NUM; i ++, cnt ++ )
{
params[cnt] = (vx_reference)inputs[i]->t;
}
/* Set outputs */
for( i = 0; i < _OUTPUT_NUM; i ++, cnt ++ )
{
params[cnt] = (vx_reference)outputs[i]->t;
}
} /* _set_inputs_outputs() */
static vsi_status _create_params
(
vsi_nn_node_t * node,
vx_reference * params,
uint32_t num
)
{
vsi_status status;
vx_context ctx;
vsi_nn_layernormalize_param * p;
if( 0 == num )
{
return VSI_SUCCESS;
}
memset( params, 0, sizeof( vx_reference * ) * num );
p = &(node->nn_param.layernorm);
ctx = vxGetContext( (vx_reference)node->graph->g );
/* Init parameters */
#define _SET_PARAM( i, type, arg ) do{ \
params[i] = (vx_reference)vxCreateScalar( ctx, type, &p->arg ); \
status = vxGetStatus( params[i] ); \
if( VSI_SUCCESS != status ) { \
goto set_param_error; \
} \
} while(0)
_SET_PARAM( 0, VX_TYPE_FLOAT32, eps );
#undef _SET_PARAM
set_param_error:
return status;
} /* _create_params */
static void _release_params
(
vx_reference * params,
uint32_t num
)
{
uint32_t i;
vx_scalar scalar;
for( i = 0; i < num; i ++ )
{
scalar = (vx_scalar)params[i];
vxReleaseScalar( &scalar );
}
} /* _release_params() */
static vsi_status cpu_op_compute
(
vsi_nn_node_t * self,
vsi_nn_tensor_t ** inputs,
vsi_nn_tensor_t ** outputs
)
{
vsi_status status = VSI_SUCCESS;
vx_reference params[_PARAM_NUM];
vx_reference * args;
args = &params[_IO_NUM];
if( NULL == self->n )
{
return VSI_FAILURE;
}
/* Set inputs and outputs */
_set_inputs_outputs( params, inputs, outputs );
/* Init parameters. */
_create_params( self, args, _ARG_NUM );
/* Pass parameters to node. */
status = vsi_nn_ClientNodePassParameters( self->n, params, _PARAM_NUM );
_release_params( args, _ARG_NUM );
return status;
}
static vsi_status vx_op_pre_compute
(
vsi_nn_node_t * self,
vsi_nn_tensor_t ** inputs,
vsi_nn_tensor_t ** outputs,
vsi_nn_kernel_info_t * kernel_info
)
{
vsi_nn_type_e inputDataFormat = inputs[0]->attr.dtype.vx_type;
vsi_nn_type_e outputDataFormat = outputs[0]->attr.dtype.vx_type;
vsi_nn_type_e scaleDataFormat = inputs[2]->attr.dtype.vx_type;
if (inputDataFormat == VSI_NN_TYPE_FLOAT16 && outputDataFormat == VSI_NN_TYPE_FLOAT16
&& scaleDataFormat == VSI_NN_TYPE_FLOAT16)
{
kernel_info->kernel_index = 1;
}
else if (inputDataFormat == VSI_NN_TYPE_UINT8 && outputDataFormat == VSI_NN_TYPE_UINT8
&& scaleDataFormat == VSI_NN_TYPE_FLOAT16)
{
kernel_info->kernel_index = 2;
}
else if (inputDataFormat == VSI_NN_TYPE_FLOAT16 && outputDataFormat == VSI_NN_TYPE_UINT8
&& scaleDataFormat == VSI_NN_TYPE_FLOAT16)
{
kernel_info->kernel_index = 3;
}
else if (inputDataFormat == VSI_NN_TYPE_UINT8 && outputDataFormat == VSI_NN_TYPE_FLOAT16
&& scaleDataFormat == VSI_NN_TYPE_FLOAT16)
{
kernel_info->resource_name[0] = "vsi_nn_kernel_layernormalize_U8";
kernel_info->kernel_index = 4;
}
else
{
VSILOGE("Not support input or output data format!(LAYERNORM) at [%s : %d]\n", __FILE__, __LINE__);
return VSI_FAILURE;
}
return VSI_SUCCESS;
}
static vsi_status vx_op_compute
(
vsi_nn_node_t * self,
vsi_nn_tensor_t ** inputs,
vsi_nn_tensor_t ** outputs
)
{
vsi_status status = VSI_SUCCESS;
vx_reference params[_PARAM_NUM];
vx_border_t border;
vx_reference * args;
vx_bool rsFlg = FALSE;
int32_t in_zp;
vsi_nn_type_e inputDataFormat = inputs[0]->attr.dtype.vx_type;
vsi_nn_tensor_attr_t attr;
args = &params[_IO_NUM];
if( NULL == self->n )
{
return VSI_FAILURE;
}
/* Set inputs and outputs */
//_set_inputs_outputs( params, inputs, outputs );
check_tensor_shape(self, inputs[0], params, 0, rsFlg);
check_tensor_shape(self, inputs[1], params, 1, rsFlg);
check_tensor_shape(self, inputs[2], params, 2, rsFlg);
check_tensor_shape(self, outputs[0], params, 3, rsFlg);
memset(&attr, 0, sizeof(vsi_nn_tensor_attr_t));
status = vsi_nn_vxGetTensorAttr(inputs[0]->t, &attr);
in_zp = attr.dtype.zero_point;
/* Init parameters. */
_create_params( self, args, _ARG_NUM );
/* Pass parameters to node. */
status = vsi_nn_ClientNodePassParameters( self->n, params, _PARAM_NUM );
_release_params( args, _ARG_NUM );
border.mode = VX_BORDER_CONSTANT;
border.constant_value.U32 = 0;
border.constant_value.S16 = 0;
border.constant_value.U8 = 0;
if(inputDataFormat == VSI_NN_TYPE_UINT8)
{
border.constant_value.U32 = (vx_uint32)in_zp;
border.constant_value.S16 = (vx_int16)in_zp;
border.constant_value.U8 = (vx_uint8)in_zp;
}
status |= vxSetNodeAttribute(self->n, VX_NODE_BORDER, &border, sizeof(border));
return status;
}
static vsi_nn_op_compute_t op_compute_list[] =
{
cpu_op_compute,
vx_op_compute,
NULL
};
static vsi_status op_compute
(
@ -349,35 +48,44 @@ static vsi_status op_compute
vsi_nn_tensor_t ** outputs
)
{
vsi_status status;
vsi_nn_kernel_info_t kernel_info;
vsi_status status = VSI_FAILURE;
vsi_nn_kernel_param_t * param = NULL;
vsi_nn_kernel_node_t n = NULL;
float eps = self->nn_param.instancenorm.eps;
uint32_t *input_size = inputs[0]->attr.size;
uint32_t dims_num = inputs[0]->attr.dim_num;
int32_t rs_flg = 0;
int32_t wh_flg = 0;
memset(&kernel_info, 0x0, sizeof(vsi_nn_kernel_info_t));
status = VSI_FAILURE;
kernel_info.resource_num = 1;
kernel_info.resource_name = (char **)malloc(kernel_info.resource_num * sizeof(char *));
kernel_info.resource_name[0] = "vsi_nn_kernel_layernormalize";
kernel_info.type = vsi_nn_GetVXKernelTypeForShader();
kernel_info.kernel = vx_kernel_LAYERNORM_list;
kernel_info.init_index = 1;
param =vsi_nn_kernel_param_create();
if (vsi_nn_is_do_vx_op_pre_init(kernel_info.type))
if (input_size[0] >= GPU_TENSOR_MAX_WIDTH)
{
vx_op_pre_compute(self, inputs, outputs, &kernel_info);
wh_flg = 1;
}
self->n = vsi_nn_RegisterClientKernelAndNewNode(
self->graph, &kernel_info);
if (kernel_info.resource_name) free(kernel_info.resource_name);
if( NULL == self->n )
if ((input_size[1] * input_size[2] < GPU_TENSOR_MAX_WIDTH)
&& dims_num > 2)
{
return VSI_FAILURE;
rs_flg = 1;
}
if (NULL != op_compute_list[kernel_info.init_index])
vsi_nn_kernel_param_add_float32( param, "eps", eps );
vsi_nn_kernel_param_add_int32( param, "reshape_flg", rs_flg );
vsi_nn_kernel_param_add_int32( param, "wh_flg", wh_flg );
n = vsi_nn_kernel_selector( self->graph, "layer_norm",
inputs, _INPUT_NUM, outputs, _OUTPUT_NUM, param );
if ( n != NULL )
{
status = op_compute_list[kernel_info.init_index](self, inputs, outputs);
self->n = (vx_node)n;
status = VSI_SUCCESS;
}
if (param != NULL)
{
vsi_nn_kernel_param_release( &param );
}
return status;
} /* op_compute() */
@ -389,10 +97,12 @@ static vsi_bool op_check
)
{
BEGIN_IO_TYPE_DECL(LAYER_NORM, 3, 1)
IO_TYPE(D_F32, D_F32, D_F32, D_F32)
IO_TYPE(D_F16, D_F32, D_F16, D_F16)
IO_TYPE(D_F16, D_F32, D_F16, D_U8|Q_ASYM)
IO_TYPE(D_U8|Q_ASYM, D_F32, D_F16, D_F16)
IO_TYPE(D_U8|Q_ASYM, D_F32, D_F16, D_U8|Q_ASYM)
IO_TYPE(D_I16|Q_DFP, D_F32, D_F16, D_I16|Q_DFP)
END_IO_TYPE_DECL(LAYER_NORM)
if (!VALIDATE_OP_IO_TYPES(LAYER_NORM, self, inputs, self->input.num, outputs, self->output.num))
{
@ -438,8 +148,8 @@ DEF_OP_REG
/* check */ op_check,
/* setup */ vsi_nn_op_common_setup,
/* optimize */ NULL,
/* input_num */ 3,
/* output_num */ 1
/* input_num */ _INPUT_NUM,
/* output_num */ _OUTPUT_NUM
);
#ifdef __cplusplus
}

Some files were not shown because too many files have changed in this diff Show More