From c14141623833f9af115cb2fdfb3f61cf45703182 Mon Sep 17 00:00:00 2001 From: Kainan Cha Date: Mon, 29 Mar 2021 16:21:46 +0800 Subject: [PATCH] Update internal to REL/v1.1.30.2 SHA: 2e64046f Signed-off-by: Kainan Cha --- src/tim/vx/internal/BUILD | 9 - src/tim/vx/internal/include/interface/ops.def | 1 + .../include/internal/internal_ops.def | 1 + .../kernel/vsi_nn_kernel_gpu_shape_optimize.h | 20 + .../internal/include/libnnext/vx_lib_nnext.h | 4 - .../include/ops/vsi_nn_op_instancenormalize.h | 4 + .../ops/vsi_nn_op_space2depth_internal.h | 44 + .../include/ops/vsi_nn_op_upsamplescale.h | 39 + src/tim/vx/internal/include/vsi_nn_graph.h | 5 + .../vx/internal/include/vsi_nn_node_type.h | 4 + .../include/vsi_nn_pre_post_process.h | 6 + .../src/kernel/cl/instance_normalization_cl.c | 78 +- .../src/kernel/cl/layer_normalization_cl.c | 395 + .../vx/internal/src/kernel/cl/matrixmul_cl.c | 124 +- .../vx/internal/src/kernel/cl/roi_align_cl.c | 329 + .../src/kernel/cl/space2depth_internal_cl.c | 298 + .../kernel/cpu/instance_normalization_cpu.c | 2 +- .../src/kernel/cpu/layer_normalization_cpu.c | 255 + .../internal/src/kernel/cpu/roi_align_cpu.c | 378 + .../src/kernel/cpu/space2depth_internal_cpu.c | 230 + .../src/kernel/cpu/upsamplescale_cpu.c | 264 + .../src/kernel/evis/a_times_b_plus_c_evis.c | 56 +- .../vx/internal/src/kernel/evis/gather_evis.c | 299 +- .../kernel/evis/instance_normalization_evis.c | 118 +- .../kernel/evis/layer_normalization_evis.c | 1389 ++++ .../src/kernel/evis/pre_process_bgra_evis.c | 64 +- .../src/kernel/evis/pre_process_nv12_evis.c | 101 +- .../src/kernel/evis/pre_process_rgb_evis.c | 233 +- .../src/kernel/evis/pre_process_yuv420_evis.c | 245 +- .../src/kernel/evis/pre_process_yuv444_evis.c | 250 +- .../src/kernel/evis/resize_bilinear_evis.c | 126 +- .../kernel/evis/space2depth_internal_evis.c | 366 + .../src/kernel/evis/upsamplescale_evis.c | 422 ++ .../kernel/vsi_nn_kernel_gpu_shape_optimize.c | 154 +- src/tim/vx/internal/src/kernel/vx/clip_vx.c | 6 +- .../vx/internal/src/kernel/vx/convolutional.c | 15 +- .../internal/src/kernel/vx/eltwise_unary_vx.c | 6 +- .../vx/internal/src/kernel/vx/relu_keras_vx.c | 6 +- .../libnnext/ops/cl/layer_normalization.cl | 143 + .../internal/src/libnnext/ops/cl/matrixmul.cl | 190 +- .../src/libnnext/ops/cl/matrixmul_transA.cl | 40 +- .../internal/src/libnnext/ops/cl/roi_align.cl | 108 + .../libnnext/ops/cl/space2depth_internal.cl | 90 + .../libnnext/ops/kernel/vsi_nn_kernel_crop.c | 253 - .../ops/kernel/vsi_nn_kernel_fullconnect2.c | 323 - .../ops/kernel/vsi_nn_kernel_layernormalize.c | 688 -- .../ops/kernel/vsi_nn_kernel_reduce.c | 190 - .../ops/kernel/vsi_nn_kernel_resize.c | 283 - .../ops/kernel/vsi_nn_kernel_roi_align.c | 317 - .../libnnext/ops/kernel/vsi_nn_kernel_scale.c | 410 - .../ops/kernel/vsi_nn_kernel_shufflechannel.c | 345 - .../ops/kernel/vsi_nn_kernel_space2depth.c | 293 - .../src/libnnext/ops/vx/a_times_b_plus_c.vx | 78 + .../vx/internal/src/libnnext/ops/vx/gather.vx | 121 +- .../src/libnnext/ops/vx/gather_mix.vx | 106 +- .../libnnext/ops/vx/layer_normalization.vx | 279 + ...normalize.vx => layer_normalization_2d.vx} | 62 +- .../ops/vx/layer_normalization_i16.vx | 167 + .../ops/vx/layer_normalization_u8_f16.vx | 252 + .../ops/vx/layer_normalization_wh_f16.vx | 426 ++ .../ops/vx/layer_normalization_wh_i16.vx | 266 + .../ops/vx/layer_normalization_wh_u8.vx | 419 ++ .../libnnext/ops/vx/pre_process_bgra_trans.vx | 136 - .../ops/vx/pre_process_nv12_trans_u8.vx | 89 - .../ops/vx/pre_process_rgb_copy_trans.vx | 94 - .../libnnext/ops/vx/pre_process_rgb_trans.vx | 172 - .../ops/vx/pre_process_yuv420_copy_u8.vx | 147 - .../ops/vx/pre_process_yuv420_trans_u8.vx | 235 - .../ops/vx/pre_process_yuv444_copy_u8.vx | 147 - .../ops/vx/pre_process_yuv444_trans_u8.vx | 196 - .../libnnext/ops/vx/resize_bilinear_BF16.vx | 144 +- .../libnnext/ops/vx/resize_bilinear_F16.vx | 315 +- .../libnnext/ops/vx/resize_bilinear_I16.vx | 212 +- .../src/libnnext/ops/vx/resize_bilinear_I8.vx | 189 +- .../src/libnnext/ops/vx/resize_bilinear_U8.vx | 276 +- .../libnnext/ops/vx/resize_bilinear_U8_opt.vx | 16 +- .../src/libnnext/ops/vx/resize_nearest.vx | 202 +- .../libnnext/ops/vx/space2depth_internal.vx | 135 + .../src/libnnext/ops/vx/upsamplescale.vx | 58 + .../src/libnnext/ops/vx/upsamplescale_k2.vx | 83 + .../src/libnnext/ops/vx/vsi_nn_kernel_crop.vx | 111 - .../ops/vx/vsi_nn_kernel_fullconnect2.vx | 63 - .../ops/vx/vsi_nn_kernel_layernormalize_U8.vx | 129 - .../libnnext/ops/vx/vsi_nn_kernel_resize.vx | 38 - .../libnnext/ops/vx/vsi_nn_kernel_scale.vx | 49 - .../ops/vx/vsi_nn_kernel_shufflechannel.vx | 67 - .../vx/vsi_nn_kernel_shufflechannel_axis1.vx | 65 - .../ops/vx/vsi_nn_kernel_space2depth.vx | 41 - .../src/libnnext/vsi_nn_libnnext_resource.c | 6652 ++++++++++------- src/tim/vx/internal/src/makefile.linux | 7 +- .../vx/internal/src/ops/vsi_nn_op_argmaxmin.c | 5 +- src/tim/vx/internal/src/ops/vsi_nn_op_crop.c | 215 - .../internal/src/ops/vsi_nn_op_dataconvert.c | 3 + .../src/ops/vsi_nn_op_deconvolution.c | 1 + .../src/ops/vsi_nn_op_embedding_lookup.c | 4 + .../internal/src/ops/vsi_nn_op_fullconnect2.c | 233 - .../vx/internal/src/ops/vsi_nn_op_gather.c | 1 + .../src/ops/vsi_nn_op_instancenormalize.c | 147 +- .../src/ops/vsi_nn_op_l2normalizescale.c | 62 +- .../src/ops/vsi_nn_op_layernormalize.c | 356 +- .../vx/internal/src/ops/vsi_nn_op_matrixmul.c | 20 +- .../internal/src/ops/vsi_nn_op_pre_process.c | 622 +- .../src/ops/vsi_nn_op_pre_process_bgra.c | 25 +- .../src/ops/vsi_nn_op_pre_process_nv12.c | 25 +- .../src/ops/vsi_nn_op_pre_process_rgb.c | 12 +- .../src/ops/vsi_nn_op_pre_process_yuv420.c | 25 +- .../src/ops/vsi_nn_op_pre_process_yuv444.c | 25 +- .../vx/internal/src/ops/vsi_nn_op_reduce.c | 146 - .../vx/internal/src/ops/vsi_nn_op_resize.c | 245 +- .../vx/internal/src/ops/vsi_nn_op_roi_align.c | 242 +- src/tim/vx/internal/src/ops/vsi_nn_op_scale.c | 280 +- .../src/ops/vsi_nn_op_shufflechannel.c | 289 - .../internal/src/ops/vsi_nn_op_space2depth.c | 330 +- .../src/ops/vsi_nn_op_space2depth_internal.c | 159 + .../src/ops/vsi_nn_op_tensorstackconcat.c | 8 +- .../vx/internal/src/ops/vsi_nn_op_unstack.c | 2 +- .../src/ops/vsi_nn_op_upsamplescale.c | 253 + .../src/utils/vsi_nn_code_generator.c | 1 + src/tim/vx/internal/src/vsi_nn_graph.c | 225 +- .../vx/internal/src/vsi_nn_pre_post_process.c | 25 + 120 files changed, 14252 insertions(+), 11997 deletions(-) create mode 100644 src/tim/vx/internal/include/ops/vsi_nn_op_space2depth_internal.h create mode 100644 src/tim/vx/internal/include/ops/vsi_nn_op_upsamplescale.h create mode 100644 src/tim/vx/internal/src/kernel/cl/layer_normalization_cl.c create mode 100644 src/tim/vx/internal/src/kernel/cl/roi_align_cl.c create mode 100644 src/tim/vx/internal/src/kernel/cl/space2depth_internal_cl.c create mode 100644 src/tim/vx/internal/src/kernel/cpu/layer_normalization_cpu.c create mode 100644 src/tim/vx/internal/src/kernel/cpu/roi_align_cpu.c create mode 100644 src/tim/vx/internal/src/kernel/cpu/space2depth_internal_cpu.c create mode 100644 src/tim/vx/internal/src/kernel/cpu/upsamplescale_cpu.c create mode 100644 src/tim/vx/internal/src/kernel/evis/layer_normalization_evis.c create mode 100644 src/tim/vx/internal/src/kernel/evis/space2depth_internal_evis.c create mode 100644 src/tim/vx/internal/src/kernel/evis/upsamplescale_evis.c create mode 100644 src/tim/vx/internal/src/libnnext/ops/cl/layer_normalization.cl create mode 100644 src/tim/vx/internal/src/libnnext/ops/cl/roi_align.cl create mode 100644 src/tim/vx/internal/src/libnnext/ops/cl/space2depth_internal.cl delete mode 100644 src/tim/vx/internal/src/libnnext/ops/kernel/vsi_nn_kernel_crop.c delete mode 100644 src/tim/vx/internal/src/libnnext/ops/kernel/vsi_nn_kernel_fullconnect2.c delete mode 100644 src/tim/vx/internal/src/libnnext/ops/kernel/vsi_nn_kernel_layernormalize.c delete mode 100644 src/tim/vx/internal/src/libnnext/ops/kernel/vsi_nn_kernel_reduce.c delete mode 100644 src/tim/vx/internal/src/libnnext/ops/kernel/vsi_nn_kernel_resize.c delete mode 100644 src/tim/vx/internal/src/libnnext/ops/kernel/vsi_nn_kernel_roi_align.c delete mode 100644 src/tim/vx/internal/src/libnnext/ops/kernel/vsi_nn_kernel_scale.c delete mode 100644 src/tim/vx/internal/src/libnnext/ops/kernel/vsi_nn_kernel_shufflechannel.c delete mode 100644 src/tim/vx/internal/src/libnnext/ops/kernel/vsi_nn_kernel_space2depth.c create mode 100644 src/tim/vx/internal/src/libnnext/ops/vx/layer_normalization.vx rename src/tim/vx/internal/src/libnnext/ops/vx/{vsi_nn_kernel_layernormalize.vx => layer_normalization_2d.vx} (87%) create mode 100644 src/tim/vx/internal/src/libnnext/ops/vx/layer_normalization_i16.vx create mode 100644 src/tim/vx/internal/src/libnnext/ops/vx/layer_normalization_u8_f16.vx create mode 100644 src/tim/vx/internal/src/libnnext/ops/vx/layer_normalization_wh_f16.vx create mode 100644 src/tim/vx/internal/src/libnnext/ops/vx/layer_normalization_wh_i16.vx create mode 100644 src/tim/vx/internal/src/libnnext/ops/vx/layer_normalization_wh_u8.vx delete mode 100644 src/tim/vx/internal/src/libnnext/ops/vx/pre_process_bgra_trans.vx delete mode 100644 src/tim/vx/internal/src/libnnext/ops/vx/pre_process_nv12_trans_u8.vx delete mode 100644 src/tim/vx/internal/src/libnnext/ops/vx/pre_process_rgb_copy_trans.vx delete mode 100644 src/tim/vx/internal/src/libnnext/ops/vx/pre_process_rgb_trans.vx delete mode 100644 src/tim/vx/internal/src/libnnext/ops/vx/pre_process_yuv420_trans_u8.vx delete mode 100644 src/tim/vx/internal/src/libnnext/ops/vx/pre_process_yuv444_trans_u8.vx create mode 100644 src/tim/vx/internal/src/libnnext/ops/vx/space2depth_internal.vx create mode 100644 src/tim/vx/internal/src/libnnext/ops/vx/upsamplescale.vx create mode 100644 src/tim/vx/internal/src/libnnext/ops/vx/upsamplescale_k2.vx delete mode 100644 src/tim/vx/internal/src/libnnext/ops/vx/vsi_nn_kernel_crop.vx delete mode 100644 src/tim/vx/internal/src/libnnext/ops/vx/vsi_nn_kernel_fullconnect2.vx delete mode 100644 src/tim/vx/internal/src/libnnext/ops/vx/vsi_nn_kernel_layernormalize_U8.vx delete mode 100644 src/tim/vx/internal/src/libnnext/ops/vx/vsi_nn_kernel_resize.vx delete mode 100644 src/tim/vx/internal/src/libnnext/ops/vx/vsi_nn_kernel_scale.vx delete mode 100644 src/tim/vx/internal/src/libnnext/ops/vx/vsi_nn_kernel_shufflechannel.vx delete mode 100644 src/tim/vx/internal/src/libnnext/ops/vx/vsi_nn_kernel_shufflechannel_axis1.vx delete mode 100644 src/tim/vx/internal/src/libnnext/ops/vx/vsi_nn_kernel_space2depth.vx create mode 100644 src/tim/vx/internal/src/ops/vsi_nn_op_space2depth_internal.c create mode 100644 src/tim/vx/internal/src/ops/vsi_nn_op_upsamplescale.c diff --git a/src/tim/vx/internal/BUILD b/src/tim/vx/internal/BUILD index ae21b3d..1803e76 100644 --- a/src/tim/vx/internal/BUILD +++ b/src/tim/vx/internal/BUILD @@ -194,22 +194,13 @@ cc_library( "src/kernel/vsi_nn_kernel_param.c", "src/kernel/vsi_nn_gpu.c", "src/kernel/vsi_nn_kernel_gpu_shape_optimize.c", - "src/libnnext/ops/kernel/vsi_nn_kernel_crop.c", - "src/libnnext/ops/kernel/vsi_nn_kernel_fullconnect2.c", - "src/libnnext/ops/kernel/vsi_nn_kernel_shufflechannel.c", - "src/libnnext/ops/kernel/vsi_nn_kernel_resize.c", - "src/libnnext/ops/kernel/vsi_nn_kernel_scale.c", - "src/libnnext/ops/kernel/vsi_nn_kernel_space2depth.c", "src/libnnext/ops/kernel/vsi_nn_kernel_imageprocess.c", - "src/libnnext/ops/kernel/vsi_nn_kernel_layernormalize.c", - "src/libnnext/ops/kernel/vsi_nn_kernel_reduce.c", "src/libnnext/ops/kernel/vsi_nn_kernel_signalframe.c", "src/libnnext/ops/kernel/vsi_nn_kernel_tensorstackconcat.c", "src/libnnext/ops/kernel/vsi_nn_kernel_sync_host.c", "src/libnnext/ops/kernel/vsi_nn_kernel_spatial_transformer.c", "src/libnnext/ops/kernel/vsi_nn_kernel_extra_ending.c", "src/libnnext/ops/kernel/vsi_nn_kernel_topk.c", - "src/libnnext/ops/kernel/vsi_nn_kernel_roi_align.c", "src/libnnext/ops/kernel/vsi_nn_kernel_heatmap_max_keypoint.c", "src/libnnext/ops/kernel/vsi_nn_kernel_axis_aligned_bbox_transform.c", "src/libnnext/ops/kernel/vsi_nn_kernel_box_with_nms_limit.c", diff --git a/src/tim/vx/internal/include/interface/ops.def b/src/tim/vx/internal/include/interface/ops.def index 88c74c2..523f299 100644 --- a/src/tim/vx/internal/include/interface/ops.def +++ b/src/tim/vx/internal/include/interface/ops.def @@ -146,3 +146,4 @@ DEF_OP(SCATTER_ND) DEF_OP(DECONVOLUTION1D) DEF_OP(INTERP) DEF_OP(RESIZE_1D) +DEF_OP(UPSAMPLESCALE) diff --git a/src/tim/vx/internal/include/internal/internal_ops.def b/src/tim/vx/internal/include/internal/internal_ops.def index e8f677b..ab04552 100644 --- a/src/tim/vx/internal/include/internal/internal_ops.def +++ b/src/tim/vx/internal/include/internal/internal_ops.def @@ -16,3 +16,4 @@ DEF_OP(GRUCELL_ACTIVATION_INTERNAL) DEF_OP(GRUCELL_ACTIVATION_INTERNAL_SMA) DEF_OP(RESIZE_1D_BILINEAR_INTERNAL) DEF_OP(RESIZE_1D_NEAREST_INTERNAL) +DEF_OP(SPACE2DEPTH_INTERNAL) diff --git a/src/tim/vx/internal/include/kernel/vsi_nn_kernel_gpu_shape_optimize.h b/src/tim/vx/internal/include/kernel/vsi_nn_kernel_gpu_shape_optimize.h index bf2b95d..0b65afc 100644 --- a/src/tim/vx/internal/include/kernel/vsi_nn_kernel_gpu_shape_optimize.h +++ b/src/tim/vx/internal/include/kernel/vsi_nn_kernel_gpu_shape_optimize.h @@ -38,6 +38,14 @@ vsi_bool vsi_nn_kernel_optimize_reduce_shape int32_t* out_axis, uint32_t* out_axis_size ); +vsi_bool vsi_nn_kernel_optimize_tensor_shape + ( + const int32_t* shape_x, const size_t rank_x, + const int32_t *axis, const size_t axis_size, + int32_t* out_shape_x, uint32_t* out_rank_x, + int32_t* out_axis, uint32_t* out_axis_size + ); + vsi_bool vsi_nn_kernel_optimize_element_shape ( const int32_t* shape_x, const size_t rank_x, @@ -59,4 +67,16 @@ vsi_bool vsi_nn_kernel_optimize_tile_shape int32_t* out_shape_output, uint32_t* out_rank_output ); +vsi_bool vsi_nn_kernel_optimize_1d_tensor_shape + ( + const int32_t* shape, const uint32_t rank, + int32_t* out_shape, uint32_t* out_rank + ); + +vsi_bool vsi_nn_kernel_optimize_nchw2xhw_shape + ( + const int32_t* shape, const uint32_t rank, + int32_t* out_shape, uint32_t* out_rank + ); + #endif diff --git a/src/tim/vx/internal/include/libnnext/vx_lib_nnext.h b/src/tim/vx/internal/include/libnnext/vx_lib_nnext.h index 4941769..2245dff 100644 --- a/src/tim/vx/internal/include/libnnext/vx_lib_nnext.h +++ b/src/tim/vx/internal/include/libnnext/vx_lib_nnext.h @@ -372,10 +372,6 @@ enum vx_kernel_libnnext_offset_e #define VX_KERNEL_NAME_GRAYSCALETOTENSOR_INT16_COPY VIVANTE_NAMESPACE ".GrayScaletoTensor_Int16_copy" #define VX_KERNEL_NAME_GRAYSCALETOTENSOR_UINT8 VIVANTE_NAMESPACE ".GrayScaletoTensor_UInt8" #define VX_KERNEL_NAME_GRAYSCALETOTENSOR_UINT8_COPY VIVANTE_NAMESPACE ".GrayScaletoTensor_UInt8_copy" -#define VX_KERNEL_NAME_LAYERNORM VIVANTE_NAMESPACE ".vxcLayerNorm" -#define VX_KERNEL_NAME_LAYERNORM_UINT8 VIVANTE_NAMESPACE ".vxcLayerNorm_u8" -#define VX_KERNEL_NAME_LAYERNORM_FP16TOU8 VIVANTE_NAMESPACE ".vxcLayerNormFP16toU8" -#define VX_KERNEL_NAME_LAYERNORM_U8TOFP16 VIVANTE_NAMESPACE ".vxcLayerNormU8toFp16" #define VX_KERNEL_NAME_TENSORSTACKCONCAT VIVANTE_NAMESPACE ".vxcTensorStackConcat" #define VX_KERNEL_NAME_TENSORSTACKCONCAT8BITS VIVANTE_NAMESPACE ".vxcTensorStackConcat8Bits" #define VX_KERNEL_NAME_SIGNALFRAME_WIDTH VIVANTE_NAMESPACE ".vxcSignalFrame_width" diff --git a/src/tim/vx/internal/include/ops/vsi_nn_op_instancenormalize.h b/src/tim/vx/internal/include/ops/vsi_nn_op_instancenormalize.h index 5ec359b..e70dc41 100644 --- a/src/tim/vx/internal/include/ops/vsi_nn_op_instancenormalize.h +++ b/src/tim/vx/internal/include/ops/vsi_nn_op_instancenormalize.h @@ -70,6 +70,10 @@ typedef struct _vsi_nn_instancenorm_lcl_data2 uint32_t reshapeFlg; uint32_t hash_idx; vsi_bool execute_on_sw; + + /* handle 3D instance norm */ + vsi_nn_tensor_t *reshaped_input; + vsi_nn_tensor_t *reshaped_output; } vsi_nn_instancenorm_lcl_data2; typedef struct _vsi_nn_instancenorm_lcl_data diff --git a/src/tim/vx/internal/include/ops/vsi_nn_op_space2depth_internal.h b/src/tim/vx/internal/include/ops/vsi_nn_op_space2depth_internal.h new file mode 100644 index 0000000..e5630ca --- /dev/null +++ b/src/tim/vx/internal/include/ops/vsi_nn_op_space2depth_internal.h @@ -0,0 +1,44 @@ +/**************************************************************************** +* +* Copyright (c) 2020 Vivante Corporation +* +* Permission is hereby granted, free of charge, to any person obtaining a +* copy of this software and associated documentation files (the "Software"), +* to deal in the Software without restriction, including without limitation +* the rights to use, copy, modify, merge, publish, distribute, sublicense, +* and/or sell copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following conditions: +* +* The above copyright notice and this permission notice shall be included in +* all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +* DEALINGS IN THE SOFTWARE. +* +*****************************************************************************/ +#ifndef _VSI_NN_OP_SPACE2DEPTH_INTERNAL_H +#define _VSI_NN_OP_SPACE2DEPTH_INTERNAL_H + +#include "vsi_nn_types.h" + +#ifdef __cplusplus +extern "C" { +#endif + +typedef struct _vsi_nn_space2depth_internal_param +{ + int32_t block_size_x; + int32_t block_size_y; +} vsi_nn_space2depth_internal_param; + +#ifdef __cplusplus +} +#endif + +#endif + diff --git a/src/tim/vx/internal/include/ops/vsi_nn_op_upsamplescale.h b/src/tim/vx/internal/include/ops/vsi_nn_op_upsamplescale.h new file mode 100644 index 0000000..f790da2 --- /dev/null +++ b/src/tim/vx/internal/include/ops/vsi_nn_op_upsamplescale.h @@ -0,0 +1,39 @@ +/**************************************************************************** +* +* Copyright (c) 2020 Vivante Corporation +* +* Permission is hereby granted, free of charge, to any person obtaining a +* copy of this software and associated documentation files (the "Software"), +* to deal in the Software without restriction, including without limitation +* the rights to use, copy, modify, merge, publish, distribute, sublicense, +* and/or sell copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following conditions: +* +* The above copyright notice and this permission notice shall be included in +* all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +* DEALINGS IN THE SOFTWARE. +* +*****************************************************************************/ + +#ifndef _VSI_NN_OP_UPSAMPLESCALE_H +#define _VSI_NN_OP_UPSAMPLESCALE_H + +#include "vsi_nn_types.h" + +typedef struct _vsi_nn_upsamplescale_param +{ + struct _upsamplescale_local_data_t* local; + // Add parameters here + int32_t stride; + float scale; +} vsi_nn_upsamplescale_param; + +#endif + diff --git a/src/tim/vx/internal/include/vsi_nn_graph.h b/src/tim/vx/internal/include/vsi_nn_graph.h index 6e3e6fd..584bdd8 100644 --- a/src/tim/vx/internal/include/vsi_nn_graph.h +++ b/src/tim/vx/internal/include/vsi_nn_graph.h @@ -677,6 +677,11 @@ OVXLIB_API vsi_status vsi_nn_TrySetupCompleteSignalNode vsi_nn_graph_t* graph ); +vsi_status vsi_nn_setup_binary_graph_inputs_outputs + ( + vsi_nn_graph_t* graph + ); + void vsi_nn_get_tensor_consumers ( vsi_nn_graph_t* graph, diff --git a/src/tim/vx/internal/include/vsi_nn_node_type.h b/src/tim/vx/internal/include/vsi_nn_node_type.h index 9f13725..89cd104 100644 --- a/src/tim/vx/internal/include/vsi_nn_node_type.h +++ b/src/tim/vx/internal/include/vsi_nn_node_type.h @@ -56,6 +56,7 @@ #include "ops/vsi_nn_op_elu.h" #include "ops/vsi_nn_op_reverse.h" #include "ops/vsi_nn_op_space2depth.h" +#include "ops/vsi_nn_op_space2depth_internal.h" #include "ops/vsi_nn_op_depth2space.h" #include "ops/vsi_nn_op_depth2space_internal.h" #include "ops/vsi_nn_op_maximum.h" @@ -162,6 +163,7 @@ #include "ops/vsi_nn_op_resize_1d.h" #include "ops/vsi_nn_op_resize_1d_bilinear_internal.h" #include "ops/vsi_nn_op_resize_1d_nearest_internal.h" +#include "ops/vsi_nn_op_upsamplescale.h" /* custom node head define define */ #include "custom/vsi_nn_custom_node_type.h" @@ -204,6 +206,7 @@ typedef union _vsi_nn_nn_param vsi_nn_elu_param elu; vsi_nn_reverse_param reverse; vsi_nn_space2depth_param space2depth; + vsi_nn_space2depth_internal_param space2depth_internal; vsi_nn_depth2space_param depth2space; vsi_nn_depth2space_internal_param depth2space_internal; vsi_nn_maximum_param maximum; @@ -310,6 +313,7 @@ typedef union _vsi_nn_nn_param vsi_nn_resize_1d_param resize_1d; vsi_nn_resize_1d_bilinear_internal_param resize_1d_bilinear_internal; vsi_nn_resize_1d_nearest_internal_param resize_1d_nearest_internal; + vsi_nn_upsamplescale_param upsamplescale; uint8_t client_param[128]; /* custom node data struct define */ diff --git a/src/tim/vx/internal/include/vsi_nn_pre_post_process.h b/src/tim/vx/internal/include/vsi_nn_pre_post_process.h index 501fca3..74938f7 100644 --- a/src/tim/vx/internal/include/vsi_nn_pre_post_process.h +++ b/src/tim/vx/internal/include/vsi_nn_pre_post_process.h @@ -65,6 +65,12 @@ typedef enum VSI_NN_SOURCE_LAYOUT_NCHW, } vsi_nn_preprocess_source_layout_e; +typedef enum +{ + VSI_NN_DEST_LAYOUT_NHWC = 0, + VSI_NN_DEST_LAYOUT_NCHW, +} vsi_nn_preprocess_dest_layout_e; + /** * Input source format */ diff --git a/src/tim/vx/internal/src/kernel/cl/instance_normalization_cl.c b/src/tim/vx/internal/src/kernel/cl/instance_normalization_cl.c index 1b73c36..fe470a0 100644 --- a/src/tim/vx/internal/src/kernel/cl/instance_normalization_cl.c +++ b/src/tim/vx/internal/src/kernel/cl/instance_normalization_cl.c @@ -214,7 +214,7 @@ DEF_KERNEL_INITIALIZER(_instancenorm_mean_vari_initializer) width = input_shape->data[0]; height = input_shape->data[1]; chn = attr[1]->shape->data[1]; - if(rsFlg) + if (rsFlg) { height = height / chn; } @@ -281,7 +281,7 @@ DEF_KERNEL_INITIALIZER(_instancenorm_initializer) width = input_shape->data[0]; height = input_shape->data[1]; chn = attr[1]->shape->data[1]; - if(rsFlg) + if (rsFlg) { height = height / chn; } @@ -355,12 +355,12 @@ static vsi_status _query_kernel for( i = 0; i < kernel_map_size; i ++ ) { - if( kernel_map[i].key == hashkey ) + if ( kernel_map[i].key == hashkey ) { break; } } - if( i < kernel_map_size ) + if ( i < kernel_map_size ) { snprintf( kernel->info.name, VX_MAX_KERNEL_NAME, "%s", kernel_map[i].function_name ); kernel->info.parameters = param_def; @@ -413,19 +413,23 @@ static vsi_nn_kernel_node_t _setup int32_t width = inputs[0]->attr.size[0]; int32_t height = inputs[0]->attr.size[1]; int32_t group_num = (width + 15) / 16; - int32_t input_zp = inputs[0]->attr.dtype.zero_point; - float input_scale = inputs[0]->attr.dtype.scale; - int32_t input_fl = inputs[0]->attr.dtype.fl; - int32_t output_zp = outputs[0]->attr.dtype.zero_point; - float output_scale = outputs[0]->attr.dtype.scale; - int32_t output_fl = outputs[0]->attr.dtype.fl; + int32_t input_zp = 0; + float input_scale = 1.0f; + int32_t input_fl = 0; + int32_t output_zp = 0; + float output_scale = 1.0f; + int32_t output_fl = 0; float in_fl_scale = 1.0f, out_fl_scale = 1.0; float dim_ratio = (float)1.0 / (float)(width * height); - if(inputs[0]->attr.dtype.vx_type == VSI_NN_TYPE_INT8 - || inputs[0]->attr.dtype.vx_type == VSI_NN_TYPE_INT16 - || inputs[0]->attr.dtype.vx_type == VSI_NN_TYPE_INT32) + if (inputs[0]->attr.dtype.qnt_type == VSI_NN_QNT_TYPE_AFFINE_ASYMMETRIC) { + input_zp = inputs[0]->attr.dtype.zero_point; + input_scale = inputs[0]->attr.dtype.scale; + } + else if (inputs[0]->attr.dtype.qnt_type == VSI_NN_QNT_TYPE_DFP) + { + input_fl = inputs[0]->attr.dtype.fl; if (input_fl > 0) { in_fl_scale = (1.0f / ((float) ((int64_t)1 << input_fl))); @@ -434,12 +438,17 @@ static vsi_nn_kernel_node_t _setup { in_fl_scale = ((float) ((int64_t)1 << -input_fl)); } + input_zp = 0; } - if(outputs[0]->attr.dtype.vx_type == VSI_NN_TYPE_INT8 - || outputs[0]->attr.dtype.vx_type == VSI_NN_TYPE_INT16 - || outputs[0]->attr.dtype.vx_type == VSI_NN_TYPE_INT32) + if (outputs[0]->attr.dtype.qnt_type == VSI_NN_QNT_TYPE_AFFINE_ASYMMETRIC) { + output_zp = outputs[0]->attr.dtype.zero_point; + output_scale = 1.0f / outputs[0]->attr.dtype.scale; + } + else if (outputs[0]->attr.dtype.qnt_type == VSI_NN_QNT_TYPE_DFP) + { + output_fl = outputs[0]->attr.dtype.fl; if (output_fl > 0) { out_fl_scale = (float)((int64_t)1 << output_fl); @@ -448,9 +457,10 @@ static vsi_nn_kernel_node_t _setup { out_fl_scale = (1.0f / (float)((int64_t)1 << -output_fl)); } + output_zp = 0; } - if( !vsi_nn_kernel_gpu_check_shape( (int32_t*)outputs[0]->attr.size, + if ( !vsi_nn_kernel_gpu_check_shape( (int32_t*)outputs[0]->attr.size, outputs[0]->attr.dim_num ) ) { return NULL; @@ -482,17 +492,17 @@ static vsi_nn_kernel_node_t _setup hashkey = HASH_INSTANCENORM_KEY( in0_dtype, out_dtype, reshape_flg ); status = _query_kernel( ikernels[MEAN_VARI_INDEX], hashkeys[MEAN_VARI_INDEX], INTERNAL_KERNEL_MEAN_VARI ); - if( VSI_SUCCESS != status ) + if ( VSI_SUCCESS != status ) { goto final; } status = _query_kernel( kernel, hashkey, INTERNAL_KERNEL_NORM ); - if( VSI_SUCCESS != status ) + if ( VSI_SUCCESS != status ) { goto final; } - if(reshape_flg) + if (reshape_flg) { int32_t shape[VSI_NN_MAX_DIM_NUM] = {0}; shape[0] = inputs[0]->attr.size[0]; @@ -507,7 +517,7 @@ static vsi_nn_kernel_node_t _setup shape[3] = outputs[0]->attr.dim_num > 3 ? outputs[0]->attr.size[3] : 1; rs_output = vsi_nn_kernel_tensor_reshape( outputs[0]->t, shape, 4 ); } - if(inputs[1]->attr.dim_num < 2) + if (inputs[1]->attr.dim_num < 2) { int32_t shape[VSI_NN_MAX_DIM_NUM] = {0}; shape[0] = inputs[1]->attr.size[0]; @@ -516,7 +526,7 @@ static vsi_nn_kernel_node_t _setup shape[3] = 1; rs_beta = vsi_nn_kernel_tensor_reshape( inputs[1]->t, shape, 4 ); } - if(inputs[2]->attr.dim_num < 2) + if (inputs[2]->attr.dim_num < 2) { int32_t shape[VSI_NN_MAX_DIM_NUM] = {0}; shape[0] = inputs[2]->attr.size[0]; @@ -528,10 +538,10 @@ static vsi_nn_kernel_node_t _setup // Mean Vari { node = vsi_nn_kernel_create_node( graph, ikernels[MEAN_VARI_INDEX] ); - if(node) + if (node) { uint32_t index = 0; - if(reshape_flg) + if (reshape_flg) { mean_vari_node_params[index++] = rs_input; } @@ -565,10 +575,10 @@ static vsi_nn_kernel_node_t _setup // Nomalization { node = vsi_nn_kernel_create_node( graph, kernel ); - if(node) + if (node) { uint32_t index = 0; - if(reshape_flg) + if (reshape_flg) { node_params[index++] = rs_input; } @@ -576,7 +586,7 @@ static vsi_nn_kernel_node_t _setup { node_params[index++] = (vsi_nn_kernel_node_param_t)inputs[0]->t; } - if(inputs[1]->attr.dim_num < 2) + if (inputs[1]->attr.dim_num < 2) { node_params[index++] = rs_beta; } @@ -584,7 +594,7 @@ static vsi_nn_kernel_node_t _setup { node_params[index++] = (vsi_nn_kernel_node_param_t)inputs[1]->t; } - if(inputs[2]->attr.dim_num < 2) + if (inputs[2]->attr.dim_num < 2) { node_params[index++] = rs_gamma; } @@ -593,7 +603,7 @@ static vsi_nn_kernel_node_t _setup node_params[index++] = (vsi_nn_kernel_node_param_t)inputs[2]->t; } node_params[index++] = (vsi_nn_kernel_node_param_t)tensors[MEAN_VARI_INDEX]->t; - if(reshape_flg) + if (reshape_flg) { node_params[index++] = rs_output; } @@ -634,26 +644,26 @@ static vsi_nn_kernel_node_t _setup /* Pass parameters to node. */ final: - if(rs_beta) + if (rs_beta) { vsi_nn_kernel_tensor_release( &rs_beta ); } - if(rs_gamma) + if (rs_gamma) { vsi_nn_kernel_tensor_release( &rs_gamma ); } - if(reshape_flg) + if (reshape_flg) { vsi_nn_kernel_tensor_release( &rs_input ); vsi_nn_kernel_tensor_release( &rs_output ); } for( i = 0; i < INTERNAL_KERNEL_SIZE; i ++ ) { - if( ikernels[i] ) + if ( ikernels[i] ) { vsi_nn_kernel_release( &ikernels[i] ); } - if( tensors[i] ) + if ( tensors[i] ) { vsi_nn_ReleaseTensor( &tensors[i] ); } diff --git a/src/tim/vx/internal/src/kernel/cl/layer_normalization_cl.c b/src/tim/vx/internal/src/kernel/cl/layer_normalization_cl.c new file mode 100644 index 0000000..166f779 --- /dev/null +++ b/src/tim/vx/internal/src/kernel/cl/layer_normalization_cl.c @@ -0,0 +1,395 @@ +/**************************************************************************** +* +* Copyright (c) 2019 Vivante Corporation +* +* Permission is hereby granted, free of charge, to any person obtaining a +* copy of this software and associated documentation files (the "Software"), +* to deal in the Software without restriction, including without limitation +* the rights to use, copy, modify, merge, publish, distribute, sublicense, +* and/or sell copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following conditions: +* +* The above copyright notice and this permission notice shall be included in +* all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +* DEALINGS IN THE SOFTWARE. +* +*****************************************************************************/ + + +#include +#include +#include +#include "vsi_nn_types.h" +#include "vsi_nn_tensor.h" +#include "vsi_nn_graph.h" +#include "vsi_nn_log.h" +#include "vsi_nn_prv.h" +#include "vsi_nn_error.h" +#include "vsi_nn_tensor_util.h" +#include "utils/vsi_nn_util.h" +#include "kernel/vsi_nn_kernel.h" +#include "kernel/vsi_nn_kernel_eltwise.h" + +__BEGIN_DECLS + +/* + * Define kernel meta. + */ + +#define KERNEL_SOURCE_1 "layer_normalization" + +#define HASH_LAYERNORM_KERNEL_NAME(SRC0_TYPE, DST_TYPE) \ + CVIVANTE_NAMESPACE("cl.layer_norm_"#SRC0_TYPE"to"#DST_TYPE) + +// Add kernel hashtable here +#define HASH_LAYERNORM_KEY(_input0_type, _output_type, _reshape_flag) \ + ((_input0_type << 24) | (_output_type << 16) | (_reshape_flag << 8)) + +#define TENSOR_LAYERNORM_KERNELS(IN0_TYPE, OUT_TYPE, SOURCE) \ + { HASH_LAYERNORM_KEY(IN0_TYPE, OUT_TYPE, 0), \ + HASH_LAYERNORM_KERNEL_NAME(IN0_TYPE, OUT_TYPE), \ + SOURCE }, + +typedef struct +{ + uint32_t key; + char * function_name; + const char * source_name; +} _kernel_map_type; + +static const _kernel_map_type _layernorm_kernel_map[] = +{ + // Register kernel here + TENSOR_LAYERNORM_KERNELS( F32, F32, KERNEL_SOURCE_1 ) + TENSOR_LAYERNORM_KERNELS( U8, U8, KERNEL_SOURCE_1 ) +}; + +/* + * Kernel params + */ +static vx_param_description_t _layernorm_kernel_param_def[] = +{ + {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, + {VX_OUTPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED}, + // Add kererl parameters here +}; +#define _LAYERNORM_PARAM_NUM _cnt_of_array( _layernorm_kernel_param_def ) + +/* + * Kernel initializer + */ + +DEF_KERNEL_INITIALIZER(_layernorm_initializer) + ( + vsi_nn_kernel_node_t node, + const vsi_nn_kernel_node_param_t * param, + size_t param_size + ) +{ + vsi_status status = VSI_FAILURE; + gpu_param_t gpu_param = { + 3, + {0, 0, 0}, + {0, 0, 0}, + {0, 0, 0}, + {0, 0, 0} + }; + + vsi_nn_kernel_tensor_attr_t * attr[2] = { NULL }; + vsi_int_array_t * input_shape = NULL; + //int32_t width = 0; + int32_t height = 0; + int32_t chn = 0; + + attr[0] = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[0] ); + CHECK_PTR_FAIL_GOTO( attr[0], "Create tensor attr buffer fail.", final ); + attr[1] = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[3] ); + CHECK_PTR_FAIL_GOTO( attr[1], "Create tensor attr buffer fail.", final ); + + input_shape = attr[0]->shape; + //width = input_shape->data[0]; + height = input_shape->data[1]; + chn = (input_shape->size <= 2) ? 1 : input_shape->data[2]; + + gpu_param.global_scale[0] = 1; + gpu_param.global_scale[1] = 1; + gpu_param.global_scale[2] = 1; + gpu_param.local_size[0] = 16; + gpu_param.local_size[1] = 1; + gpu_param.local_size[2] = 1; + gpu_param.global_size[0] = 16; + gpu_param.global_size[1] = height; + gpu_param.global_size[2] = chn; + + status = vsi_nn_kernel_gpu_config( node, &gpu_param ); + CHECK_STATUS_FAIL_GOTO(status, final); + +final: + if (attr[0]) + { + vsi_nn_kernel_tensor_attr_release( &attr[0] ); + attr[0] = NULL; + } + if (attr[1]) + { + vsi_nn_kernel_tensor_attr_release( &attr[1] ); + attr[1] = NULL; + } + return status; +} /* _layernorm_initializer() */ + +/* + * Query kernel + */ +static vsi_status _query_kernel + ( + vsi_nn_kernel_t* kernel, + vsi_nn_tensor_t* const* const inputs, + vsi_nn_tensor_t* const* const outputs, + int32_t reshape2D + ) +{ + vsi_status status = VSI_FAILURE; + vsi_nn_kernel_dtype_e input0_dtype = U8; + vsi_nn_kernel_dtype_e output_dtype = U8; + uint32_t key = 0; + int i = 0; + + input0_dtype = vsi_nn_kernel_map_dtype( inputs[0]->attr.dtype.vx_type ); + output_dtype = vsi_nn_kernel_map_dtype( outputs[0]->attr.dtype.vx_type ); + + if (input0_dtype == F16 && output_dtype == F16) + { + input0_dtype = F32; + output_dtype = F32; + } + + key = HASH_LAYERNORM_KEY( input0_dtype, output_dtype, 0 ); + + for( i = 0; i < _cnt_of_array(_layernorm_kernel_map); i ++ ) + { + if ( _layernorm_kernel_map[i].key == key ) + { + break; + } + } + if ( i < _cnt_of_array(_layernorm_kernel_map) ) + { + snprintf( kernel->info.name, VX_MAX_KERNEL_NAME, "%s", _layernorm_kernel_map[i].function_name ); + kernel->info.parameters = _layernorm_kernel_param_def; + kernel->info.numParams = _LAYERNORM_PARAM_NUM; + kernel->info.initialize = _layernorm_initializer; + + vsi_nn_kernel_add_source( kernel, VSI_NN_GPU_SOURCE_FMT_CODE, 2, + "eltwise_ops_helper", + _layernorm_kernel_map[i].source_name ); + vsi_nn_kernel_add_source( kernel, VSI_NN_GPU_SOURCE_FMT_EXECUTABLE, 1, + _layernorm_kernel_map[i].source_name ); + status = VSI_SUCCESS; + } + return status; +} /* _query_kernel() */ + + +static vsi_nn_kernel_node_t _setup + ( + vsi_nn_graph_t * graph, + vsi_nn_tensor_t ** inputs, + size_t input_num, + vsi_nn_tensor_t ** outputs, + size_t output_num, + const vsi_nn_kernel_param_t * params, + vsi_nn_kernel_t * kernel + ) +{ + vsi_status status = VSI_FAILURE; + vsi_nn_kernel_node_param_t node_params[_LAYERNORM_PARAM_NUM] = { NULL }; + vsi_nn_kernel_node_t node = NULL; + vsi_nn_kernel_tensor_t rs_gamma = NULL, rs_beta = NULL; + + float eps = vsi_nn_kernel_param_get_float32( params, "eps" ); + + int32_t width = inputs[0]->attr.size[0]; + int32_t height = inputs[0]->attr.size[1]; + int32_t input_fl = 0; + float input_zp = 0.0f; + float input_scale = 1.0f; + int32_t output_fl = 0; + float output_zp = 0.0f; + float output_scale = 1.0f; + float e2InScale = 1.0f, scale_inOut = 1.0f; + float dim_ratio = (float)1.0 / (float)(width); + float sumZpScale = 0.0f; + float zp2ScaleE2 = 0.0f; + float sumZpScaleE2 = 0.0f; + + if (inputs[0]->attr.dtype.qnt_type == VSI_NN_QNT_TYPE_AFFINE_ASYMMETRIC) + { + input_zp = (float)inputs[0]->attr.dtype.zero_point; + input_scale = inputs[0]->attr.dtype.scale; + } + else if (inputs[0]->attr.dtype.qnt_type == VSI_NN_QNT_TYPE_DFP) + { + input_fl = inputs[0]->attr.dtype.fl; + if (input_fl > 0) + { + input_scale = (1.0f / ((float) ((int64_t)1 << input_fl))); + } + else + { + input_scale = ((float) ((int64_t)1 << -input_fl)); + } + input_zp = 0.0f; + } + + if (outputs[0]->attr.dtype.qnt_type == VSI_NN_QNT_TYPE_AFFINE_ASYMMETRIC) + { + output_zp = (float)outputs[0]->attr.dtype.zero_point; + output_scale = 1.0f / outputs[0]->attr.dtype.scale; + } + else if (outputs[0]->attr.dtype.qnt_type == VSI_NN_QNT_TYPE_DFP) + { + output_fl = outputs[0]->attr.dtype.fl; + if (output_fl > 0) + { + output_scale = (float)((int64_t)1 << output_fl); + } + else + { + output_scale = (1.0f / (float)((int64_t)1 << -output_fl)); + } + output_zp = 0.0f; + } + scale_inOut = input_scale * output_scale; + e2InScale = input_scale * input_scale; + sumZpScale = width * input_zp * input_scale; + zp2ScaleE2 = input_zp * 2 * e2InScale; + sumZpScaleE2 = width * input_zp * input_zp * e2InScale; + + if ( !vsi_nn_kernel_gpu_check_shape( (int32_t*)outputs[0]->attr.size, + outputs[0]->attr.dim_num ) ) + { + return NULL; + } + + status = _query_kernel( kernel, inputs, outputs, 0 ); + if ( VSI_SUCCESS != status ) + { + goto final; + } + + if (inputs[1]->attr.dim_num < 2) + { + int32_t shape[VSI_NN_MAX_DIM_NUM] = {0}; + shape[0] = inputs[1]->attr.size[0]; + shape[1] = 1; + shape[2] = 1; + shape[3] = 1; + rs_beta = vsi_nn_kernel_tensor_reshape( inputs[1]->t, shape, 4 ); + } + if (inputs[2]->attr.dim_num < 2) + { + int32_t shape[VSI_NN_MAX_DIM_NUM] = {0}; + shape[0] = inputs[2]->attr.size[0]; + shape[1] = 1; + shape[2] = 1; + shape[3] = 1; + rs_gamma = vsi_nn_kernel_tensor_reshape( inputs[2]->t, shape, 4 ); + } + + // Nomalization + { + node = vsi_nn_kernel_create_node( graph, kernel ); + if (node) + { + uint32_t index = 0; + node_params[index++] = (vsi_nn_kernel_node_param_t)inputs[0]->t; + if (inputs[1]->attr.dim_num < 2) + { + node_params[index++] = rs_beta; + } + else + { + node_params[index++] = (vsi_nn_kernel_node_param_t)inputs[1]->t; + } + if (inputs[2]->attr.dim_num < 2) + { + node_params[index++] = rs_gamma; + } + else + { + node_params[index++] = (vsi_nn_kernel_node_param_t)inputs[2]->t; + } + node_params[index++] = (vsi_nn_kernel_node_param_t)outputs[0]->t; + node_params[index++] = vsi_nn_kernel_scalar_create( graph, F32, &eps ); + node_params[index++] = vsi_nn_kernel_scalar_create( graph, F32, &input_zp ); + node_params[index++] = vsi_nn_kernel_scalar_create( graph, F32, &input_scale ); + node_params[index++] = vsi_nn_kernel_scalar_create( graph, F32, &output_zp ); + node_params[index++] = vsi_nn_kernel_scalar_create( graph, F32, &output_scale ); + node_params[index++] = vsi_nn_kernel_scalar_create( graph, F32, &e2InScale ); + node_params[index++] = vsi_nn_kernel_scalar_create( graph, F32, &scale_inOut ); + node_params[index++] = vsi_nn_kernel_scalar_create( graph, F32, &sumZpScale ); + node_params[index++] = vsi_nn_kernel_scalar_create( graph, F32, &zp2ScaleE2 ); + node_params[index++] = vsi_nn_kernel_scalar_create( graph, F32, &sumZpScaleE2 ); + node_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &width ); + node_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &height ); + node_params[index++] = vsi_nn_kernel_scalar_create( graph, F32, &dim_ratio ); + + status = vsi_nn_kernel_node_pass_param( node, node_params, + _LAYERNORM_PARAM_NUM ); + CHECK_STATUS(status); + vsi_nn_kernel_scalar_release( &node_params[4] ); + vsi_nn_kernel_scalar_release( &node_params[5] ); + vsi_nn_kernel_scalar_release( &node_params[6] ); + vsi_nn_kernel_scalar_release( &node_params[7] ); + vsi_nn_kernel_scalar_release( &node_params[8] ); + vsi_nn_kernel_scalar_release( &node_params[9] ); + vsi_nn_kernel_scalar_release( &node_params[10] ); + vsi_nn_kernel_scalar_release( &node_params[11] ); + vsi_nn_kernel_scalar_release( &node_params[12] ); + vsi_nn_kernel_scalar_release( &node_params[13] ); + vsi_nn_kernel_scalar_release( &node_params[14] ); + vsi_nn_kernel_scalar_release( &node_params[15] ); + vsi_nn_kernel_scalar_release( &node_params[16] ); + } + } + + /* Pass parameters to node. */ +final: + if (rs_beta) + { + vsi_nn_kernel_tensor_release( &rs_beta ); + } + if (rs_gamma) + { + vsi_nn_kernel_tensor_release( &rs_gamma ); + } + return node; +} /* _setup() */ + +__END_DECLS + +REGISTER_BACKEND_CL( layer_norm, _setup ) + diff --git a/src/tim/vx/internal/src/kernel/cl/matrixmul_cl.c b/src/tim/vx/internal/src/kernel/cl/matrixmul_cl.c index 272d2b0..5ccc69e 100644 --- a/src/tim/vx/internal/src/kernel/cl/matrixmul_cl.c +++ b/src/tim/vx/internal/src/kernel/cl/matrixmul_cl.c @@ -59,6 +59,9 @@ __BEGIN_DECLS #define HASH_MATRIXMUL_TRANSA_SH_KERNEL_NAME(SRC0_TYPE, SRC1_TYPE, DST_TYPE, IMAGE_DIM) \ CVIVANTE_NAMESPACE("cl.gemm_transa_"#SRC0_TYPE#SRC1_TYPE"to"#DST_TYPE#IMAGE_DIM) +#define HASH_MATRIXMUL_TRANSB_SH_KERNEL_NAME(SRC0_TYPE, SRC1_TYPE, DST_TYPE, IMAGE_DIM) \ + CVIVANTE_NAMESPACE("cl.gemm_transb_"#SRC0_TYPE#SRC1_TYPE"to"#DST_TYPE#IMAGE_DIM) + #define TENSOR_MATRIXMUL_KERNELS(IN0_TYPE, IN1_TYPE, OUT_TYPE, IMAGE_DIM, SOURCE) \ { HASH_MATRIXMUL_KEY(IN0_TYPE, IN1_TYPE, OUT_TYPE, IMAGE_DIM, 0), \ HASH_MATRIXMUL_SH_KERNEL_NAME(F32, F32, F32, IMAGE_DIM), \ @@ -69,6 +72,11 @@ __BEGIN_DECLS HASH_MATRIXMUL_TRANSA_SH_KERNEL_NAME(F32, F32, F32, IMAGE_DIM), \ SOURCE }, +#define TENSOR_MATRIXMUL_TRANSB_KERNELS(IN0_TYPE, IN1_TYPE, OUT_TYPE, IMAGE_DIM, SOURCE) \ + { HASH_MATRIXMUL_KEY(IN0_TYPE, IN1_TYPE, OUT_TYPE, IMAGE_DIM, 2), \ + HASH_MATRIXMUL_TRANSB_SH_KERNEL_NAME(IN0_TYPE, IN1_TYPE, OUT_TYPE, IMAGE_DIM), \ + SOURCE }, + static const struct { uint32_t key; char* function_name; @@ -83,6 +91,10 @@ static const struct { TENSOR_MATRIXMUL_KERNELS(F32, F32, F32, _3D, KERNEL_SOURCE_1) TENSOR_MATRIXMUL_TRANSA_KERNELS(F32, F32, F32, _2D, KERNEL_SOURCE_2) TENSOR_MATRIXMUL_TRANSA_KERNELS(F32, F32, F32, _3D, KERNEL_SOURCE_2) + TENSOR_MATRIXMUL_TRANSB_KERNELS(F32, F32, F32, _2D, KERNEL_SOURCE_1) + TENSOR_MATRIXMUL_TRANSB_KERNELS(F32, F32, F32, _3D, KERNEL_SOURCE_1) + TENSOR_MATRIXMUL_TRANSB_KERNELS(F32, I8, F32, _2D, KERNEL_SOURCE_1) + TENSOR_MATRIXMUL_TRANSB_KERNELS(F32, I8, F32, _3D, KERNEL_SOURCE_1) }; /* @@ -98,6 +110,12 @@ static vx_param_description_t _matrixmul_kernel_param_def[] = {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED}, {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED}, {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED}, }; #define _MATRIXMUL_PARAM_NUM _cnt_of_array(_matrixmul_kernel_param_def) @@ -130,7 +148,7 @@ DEF_KERNEL_INITIALIZER(_matrixmul_initializer) CHECK_PTR_FAIL_GOTO( attr[0], "Create tensor attr buffer fail.", final ); width = attr[0]->shape->data[0]; - height = attr[0]->shape->data[0]; + height = attr[0]->shape->data[1]; chn = attr[0]->shape->size > 2 ? attr[0]->shape->data[2] : 1; gpu_param.global_scale[0] = 1; @@ -175,22 +193,27 @@ static vsi_status _query_kernel input0_dtype = vsi_nn_kernel_map_dtype( inputs[0]->attr.dtype.vx_type ); input1_dtype = vsi_nn_kernel_map_dtype( inputs[1]->attr.dtype.vx_type ); output_dtype = vsi_nn_kernel_map_dtype( outputs[0]->attr.dtype.vx_type ); - if(depth > 1) + if (depth > 1) { dim_type = _3D; } + if (input1_dtype == I16 || input1_dtype == I32) + { + input1_dtype = I8; + } + key = HASH_MATRIXMUL_KEY( input0_dtype, input1_dtype, output_dtype, dim_type, transa ); for( i = 0; i < _cnt_of_array(matrixmul_map); i ++ ) { - if( matrixmul_map[i].key == key ) + if ( matrixmul_map[i].key == key ) { break; } } - if( i < _cnt_of_array(matrixmul_map) ) + if ( i < _cnt_of_array(matrixmul_map) ) { snprintf( kernel->info.name, VX_MAX_KERNEL_NAME, "%s", matrixmul_map[i].function_name ); kernel->info.parameters = _matrixmul_kernel_param_def; @@ -223,48 +246,111 @@ static vsi_nn_kernel_node_t _setup vsi_nn_kernel_node_t node = NULL; int32_t transposeA = vsi_nn_kernel_param_get_int32( params, "transposeA" ); int32_t transposeB = vsi_nn_kernel_param_get_int32( params, "transposeB" ); + int32_t transFlg = 0; uint32_t M = inputs[0]->attr.size[1]; uint32_t K = inputs[0]->attr.size[0]; uint32_t N = inputs[1]->attr.size[0]; uint32_t depth = outputs[0]->attr.dim_num > 2 ? outputs[0]->attr.size[2] : 1; uint32_t ac2zero = 0; uint32_t bc2zero = 0; + float scale_a = 1.0f; + float zp_a = 0; + float scale_b = 1.0f; + float zp_b = 0; + float scale_out = 1.0f; + float zp_out = 0; - if( !vsi_nn_kernel_gpu_check_shape( (int32_t*)outputs[0]->attr.size, + if ( !vsi_nn_kernel_gpu_check_shape( (int32_t*)outputs[0]->attr.size, outputs[0]->attr.dim_num ) ) { return NULL; } - if(transposeB) + if (transposeB) { - return NULL; + N = inputs[1]->attr.size[1]; + transFlg = 2; } - if(transposeA) + if (inputs[0]->attr.dtype.qnt_type == VSI_NN_QNT_TYPE_DFP) + { + if (inputs[0]->attr.dtype.fl > 0) + { + scale_a = (1.0f / ((float) ((int64_t)1 << inputs[0]->attr.dtype.fl))); + } + else + { + scale_a = ((float) ((int64_t)1 << -inputs[0]->attr.dtype.fl)); + } + zp_a = 0; + } + else if (inputs[0]->attr.dtype.qnt_type == VSI_NN_QNT_TYPE_AFFINE_ASYMMETRIC) + { + zp_a = (float)inputs[0]->attr.dtype.zero_point; + scale_a = inputs[0]->attr.dtype.scale; + } + + if (inputs[1]->attr.dtype.qnt_type == VSI_NN_QNT_TYPE_DFP) + { + if (inputs[1]->attr.dtype.fl > 0) + { + scale_b = (1.0f / ((float) ((int64_t)1 << inputs[1]->attr.dtype.fl))); + } + else + { + scale_b = ((float) ((int64_t)1 << -inputs[1]->attr.dtype.fl)); + } + zp_b = 0; + } + else if (inputs[1]->attr.dtype.qnt_type == VSI_NN_QNT_TYPE_AFFINE_ASYMMETRIC) + { + zp_b = (float)inputs[1]->attr.dtype.zero_point; + scale_b = inputs[1]->attr.dtype.scale; + } + + if (outputs[0]->attr.dtype.qnt_type == VSI_NN_QNT_TYPE_DFP) + { + if (outputs[0]->attr.dtype.fl > 0) + { + scale_out = (float)((int64_t)1 << outputs[0]->attr.dtype.fl); + } + else + { + scale_out = (1.0f / (float)((int64_t)1 << -outputs[0]->attr.dtype.fl)); + } + zp_out = 0; + } + else if (outputs[0]->attr.dtype.qnt_type == VSI_NN_QNT_TYPE_AFFINE_ASYMMETRIC) + { + zp_out = (float)outputs[0]->attr.dtype.zero_point; + scale_out = outputs[0]->attr.dtype.scale; + } + + if (transposeA) { K = inputs[0]->attr.size[1]; M = inputs[0]->attr.size[0]; + transFlg = 1; } - if((inputs[0]->attr.dim_num > inputs[1]->attr.dim_num) || + if ((inputs[0]->attr.dim_num > inputs[1]->attr.dim_num) || (inputs[0]->attr.size[2] > inputs[1]->attr.size[2] && inputs[0]->attr.dim_num > 2 && inputs[1]->attr.dim_num > 2)) { bc2zero = 1; } - else if((inputs[1]->attr.dim_num > inputs[0]->attr.dim_num) || + else if ((inputs[1]->attr.dim_num > inputs[0]->attr.dim_num) || (inputs[1]->attr.size[2] > inputs[0]->attr.size[2] && inputs[0]->attr.dim_num > 2 && inputs[1]->attr.dim_num > 2)) { ac2zero = 1; } - status = _query_kernel( kernel, inputs, outputs, depth, transposeA ); - if( VSI_SUCCESS == status) + status = _query_kernel( kernel, inputs, outputs, depth, transFlg ); + if ( VSI_SUCCESS == status) { node = vsi_nn_kernel_create_node( graph, kernel ); - if( node ) + if ( node ) { uint32_t index = 3; /* Pass parameters to node. */ @@ -275,6 +361,12 @@ static vsi_nn_kernel_node_t _setup node_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &N ); node_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &ac2zero ); node_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &bc2zero ); + node_params[index++] = vsi_nn_kernel_scalar_create( graph, F32, &scale_a ); + node_params[index++] = vsi_nn_kernel_scalar_create( graph, F32, &zp_a ); + node_params[index++] = vsi_nn_kernel_scalar_create( graph, F32, &scale_b ); + node_params[index++] = vsi_nn_kernel_scalar_create( graph, F32, &zp_b ); + node_params[index++] = vsi_nn_kernel_scalar_create( graph, F32, &scale_out ); + node_params[index++] = vsi_nn_kernel_scalar_create( graph, F32, &zp_out ); /* Pass parameters to node. */ status = vsi_nn_kernel_node_pass_param( node, node_params, _MATRIXMUL_PARAM_NUM ); CHECK_STATUS(status); @@ -283,6 +375,12 @@ static vsi_nn_kernel_node_t _setup vsi_nn_kernel_scalar_release( &node_params[5] ); vsi_nn_kernel_scalar_release( &node_params[6] ); vsi_nn_kernel_scalar_release( &node_params[7] ); + vsi_nn_kernel_scalar_release( &node_params[8] ); + vsi_nn_kernel_scalar_release( &node_params[9] ); + vsi_nn_kernel_scalar_release( &node_params[10] ); + vsi_nn_kernel_scalar_release( &node_params[11] ); + vsi_nn_kernel_scalar_release( &node_params[12] ); + vsi_nn_kernel_scalar_release( &node_params[13] ); } } return node; diff --git a/src/tim/vx/internal/src/kernel/cl/roi_align_cl.c b/src/tim/vx/internal/src/kernel/cl/roi_align_cl.c new file mode 100644 index 0000000..3f4402a --- /dev/null +++ b/src/tim/vx/internal/src/kernel/cl/roi_align_cl.c @@ -0,0 +1,329 @@ +/**************************************************************************** +* +* Copyright (c) 2020 Vivante Corporation +* +* Permission is hereby granted, free of charge, to any person obtaining a +* copy of this software and associated documentation files (the "Software"), +* to deal in the Software without restriction, including without limitation +* the rights to use, copy, modify, merge, publish, distribute, sublicense, +* and/or sell copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following conditions: +* +* The above copyright notice and this permission notice shall be included in +* all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +* DEALINGS IN THE SOFTWARE. +* +*****************************************************************************/ + + +#include +#include +#include +#include "vsi_nn_types.h" +#include "vsi_nn_tensor.h" +#include "vsi_nn_graph.h" +#include "vsi_nn_log.h" +#include "vsi_nn_error.h" +#include "vsi_nn_prv.h" +#include "vsi_nn_tensor_util.h" +#include "utils/vsi_nn_util.h" +#include "kernel/vsi_nn_kernel.h" +#include "libnnext/vx_lib_nnext.h" +#include "kernel/vsi_nn_kernel_gpu_shape_optimize.h" + +__BEGIN_DECLS + +#define _ROI_ALIGN_KERNEL_SOURCE(_input_type) "roi_align" + +#define STR(a) #a +// Add kernel hashtable here +#define ROI_ALIGN_HASH_KEY( IN0_DTYPE, IN1_DTYPE, IN2_DTYPE, OUT_DTYPE, _image_2d ) \ + (( IN0_DTYPE ) | ( IN1_DTYPE << 7) | (IN2_DTYPE << 14) | (OUT_DTYPE << 21) | (_image_2d << 28)) + +#define PACK_KERNEL_MAP( IN0_DTYPE, IN1_DTYPE, IN2_DTYPE, OUT_DTYPE ) \ + { ROI_ALIGN_HASH_KEY( IN0_DTYPE, IN1_DTYPE, IN2_DTYPE, OUT_DTYPE, 0 ), \ + CVIVANTE_NAMESPACE("cl.roi_align_"STR(IN0_DTYPE)"to"STR(OUT_DTYPE)), \ + _ROI_ALIGN_KERNEL_SOURCE(IN0_DTYPE) } + +typedef struct +{ + uint32_t key; + char * function_name; + const char * source_name; +} _kernel_map_type; + +static const _kernel_map_type _roi_align_kernel_map[] = +{ + PACK_KERNEL_MAP(F32, F32, I32, F32), +}; + + +/* + * Kernel params + */ +static vx_param_description_t _roi_align_kernel_param_def[] = +{ + {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, + {VX_OUTPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED}, +}; +#define _ROI_ALIGN_PARAM_NUM _cnt_of_array( _roi_align_kernel_param_def ) + +#define SCALAR_SPATIAL_X_SCALE (4) +#define SCALAR_SPATIAL_Y_SCALE (5) +#define SCALAR_INPUT_WIDTH (6) +#define SCALAR_INPUT_HEIGHT (7) +#define SCALAR_RCP_OF_OUTPUT_WIDTH (8) +#define SCALAR_RCP_OF_OUTPUT_HEIGHT (9) +#define SCALAR_SAMPLING_X_RATIO (10) +#define SCALAR_SAMPLING_Y_RATIO (11) +#define SCALAR_DEPTH (12) + +#define ROI_ALIGN_PARAM_NUM 13 +#define ROI_ALIGN_QUANT_PARAM_NUM _cnt_of_array( _roi_align_kernel_param_def ) + +/* + * Kernel initializer + */ +DEF_KERNEL_INITIALIZER(_roi_align_initializer) + ( + vsi_nn_kernel_node_t node, + const vsi_nn_kernel_node_param_t * param, + size_t param_size + ) +{ + vsi_status status = VSI_FAILURE; + gpu_param_t gpu_param = { + 3, + {0, 0, 0}, + {0, 0, 0}, + {0, 0, 0}, + {0, 0, 0} + }; + vsi_nn_kernel_tensor_attr_t * rois_attr = NULL; + vsi_nn_kernel_tensor_attr_t * output_attr = NULL; + vsi_int_array_t * rois_shape = NULL; + vsi_int_array_t * out_shape = NULL; + + rois_attr = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[1] ); + CHECK_PTR_FAIL_GOTO( rois_attr, "Create tensor attr buffer fail.", final ); + output_attr = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[3] ); + CHECK_PTR_FAIL_GOTO( output_attr, "Create tensor attr buffer fail.", final ); + + rois_shape = rois_attr->shape; + out_shape = output_attr->shape; + + gpu_param.global_scale[0] = 1; + gpu_param.global_scale[1] = 1; + gpu_param.global_scale[2] = 1; + + gpu_param.dim = 3; + gpu_param.global_size[0] = gpu_align_p2( + (out_shape->data[0] + gpu_param.global_scale[0] - 1) + / gpu_param.global_scale[0], 4); + gpu_param.global_size[1] = ( + (out_shape->data[1] + gpu_param.global_scale[1] - 1) + / gpu_param.global_scale[1]); + gpu_param.global_size[2] = rois_shape->data[1]; + status = vsi_nn_kernel_gpu_config( node, &gpu_param ); + +final: +#define SAFE_FREE_TENSOR_ATTR(_PTR) if( _PTR ) { vsi_nn_kernel_tensor_attr_release( &_PTR ); _PTR = NULL; } + SAFE_FREE_TENSOR_ATTR(output_attr); + + return status; +} /* _roi_align_initializer() */ + + + +/* + * Query kernel + */ +static vsi_status _query_kernel + ( + vsi_nn_kernel_t * kernel, + vsi_nn_tensor_t * const * const inputs, + vsi_nn_tensor_t * const * const outputs, + vsi_bool image_2d + ) +{ + vsi_status status = VSI_FAILURE; + vsi_nn_kernel_dtype_e in0_dtype; + vsi_nn_kernel_dtype_e in1_dtype; + vsi_nn_kernel_dtype_e in2_dtype; + vsi_nn_kernel_dtype_e out_dtype; + const _kernel_map_type * kernel_map = _roi_align_kernel_map; + size_t kernel_map_size = _cnt_of_array( _roi_align_kernel_map ); + vx_param_description_t * param_def = _roi_align_kernel_param_def; + size_t param_def_size = ROI_ALIGN_QUANT_PARAM_NUM; + vx_kernel_initialize_f initializer = _roi_align_initializer; + + uint32_t key; + uint32_t i; + + in0_dtype = vsi_nn_kernel_map_dtype( inputs[0]->attr.dtype.vx_type ); + in1_dtype = vsi_nn_kernel_map_dtype( inputs[1]->attr.dtype.vx_type ); + in2_dtype = vsi_nn_kernel_map_dtype( inputs[2]->attr.dtype.vx_type ); + out_dtype = vsi_nn_kernel_map_dtype( outputs[0]->attr.dtype.vx_type ); + + in0_dtype = in0_dtype == F16 ? F32 : in0_dtype; + in1_dtype = in1_dtype == F16 ? F32 : in1_dtype; + + key = ROI_ALIGN_HASH_KEY( in0_dtype, in1_dtype, in2_dtype, out_dtype, image_2d ); + + for ( i = 0; i < (uint32_t)kernel_map_size; i ++ ) + { + if ( kernel_map[i].key == key ) + { + break; + } + } + + if ( i < (uint32_t)kernel_map_size ) + { + snprintf( kernel->info.name, VX_MAX_KERNEL_NAME, "%s", kernel_map[i].function_name ); + kernel->info.parameters = param_def; + kernel->info.numParams = (uint32_t)param_def_size; + kernel->info.initialize = initializer; + // Register code source + vsi_nn_kernel_add_source( kernel, VSI_NN_GPU_SOURCE_FMT_CODE, 1, + kernel_map[i].source_name ); + // Register binary source + vsi_nn_kernel_add_source( kernel, VSI_NN_GPU_SOURCE_FMT_EXECUTABLE, 1, + kernel_map[i].source_name ); + status = VSI_SUCCESS; + } + + return status; + +} /* _query_kernel() */ + +#define _INPUT_NUM (3) +#define _OUTPUT_NUM (1) +#define _IO_NUM (_INPUT_NUM + _OUTPUT_NUM) + +static vsi_nn_kernel_node_t _setup + ( + vsi_nn_graph_t * graph, + vsi_nn_tensor_t ** inputs, + size_t input_num, + vsi_nn_tensor_t ** outputs, + size_t output_num, + const vsi_nn_kernel_param_t * params, + vsi_nn_kernel_t * kernel + ) +{ + vsi_status status = VSI_FAILURE; + vsi_nn_kernel_node_param_t node_params[_ROI_ALIGN_PARAM_NUM] = {NULL}; + vsi_nn_kernel_node_t node = NULL; + vsi_bool image_2d = FALSE; + uint32_t rank[_IO_NUM] = {0}; + int32_t shapes[_IO_NUM][VSI_NN_MAX_DIM_NUM] = {{ 1 }}; + vsi_nn_tensor_t* reshape_tensors[_IO_NUM] = { NULL }; + int32_t i = 0; + float width_ratio = vsi_nn_kernel_param_get_float32( params, "width_ratio" ); + float height_ratio = vsi_nn_kernel_param_get_float32( params, "height_ratio" ); + int32_t width_sample_num = vsi_nn_kernel_param_get_int32( params, "width_sample_num" ); + int32_t height_sample_num = vsi_nn_kernel_param_get_int32( params, "height_sample_num" ); + float width_scale = 1.0f / width_ratio; + float height_scale = 1.0f / height_ratio; + float in_width = (float)(inputs[0]->attr.size[0]); + float in_height = (float)(inputs[0]->attr.size[1]); + float rcp_of_out_width = 1.0f / (float)(outputs[0]->attr.size[0]); + float rcp_of_out_height = 1.0f / (float)(outputs[0]->attr.size[1]); + float sampling_x_ratio = width_sample_num > 0 ? (float)width_sample_num : 0; + float sampling_y_ratio = height_sample_num > 0 ? (float)height_sample_num : 0; + int depth = inputs[0]->attr.size[2]; + + vsi_nn_kernel_optimize_nchw2xhw_shape( (const int32_t*)inputs[0]->attr.size, inputs[0]->attr.dim_num, + shapes[0], &rank[0]); + vsi_nn_kernel_optimize_1d_tensor_shape( (const int32_t*)inputs[1]->attr.size, inputs[1]->attr.dim_num, + shapes[1], &rank[1]); + vsi_nn_kernel_optimize_1d_tensor_shape( (const int32_t*)inputs[2]->attr.size, inputs[2]->attr.dim_num, + shapes[2], &rank[2]); + vsi_nn_kernel_optimize_nchw2xhw_shape( (const int32_t*)outputs[0]->attr.size, outputs[0]->attr.dim_num, + shapes[3], &rank[3]); + + for (i = 0; i < _INPUT_NUM; i++) + { + reshape_tensors[i] = vsi_nn_reshape_tensor( graph, + inputs[i], (uint32_t*)shapes[i], rank[i] ); + } + reshape_tensors[_INPUT_NUM] = vsi_nn_reshape_tensor( graph, + outputs[0], (uint32_t*)shapes[_INPUT_NUM], rank[_INPUT_NUM] ); + + if ( !vsi_nn_kernel_gpu_check_shape( (int32_t*)reshape_tensors[0]->attr.size, + inputs[0]->attr.dim_num ) ) + { + return NULL; + } + + status = _query_kernel( kernel, reshape_tensors, &reshape_tensors[_INPUT_NUM], image_2d); + + if ( VSI_SUCCESS == status ) + { + size_t node_params_num = ROI_ALIGN_PARAM_NUM; + + node = vsi_nn_kernel_create_node( graph, kernel ); + if ( node ) + { + /* Set inputs and outputs */ + vsi_nn_kernel_node_pack_io( node_params, _ROI_ALIGN_PARAM_NUM, + reshape_tensors, input_num, &reshape_tensors[_INPUT_NUM], output_num ); + + node_params[SCALAR_SPATIAL_X_SCALE] = vsi_nn_kernel_scalar_create( graph, F32, &width_scale ); + node_params[SCALAR_SPATIAL_Y_SCALE] = vsi_nn_kernel_scalar_create( graph, F32, &height_scale ); + node_params[SCALAR_INPUT_WIDTH] = vsi_nn_kernel_scalar_create( graph, F32, &in_width ); + node_params[SCALAR_INPUT_HEIGHT] = vsi_nn_kernel_scalar_create( graph, F32, &in_height ); + node_params[SCALAR_RCP_OF_OUTPUT_WIDTH] = vsi_nn_kernel_scalar_create( graph, F32, &rcp_of_out_width ); + node_params[SCALAR_RCP_OF_OUTPUT_HEIGHT] = vsi_nn_kernel_scalar_create( graph, F32, &rcp_of_out_height ); + node_params[SCALAR_SAMPLING_X_RATIO] = vsi_nn_kernel_scalar_create( graph, F32, &sampling_x_ratio ); + node_params[SCALAR_SAMPLING_Y_RATIO] = vsi_nn_kernel_scalar_create( graph, F32, &sampling_y_ratio ); + node_params[SCALAR_DEPTH] = vsi_nn_kernel_scalar_create( graph, I32, &depth ); + + /* Pass parameters to node. */ + status = vsi_nn_kernel_node_pass_param( node, node_params, node_params_num ); + vsi_nn_kernel_scalar_release( &node_params[SCALAR_SPATIAL_X_SCALE] ); + vsi_nn_kernel_scalar_release( &node_params[SCALAR_SPATIAL_Y_SCALE] ); + vsi_nn_kernel_scalar_release( &node_params[SCALAR_INPUT_WIDTH] ); + vsi_nn_kernel_scalar_release( &node_params[SCALAR_INPUT_HEIGHT] ); + vsi_nn_kernel_scalar_release( &node_params[SCALAR_RCP_OF_OUTPUT_WIDTH] ); + vsi_nn_kernel_scalar_release( &node_params[SCALAR_RCP_OF_OUTPUT_HEIGHT] ); + vsi_nn_kernel_scalar_release( &node_params[SCALAR_SAMPLING_X_RATIO] ); + vsi_nn_kernel_scalar_release( &node_params[SCALAR_SAMPLING_Y_RATIO] ); + vsi_nn_kernel_scalar_release( &node_params[SCALAR_DEPTH] ); + } + } + + for (i = 0; i < _IO_NUM; i++) + { + if (reshape_tensors[i]) + { + vsi_nn_ReleaseTensor( &reshape_tensors[i] ); + } + } + + return node; +} /* _setup() */ + +__END_DECLS + +REGISTER_BACKEND_CL( roi_align, _setup ) + diff --git a/src/tim/vx/internal/src/kernel/cl/space2depth_internal_cl.c b/src/tim/vx/internal/src/kernel/cl/space2depth_internal_cl.c new file mode 100644 index 0000000..b021962 --- /dev/null +++ b/src/tim/vx/internal/src/kernel/cl/space2depth_internal_cl.c @@ -0,0 +1,298 @@ +/**************************************************************************** +* +* Copyright (c) 2020 Vivante Corporation +* +* Permission is hereby granted, free of charge, to any person obtaining a +* copy of this software and associated documentation files (the "Software"), +* to deal in the Software without restriction, including without limitation +* the rights to use, copy, modify, merge, publish, distribute, sublicense, +* and/or sell copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following conditions: +* +* The above copyright notice and this permission notice shall be included in +* all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +* DEALINGS IN THE SOFTWARE. +* +*****************************************************************************/ + +#include +#include +#include +#include "vsi_nn_types.h" +#include "vsi_nn_tensor.h" +#include "vsi_nn_graph.h" +#include "vsi_nn_log.h" +#include "vsi_nn_prv.h" +#include "vsi_nn_error.h" +#include "vsi_nn_tensor_util.h" +#include "utils/vsi_nn_util.h" +#include "kernel/vsi_nn_kernel.h" +#include "kernel/vsi_nn_kernel_eltwise.h" + +__BEGIN_DECLS + +/* + * Define kernel meta. + */ +#define KERNEL_SOURCE_1 "space2depth_internal" + +#define HASH_SPACE2DEPTH_INTERNAL_KEY(_input0_type, _output_type, _opt_flg) \ + ((_input0_type << 24) | (_output_type << 16) | (_opt_flg << 8)) + +#define HASH_SPACE2DEPTH_INTERNAL_CL_KERNEL_NAME(SRC0_TYPE, DST_TYPE) \ + CVIVANTE_NAMESPACE("cl.space2depth_internal_"#SRC0_TYPE"to"#DST_TYPE) + +#define HASH_SPACE2DEPTH_INTERNAL_X2Y1_CL_KERNEL_NAME(SRC0_TYPE, DST_TYPE) \ + CVIVANTE_NAMESPACE("cl.space2depth_internal_"#SRC0_TYPE"to"#DST_TYPE"_X2Y1") + +#define TENSOR_SPACE2DEPTH_INTERNAL_KERNELS(IN0_TYPE, OUT_TYPE, SOURCE) \ + { HASH_SPACE2DEPTH_INTERNAL_KEY(IN0_TYPE, OUT_TYPE, 0), \ + HASH_SPACE2DEPTH_INTERNAL_CL_KERNEL_NAME(IN0_TYPE, OUT_TYPE), \ + SOURCE }, + + #define TENSOR_SPACE2DEPTH_INTERNAL_OPT_KERNELS(IN0_TYPE, OUT_TYPE, SOURCE) \ + { HASH_SPACE2DEPTH_INTERNAL_KEY(IN0_TYPE, OUT_TYPE, 1), \ + HASH_SPACE2DEPTH_INTERNAL_X2Y1_CL_KERNEL_NAME(IN0_TYPE, OUT_TYPE), \ + SOURCE }, + +static const struct { + uint32_t key; + char* function_name; + const char* source_name; + } kernel_map[] = +{ + TENSOR_SPACE2DEPTH_INTERNAL_KERNELS(F32, F32, KERNEL_SOURCE_1) + TENSOR_SPACE2DEPTH_INTERNAL_KERNELS(U8, U8, KERNEL_SOURCE_1) + TENSOR_SPACE2DEPTH_INTERNAL_OPT_KERNELS(F32, F32, KERNEL_SOURCE_1) + TENSOR_SPACE2DEPTH_INTERNAL_OPT_KERNELS(U8, U8, KERNEL_SOURCE_1) +}; + +/* + * Kernel params + */ +static vx_param_description_t kernel_param_def[] = +{ + {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, + {VX_OUTPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED}, +}; + +#define _CL_PARAM_NUM _cnt_of_array(kernel_param_def) + +/* + * Kernel initializer + */ +DEF_KERNEL_INITIALIZER(_space2depth_internal_initializer) + ( + vsi_nn_kernel_node_t node, + const vsi_nn_kernel_node_param_t * param, + size_t param_size + ) +{ + gpu_param_t gpu_param = { + 3, + {0, 0, 0}, + {0, 0, 0}, + {0, 0, 0}, + {0, 0, 0} + }; + + vsi_status status = VSI_FAILURE; + vsi_nn_kernel_tensor_attr_t * attr[1] = { NULL }; + vsi_int_array_t * in_shape = NULL; + int32_t width = 0; + int32_t height = 0; + int32_t chn = 0; + + attr[0] = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[0] ); + CHECK_PTR_FAIL_GOTO( attr[0], "Create tensor attr buffer fail.", final ); + + in_shape = attr[0]->shape; + width = in_shape->data[0]; + height = in_shape->data[1]; + chn = in_shape->size > 2 ? in_shape->data[2] : 1; + + gpu_param.global_scale[0] = 1; + gpu_param.global_scale[1] = 1; + gpu_param.global_scale[2] = 1; + + gpu_param.global_size[0] = gpu_align_p2((width + gpu_param.global_scale[0] - 1) + / gpu_param.global_scale[0], 4); + gpu_param.global_size[1] = height; + gpu_param.global_size[2] = chn; + + status = vsi_nn_kernel_gpu_config( node, &gpu_param ); + +final: + if (attr[0]) + { + vsi_nn_kernel_tensor_attr_release( &attr[0] ); + attr[0] = NULL; + } + + return status; +} /* _space2depth_internal_initializer() */ + +static vsi_status _query_kernel + ( + vsi_nn_tensor_t* const* const inputs, + vsi_nn_tensor_t* const* const outputs, + vsi_nn_kernel_t* kernel, + int32_t opt_flg + ) +{ + vsi_nn_kernel_dtype_e input0_dtype = U8; + vsi_nn_kernel_dtype_e output_dtype = U8; + vsi_status status = VSI_FAILURE; + uint32_t key = 0; + int i = 0; + + input0_dtype = vsi_nn_kernel_map_dtype( inputs[0]->attr.dtype.vx_type ); + output_dtype = vsi_nn_kernel_map_dtype( outputs[0]->attr.dtype.vx_type ); + key = HASH_SPACE2DEPTH_INTERNAL_KEY( input0_dtype, output_dtype, opt_flg ); + + if (input0_dtype == F16 && output_dtype == F16) + { + input0_dtype = F32; + output_dtype = F32; + } + + for( i = 0; i < _cnt_of_array(kernel_map); i ++ ) + { + if ( kernel_map[i].key == key ) + { + break; + } + } + if ( i < _cnt_of_array(kernel_map) ) + { + snprintf( kernel->info.name, VX_MAX_KERNEL_NAME, "%s", kernel_map[i].function_name ); + kernel->info.parameters = kernel_param_def; + kernel->info.numParams = _cnt_of_array( kernel_param_def ); + kernel->info.initialize = _space2depth_internal_initializer; + vsi_nn_kernel_add_source( kernel, VSI_NN_GPU_SOURCE_FMT_CODE, 2, + "eltwise_ops_helper", + kernel_map[i].source_name ); + vsi_nn_kernel_add_source( kernel, VSI_NN_GPU_SOURCE_FMT_EXECUTABLE, 1, + kernel_map[i].source_name ); + status = VSI_SUCCESS; + } + return status; +} /* _query_kernel() */ + +static vsi_nn_kernel_node_t _setup + ( + vsi_nn_graph_t * graph, + vsi_nn_tensor_t ** inputs, + size_t input_num, + vsi_nn_tensor_t ** outputs, + size_t output_num, + const vsi_nn_kernel_param_t * params, + vsi_nn_kernel_t * kernel + ) +{ + vsi_status status = VSI_FAILURE; + vsi_nn_kernel_node_param_t node_params[_CL_PARAM_NUM] = {NULL}; + vsi_nn_kernel_node_t node = NULL; + int32_t block_size_x = vsi_nn_kernel_param_get_int32( params, "block_size_x" ); + int32_t block_size_y = vsi_nn_kernel_param_get_int32( params, "block_size_y" ); + int32_t opt_flg = (block_size_x == 2 && block_size_y == 1) ? 1 : 0; + + float inputScale = inputs[0]->attr.dtype.scale; + int32_t inputZp = inputs[0]->attr.dtype.zero_point; + float outputScale = outputs[0]->attr.dtype.scale; + int32_t outputZp = outputs[0]->attr.dtype.zero_point; + float scaleInOut = 1.0f; + float zpInOut = 0.0f; + + if (inputs[0]->attr.dtype.qnt_type == VSI_NN_QNT_TYPE_DFP) + { + int32_t input_fl = inputs[0]->attr.dtype.fl; + if (input_fl > 0) + { + inputScale = (1.0f / ((float) ((int64_t)1 << input_fl))); + } + else + { + inputScale = ((float) ((int64_t)1 << -input_fl)); + } + inputZp = 0; + } + else if (inputs[0]->attr.dtype.qnt_type == VSI_NN_QNT_TYPE_NONE) + { + inputScale = 1.0f; + inputZp = 0; + } + + if (outputs[0]->attr.dtype.qnt_type == VSI_NN_QNT_TYPE_DFP) + { + int32_t output_fl = outputs[0]->attr.dtype.fl; + if (output_fl > 0) + { + outputScale = (1.0f / ((float) ((int64_t)1 << output_fl))); + } + else + { + outputScale = ((float) ((int64_t)1 << -output_fl)); + } + outputZp = 0; + } + else if (outputs[0]->attr.dtype.qnt_type == VSI_NN_QNT_TYPE_NONE) + { + outputScale = 1.0f; + outputZp = 0; + } + scaleInOut = inputScale / outputScale; + zpInOut = outputZp - inputZp * scaleInOut; + + if ( !vsi_nn_kernel_gpu_check_shape( (int32_t*)outputs[0]->attr.size, + outputs[0]->attr.dim_num ) ) + { + return NULL; + } + + status = _query_kernel( inputs, outputs, kernel, opt_flg); + if ( VSI_SUCCESS == status) + { + node = vsi_nn_kernel_create_node( graph, kernel ); + + if ( node ) + { + int32_t index = 2; + vsi_nn_kernel_node_pack_io( node_params, _CL_PARAM_NUM, + inputs, 1, outputs, 1 ); + node_params[index++] = vsi_nn_kernel_scalar_create( + graph, I32, &block_size_x ); + node_params[index++] = vsi_nn_kernel_scalar_create( + graph, I32, &block_size_y ); + node_params[index++] = vsi_nn_kernel_scalar_create( + graph, F32, &scaleInOut ); + node_params[index] = vsi_nn_kernel_scalar_create( + graph, F32, &zpInOut ); + + /* Pass parameters to node. */ + status = vsi_nn_kernel_node_pass_param( node, node_params, _CL_PARAM_NUM ); + VSI_ASSERT( status == VSI_SUCCESS ); + vsi_nn_kernel_scalar_release( &node_params[2] ); + vsi_nn_kernel_scalar_release( &node_params[3] ); + vsi_nn_kernel_scalar_release( &node_params[4] ); + vsi_nn_kernel_scalar_release( &node_params[5] ); + } + } + return node; +} /* _setup() */ + +__END_DECLS + +REGISTER_BACKEND_CL( space2depth_internal, _setup ) + diff --git a/src/tim/vx/internal/src/kernel/cpu/instance_normalization_cpu.c b/src/tim/vx/internal/src/kernel/cpu/instance_normalization_cpu.c index b1e9860..6720a14 100644 --- a/src/tim/vx/internal/src/kernel/cpu/instance_normalization_cpu.c +++ b/src/tim/vx/internal/src/kernel/cpu/instance_normalization_cpu.c @@ -173,7 +173,7 @@ final: if(attr[i]) { vsi_nn_kernel_tensor_attr_release( &attr[i] ); } } return status; -} /* _pre_process_yuv420_exec() */ +} /* _instance_norm_exec() */ /* * Kernel params */ diff --git a/src/tim/vx/internal/src/kernel/cpu/layer_normalization_cpu.c b/src/tim/vx/internal/src/kernel/cpu/layer_normalization_cpu.c new file mode 100644 index 0000000..d6d9802 --- /dev/null +++ b/src/tim/vx/internal/src/kernel/cpu/layer_normalization_cpu.c @@ -0,0 +1,255 @@ +/**************************************************************************** +* +* Copyright (c) 2019 Vivante Corporation +* +* Permission is hereby granted, free of charge, to any person obtaining a +* copy of this software and associated documentation files (the "Software"), +* to deal in the Software without restriction, including without limitation +* the rights to use, copy, modify, merge, publish, distribute, sublicense, +* and/or sell copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following conditions: +* +* The above copyright notice and this permission notice shall be included in +* all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +* DEALINGS IN THE SOFTWARE. +* +*****************************************************************************/ + + +#include +#include +#include +#include "vsi_nn_types.h" +#include "vsi_nn_tensor.h" +#include "vsi_nn_graph.h" +#include "vsi_nn_log.h" +#include "vsi_nn_prv.h" +#include "vsi_nn_tensor_util.h" +#include "vsi_nn_error.h" +#include "utils/vsi_nn_util.h" +#include "kernel/vsi_nn_kernel.h" +#include "client/vsi_nn_vxkernel.h" + +__BEGIN_DECLS + +/* + * Define kernel meta. + */ +#define _CPU_ARG_NUM (1) +#define _CPU_INPUT_NUM (3) +#define _CPU_OUTPUT_NUM (1) +#define _CPU_IO_NUM (_CPU_INPUT_NUM + _CPU_OUTPUT_NUM) +#define _CPU_PARAM_NUM (_CPU_ARG_NUM + _CPU_IO_NUM) +#define _KERNEL_NAME CVIVANTE_NAMESPACE("cpu.layer_norm") + +DEF_KERNEL_EXECUTOR(_layer_norm_exec) + ( + vsi_nn_kernel_node_t node, + const vsi_nn_kernel_node_param_t * param, + size_t param_size + ) +{ + vsi_status status = VSI_FAILURE; + vsi_nn_kernel_tensor_t tensors[_CPU_IO_NUM] = { NULL }; + float * buffer[_CPU_IO_NUM] = { NULL }; + size_t out_elements = 0; + vsi_nn_kernel_tensor_attr_t * attr[_CPU_IO_NUM] = { NULL }; + uint32_t i = 0; + float eps = .0f; + + tensors[0] = (vsi_nn_kernel_tensor_t)param[0]; + tensors[1] = (vsi_nn_kernel_tensor_t)param[1]; + tensors[2] = (vsi_nn_kernel_tensor_t)param[2]; + tensors[3] = (vsi_nn_kernel_tensor_t)param[3]; + + attr[0] = vsi_nn_kernel_tensor_attr_create( tensors[0] ); + CHECK_PTR_FAIL_GOTO( attr[0], "Create tensor attr buffer fail.", final ); + attr[1] = vsi_nn_kernel_tensor_attr_create( tensors[1] ); + CHECK_PTR_FAIL_GOTO( attr[1], "Create tensor attr buffer fail.", final ); + attr[2] = vsi_nn_kernel_tensor_attr_create( tensors[2] ); + CHECK_PTR_FAIL_GOTO( attr[2], "Create tensor attr buffer fail.", final ); + attr[3] = vsi_nn_kernel_tensor_attr_create( tensors[3] ); + CHECK_PTR_FAIL_GOTO( attr[3], "Create tensor attr buffer fail.", final ); + + out_elements = vsi_nn_kernel_tensor_attr_get_size( attr[3] ); + + status = vsi_nn_kernel_scalar_read_float32((vsi_nn_kernel_scalar_t)param[4], &eps); + CHECK_STATUS_FAIL_GOTO(status, final ); + + buffer[0] = (float*)vsi_nn_kernel_tensor_create_buffer( tensors[0], attr[0], TRUE ); + CHECK_PTR_FAIL_GOTO( buffer[0], "Create input0 buffer fail.", final ); + + buffer[1] = (float*)vsi_nn_kernel_tensor_create_buffer( tensors[1], attr[1], TRUE ); + CHECK_PTR_FAIL_GOTO( buffer[1], "Create input1 buffer fail.", final ); + + buffer[2] = (float*)vsi_nn_kernel_tensor_create_buffer( tensors[2], attr[2], TRUE ); + CHECK_PTR_FAIL_GOTO( buffer[2], "Create input1 buffer fail.", final ); + + buffer[3] = (float *)malloc( out_elements * sizeof(float) ); + CHECK_PTR_FAIL_GOTO( buffer[3], "Create output buffer fail.", final ); + memset( buffer[3], 0, out_elements * sizeof(float) ); + + { + uint32_t axis_first = 0; + uint32_t axis_num = 1; + uint32_t outerSize = 1; + uint32_t axisSize = 1; + uint32_t innerSize = 1; + uint32_t inner = 0; + uint32_t outer = 0; + + for (i = 0; i < (uint32_t)axis_first; i++) + { + innerSize *= attr[0]->shape->data[i]; + } + + for(i = 0; i < (uint32_t)axis_num; i++) + { + axisSize *= attr[0]->shape->data[axis_first + i]; + } + + for (i = (uint32_t)axis_first + axis_num; i < attr[0]->shape->size; i++) + { + outerSize *= attr[0]->shape->data[i]; + } + + for ( outer = 0; outer < outerSize; ++outer) + { + for ( inner = 0; inner < innerSize; ++inner) + { + float sum = .0f; + float sumsq = .0f; + float mean = .0f; + float vari = .0f; + + for (i = 0; i < (uint32_t)axisSize; ++i) + { + float value = buffer[0][(outer * axisSize + i) * innerSize + inner]; + sum += value; + sumsq += (value * value); + } + mean = sum / (axisSize); + vari = sumsq / (axisSize) - mean * mean; + vari = (float)(1.0 / sqrtf(vari + eps)); + + for (i = 0; i < (uint32_t)axisSize; ++i) + { + int idx = (outer * axisSize + i) * innerSize + inner; + float data = buffer[0][idx] - mean; + float scaleVal = buffer[2][idx]; + float biasVal = buffer[1][idx]; + float normVal = data * vari * scaleVal + biasVal; + buffer[3][idx] = normVal; + } + } + } + } + + status = vsi_nn_kernel_tensor_write_from_float( tensors[3], attr[3], + buffer[3], out_elements ); + CHECK_STATUS_FAIL_GOTO( status, final ); + +final: + for( i = 0; i < _CPU_IO_NUM; i ++ ) + { + if ( buffer[i] ) + { + free( buffer[i] ); + } + } + for( i = 0; i < _CPU_IO_NUM; i ++ ) + { + if (attr[i]) { vsi_nn_kernel_tensor_attr_release( &attr[i] ); } + } + return status; +} /* _layer_norm_exec() */ +/* + * Kernel params + */ +static vx_param_description_t _layer_normalization_kernel_param_def[] = +{ + {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, + {VX_OUTPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED}, + // Add kererl parameters here +}; +#define _LAYER_NORMALIZATION_PARAM_NUM _cnt_of_array( _layer_normalization_kernel_param_def ) + +static const vx_kernel_description_t _kernel_info = +{ + KERNEL_ID_PLACEHOLDER, + _KERNEL_NAME, + _layer_norm_exec, + _layer_normalization_kernel_param_def, + _LAYER_NORMALIZATION_PARAM_NUM, + vsi_nn_KernelValidator, + NULL, + NULL, + vsi_nn_KernelInitializer, + vsi_nn_KernelDeinitializer +}; + +static vsi_status _query_kernel + ( + vsi_nn_tensor_t* const* const inputs, + vsi_nn_tensor_t* const* const outputs, + vsi_nn_kernel_t* kernel + ) +{ + memmove( &kernel->info, &_kernel_info, sizeof(vx_kernel_description_t) ); + return VSI_SUCCESS; +} /* _query_kernel() */ + +static vsi_nn_kernel_node_t _setup + ( + vsi_nn_graph_t * graph, + vsi_nn_tensor_t ** inputs, + size_t input_num, + vsi_nn_tensor_t ** outputs, + size_t output_num, + const vsi_nn_kernel_param_t * params, + vsi_nn_kernel_t * kernel + ) +{ + vsi_status status = VSI_FAILURE; + vsi_nn_kernel_node_param_t backend_params[_CPU_PARAM_NUM] = {NULL}; + vsi_nn_kernel_node_t node = NULL; + + status = _query_kernel( inputs, outputs, kernel ); + if ( VSI_SUCCESS == status) + { + node = vsi_nn_kernel_create_node( graph, kernel ); + if ( node ) + { + float eps = vsi_nn_kernel_param_get_float32( params, "eps" ); + /* Set inputs and outputs */ + vsi_nn_kernel_node_pack_io( backend_params, _CPU_PARAM_NUM, + inputs, _CPU_INPUT_NUM, outputs, _CPU_OUTPUT_NUM ); + backend_params[4] = vsi_nn_kernel_scalar_create( graph, F32, &eps ); + + /* Pass parameters to node. */ + status = vsi_nn_kernel_node_pass_param( node, backend_params, _CPU_PARAM_NUM ); + CHECK_STATUS( status ); + vsi_nn_kernel_scalar_release( &backend_params[4] ); + } + else + { + status = VSI_FAILURE; + } + } + return node; +} /* _setup() */ + +__END_DECLS + +REGISTER_BACKEND_CPU( layer_norm, _setup ) + diff --git a/src/tim/vx/internal/src/kernel/cpu/roi_align_cpu.c b/src/tim/vx/internal/src/kernel/cpu/roi_align_cpu.c new file mode 100644 index 0000000..2aa18cd --- /dev/null +++ b/src/tim/vx/internal/src/kernel/cpu/roi_align_cpu.c @@ -0,0 +1,378 @@ +/**************************************************************************** +* +* Copyright (c) 2020 Vivante Corporation +* +* Permission is hereby granted, free of charge, to any person obtaining a +* copy of this software and associated documentation files (the "Software"), +* to deal in the Software without restriction, including without limitation +* the rights to use, copy, modify, merge, publish, distribute, sublicense, +* and/or sell copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following conditions: +* +* The above copyright notice and this permission notice shall be included in +* all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +* DEALINGS IN THE SOFTWARE. +* +*****************************************************************************/ + + +#include +#include +#include +#include "vsi_nn_types.h" +#include "vsi_nn_tensor.h" +#include "vsi_nn_graph.h" +#include "vsi_nn_log.h" +#include "vsi_nn_error.h" +#include "vsi_nn_prv.h" +#include "vsi_nn_tensor_util.h" +#include "utils/vsi_nn_util.h" +#include "kernel/vsi_nn_kernel.h" +#include "libnnext/vx_lib_nnext.h" + +__BEGIN_DECLS + +/* + * Define kernel meta. + */ +#define _INPUT_NUM (3) +#define _OUTPUT_NUM (1) +#define _KERNEL_NAME CVIVANTE_NAMESPACE("cpu.roi_align") + + +/* + * Kernel params + */ +static vx_param_description_t _roi_align_kernel_param_def[] = +{ + {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, + {VX_OUTPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED}, +}; +#define _ROI_ALIGN_PARAM_NUM _cnt_of_array( _roi_align_kernel_param_def ) +#define SCALAR_X_RATIO (4) +#define SCALAR_Y_RATIO (5) +#define SCALAR_X_SAMPLE (6) +#define SCALAR_Y_SAMPLE (7) + +/* + * Kernel function + */ +static float _compute_region_coordinate(int32_t p, float bin_size, float roi_anchor, float max_value) +{ + const float region_start = p * bin_size + roi_anchor; + + return vsi_nn_clamp(region_start, 0.0f, max_value - 1); +} + +static float _roi_align_1x1(float *input_ptr, + int32_t width, + int32_t height, + float region_start_x, + float bin_size_x, + int32_t grid_size_x, + float region_end_x, + float region_start_y, + float bin_size_y, + int32_t grid_size_y, + float region_end_y) +{ + if ((region_end_x <= region_start_x) || (region_end_y <= region_start_y)) + { + return 0; + } + else + { + float avg = 0; + int32_t iy = 0; + int32_t ix = 0; + // Iterate through the aligned pooling region + for (iy = 0; iy < grid_size_y; ++iy) + { + for (ix = 0; ix < grid_size_x; ++ix) + { + // Align the window in the middle of every bin + float y = region_start_y + ((float)iy + 0.5f) * bin_size_y / (float)(grid_size_y); + float x = region_start_x + ((float)ix + 0.5f) * bin_size_x / (float)(grid_size_x); + + // Interpolation in the [0,0] [0,1] [1,0] [1,1] square + const int32_t y_low = (int32_t)y; + const int32_t x_low = (int32_t)x; + const int32_t y_high = vsi_nn_min(y_low + 1, height - 1); + const int32_t x_high = vsi_nn_min(x_low + 1, width - 1); + + const float ly = y - y_low; + const float lx = x - x_low; + const float hy = 1.0f - ly; + const float hx = 1.0f - lx; + + const float w1 = hy * hx; + const float w2 = hy * lx; + const float w3 = ly * hx; + const float w4 = ly * lx; + + const float data1 = *(input_ptr + y_low * width + x_low); + const float data2 = *(input_ptr + y_low * width + x_high); + const float data3 = *(input_ptr + y_high * width + x_low); + const float data4 = *(input_ptr + y_high * width + x_high); + + avg += w1 * data1 + w2 * data2 + w3 * data3 + w4 * data4; + } + } + + avg /= grid_size_x * grid_size_y; + + return avg; + } +} + +DEF_KERNEL_EXECUTOR(_compute) + ( + vsi_nn_kernel_node_t node, + const vsi_nn_kernel_node_param_t * param, + size_t param_size + ) +{ + vsi_status status = VSI_FAILURE; + vsi_nn_kernel_tensor_t input[_INPUT_NUM] = {NULL}; + vsi_nn_kernel_tensor_t output[_OUTPUT_NUM] = {NULL}; + float *f32_in_buffer[_INPUT_NUM] = {NULL}; + float *f32_out_buffer[_OUTPUT_NUM] = {NULL}; + vsi_nn_kernel_tensor_attr_t *in_attr[_INPUT_NUM]; + vsi_nn_kernel_tensor_attr_t *out_attr[_OUTPUT_NUM]; + size_t out_stride_size[_OUTPUT_NUM][VSI_NN_MAX_DIM_NUM] = {{1}}; + size_t out_elements[_OUTPUT_NUM] = {0}; + size_t out_bytes[_OUTPUT_NUM] = {0}; + uint32_t i = 0; + float width_scale = 0.0f; + float height_scale = 0.0f; + float width_ratio = 0.0f; + float height_ratio = 0.0f; + int32_t width_sample_num = 0; + int32_t height_sample_num = 0; + uint32_t n = 0; + uint32_t num_rois = 0; + int32_t inHeight = 0; + int32_t inWidth = 0; + int32_t inDepth = 0; + int32_t outHeight = 0; + int32_t outWidth = 0; + uint32_t kRoiDim = 4; + uint32_t out_index = 0; + + /* prepare data */ + for (i = 0; i < _INPUT_NUM; i ++) + { + input[i] = (vsi_nn_kernel_tensor_t)param[i]; + in_attr[i] = vsi_nn_kernel_tensor_attr_create( input[i] ); + f32_in_buffer[i] = (float*)vsi_nn_kernel_tensor_create_buffer( input[i], in_attr[i], TRUE ); + CHECK_PTR_FAIL_GOTO( f32_in_buffer[i], "Create input0 buffer fail.", final ); + + } + for (i = 0; i < _OUTPUT_NUM; i ++) + { + output[i] = (vsi_nn_kernel_tensor_t)param[i + _INPUT_NUM]; + out_attr[i] = vsi_nn_kernel_tensor_attr_create( output[i] ); + vsi_nn_kernel_tensor_attr_get_stride( out_attr[i], out_stride_size[i] ); + out_elements[i] = vsi_nn_kernel_tensor_attr_get_size( out_attr[i] ); + out_bytes[i] = out_elements[i] * sizeof(float); + f32_out_buffer[i] = (float *)malloc( out_bytes[i] ); + CHECK_PTR_FAIL_GOTO( f32_out_buffer[i], "Create output buffer fail.", final ); + memset( f32_out_buffer[i], 0, out_bytes[i] ); + } + + vsi_nn_kernel_scalar_read_float32((vsi_nn_kernel_scalar_t)param[SCALAR_X_RATIO], &(width_ratio)); + vsi_nn_kernel_scalar_read_float32((vsi_nn_kernel_scalar_t)param[SCALAR_Y_RATIO], &(height_ratio)); + vsi_nn_kernel_scalar_read_int32((vsi_nn_kernel_scalar_t)param[SCALAR_X_SAMPLE], &(width_sample_num)); + vsi_nn_kernel_scalar_read_int32((vsi_nn_kernel_scalar_t)param[SCALAR_Y_SAMPLE], &(height_sample_num)); + + width_scale = 1.0f / width_ratio; + height_scale = 1.0f / height_ratio; + num_rois = in_attr[1]->shape->data[1]; + + inWidth = in_attr[0]->shape->data[0]; + inHeight = in_attr[0]->shape->data[1]; + inDepth = in_attr[0]->shape->data[2]; + outWidth = out_attr[0]->shape->data[0]; + outHeight = out_attr[0]->shape->data[1]; + + for (n = 0; n < num_rois; n++) + { + uint32_t batchId = (uint32_t)f32_in_buffer[2][n]; + float scale = (in_attr[1]->dtype == U16) ? 0.125f : 1.0f; + float qx1 = f32_in_buffer[1][n * kRoiDim]; + float qy1 = f32_in_buffer[1][n * kRoiDim + 1]; + float qx2 = f32_in_buffer[1][n * kRoiDim + 2]; + float qy2 = f32_in_buffer[1][n * kRoiDim + 3]; + + float x1 = qx1 * scale; + float x2 = qx2 * scale; + float y1 = qy1 * scale; + float y2 = qy2 * scale; + float roi_anchor_x = x1 * width_scale; + float roi_anchor_y = y1 * height_scale; + float roi_dims_x = vsi_nn_max((x2 - x1) * width_scale, 1.0f); + float roi_dims_y = vsi_nn_max((y2 - y1) * height_scale, 1.0f); + float bin_size_x = roi_dims_x / outWidth; + float bin_size_y = roi_dims_y / outHeight; + + int32_t batch_base_index = batchId * inHeight * inWidth * inDepth; + int32_t ch = 0; + int32_t py = 0; + int32_t px = 0; + + for (ch = 0; ch < inDepth; ch++) + { + for (py = 0; py < outHeight; py++) + { + for (px = 0; px < outWidth; px++) + { + float region_start_x = _compute_region_coordinate(px, bin_size_x, + roi_anchor_x, (float)inWidth); + float region_start_y = _compute_region_coordinate(py, bin_size_y, + roi_anchor_y, (float)inHeight); + float region_end_x = _compute_region_coordinate(px + 1, bin_size_x, + roi_anchor_x, (float)inWidth); + float region_end_y = _compute_region_coordinate(py + 1, bin_size_y, + roi_anchor_y, (float)inHeight); + + int32_t roi_bin_grid_x = (width_sample_num > 0) ? width_sample_num : (int32_t)(ceil(bin_size_x)); + int32_t roi_bin_grid_y = (height_sample_num > 0) ? height_sample_num : (int32_t)(ceil(bin_size_y)); + + float *input_ptr = &f32_in_buffer[0][batch_base_index + ch * inWidth * inHeight]; + float out_val = 0; + + out_val = _roi_align_1x1( + input_ptr, inWidth, inHeight, region_start_x, bin_size_x, + roi_bin_grid_x, region_end_x, region_start_y, bin_size_y, + roi_bin_grid_y, region_end_y); + + f32_out_buffer[0][out_index++] = out_val; + } + } + } + } + + /* save data */ + for (i = 0; i < _OUTPUT_NUM; i++) + { + status = vsi_nn_kernel_tensor_write_from_float( output[i], out_attr[i], + f32_out_buffer[i], out_elements[i] ); + CHECK_STATUS_FAIL_GOTO( status, final ); + } + +final: + for (i = 0; i < _INPUT_NUM; i++) + { + if (f32_in_buffer[i]) + { + free(f32_in_buffer[i]); + f32_in_buffer[i] = NULL; + } + + if (in_attr[i]) + { + vsi_nn_kernel_tensor_attr_release( &in_attr[i] ); + } + } + + for (i = 0; i < _OUTPUT_NUM; i++) + { + if (f32_out_buffer[i]) + { + free(f32_out_buffer[i]); + f32_out_buffer[i] = NULL; + } + if (out_attr[i]) + { + vsi_nn_kernel_tensor_attr_release( &out_attr[i] ); + } + } + + return status; +} /* _compute() */ + +/* + * Query kernel + */ +static vsi_status _query_kernel + ( + vsi_nn_kernel_t * kernel, + vsi_nn_tensor_t * const * const inputs, + vsi_nn_tensor_t * const * const outputs + ) +{ + vsi_status status = VSI_FAILURE; + + snprintf( kernel->info.name, VX_MAX_KERNEL_NAME, "%s", _KERNEL_NAME ); + kernel->info.function = _compute; + kernel->info.parameters = _roi_align_kernel_param_def; + kernel->info.numParams = _cnt_of_array( _roi_align_kernel_param_def ); + status = VSI_SUCCESS; + + return status; +} /* _query_kernel() */ + + +static vsi_nn_kernel_node_t _setup + ( + vsi_nn_graph_t * graph, + vsi_nn_tensor_t ** inputs, + size_t input_num, + vsi_nn_tensor_t ** outputs, + size_t output_num, + const vsi_nn_kernel_param_t * params, + vsi_nn_kernel_t * kernel + ) +{ + vsi_status status = VSI_FAILURE; + vsi_nn_kernel_node_param_t node_params[_ROI_ALIGN_PARAM_NUM] = {NULL}; + vsi_nn_kernel_node_t node = NULL; + float width_ratio = vsi_nn_kernel_param_get_float32( params, "width_ratio" ); + float height_ratio = vsi_nn_kernel_param_get_float32( params, "height_ratio" ); + int32_t width_sample_num = vsi_nn_kernel_param_get_int32( params, "width_sample_num" ); + int32_t height_sample_num = vsi_nn_kernel_param_get_int32( params, "height_sample_num" ); + + status = _query_kernel( kernel, inputs, outputs ); + if( VSI_SUCCESS == status) + { + node = vsi_nn_kernel_create_node( graph, kernel ); + if( node ) + { + /* Set inputs and outputs */ + vsi_nn_kernel_node_pack_io( node_params, _ROI_ALIGN_PARAM_NUM, + inputs, input_num, outputs, output_num ); + node_params[SCALAR_X_RATIO] = vsi_nn_kernel_scalar_create( graph, F32, &width_ratio ); + node_params[SCALAR_Y_RATIO] = vsi_nn_kernel_scalar_create( graph, F32, &height_ratio ); + node_params[SCALAR_X_SAMPLE] = vsi_nn_kernel_scalar_create( graph, I32, &width_sample_num ); + node_params[SCALAR_Y_SAMPLE] = vsi_nn_kernel_scalar_create( graph, I32, &height_sample_num ); + /* Pass parameters to node. */ + status = vsi_nn_kernel_node_pass_param( node, node_params, _ROI_ALIGN_PARAM_NUM ); + VSI_ASSERT( status == VSI_SUCCESS ); + vsi_nn_kernel_scalar_release( &node_params[SCALAR_X_RATIO] ); + vsi_nn_kernel_scalar_release( &node_params[SCALAR_Y_RATIO] ); + vsi_nn_kernel_scalar_release( &node_params[SCALAR_X_SAMPLE] ); + vsi_nn_kernel_scalar_release( &node_params[SCALAR_Y_SAMPLE] ); + } + } + + return node; + +} /* _setup() */ + +__END_DECLS + +REGISTER_BACKEND_CPU( roi_align, _setup ) + diff --git a/src/tim/vx/internal/src/kernel/cpu/space2depth_internal_cpu.c b/src/tim/vx/internal/src/kernel/cpu/space2depth_internal_cpu.c new file mode 100644 index 0000000..4df8a52 --- /dev/null +++ b/src/tim/vx/internal/src/kernel/cpu/space2depth_internal_cpu.c @@ -0,0 +1,230 @@ +/**************************************************************************** +* +* Copyright (c) 2019 Vivante Corporation +* +* Permission is hereby granted, free of charge, to any person obtaining a +* copy of this software and associated documentation files (the "Software"), +* to deal in the Software without restriction, including without limitation +* the rights to use, copy, modify, merge, publish, distribute, sublicense, +* and/or sell copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following conditions: +* +* The above copyright notice and this permission notice shall be included in +* all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +* DEALINGS IN THE SOFTWARE. +* +*****************************************************************************/ + + +#include +#include +#include +#include "vsi_nn_types.h" +#include "vsi_nn_tensor.h" +#include "vsi_nn_graph.h" +#include "vsi_nn_log.h" +#include "vsi_nn_prv.h" +#include "vsi_nn_tensor_util.h" +#include "vsi_nn_error.h" +#include "utils/vsi_nn_util.h" +#include "kernel/vsi_nn_kernel.h" +#include "client/vsi_nn_vxkernel.h" + +__BEGIN_DECLS + +/* + * Define kernel meta. + */ +#define _CPU_ARG_NUM (2) +#define _CPU_INPUT_NUM (1) +#define _CPU_OUTPUT_NUM (1) +#define _CPU_IO_NUM (_CPU_INPUT_NUM + _CPU_OUTPUT_NUM) +#define _CPU_PARAM_NUM (_CPU_ARG_NUM + _CPU_IO_NUM) +#define _KERNEL_NAME CVIVANTE_NAMESPACE("cpu.space2depth_internal") + +DEF_KERNEL_EXECUTOR(_space2depth_internal_exec) + ( + vsi_nn_kernel_node_t node, + const vsi_nn_kernel_node_param_t * param, + size_t param_size + ) +{ + vsi_status status = VX_FAILURE; + vsi_nn_kernel_tensor_t tensors[_CPU_IO_NUM] = { NULL }; + float * buffer[2] = { NULL }; + size_t out_elements = 0; + vsi_nn_kernel_tensor_attr_t * attr[_CPU_IO_NUM] = { NULL }; + uint32_t i = 0; + int32_t block_size_x = 1; + int32_t block_size_y = 1; + + tensors[0] = (vsi_nn_kernel_tensor_t)param[0]; + tensors[1] = (vsi_nn_kernel_tensor_t)param[1]; + + attr[0] = vsi_nn_kernel_tensor_attr_create( tensors[0] ); + CHECK_PTR_FAIL_GOTO( attr[0], "Create tensor attr buffer fail.", final ); + attr[1] = vsi_nn_kernel_tensor_attr_create( tensors[1] ); + CHECK_PTR_FAIL_GOTO( attr[1], "Create tensor attr buffer fail.", final ); + + out_elements = vsi_nn_kernel_tensor_attr_get_size( attr[1] ); + + status = vsi_nn_kernel_scalar_read_int32((vsi_nn_kernel_scalar_t)param[2], &block_size_x); + CHECK_STATUS_FAIL_GOTO(status, final ); + status = vsi_nn_kernel_scalar_read_int32((vsi_nn_kernel_scalar_t)param[3], &block_size_y); + CHECK_STATUS_FAIL_GOTO(status, final ); + + buffer[0] = (float*)vsi_nn_kernel_tensor_create_buffer( tensors[0], attr[0], TRUE ); + CHECK_PTR_FAIL_GOTO( buffer[0], "Create input0 buffer fail.", final ); + + buffer[1] = (float *)malloc( out_elements * sizeof(float) ); + CHECK_PTR_FAIL_GOTO( buffer[1], "Create output buffer fail.", final ); + memset( buffer[1], 0, out_elements * sizeof(float) ); + + { + uint32_t output_depth = attr[1]->shape->data[2]; + uint32_t output_height = attr[1]->shape->data[1]; + uint32_t output_width = attr[1]->shape->data[0]; + uint32_t input_batch = attr[0]->shape->size > 3 ? attr[0]->shape->data[3] : 1; + uint32_t input_depth = attr[0]->shape->data[2]; + uint32_t input_height = attr[0]->shape->data[1]; + uint32_t input_width = attr[0]->shape->data[0]; + uint32_t batch = 0, in_h = 0, in_w = 0; + + for (batch = 0; batch < input_batch; ++ batch) + { + uint32_t output_batch_index = batch * output_height * output_width * output_depth; + uint32_t input_batch_index = batch * input_height * input_width * input_depth; + uint32_t in_d = 0; + + for (in_d = 0; in_d < input_depth; in_d ++) + { + for (in_h = 0; in_h < input_height; ++ in_h) + { + for (in_w = 0; in_w < input_width; in_w ++) + { + uint32_t out_w = in_w / block_size_x; + uint32_t out_h = in_h / block_size_y; + uint32_t out_d = (in_w % block_size_x) * input_depth + + (in_h % block_size_y) * block_size_x * input_depth + in_d; + + uint32_t in_index = in_w + in_h * input_width + + in_d * input_height * input_width + input_batch_index; + uint32_t out_index = out_w + out_h * output_width + + out_d * output_width * output_height + output_batch_index; + + buffer[1][out_index] = buffer[0][in_index]; + } + } + } + } + } + + status = vsi_nn_kernel_tensor_write_from_float( tensors[1], attr[1], + buffer[1], out_elements ); + CHECK_STATUS_FAIL_GOTO( status, final ); + +final: + for( i = 0; i < _CPU_IO_NUM; i ++ ) + { + if (attr[i]) { vsi_nn_kernel_tensor_attr_release( &attr[i] ); } + if ( buffer[i] ) + { + free( buffer[i] ); + } + } + return status; +} /* _depth2space_crd_exec() */ +/* + * Kernel params + */ +static vx_param_description_t _space2depth_internal_kernel_param_def[] = +{ + {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, + {VX_OUTPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED}, + // Add kererl parameters here +}; +#define _DEPTH2SPACE_CRD_PARAM_NUM _cnt_of_array( _space2depth_internal_kernel_param_def ) + +static const vx_kernel_description_t _kernel_info = +{ + KERNEL_ID_PLACEHOLDER, + _KERNEL_NAME, + _space2depth_internal_exec, + _space2depth_internal_kernel_param_def, + _cnt_of_array( _space2depth_internal_kernel_param_def ), + vsi_nn_KernelValidator, + NULL, + NULL, + vsi_nn_KernelInitializer, + vsi_nn_KernelDeinitializer +}; + +static vsi_status _query_kernel + ( + vsi_nn_tensor_t* const* const inputs, + vsi_nn_tensor_t* const* const outputs, + vsi_nn_kernel_t* kernel + ) +{ + memmove( &kernel->info, &_kernel_info, sizeof(vx_kernel_description_t) ); + return VSI_SUCCESS; +} /* _query_kernel() */ + +static vsi_nn_kernel_node_t _setup + ( + vsi_nn_graph_t * graph, + vsi_nn_tensor_t ** inputs, + size_t input_num, + vsi_nn_tensor_t ** outputs, + size_t output_num, + const vsi_nn_kernel_param_t * params, + vsi_nn_kernel_t * kernel + ) +{ + vsi_status status = VX_FAILURE; + vsi_nn_kernel_node_param_t backend_params[_CPU_PARAM_NUM] = {NULL}; + vsi_nn_kernel_node_t node = NULL; + + status = _query_kernel( inputs, outputs, kernel ); + if ( VSI_SUCCESS == status) + { + node = vsi_nn_kernel_create_node( graph, kernel ); + if ( node ) + { + uint32_t index = 2; + int32_t block_size_x = vsi_nn_kernel_param_get_int32( params, "block_size_x" ); + int32_t block_size_y = vsi_nn_kernel_param_get_int32( params, "block_size_y" ); + + /* Set inputs and outputs */ + vsi_nn_kernel_node_pack_io( backend_params, _CPU_PARAM_NUM, + inputs, _CPU_INPUT_NUM, outputs, _CPU_OUTPUT_NUM ); + + backend_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &block_size_x ); + backend_params[index] = vsi_nn_kernel_scalar_create( graph, I32, &block_size_y ); + /* Pass parameters to node. */ + status = vsi_nn_kernel_node_pass_param( node, backend_params, _CPU_PARAM_NUM ); + CHECK_STATUS( status ); + vsi_nn_kernel_scalar_release( &backend_params[2] ); + vsi_nn_kernel_scalar_release( &backend_params[3] ); + } + else + { + status = VSI_FAILURE; + } + } + return node; +} /* _setup() */ + +__END_DECLS + +REGISTER_BACKEND_CPU( space2depth_internal, _setup ) + diff --git a/src/tim/vx/internal/src/kernel/cpu/upsamplescale_cpu.c b/src/tim/vx/internal/src/kernel/cpu/upsamplescale_cpu.c new file mode 100644 index 0000000..e8b49f9 --- /dev/null +++ b/src/tim/vx/internal/src/kernel/cpu/upsamplescale_cpu.c @@ -0,0 +1,264 @@ +/**************************************************************************** +* +* Copyright (c) 2020 Vivante Corporation +* +* Permission is hereby granted, free of charge, to any person obtaining a +* copy of this software and associated documentation files (the "Software"), +* to deal in the Software without restriction, including without limitation +* the rights to use, copy, modify, merge, publish, distribute, sublicense, +* and/or sell copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following conditions: +* +* The above copyright notice and this permission notice shall be included in +* all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +* DEALINGS IN THE SOFTWARE. +* +*****************************************************************************/ + + +#include +#include +#include +#include "vsi_nn_types.h" +#include "vsi_nn_tensor.h" +#include "vsi_nn_graph.h" +#include "vsi_nn_log.h" +#include "vsi_nn_error.h" +#include "vsi_nn_prv.h" +#include "vsi_nn_tensor_util.h" +#include "utils/vsi_nn_util.h" +#include "kernel/vsi_nn_kernel.h" +#include "libnnext/vx_lib_nnext.h" + +__BEGIN_DECLS + +/* + * Define kernel meta. + */ +#define _INPUT_NUM (1) +#define _OUTPUT_NUM (1) +#define _KERNEL_NAME CVIVANTE_NAMESPACE("cpu.upsamplescale") + + +/* + * Kernel params + */ +static vx_param_description_t _upsamplescale_kernel_param_def[] = +{ + {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, + {VX_OUTPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED}, + // Add kererl parameters here +}; +#define _UPSAMPLESCALE_PARAM_NUM _cnt_of_array( _upsamplescale_kernel_param_def ) + +#define SCALAR_STRIDE_VALUE (2) +#define SCALAR_SCALE_VALUE (3) + +/* + * Kernel function + */ +DEF_KERNEL_EXECUTOR(_compute) + ( + vsi_nn_kernel_node_t node, + const vsi_nn_kernel_node_param_t * param, + size_t param_size + ) +{ + vsi_status status = VSI_FAILURE; + vsi_nn_kernel_tensor_t input[_INPUT_NUM] = {NULL}; + vsi_nn_kernel_tensor_t output[_OUTPUT_NUM] = {NULL}; + float *f32_in_buffer[_INPUT_NUM] = {NULL}; + float *f32_out_buffer[_OUTPUT_NUM] = {NULL}; + vsi_nn_kernel_tensor_attr_t *in_attr[_INPUT_NUM]; + vsi_nn_kernel_tensor_attr_t *out_attr[_OUTPUT_NUM]; + size_t out_stride_size[_OUTPUT_NUM][VSI_NN_MAX_DIM_NUM] = {{1}}; + size_t out_elements[_OUTPUT_NUM] = {0}; + size_t out_bytes[_OUTPUT_NUM] = {0}; + int32_t i = 0; + int32_t stride = 0; + float scale = 0.0f; + int32_t width = 0; + int32_t height = 0; + int32_t out_width = 0; + int32_t out_height = 0; + int32_t outerSize = 1; + int32_t x = 0; + int32_t y = 0; + + /* prepare data */ + for(i = 0; i < _INPUT_NUM; i ++) + { + input[i] = (vsi_nn_kernel_tensor_t)param[i]; + in_attr[i] = vsi_nn_kernel_tensor_attr_create( input[i] ); + f32_in_buffer[i] = (float*)vsi_nn_kernel_tensor_create_buffer( input[i], in_attr[i], TRUE ); + CHECK_PTR_FAIL_GOTO( f32_in_buffer[i], "Create input0 buffer fail.", final ); + + } + for(i = 0; i < _OUTPUT_NUM; i ++) + { + output[i] = (vsi_nn_kernel_tensor_t)param[i + _INPUT_NUM]; + out_attr[i] = vsi_nn_kernel_tensor_attr_create( output[i] ); + vsi_nn_kernel_tensor_attr_get_stride( out_attr[i], out_stride_size[i] ); + out_elements[i] = vsi_nn_kernel_tensor_attr_get_size( out_attr[i] ); + out_bytes[i] = out_elements[i] * sizeof(float); + f32_out_buffer[i] = (float *)malloc( out_bytes[i] ); + CHECK_PTR_FAIL_GOTO( f32_out_buffer[i], "Create output buffer fail.", final ); + memset( f32_out_buffer[i], 0, out_bytes[i] ); + } + + vsi_nn_kernel_scalar_read_int32((vsi_nn_kernel_scalar_t)param[SCALAR_STRIDE_VALUE], &stride); + vsi_nn_kernel_scalar_read_float32((vsi_nn_kernel_scalar_t)param[SCALAR_SCALE_VALUE], &scale); + + width = in_attr[0]->shape->data[0]; + height = in_attr[0]->shape->data[1]; + for (i = 2; i < (int32_t)in_attr[0]->shape->size; i++) + { + outerSize *= in_attr[0]->shape->data[i]; + } + + out_width = out_attr[0]->shape->data[0]; + out_height = out_attr[0]->shape->data[1]; + + for (i = 0; i < outerSize; i++) + { + for (y = 0; y < height; y++) + { + for (x = 0; x < width; x++) + { + int32_t in_idx = i * width * height + y * width + x; + int32_t base_idx = i * out_width * out_height + + y * stride * out_width + x * stride; + int32_t dx = 0; + int32_t dy = 0; + float data = f32_in_buffer[0][in_idx] * scale; + + for (dy = 0; dy < stride; dy++) + { + for (dx = 0; dx < stride; dx++) + { + int32_t idx = base_idx + dy * out_width + dx; + + f32_out_buffer[0][idx] = data; + } + } + + } + } + } + + /* save data */ + for(i = 0; i < _OUTPUT_NUM; i++) + { + status = vsi_nn_kernel_tensor_write_from_float( output[i], out_attr[i], + f32_out_buffer[i], out_elements[i] ); + CHECK_STATUS_FAIL_GOTO( status, final ); + } + +final: + for (i = 0; i < _INPUT_NUM; i++) + { + if (f32_in_buffer[i]) + { + free(f32_in_buffer[i]); + f32_in_buffer[i] = NULL; + } + if (in_attr[i]) + { + vsi_nn_kernel_tensor_attr_release( &in_attr[i] ); + } + } + for(i = 0; i < _OUTPUT_NUM; i++) + { + if (f32_out_buffer[i]) + { + free(f32_out_buffer[i]); + f32_out_buffer[i] = NULL; + } + if (out_attr[i]) + { + vsi_nn_kernel_tensor_attr_release( &out_attr[i] ); + } + } + + return status; +} /* _compute() */ + + +/* + * Query kernel + */ +static vsi_status _query_kernel + ( + vsi_nn_kernel_t * kernel, + vsi_nn_tensor_t * const * const inputs, + vsi_nn_tensor_t * const * const outputs + /* Add extra params */ + ) +{ + vsi_status status = VSI_SUCCESS; + snprintf( kernel->info.name, VX_MAX_KERNEL_NAME, "%s", _KERNEL_NAME ); + kernel->info.function = _compute; + kernel->info.parameters = _upsamplescale_kernel_param_def; + kernel->info.numParams = _cnt_of_array( _upsamplescale_kernel_param_def ); + + return status; +} /* _query_kernel() */ + + +static vsi_nn_kernel_node_t _setup + ( + vsi_nn_graph_t * graph, + vsi_nn_tensor_t ** inputs, + size_t input_num, + vsi_nn_tensor_t ** outputs, + size_t output_num, + const vsi_nn_kernel_param_t * params, + vsi_nn_kernel_t * kernel + ) +{ + vsi_status status = VSI_FAILURE; + vsi_nn_kernel_node_param_t node_params[_UPSAMPLESCALE_PARAM_NUM]; + vsi_nn_kernel_node_t node = NULL; + int32_t stride = 0; + float scale = 1.0f; + + stride = vsi_nn_kernel_param_get_int32(params, "stride"); + scale = vsi_nn_kernel_param_get_float32(params, "scale"); + + status = _query_kernel( kernel, inputs, outputs /* Add extra params */ ); + if( VSI_SUCCESS == status) + { + node = vsi_nn_kernel_create_node( graph, kernel ); + if( node ) + { + /* Set inputs and outputs */ + vsi_nn_kernel_node_pack_io( node_params, _UPSAMPLESCALE_PARAM_NUM, + inputs, input_num, outputs, output_num ); + + node_params[SCALAR_STRIDE_VALUE] = vsi_nn_kernel_scalar_create( + graph, I32, &stride ); + node_params[SCALAR_SCALE_VALUE] = vsi_nn_kernel_scalar_create( + graph, F32, &scale ); + /* Pass parameters to node. */ + status = vsi_nn_kernel_node_pass_param( node, node_params, _UPSAMPLESCALE_PARAM_NUM ); + + vsi_nn_kernel_scalar_release( &node_params[SCALAR_STRIDE_VALUE] ); + vsi_nn_kernel_scalar_release( &node_params[SCALAR_SCALE_VALUE] ); + } + } + return node; +} /* _setup() */ + +__END_DECLS + +REGISTER_BACKEND_CPU( upsamplescale, _setup ) + diff --git a/src/tim/vx/internal/src/kernel/evis/a_times_b_plus_c_evis.c b/src/tim/vx/internal/src/kernel/evis/a_times_b_plus_c_evis.c index 4401d24..b634604 100644 --- a/src/tim/vx/internal/src/kernel/evis/a_times_b_plus_c_evis.c +++ b/src/tim/vx/internal/src/kernel/evis/a_times_b_plus_c_evis.c @@ -79,8 +79,10 @@ typedef struct static const _kernel_map_type _a_times_b_plus_c_kernel_map[] = { PACK_KERNEL_MAP(F16, F16, F16, F16), + PACK_KERNEL_MAP(F16, F16, F32, F16), PACK_KERNEL_MAP_2D(F16, F16, F16, F16), + PACK_KERNEL_MAP_2D(F16, F16, F32, F16), }; /* @@ -106,7 +108,7 @@ DEF_KERNEL_INITIALIZER(_a_times_b_plus_c_initializer) ) { #define _PACK_A_TIMES_B_PLUS_C_KEY( IN0_TYPE, IN1_TYPE, IN2_TYPE, OUT_TYPE ) \ - (( IN1_TYPE << 24) | ( IN1_TYPE << 16) | ( IN0_TYPE << 8) | ( OUT_TYPE)) + (( IN2_TYPE << 24) | ( IN1_TYPE << 16) | ( IN0_TYPE << 8) | ( OUT_TYPE)) vsi_status status = VX_SUCCESS; // Alignment with a power of two value. gpu_param_t gpu_param = { @@ -183,6 +185,48 @@ DEF_KERNEL_INITIALIZER(_a_times_b_plus_c_initializer) CHECK_STATUS_FAIL_GOTO(status, final ); } break; + case _PACK_A_TIMES_B_PLUS_C_KEY( F16, F16, F32, F16 ): + { + gpu_dp_inst_t uniA_Times_B_lo_4x4 = {{ + 0x01010101, // TCfg + 0x00000000, // ASelt + 0x00010000, 0x00030002, // ABin + 0x01010101, // BSelt + 0x00010000, 0x00030002, // BBin + 0x00000400, // AccumType, ConstantType, and PostShift + 0x00000000, 0x00000000, 0x00000000, 0x00000000, + 0x00000000, 0x00000000, 0x00000000, 0x00000000 // Constant + }, GPU_DP_TYPE_16 }; + gpu_dp_inst_t uniA_Times_B_hi_4x4 = {{ + 0x01010101, // TCfg + 0x00000000, // ASelt + 0x00050004, 0x00070006, // ABin + 0x01010101, // BSelt + 0x00050004, 0x00070006, // BBin + 0x00000400, // AccumType, ConstantType, and PostShift + 0x00000000, 0x00000000, 0x00000000, 0x00000000, + 0x00000000, 0x00000000, 0x00000000, 0x00000000 // Constant + }, GPU_DP_TYPE_16 }; + gpu_dp_inst_t uniExtractHalf8_2x8 = {{ + 0x11111111, // TCfg + 0x11110000, // ASelt + 0x06040200, 0x06040200, // ABin + 0x22222222, // BSelt + 0x00000000, 0x00000000, // BBin + 0x00002100, // AccumType, ConstantType, and PostShift + 0x00003c00, 0x00003c00, 0x00003c00, 0x00003c00, + 0x00003c00, 0x00003c00, 0x00003c00, 0x00003c00 // Constant + }, GPU_DP_TYPE_16 }; + + status = vsi_nn_kernel_gpu_add_param( node, + "uniA_Times_B_lo_4x4", &uniA_Times_B_lo_4x4 ); + status |= vsi_nn_kernel_gpu_add_param( node, + "uniA_Times_B_hi_4x4", &uniA_Times_B_hi_4x4 ); + status |= vsi_nn_kernel_gpu_add_param( node, + "uniExtractHalf8_2x8", &uniExtractHalf8_2x8 ); + CHECK_STATUS_FAIL_GOTO(status, final ); + } + break; default: break; } @@ -223,13 +267,13 @@ static vsi_status _query_kernel vx_param_description_t * param_def = _a_times_b_plus_c_kernel_param_def; size_t param_def_size = _cnt_of_array( _a_times_b_plus_c_kernel_param_def ); vx_kernel_initialize_f initializer = _a_times_b_plus_c_initializer; - uint32_t key; - uint32_t i; + uint32_t key = 0; + uint32_t i = 0; in0_dtype = vsi_nn_kernel_map_dtype( inputs[0]->attr.dtype.vx_type ); - in1_dtype = vsi_nn_kernel_map_dtype( inputs[1]->attr.dtype.vx_type ); - in2_dtype = vsi_nn_kernel_map_dtype( inputs[2]->attr.dtype.vx_type ); - out_dtype = vsi_nn_kernel_map_dtype( outputs[0]->attr.dtype.vx_type ); + in1_dtype = vsi_nn_kernel_map_dtype( inputs[1]->attr.dtype.vx_type ); + in2_dtype = vsi_nn_kernel_map_dtype( inputs[2]->attr.dtype.vx_type ); + out_dtype = vsi_nn_kernel_map_dtype( outputs[0]->attr.dtype.vx_type ); key = A_TIMES_B_PLUS_C_HASH_KEY(in0_dtype, in1_dtype, in2_dtype, out_dtype, image_2d); diff --git a/src/tim/vx/internal/src/kernel/evis/gather_evis.c b/src/tim/vx/internal/src/kernel/evis/gather_evis.c index 0c8273e..2ef5977 100644 --- a/src/tim/vx/internal/src/kernel/evis/gather_evis.c +++ b/src/tim/vx/internal/src/kernel/evis/gather_evis.c @@ -53,18 +53,34 @@ __BEGIN_DECLS #define VX_KERNEL_NAME_GATHER_U8TOF16 CVIVANTE_NAMESPACE("evis.gather_U8toF16") #define VX_KERNEL_NAME_GATHER_F16TOU8 CVIVANTE_NAMESPACE("evis.gather_F16toU8") +#define VX_KERNEL_NAME_GATHER_AXIS0_U8TOU8 CVIVANTE_NAMESPACE("evis.gather_U8toU8_axis0") +#define VX_KERNEL_NAME_GATHER_AXIS0_I8TOI8 CVIVANTE_NAMESPACE("evis.gather_I8toI8_axis0") +#define VX_KERNEL_NAME_GATHER_AXIS0_I16TOI16 CVIVANTE_NAMESPACE("evis.gather_I16toI16_axis0") +#define VX_KERNEL_NAME_GATHER_AXIS0_F16TOF16 CVIVANTE_NAMESPACE("evis.gather_F16toF16_axis0") +#define VX_KERNEL_NAME_GATHER_AXIS0_I8TOF16 CVIVANTE_NAMESPACE("evis.gather_I8toF16_axis0") +#define VX_KERNEL_NAME_GATHER_AXIS0_I16TOF16 CVIVANTE_NAMESPACE("evis.gather_I16toF16_axis0") +#define VX_KERNEL_NAME_GATHER_AXIS0_F16TOI8 CVIVANTE_NAMESPACE("evis.gather_F16toI8_axis0") +#define VX_KERNEL_NAME_GATHER_AXIS0_F16TOI16 CVIVANTE_NAMESPACE("evis.gather_F16toI16_axis0") +#define VX_KERNEL_NAME_GATHER_AXIS0_U8TOF16 CVIVANTE_NAMESPACE("evis.gather_U8toF16_axis0") +#define VX_KERNEL_NAME_GATHER_AXIS0_F16TOU8 CVIVANTE_NAMESPACE("evis.gather_F16toU8_axis0") + #define KERNEL_SOURCE_1 "gather" #define KERNEL_SOURCE_2 "gather_mix" // Add kernel hashtable here -#define HASH_GATHER_KEY(_input0_type, _input1_type, _output_type, _quant_type) \ - ((_input0_type << 24) | (_input1_type << 16) | (_output_type << 8) | (_quant_type)) +#define HASH_GATHER_KEY(_input0_type, _input1_type, _output_type, _is_axis0) \ + ((_input0_type << 24) | (_input1_type << 16) | (_output_type << 8) | (_is_axis0)) #define TENSOR_GATHER_KERNELS(IN0_TYPE, IN1TYPE, OUT_TYPE, SOURCE) \ { HASH_GATHER_KEY(IN0_TYPE, IN1TYPE, OUT_TYPE, 0), \ VX_KERNEL_NAME_GATHER_##IN0_TYPE##TO##OUT_TYPE, \ SOURCE }, +#define TENSOR_GATHER_AXIS0_KERNELS(IN0_TYPE, IN1TYPE, OUT_TYPE, SOURCE) \ + { HASH_GATHER_KEY(IN0_TYPE, IN1TYPE, OUT_TYPE, 1), \ + VX_KERNEL_NAME_GATHER_AXIS0_##IN0_TYPE##TO##OUT_TYPE, \ + SOURCE }, + static const struct { uint32_t key; char* function_name; @@ -81,6 +97,16 @@ static const struct { TENSOR_GATHER_KERNELS(F16, I32, I16, KERNEL_SOURCE_2) TENSOR_GATHER_KERNELS(U8, I32, F16, KERNEL_SOURCE_2) TENSOR_GATHER_KERNELS(F16, I32, U8, KERNEL_SOURCE_2) + TENSOR_GATHER_AXIS0_KERNELS(U8, I32, U8, KERNEL_SOURCE_1) + TENSOR_GATHER_AXIS0_KERNELS(I8, I32, I8, KERNEL_SOURCE_1) + TENSOR_GATHER_AXIS0_KERNELS(I16, I32, I16, KERNEL_SOURCE_1) + TENSOR_GATHER_AXIS0_KERNELS(F16, I32, F16, KERNEL_SOURCE_1) + TENSOR_GATHER_AXIS0_KERNELS(I8, I32, F16, KERNEL_SOURCE_2) + TENSOR_GATHER_AXIS0_KERNELS(I16, I32, F16, KERNEL_SOURCE_2) + TENSOR_GATHER_AXIS0_KERNELS(F16, I32, I8, KERNEL_SOURCE_2) + TENSOR_GATHER_AXIS0_KERNELS(F16, I32, I16, KERNEL_SOURCE_2) + TENSOR_GATHER_AXIS0_KERNELS(U8, I32, F16, KERNEL_SOURCE_2) + TENSOR_GATHER_AXIS0_KERNELS(F16, I32, U8, KERNEL_SOURCE_2) }; /* @@ -123,7 +149,7 @@ static vsi_status get_gather_tensor_reshape_size sizes[i] = 1; } - if(idxFlg && elementCnt < VSI_NN_MAX_IMAGE_WIDTH) + if (idxFlg && elementCnt < VSI_NN_MAX_IMAGE_WIDTH) { sizes[0] = elementCnt; sizes[1] = 1; @@ -131,7 +157,7 @@ static vsi_status get_gather_tensor_reshape_size } else { - if((elementCnt / block_size) < VSI_NN_MAX_IMAGE_WIDTH) + if ((elementCnt / block_size) < VSI_NN_MAX_IMAGE_WIDTH) { sizes[0] = block_size; sizes[1] = elementCnt / block_size; @@ -191,7 +217,7 @@ DEF_KERNEL_INITIALIZER(_gather_initializer) src0Scale = attr[0]->asymm.scale; dstZP = attr[2]->asymm.zero_point; dstScale = attr[2]->asymm.scale; - if( attr[0]->quant == VSI_NN_KERNEL_QUANT_DFP ) + if ( attr[0]->quant == VSI_NN_KERNEL_QUANT_DFP ) { if (attr[0]->dfp.fl > 0) { @@ -202,12 +228,12 @@ DEF_KERNEL_INITIALIZER(_gather_initializer) src0Scale = ((float) ((int64_t)1 << -attr[0]->dfp.fl)); } } - else if( attr[0]->quant == VSI_NN_KERNEL_QUANT_NONE ) + else if ( attr[0]->quant == VSI_NN_KERNEL_QUANT_NONE ) { src0Scale = 1; } - if( attr[2]->quant == VSI_NN_KERNEL_QUANT_DFP ) + if ( attr[2]->quant == VSI_NN_KERNEL_QUANT_DFP ) { if (attr[2]->dfp.fl > 0) { @@ -219,7 +245,7 @@ DEF_KERNEL_INITIALIZER(_gather_initializer) } dstScale = 1.0f/dstScale; } - else if( attr[2]->quant == VSI_NN_KERNEL_QUANT_NONE ) + else if ( attr[2]->quant == VSI_NN_KERNEL_QUANT_NONE ) { dstScale = 1; } @@ -232,7 +258,7 @@ DEF_KERNEL_INITIALIZER(_gather_initializer) } shaderParam.global_scale[0] = 16; - if(attr[0]->dtype == I16 || attr[0]->dtype == F16) + if (attr[0]->dtype == I16 || attr[0]->dtype == F16) { shaderParam.global_scale[0] = 8; } @@ -340,6 +366,214 @@ OnError: return status; } +DEF_KERNEL_INITIALIZER(_gather_axis0_initializer) + ( + vsi_nn_kernel_node_t node, + const vsi_nn_kernel_node_param_t * param, + size_t param_size + ) +{ + vsi_status status = VSI_FAILURE; + gpu_param_t shaderParam = { + 3, // workdim + {0, 0, 0}, // globalWorkOffset: control the start location be processed in the image + {0, 0, 0}, // globalWorkScale: how many pixels could be processed by a single thread + {0, 0, 0}, // localWorkSize: local group size in thread + {0, 0, 0}}; // globalWorkSize: image size in thread + + int32_t block_num = 0; + int32_t indices_num = 1; + uint32_t input_dims1 = 0; + vx_uint32 i = 0; + vsi_nn_kernel_tensor_attr_t* attr[3] = {NULL, NULL}; + vsi_int_array_t * input1_shape = NULL; + int32_t src0ZP = 0; + float src0Scale = 0; + int32_t dstZP = 0; + float dstScale = 0; + + uint32_t pack_key = 0; + + attr[0] = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[0] ); + CHECK_PTR_FAIL_GOTO( attr[0], "Create tensor attr buffer fail.", OnError ); + attr[1] = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[1] ); + CHECK_PTR_FAIL_GOTO( attr[1], "Create tensor attr buffer fail.", OnError ); + attr[2] = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[2] ); + CHECK_PTR_FAIL_GOTO( attr[2], "Create tensor attr buffer fail.", OnError ); + + status = vsi_nn_kernel_scalar_read_int32((vsi_nn_kernel_scalar_t)param[4], &block_num); + CHECK_STATUS_FAIL_GOTO(status, OnError ); + + src0ZP = attr[0]->asymm.zero_point; + src0Scale = attr[0]->asymm.scale; + dstZP = attr[2]->asymm.zero_point; + dstScale = attr[2]->asymm.scale; + if ( attr[0]->quant == VSI_NN_KERNEL_QUANT_DFP ) + { + if (attr[0]->dfp.fl > 0) + { + src0Scale = (1.0f / ((float) ((int64_t)1 << attr[0]->dfp.fl))); + } + else + { + src0Scale = ((float) ((int64_t)1 << -attr[0]->dfp.fl)); + } + } + else if ( attr[0]->quant == VSI_NN_KERNEL_QUANT_NONE ) + { + src0Scale = 1; + } + + if ( attr[2]->quant == VSI_NN_KERNEL_QUANT_DFP ) + { + if (attr[2]->dfp.fl > 0) + { + dstScale = (float)((int64_t)1 << attr[2]->dfp.fl); + } + else + { + dstScale = (1.0f / (float)((int64_t)1 << -attr[2]->dfp.fl)); + } + dstScale = 1.0f/dstScale; + } + else if ( attr[2]->quant == VSI_NN_KERNEL_QUANT_NONE ) + { + dstScale = 1; + } + + input1_shape = attr[1]->shape; + input_dims1 = (uint32_t)input1_shape->size; + for (i = 0; i < input_dims1; i++) + { + indices_num *= input1_shape->data[i]; + } + + shaderParam.global_scale[0] = 4; + shaderParam.global_scale[1] = 1; + shaderParam.global_scale[2] = 1; + shaderParam.global_size[0] = gpu_align_p2((indices_num + shaderParam.global_scale[0] - 1) + / shaderParam.global_scale[0], 4); + shaderParam.global_size[1] = block_num; + shaderParam.global_size[2] = 1; + + status = vsi_nn_kernel_gpu_config( node, &shaderParam ); + CHECK_STATUS_FAIL_GOTO(status, OnError); + +#define _PACK_SELECT_KEY( IN0_TYPE, OUT_TYPE) \ + (IN0_TYPE | (OUT_TYPE << 8)) + + pack_key = _PACK_SELECT_KEY( attr[0]->dtype, attr[2]->dtype); + + { + uint16_t M0 = 0; + int32_t postShift = 0; + uint32_t multAndoutZP0[2] = {0}; + uint32_t multAndoutZP1[2] = {0}; + gpu_dp_inst_t uniU8MulAndPostShift_0_Lo_2x8 = {{ + 0xdddddddd, // TCfg + 0x44444444, // ASelt + 0x13121110, 0x17161514, // ABin + 0x11111111, // BSelt + 0x00000000, 0x00000000, // BBin + 0x00002600, // AccumType, ConstantType, and PostShift + 0x00000000, 0x00000000, 0x00000000, 0x00000000, + 0x00000000, 0x00000000, 0x00000000, 0x00000000 // Constant + }, GPU_DP_TYPE_16 }; + gpu_dp_inst_t uniConvertFp16toU8_2x8 = {{ + 0xdddddddd, // TCfg + 0x44444444, // ASelt + 0x13121110, 0x17161514, // ABin + 0x11111111, // BSelt + 0x00000000, 0x00000000, // BBin + 0x00002600, // AccumType, ConstantType, and PostShift + 0x00000000, 0x00000000, 0x00000000, 0x00000000, + 0x00000000, 0x00000000, 0x00000000, 0x00000000 // Constant + }, GPU_DP_TYPE_16 }; + + gpu_dp_inst_t uniExtraCopyDpKeepinEvis_2x8 = {{ + 0x11111111, // TCfg + 0x00000000, // ASelt + 0x03020100, 0x07060504, // ABin + 0x22222222, // BSelt + 0x00000000, 0x00000000, // BBin + 0x00000600, // AccumType, ConstantType, and PostShift + 0x00000001, 0x00000001, 0x00000001, 0x00000001, 0x00000001, + 0x00000001, 0x00000001, 0x00000001 // Constant + }, GPU_DP_TYPE_16 }; + + switch( pack_key ) + { + case _PACK_SELECT_KEY( U8, F16): + case _PACK_SELECT_KEY( I8, F16): + case _PACK_SELECT_KEY( I16, F16): + { + gpu_quantize_multiplier_16bit( (double)src0Scale / dstScale, &M0, &postShift); + multAndoutZP0[0] = (uint32_t)(M0); + multAndoutZP0[1] = (uint32_t)((dstZP << postShift) - src0ZP * M0); + + gpu_dp_inst_update_postshfit( &uniU8MulAndPostShift_0_Lo_2x8, postShift ); + status = vsi_nn_kernel_gpu_add_param( node, + "uniU8MulAndPostShift_0_Lo_2x8", &uniU8MulAndPostShift_0_Lo_2x8 ); + status |= vsi_nn_kernel_gpu_add_param( node, "multAndoutZP0", &multAndoutZP0 ); + CHECK_STATUS_FAIL_GOTO(status, OnError ); + } + break; + case _PACK_SELECT_KEY( F16, U8): + case _PACK_SELECT_KEY( F16, I8): + case _PACK_SELECT_KEY( F16, I16): + { + int32_t postShift0 = 0; + gpu_quantize_multiplier_16bit( (double)src0Scale / dstScale, &M0, &postShift0); + + multAndoutZP1[0] = (uint32_t)(M0); + multAndoutZP1[1] = (uint32_t)((dstZP << postShift0) - src0ZP * M0); + + gpu_dp_inst_update_postshfit( &uniConvertFp16toU8_2x8, postShift0 ); + status = vsi_nn_kernel_gpu_add_param( node, "multAndoutZP1", &multAndoutZP1 ); + status |= vsi_nn_kernel_gpu_add_param( node, + "uniConvertFp16toU8_2x8", &uniConvertFp16toU8_2x8 ); + CHECK_STATUS_FAIL_GOTO(status, OnError ); + } + break; + case _PACK_SELECT_KEY( I16, I16): + case _PACK_SELECT_KEY( I8, I8): + case _PACK_SELECT_KEY( U8, U8): + case _PACK_SELECT_KEY( F16, F16): + { + status = vsi_nn_kernel_gpu_add_param( node, + "uniExtraCopyDpKeepinEvis_2x8", &uniExtraCopyDpKeepinEvis_2x8 ); + CHECK_STATUS_FAIL_GOTO(status, OnError ); + } + break; + default: + break; + } + } +#undef _PACK_SELECT_KEY + + status = vsi_nn_kernel_gpu_add_param(node, "indices_num", &indices_num); + CHECK_STATUS_FAIL_GOTO(status, OnError ); + +OnError: + if (attr[0]) + { + vsi_nn_kernel_tensor_attr_release( &attr[0] ); + attr[0] = NULL; + } + if (attr[1]) + { + vsi_nn_kernel_tensor_attr_release( &attr[1] ); + attr[1] = NULL; + } + if (attr[2]) + { + vsi_nn_kernel_tensor_attr_release( &attr[2] ); + attr[2] = NULL; + } + + return status; +} + /* * Query kernel */ @@ -348,7 +582,8 @@ static vsi_status _query_kernel vsi_nn_tensor_t* const* const inputs, vsi_nn_tensor_t* const* const outputs, vsi_nn_kernel_t* kernel, - const vsi_nn_kernel_param_t * params + const vsi_nn_kernel_param_t * params, + int32_t axis ) { vsi_status status = VSI_FAILURE; @@ -360,21 +595,28 @@ static vsi_status _query_kernel input0_dtype = vsi_nn_kernel_map_dtype( inputs[0]->attr.dtype.vx_type ); output_dtype = vsi_nn_kernel_map_dtype( outputs[0]->attr.dtype.vx_type ); - key = HASH_GATHER_KEY( input0_dtype, I32, output_dtype, 0 ); + key = HASH_GATHER_KEY( input0_dtype, I32, output_dtype, axis ); for( i = 0; i < _cnt_of_array(gather_map); i ++ ) { - if( gather_map[i].key == key ) + if ( gather_map[i].key == key ) { break; } } - if( i < _cnt_of_array(gather_map) ) + if ( i < _cnt_of_array(gather_map) ) { snprintf( kernel->info.name, VX_MAX_KERNEL_NAME, "%s", gather_map[i].function_name ); kernel->info.parameters = _gather_kernel_param_def; kernel->info.numParams = _cnt_of_array( _gather_kernel_param_def ); - kernel->info.initialize = _gather_initializer; + if (axis) + { + kernel->info.initialize = _gather_axis0_initializer; + } + else + { + kernel->info.initialize = _gather_initializer; + } vsi_nn_kernel_add_source( kernel, VSI_NN_GPU_SOURCE_FMT_CODE, 2, "vsi_nn_kernel_header", @@ -405,26 +647,39 @@ static vsi_nn_kernel_node_t _setup int32_t block_size = vsi_nn_kernel_param_get_int32( params, "block_size" ); int32_t block_num = vsi_nn_kernel_param_get_int32( params, "block_num" ); int32_t axis_num = vsi_nn_kernel_param_get_int32( params, "axis_num" ); + int32_t axis = vsi_nn_kernel_param_get_int32( params, "axis" ); + int32_t axis0_flg = 0; - status = get_gather_tensor_reshape_size(&inputs[0], shapes[0], block_size, 0); - status |= get_gather_tensor_reshape_size(&inputs[1], shapes[1], 1, 1); - status |= get_gather_tensor_reshape_size(&outputs[0], shapes[2], block_size, 0); - if(status != VSI_SUCCESS) + if (axis == 0) + { + status = get_gather_tensor_reshape_size(&inputs[0], shapes[0], axis_num, 0); + status |= get_gather_tensor_reshape_size(&inputs[1], shapes[1], 1, 1); + status |= get_gather_tensor_reshape_size(&outputs[0], shapes[2], shapes[1][0], 0); + axis0_flg = 1; + } + else + { + status = get_gather_tensor_reshape_size(&inputs[0], shapes[0], block_size, 0); + status |= get_gather_tensor_reshape_size(&inputs[1], shapes[1], 1, 1); + status |= get_gather_tensor_reshape_size(&outputs[0], shapes[2], block_size, 0); + axis0_flg = 0; + } + if (status != VSI_SUCCESS) { return NULL; } - if( !vsi_nn_kernel_gpu_check_shape( (int32_t*)outputs[0]->attr.size, + if ( !vsi_nn_kernel_gpu_check_shape( (int32_t*)outputs[0]->attr.size, outputs[0]->attr.dim_num ) ) { return NULL; } - status = _query_kernel( inputs, outputs, kernel, params ); - if( VSI_SUCCESS == status) + status = _query_kernel( inputs, outputs, kernel, params, axis0_flg); + if ( VSI_SUCCESS == status) { node = vsi_nn_kernel_create_node( graph, kernel ); - if( node ) + if ( node ) { uint32_t index = 0; #define RESHAPE_DIM 2 diff --git a/src/tim/vx/internal/src/kernel/evis/instance_normalization_evis.c b/src/tim/vx/internal/src/kernel/evis/instance_normalization_evis.c index b1f413c..b893e74 100644 --- a/src/tim/vx/internal/src/kernel/evis/instance_normalization_evis.c +++ b/src/tim/vx/internal/src/kernel/evis/instance_normalization_evis.c @@ -183,7 +183,7 @@ DEF_KERNEL_INITIALIZER(_instancenorm_mean_vari_initializer) vsi_nn_kernel_tensor_attr_t* attr[2] = {NULL, NULL}; vsi_int_array_t * input_shape = NULL; - float scaleIn = 0; + float scaleIn = 1; int32_t input_zp = 0; vx_uint32 iter = 0; int32_t sumInZp = 0; @@ -206,10 +206,13 @@ DEF_KERNEL_INITIALIZER(_instancenorm_mean_vari_initializer) CHECK_STATUS_FAIL_GOTO(status, OnError ); input_shape = attr[0]->shape; - input_zp = attr[0]->asymm.zero_point; - scaleIn = attr[0]->asymm.scale; - if(attr[0]->dtype == I8 || attr[0]->dtype == I16) + if (attr[0]->quant == VSI_NN_KERNEL_QUANT_ASYMM) + { + input_zp = attr[0]->asymm.zero_point; + scaleIn = attr[0]->asymm.scale; + } + else if (attr[0]->quant == VSI_NN_KERNEL_QUANT_DFP) { if (attr[0]->dfp.fl > 0) { @@ -225,13 +228,13 @@ DEF_KERNEL_INITIALIZER(_instancenorm_mean_vari_initializer) width = input_shape->data[0]; height = input_shape->data[1]; chn = attr[1]->shape->data[1]; - if(rsFlg) + if (rsFlg) { height = height / chn; } iter = height * 16; - if(attr[0]->dtype == U8) + if (attr[0]->quant == VSI_NN_KERNEL_QUANT_ASYMM) { sumInZp = input_zp * iter * (-1); tmpZp1 = (-2) * input_zp; @@ -247,11 +250,11 @@ DEF_KERNEL_INITIALIZER(_instancenorm_mean_vari_initializer) shaderParam.local_size[1] = 1; shaderParam.local_size[2] = 1; - if(attr[0]->dtype == I8 || attr[0]->dtype == U8) + if (attr[0]->dtype == I8 || attr[0]->dtype == U8) { shaderParam.global_size[0] = (width + 255) / 256 * 16; } - else if(attr[0]->dtype == I16 || attr[0]->dtype == F16) + else if (attr[0]->dtype == I16 || attr[0]->dtype == F16) { shaderParam.global_size[0] = (width + 127) / 128 * 16; } @@ -261,7 +264,7 @@ DEF_KERNEL_INITIALIZER(_instancenorm_mean_vari_initializer) status = vsi_nn_kernel_gpu_config( node, &shaderParam ); CHECK_STATUS_FAIL_GOTO(status, OnError); - if(attr[0]->dtype == U8) + if (attr[0]->dtype == U8) { gpu_dp_inst_t uniSumU8_16x1 = {{ 0x55555555, // TCfg @@ -290,7 +293,7 @@ DEF_KERNEL_INITIALIZER(_instancenorm_mean_vari_initializer) status |= vsi_nn_kernel_gpu_add_param(node, "rowSumScale", &rowSumScale); CHECK_STATUS_FAIL_GOTO(status, OnError ); } - else if(attr[0]->dtype == I8) + else if (attr[0]->dtype == I8) { gpu_dp_inst_t uniSumInt8_16x1 = {{ 0x55555555, // TCfg @@ -317,7 +320,7 @@ DEF_KERNEL_INITIALIZER(_instancenorm_mean_vari_initializer) status |= vsi_nn_kernel_gpu_add_param(node, "inFlScale_s2", &inFlScale_s2); CHECK_STATUS_FAIL_GOTO(status, OnError ); } - else if(attr[0]->dtype == I16) + else if (attr[0]->dtype == I16) { gpu_dp_inst_t uniInt16SumSqr_dp8x2 = {{ 0x55555555, // TCfg @@ -333,7 +336,7 @@ DEF_KERNEL_INITIALIZER(_instancenorm_mean_vari_initializer) status |= vsi_nn_kernel_gpu_add_param(node, "inFlScale_s2", &inFlScale_s2); CHECK_STATUS_FAIL_GOTO(status, OnError ); } - else if(attr[0]->dtype == F16) + else if (attr[0]->dtype == F16) { gpu_dp_inst_t uniFp16SumSqr_dp8x2 = {{ 0x55555555, // TCfg @@ -384,10 +387,10 @@ DEF_KERNEL_INITIALIZER(_instancenorm_initializer) vsi_nn_kernel_tensor_attr_t* attr[3] = {NULL, NULL}; vsi_int_array_t * input_shape = NULL; - float scaleIn = 0; - float scaleOut = 0; - float reScaleOut_u8 = 0; - float scale_inOut = 0; + float scaleIn = 1.0f; + float scaleOut = 1.0f; + float reScaleOut_u8 = 1.0f; + float scale_inOut = 1.0f; int32_t output_zp = 0; int32_t input_zp = 0; float in_scale_fl = 1, out_scale_fl = 1, inOut_fl_scale = 1; @@ -407,12 +410,13 @@ DEF_KERNEL_INITIALIZER(_instancenorm_initializer) CHECK_STATUS_FAIL_GOTO(status, OnError ); input_shape = attr[0]->shape; - input_zp = attr[0]->asymm.zero_point; - scaleIn = attr[0]->asymm.scale; - output_zp = attr[2]->asymm.zero_point; - scaleOut = attr[2]->asymm.scale; - if(attr[0]->dtype == I8 || attr[0]->dtype == I16) + if (attr[0]->quant == VSI_NN_KERNEL_QUANT_ASYMM) + { + input_zp = attr[0]->asymm.zero_point; + scaleIn = attr[0]->asymm.scale; + } + else if (attr[0]->quant == VSI_NN_KERNEL_QUANT_DFP) { if (attr[0]->dfp.fl > 0) { @@ -422,9 +426,16 @@ DEF_KERNEL_INITIALIZER(_instancenorm_initializer) { in_scale_fl = ((float) ((int64_t)1 << -attr[0]->dfp.fl)); } + input_zp = 0; } - if(attr[2]->dtype == I8 || attr[2]->dtype == I16) + if (attr[2]->quant == VSI_NN_KERNEL_QUANT_ASYMM) + { + output_zp = attr[2]->asymm.zero_point; + scaleOut = attr[2]->asymm.scale; + reScaleOut_u8 = 1 / scaleOut; + } + else if (attr[2]->quant == VSI_NN_KERNEL_QUANT_DFP) { if (attr[2]->dfp.fl > 0) { @@ -434,10 +445,11 @@ DEF_KERNEL_INITIALIZER(_instancenorm_initializer) { out_scale_fl = (1.0f / (float)((int64_t)1 << -attr[2]->dfp.fl)); } + output_zp = 0; } - if((attr[2]->dtype == I8 || attr[2]->dtype == I16) - && (attr[0]->dtype == I8 || attr[0]->dtype == I16)) + if ((attr[0]->quant == VSI_NN_KERNEL_QUANT_DFP) + && (attr[2]->quant == VSI_NN_KERNEL_QUANT_DFP)) { inOut_fl_scale = in_scale_fl * out_scale_fl; } @@ -445,21 +457,17 @@ DEF_KERNEL_INITIALIZER(_instancenorm_initializer) width = input_shape->data[0]; height = input_shape->data[1]; chn = attr[1]->shape->data[1]; - if(rsFlg) + if (rsFlg) { height = height / chn; } - if(attr[2]->dtype == U8) - { - reScaleOut_u8 = 1 / scaleOut; - } dimRatio = (float)(1.0 / (width * height)); group_num = (width + 255) / 256; shaderParam.global_scale[0] = 16; - if(attr[0]->dtype == I16 || attr[0]->dtype == F16) + if (attr[0]->dtype == I16 || attr[0]->dtype == F16) { shaderParam.global_scale[0] = 8; group_num = (width + 127) / 128; @@ -774,12 +782,12 @@ static vsi_status _query_kernel for( i = 0; i < kernel_map_size; i ++ ) { - if( kernel_map[i].key == hashkey ) + if ( kernel_map[i].key == hashkey ) { break; } } - if( i < kernel_map_size ) + if ( i < kernel_map_size ) { snprintf( kernel->info.name, VX_MAX_KERNEL_NAME, "%s", kernel_map[i].function_name ); kernel->info.parameters = param_def; @@ -830,7 +838,7 @@ static vsi_nn_kernel_node_t _setup int32_t reshape_flg = vsi_nn_kernel_param_get_int32( params, "reshape_flg" ); // Check if gpu can support the size - if( !vsi_nn_kernel_gpu_check_shape( + if ( !vsi_nn_kernel_gpu_check_shape( (int32_t*)outputs[0]->attr.size, outputs[0]->attr.dim_num ) ) { return NULL; @@ -850,7 +858,7 @@ static vsi_nn_kernel_node_t _setup attr.size[0] = ((inputs[0]->attr.size[0] + 255) / 256) * 4; - if( inputs[0]->attr.dtype.vx_type == VSI_NN_TYPE_INT16 + if ( inputs[0]->attr.dtype.vx_type == VSI_NN_TYPE_INT16 || inputs[0]->attr.dtype.vx_type == VSI_NN_TYPE_FLOAT16) { attr.size[0] = ((inputs[0]->attr.size[0] + 127) / 128) * 4; @@ -868,17 +876,17 @@ static vsi_nn_kernel_node_t _setup hashkey = HASH_INSTANCENORM_KEY( in0_dtype, out_dtype, reshape_flg ); status = _query_kernel( ikernels[MEAN_VARI_INDEX], hashkeys[MEAN_VARI_INDEX], INTERNAL_KERNEL_MEAN_VARI ); - if( VSI_SUCCESS != status ) + if ( VSI_SUCCESS != status ) { goto final; } status = _query_kernel( kernel, hashkey, INTERNAL_KERNEL_NORM ); - if( VSI_SUCCESS != status ) + if ( VSI_SUCCESS != status ) { goto final; } - if(reshape_flg) + if (reshape_flg) { int32_t shape[VSI_NN_MAX_DIM_NUM] = {0}; shape[0] = inputs[0]->attr.size[0]; @@ -893,7 +901,7 @@ static vsi_nn_kernel_node_t _setup shape[3] = outputs[0]->attr.dim_num > 3 ? outputs[0]->attr.size[3] : 1; rs_output = vsi_nn_kernel_tensor_reshape( outputs[0]->t, shape, 4 ); } - if(inputs[1]->attr.dim_num < 2) + if (inputs[1]->attr.dim_num < 2) { int32_t shape[VSI_NN_MAX_DIM_NUM] = {0}; shape[0] = inputs[1]->attr.size[0]; @@ -902,7 +910,7 @@ static vsi_nn_kernel_node_t _setup shape[3] = 1; rs_beta = vsi_nn_kernel_tensor_reshape( inputs[1]->t, shape, 4 ); } - if(inputs[2]->attr.dim_num < 2) + if (inputs[2]->attr.dim_num < 2) { int32_t shape[VSI_NN_MAX_DIM_NUM] = {0}; shape[0] = inputs[2]->attr.size[0]; @@ -914,10 +922,10 @@ static vsi_nn_kernel_node_t _setup // Mean Vari { tmp_node = vsi_nn_kernel_create_node( graph, ikernels[MEAN_VARI_INDEX] ); - if(tmp_node) + if (tmp_node) { uint32_t index = 0; - if(reshape_flg) + if (reshape_flg) { mean_vari_node_params[index++] = rs_input; vsi_nn_kernel_node_pack_io( &mean_vari_node_params[index], @@ -943,7 +951,7 @@ static vsi_nn_kernel_node_t _setup border.mode = VX_BORDER_CONSTANT; border.constant_value.U8 = 0; border.constant_value.U16 = 0; - if(inputs[0]->attr.dtype.vx_type == VSI_NN_TYPE_UINT8) + if (inputs[0]->attr.dtype.vx_type == VSI_NN_TYPE_UINT8) { border.constant_value.U8 = (vx_uint8)inputs[0]->attr.dtype.zero_point; } @@ -956,10 +964,10 @@ static vsi_nn_kernel_node_t _setup // Nomalization { node = vsi_nn_kernel_create_node( graph, kernel ); - if(node) + if (node) { uint32_t index = 0; - if(reshape_flg) + if (reshape_flg) { node_params[index++] = rs_input; } @@ -967,7 +975,7 @@ static vsi_nn_kernel_node_t _setup { node_params[index++] = (vsi_nn_kernel_node_param_t)inputs[0]->t; } - if(inputs[1]->attr.dim_num < 2) + if (inputs[1]->attr.dim_num < 2) { node_params[index++] = rs_beta; } @@ -975,7 +983,7 @@ static vsi_nn_kernel_node_t _setup { node_params[index++] = (vsi_nn_kernel_node_param_t)inputs[1]->t; } - if(inputs[2]->attr.dim_num < 2) + if (inputs[2]->attr.dim_num < 2) { node_params[index++] = rs_gamma; } @@ -984,7 +992,7 @@ static vsi_nn_kernel_node_t _setup node_params[index++] = (vsi_nn_kernel_node_param_t)inputs[2]->t; } node_params[index++] = (vsi_nn_kernel_node_param_t)tensors[MEAN_VARI_INDEX]->t; - if(reshape_flg) + if (reshape_flg) { node_params[index++] = rs_output; } @@ -1006,9 +1014,9 @@ static vsi_nn_kernel_node_t _setup border.mode = VX_BORDER_CONSTANT; border.constant_value.U8 = 0; border.constant_value.U16 = 0; - if(outputs[0]->attr.dtype.vx_type == VSI_NN_TYPE_UINT8) + if (inputs[0]->attr.dtype.vx_type == VSI_NN_TYPE_UINT8) { - border.constant_value.U8 = (vx_uint8)outputs[0]->attr.dtype.zero_point; + border.constant_value.U8 = (vx_uint8)inputs[0]->attr.dtype.zero_point; } status = vxSetNodeAttribute( (vx_node)node, VX_NODE_BORDER, &border, sizeof(border) ); CHECK_STATUS(status); @@ -1018,31 +1026,31 @@ static vsi_nn_kernel_node_t _setup /* Pass parameters to node. */ final: - if(rs_beta) + if (rs_beta) { vsi_nn_kernel_tensor_release( &rs_beta ); } - if(rs_gamma) + if (rs_gamma) { vsi_nn_kernel_tensor_release( &rs_gamma ); } - if(reshape_flg) + if (reshape_flg) { vsi_nn_kernel_tensor_release( &rs_input ); vsi_nn_kernel_tensor_release( &rs_output ); } for( i = 0; i < INTERNAL_KERNEL_SIZE; i ++ ) { - if( ikernels[i] ) + if ( ikernels[i] ) { vsi_nn_kernel_release( &ikernels[i] ); } - if( tensors[i] ) + if ( tensors[i] ) { vsi_nn_ReleaseTensor( &tensors[i] ); } } - if(tmp_node) {vsi_nn_kernel_node_release( &tmp_node );} + if (tmp_node) {vsi_nn_kernel_node_release( &tmp_node );} return node; } /* _setup() */ diff --git a/src/tim/vx/internal/src/kernel/evis/layer_normalization_evis.c b/src/tim/vx/internal/src/kernel/evis/layer_normalization_evis.c new file mode 100644 index 0000000..238eb23 --- /dev/null +++ b/src/tim/vx/internal/src/kernel/evis/layer_normalization_evis.c @@ -0,0 +1,1389 @@ +/**************************************************************************** +* +* Copyright (c) 2019 Vivante Corporation +* +* Permission is hereby granted, free of charge, to any person obtaining a +* copy of this software and associated documentation files (the "Software"), +* to deal in the Software without restriction, including without limitation +* the rights to use, copy, modify, merge, publish, distribute, sublicense, +* and/or sell copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following conditions: +* +* The above copyright notice and this permission notice shall be included in +* all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +* DEALINGS IN THE SOFTWARE. +* +*****************************************************************************/ + + +#include +#include +#include +#include "vsi_nn_types.h" +#include "vsi_nn_tensor.h" +#include "vsi_nn_graph.h" +#include "vsi_nn_log.h" +#include "vsi_nn_prv.h" +#include "vsi_nn_tensor_util.h" +#include "vsi_nn_error.h" +#include "utils/vsi_nn_util.h" +#include "kernel/vsi_nn_kernel.h" +#include "libnnext/vx_lib_nnext.h" +#include "kernel/vsi_nn_kernel_gpu_shape_optimize.h" + +__BEGIN_DECLS + +/* + * Define kernel meta. + */ + typedef enum +{ + LAYERNORM_KERNEL, + LAYERNORM_2D_KERNEL, + SUMSQR_KERNEL, + SUMSQR_2D_KERNEL, + LAYERNORM_WH_KERNEL, + LAYERNORM_WH_2D_KERNEL, +} _kernel_type_e; + +#define KERNEL_SOURCE_1 "layer_normalization" +#define KERNEL_SOURCE_2 "layer_normalization_2d" +#define KERNEL_SOURCE_3 "layer_normalization_u8_f16" +#define KERNEL_SOURCE_4 "layer_normalization_wh_u8" +#define KERNEL_SOURCE_5 "layer_normalization_wh_f16" +#define KERNEL_SOURCE_6 "layer_normalization_i16" +#define KERNEL_SOURCE_7 "layer_normalization_wh_i16" + + +#define HASH_LAYERNORM_SH_KERNEL_NAME(SRC0_TYPE, DST_TYPE) \ + CVIVANTE_NAMESPACE("evis.layer_norm_"#SRC0_TYPE"to"#DST_TYPE) + +#define HASH_LAYERNORM_SH_KERNEL_2D_NAME(SRC0_TYPE, DST_TYPE) \ + CVIVANTE_NAMESPACE("evis.layer_norm_"#SRC0_TYPE"to"#DST_TYPE"_2D") + +// normalization +#define HASH_LAYERNORM_KEY(_input0_type, _output_type, _reshape_flag) \ + ((_input0_type << 24) | (_output_type << 16) | (_reshape_flag << 8)) + +#define TENSOR_LAYERNORM_KERNELS(IN0_TYPE, OUT_TYPE, SOURCE) \ + { HASH_LAYERNORM_KEY(IN0_TYPE, OUT_TYPE, LAYERNORM_KERNEL), \ + HASH_LAYERNORM_SH_KERNEL_NAME(IN0_TYPE, OUT_TYPE), \ + SOURCE }, + +#define TENSOR_LAYERNORM_KERNELS_2D(IN0_TYPE, OUT_TYPE, SOURCE) \ + { HASH_LAYERNORM_KEY(IN0_TYPE, OUT_TYPE, LAYERNORM_2D_KERNEL), \ + HASH_LAYERNORM_SH_KERNEL_2D_NAME(IN0_TYPE, OUT_TYPE), \ + SOURCE }, + +// greater than max size +#define HASH_SUMSQR_SH_KERNEL_NAME(SRC0_TYPE, DST_TYPE) \ + CVIVANTE_NAMESPACE("evis.layernorm_wh_sumSqr_"#SRC0_TYPE"to"#DST_TYPE) + +#define HASH_SUMSQR_SH_KERNEL_2D_NAME(SRC0_TYPE, DST_TYPE) \ + CVIVANTE_NAMESPACE("evis.layernorm_wh_sumSqr_"#SRC0_TYPE"to"#DST_TYPE"_2D") + +#define HASH_LAYERNORM_WH_SH_KERNEL_NAME(SRC0_TYPE, DST_TYPE) \ + CVIVANTE_NAMESPACE("evis.layernorm_wh_"#SRC0_TYPE"to"#DST_TYPE) + +#define HASH_LAYERNORM_WH_SH_KERNEL_2D_NAME(SRC0_TYPE, DST_TYPE) \ + CVIVANTE_NAMESPACE("evis.layernorm_wh_"#SRC0_TYPE"to"#DST_TYPE"_2D") + +#define TENSOR_SUMSQR_KERNELS(IN0_TYPE, OUT_TYPE, SOURCE) \ + { HASH_LAYERNORM_KEY(IN0_TYPE, OUT_TYPE, SUMSQR_KERNEL), \ + HASH_SUMSQR_SH_KERNEL_NAME(IN0_TYPE, OUT_TYPE), \ + SOURCE }, + +#define TENSOR_SUMSQR_KERNELS_2D(IN0_TYPE, OUT_TYPE, SOURCE) \ + { HASH_LAYERNORM_KEY(IN0_TYPE, OUT_TYPE, SUMSQR_2D_KERNEL), \ + HASH_SUMSQR_SH_KERNEL_2D_NAME(IN0_TYPE, OUT_TYPE), \ + SOURCE }, + +#define TENSOR_LAYERNORM_WH_KERNELS(IN0_TYPE, OUT_TYPE, SOURCE) \ + { HASH_LAYERNORM_KEY(IN0_TYPE, OUT_TYPE, LAYERNORM_WH_KERNEL), \ + HASH_LAYERNORM_WH_SH_KERNEL_NAME(IN0_TYPE, OUT_TYPE), \ + SOURCE }, + +#define TENSOR_LAYERNORM_WH_KERNELS_2D(IN0_TYPE, OUT_TYPE, SOURCE) \ + { HASH_LAYERNORM_KEY(IN0_TYPE, OUT_TYPE, LAYERNORM_WH_2D_KERNEL), \ + HASH_LAYERNORM_WH_SH_KERNEL_2D_NAME(IN0_TYPE, OUT_TYPE), \ + SOURCE }, + +typedef struct +{ + uint32_t key; + char * function_name; + const char * source_name; +} _kernel_map_type; + +static const _kernel_map_type _layernorm_kernel_map[] = +{ + // Register kernel here + TENSOR_LAYERNORM_KERNELS( U8, U8, KERNEL_SOURCE_1 ) + TENSOR_LAYERNORM_KERNELS_2D( U8, U8, KERNEL_SOURCE_2 ) + TENSOR_LAYERNORM_KERNELS( U8, F16, KERNEL_SOURCE_3 ) + TENSOR_LAYERNORM_KERNELS_2D( U8, F16, KERNEL_SOURCE_3 ) + + TENSOR_LAYERNORM_KERNELS( F16, F16, KERNEL_SOURCE_1 ) + TENSOR_LAYERNORM_KERNELS_2D( F16, F16, KERNEL_SOURCE_2 ) + TENSOR_LAYERNORM_KERNELS( F16, U8, KERNEL_SOURCE_1 ) + TENSOR_LAYERNORM_KERNELS_2D( F16, U8, KERNEL_SOURCE_2 ) + TENSOR_LAYERNORM_KERNELS( I16, I16, KERNEL_SOURCE_6 ) + TENSOR_LAYERNORM_KERNELS_2D( I16, I16, KERNEL_SOURCE_6 ) +}; + +static const _kernel_map_type _sumsqr_kernel_map[] = +{ + // Register kernel here + TENSOR_SUMSQR_KERNELS( U8, F32, KERNEL_SOURCE_4 ) + TENSOR_SUMSQR_KERNELS_2D( U8, F32, KERNEL_SOURCE_4 ) + TENSOR_SUMSQR_KERNELS( F16, F32, KERNEL_SOURCE_5 ) + TENSOR_SUMSQR_KERNELS_2D( F16, F32, KERNEL_SOURCE_5 ) + TENSOR_SUMSQR_KERNELS( I16, F32, KERNEL_SOURCE_7 ) + TENSOR_SUMSQR_KERNELS_2D( I16, F32, KERNEL_SOURCE_7 ) + + TENSOR_LAYERNORM_WH_KERNELS( U8, U8, KERNEL_SOURCE_4 ) + TENSOR_LAYERNORM_WH_KERNELS_2D( U8, U8, KERNEL_SOURCE_4 ) + TENSOR_LAYERNORM_WH_KERNELS( U8, F16, KERNEL_SOURCE_4 ) + TENSOR_LAYERNORM_WH_KERNELS_2D( U8, F16, KERNEL_SOURCE_4 ) + TENSOR_LAYERNORM_WH_KERNELS( F16, F16, KERNEL_SOURCE_5 ) + TENSOR_LAYERNORM_WH_KERNELS_2D( F16, F16, KERNEL_SOURCE_5 ) + TENSOR_LAYERNORM_WH_KERNELS( I16, I16, KERNEL_SOURCE_7 ) + TENSOR_LAYERNORM_WH_KERNELS_2D( I16, I16, KERNEL_SOURCE_7 ) +}; + +/* + * Kernel params + */ + +static vx_param_description_t _layernorm_kernel_param_def[] = +{ + {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, + {VX_OUTPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED}, + // Add kererl parameters here +}; + +static vx_param_description_t _sumSqr_kernel_param_def[] = +{ + {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, + {VX_OUTPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, + // Add kererl parameters here +}; + +static vx_param_description_t _layernorm_wh_kernel_param_def[] = +{ + {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, + {VX_OUTPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED}, + // Add kererl parameters here +}; + +#define _LAYERNORM_PARAM_NUM _cnt_of_array( _layernorm_kernel_param_def ) +#define _SUMSQR_PARAM_NUM _cnt_of_array( _sumSqr_kernel_param_def ) +#define _LAYERNORM_WH_PARAM_NUM _cnt_of_array( _layernorm_wh_kernel_param_def ) + +/* + * Kernel initializer + */ + +DEF_KERNEL_INITIALIZER(_layernorm_initializer) + ( + vsi_nn_kernel_node_t node, + const vsi_nn_kernel_node_param_t * param, + size_t param_size + ) +{ + vsi_status status = VSI_FAILURE; + gpu_param_t shaderParam = { + 3, // workdim + {0, 0, 0}, // globalWorkOffset: control the start location be processed in the image + {0, 0, 0}, // globalWorkScale: how many pixels could be processed by a single thread + {0, 0, 0}, // localWorkSize: local group size in thread + {0, 0, 0}}; // globalWorkSize: image size in thread + + vsi_nn_kernel_tensor_attr_t* attr[3] = {NULL, NULL}; + vsi_int_array_t * input_shape = NULL; + float scaleIn = 1; + float scaleOut = 1; + float output_zp = 0; + int32_t input_zp = 0; + int32_t iter = 0; + int32_t sumInZp = 0; + int32_t tmpZp1 = 0; + int32_t tmpZp2 = 0; + float e2InScale = 0; + int32_t height = 0, width = 0, chn = 0; + + attr[0] = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[0] ); + CHECK_PTR_FAIL_GOTO( attr[0], "Create tensor attr buffer fail.", OnError ); + attr[1] = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[2] ); + CHECK_PTR_FAIL_GOTO( attr[1], "Create tensor attr buffer fail.", OnError ); + attr[2] = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[3] ); + CHECK_PTR_FAIL_GOTO( attr[2], "Create tensor attr buffer fail.", OnError ); + + input_shape = attr[0]->shape; + + if (attr[0]->quant == VSI_NN_KERNEL_QUANT_ASYMM) + { + input_zp = attr[0]->asymm.zero_point; + scaleIn = attr[0]->asymm.scale; + } + if (attr[0]->quant == VSI_NN_KERNEL_QUANT_DFP) + { + if (attr[0]->dfp.fl > 0) + { + scaleIn = (1.0f / ((float) ((int64_t)1 << attr[0]->dfp.fl))); + } + else + { + scaleIn = ((float) ((int64_t)1 << -attr[0]->dfp.fl)); + } + input_zp = 0; + } + else if (attr[0]->quant == VSI_NN_KERNEL_QUANT_NONE) + { + scaleIn = 1; + input_zp = 0; + } + + if (attr[2]->quant == VSI_NN_KERNEL_QUANT_ASYMM) + { + output_zp = (float)attr[2]->asymm.zero_point; + scaleOut = 1.0f / attr[2]->asymm.scale; + } + if (attr[2]->quant == VSI_NN_KERNEL_QUANT_DFP) + { + if (attr[2]->dfp.fl > 0) + { + scaleOut = (float)((int64_t)1 << attr[2]->dfp.fl); + } + else + { + scaleOut = (1.0f / (float)((int64_t)1 << -attr[2]->dfp.fl)); + } + output_zp = 0; + } + else if (attr[2]->quant == VSI_NN_KERNEL_QUANT_NONE) + { + scaleOut = 1; + output_zp = 0.0f; + } + + width = input_shape->data[0]; + height = input_shape->data[1]; + chn = (input_shape->size <= 2) ? 1 : input_shape->data[2]; + + iter = ((width + 15) / 16) * 16; + sumInZp = input_zp * iter * (-1); + tmpZp1 = (-2) * input_zp; + tmpZp2 = iter * input_zp * input_zp; + e2InScale = scaleIn * scaleIn; + + shaderParam.global_scale[0] = width; + shaderParam.global_scale[1] = 1; + shaderParam.global_scale[2] = 1; + shaderParam.global_size[0] = 1; + shaderParam.global_size[1] = gpu_align_p2((height + shaderParam.global_scale[1] - 1) + / shaderParam.global_scale[1], 4); + shaderParam.global_size[2] = chn; + + status = vsi_nn_kernel_gpu_config( node, &shaderParam ); + CHECK_STATUS_FAIL_GOTO(status, OnError); + + { + float dimRatio = 1.0f / (float)width; + float dimRatio_scale = dimRatio * scaleIn; + gpu_dp_inst_t uniFp16SumSqr_dp8x2 = {{ + 0x55555555, // TCfg + 0x00000000, // ASelt + 0x76543210, 0x76543210, // ABin + 0x5555aaaa, // BSelt + 0x00000000, 0x76543210, // BBin + 0x00000100, // AccumType, ConstantType, and PostShift + 0x3c003c00, 0x3c003c00, 0x3c003c00, 0x3c003c00, 0x00000000, 0x00000000, 0x00000000, 0x00000000 // Constant + }, GPU_DP_TYPE_16 }; + gpu_dp_inst_t UniFP16toFP32Lo4_dp4x4 = {{ + 0x01010101, // TCfg + 0x00000000, // ASelt + 0x00010000, 0x00030002, // ABin + 0x02020202, // BSelt + 0x00000000, 0x00000000, // BBin + 0x00000100, // AccumType, ConstantType, and PostShift + 0x00003c00, 0x00000000, 0x00003c00, 0x00000000, 0x00003c00, 0x00000000, 0x00003c00, 0x00000000 // Constant + }, GPU_DP_TYPE_16 }; + gpu_dp_inst_t uniExtractHalf4_dp4x4 = {{ + 0x01010101, // TCfg + 0x00000000, // ASelt + 0x00020000, 0x00060004, // ABin + 0x02020202, // BSelt + 0x00000000, 0x00000000, // BBin + 0x00000100, // AccumType, ConstantType, and PostShift + 0x00003c00, 0x00000000, 0x00003c00, 0x00000000, 0x00003c00, 0x00000000, 0x00003c00, 0x00000000 // Constant + }, GPU_DP_TYPE_16 }; + gpu_dp_inst_t uniConvertSecFp16Fp32_4x4 = {{ + 0x01010101, // TCfg + 0x00000000, // ASelt + 0x00050004, 0x00070006, // ABin + 0x02020202, // BSelt + 0x00000000, 0x00000000, // BBin + 0x00000100, // AccumType, ConstantType, and PostShift + 0x00003c00, 0x00000000, 0x00003c00, 0x00000000, 0x00003c00, 0x00000000, 0x00003c00, 0x00000000 // Constant + }, GPU_DP_TYPE_16 }; + gpu_dp_inst_t uniSumU8_16x1 = {{ + 0x55555555, // TCfg + 0x00000000, // ASelt + 0x76543210, 0xfedcba98, // ABin + 0xaaaaaaaa, // BSelt + 0x00000000, 0x00000000, // BBin + 0x00002400, // AccumType, ConstantType, and PostShift + 0x00010001, 0x00010001, 0x00010001, 0x00010001, 0x00010001, 0x00010001, 0x00010001, 0x00010001 // Constant + }, GPU_DP_TYPE_16 }; + gpu_dp_inst_t uniSqrSum_16x1 = {{ + 0x55555555, // TCfg + 0x00000000, // ASelt + 0x76543210, 0xfedcba98, // ABin + 0x55555555, // BSelt + 0x76543210, 0xfedcba98, // BBin + 0x00000400, // AccumType, ConstantType, and PostShift + 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000 // Constant + }, GPU_DP_TYPE_16 }; + gpu_dp_inst_t uniConvert1stUint8SubZpToFp32_4x4 = {{ + 0x05050505, // TCfg + 0x04040404, // ASelt + 0x00010000, 0x00030002, // ABin + 0x0a0a0a0a, // BSelt + 0x00000000, 0x00000000, // BBin + 0x00000400, // AccumType, ConstantType, and PostShift + 0xffff0001, 0x00000000, 0xffff0001, 0x00000000, 0xffff0001, 0x00000000, 0xffff0001, 0x00000000 // Constant + }, GPU_DP_TYPE_16 }; + gpu_dp_inst_t uniConvert2ndUint8SubZpToFp32_4x4 = {{ + 0x05050505, // TCfg + 0x04040404, // ASelt + 0x00050004, 0x00070006, // ABin + 0x0a0a0a0a, // BSelt + 0x00000000, 0x00000000, // BBin + 0x00000400, // AccumType, ConstantType, and PostShift + 0xffff0001, 0x00000000, 0xffff0001, 0x00000000, 0xffff0001, 0x00000000, 0xffff0001, 0x00000000 // Constant + }, GPU_DP_TYPE_16 }; + gpu_dp_inst_t uniConvert3rdUint8SubZpToFp32_4x4 = {{ + 0x05050505, // TCfg + 0x04040404, // ASelt + 0x00090008, 0x000b000a, // ABin + 0x0a0a0a0a, // BSelt + 0x00000000, 0x00000000, // BBin + 0x00000400, // AccumType, ConstantType, and PostShift + 0xffff0001, 0x00000000, 0xffff0001, 0x00000000, 0xffff0001, 0x00000000, 0xffff0001, 0x00000000 // Constant + }, GPU_DP_TYPE_16 }; + gpu_dp_inst_t uniConvert4thUint8SubZpToFp32_4x4 = {{ + 0x05050505, // TCfg + 0x04040404, // ASelt + 0x000d000c, 0x000f000e, // ABin + 0x0a0a0a0a, // BSelt + 0x00000000, 0x00000000, // BBin + 0x00002400, // AccumType, ConstantType, and PostShift + 0xffff0001, 0x00000000, 0xffff0001, 0x00000000, 0xffff0001, 0x00000000, 0xffff0001, 0x00000000 // Constant + }, GPU_DP_TYPE_16 }; + gpu_dp_inst_t uniConvertInt32toUint8_2x8 = {{ + 0x33333333, // TCfg + 0x11110000, // ASelt + 0x03020100, 0x03020100, // ABin + 0x00000000, // BSelt + 0x00000000, 0x00000000, // BBin + 0x00002400, // AccumType, ConstantType, and PostShift + 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000 // Constant + }, GPU_DP_TYPE_16 }; + gpu_dp_inst_t UniPackFP16even_2x8 = {{ + 0x11111111, // TCfg + 0x11110000, // ASelt + 0x06040200, 0x06040200, // ABin + 0x22222222, // BSelt + 0x00000000, 0x00000000, // BBin + 0x00000100, // AccumType, ConstantType, and PostShift + 0x00003c00, 0x00003c00, 0x00003c00, 0x00003c00, 0x00003c00, 0x00003c00, 0x00003c00, 0x00003c00 // Constant + }, GPU_DP_TYPE_16 }; + + gpu_dp_inst_t uniInt16SumSqr_dp8x2 = {{ + 0x55555555, // TCfg + 0x00000000, // ASelt + 0x76543210, 0x76543210, // ABin + 0x5555aaaa, // BSelt + 0x00000000, 0x76543210, // BBin + 0x00000300, // AccumType, ConstantType, and PostShift + 0x00010001, 0x00010001, 0x00010001, 0x00010001, 0x00000000, 0x00000000, 0x00000000, 0x00000000 // Constant + }, GPU_DP_TYPE_16 }; + + uint32_t pack_key = 0; +#define _PACK_SELECT_KEY( IN0_TYPE, IN1_TYPE, OUT_TYPE ) \ + (IN0_TYPE | (IN1_TYPE << 16) | (OUT_TYPE << 8)) + + pack_key = _PACK_SELECT_KEY( attr[0]->dtype, attr[1]->dtype, attr[2]->dtype ); + + status = vsi_nn_kernel_gpu_add_param(node, "width", &width); + status |= vsi_nn_kernel_gpu_add_param(node, "dimRatio", &dimRatio); + status |= vsi_nn_kernel_gpu_add_param(node, "UniFP16toFP32Lo4_dp4x4", &UniFP16toFP32Lo4_dp4x4); + status |= vsi_nn_kernel_gpu_add_param(node, "uniConvertSecFp16Fp32_4x4", &uniConvertSecFp16Fp32_4x4); + status |= vsi_nn_kernel_gpu_add_param(node, "e2InScale", &e2InScale); + CHECK_STATUS_FAIL_GOTO(status, OnError ); + + switch( pack_key ) + { + case _PACK_SELECT_KEY( U8, F16, F16 ): + { + status = vsi_nn_kernel_gpu_add_param(node, "UniPackFP16even_2x8", + &UniPackFP16even_2x8); + status |= vsi_nn_kernel_gpu_add_param(node, "uniSumU8_16x1", &uniSumU8_16x1); + status |= vsi_nn_kernel_gpu_add_param(node, "uniSqrSum_16x1", &uniSqrSum_16x1); + status |= vsi_nn_kernel_gpu_add_param(node, "uniConvert1stUint8SubZpToFp32_4x4", + &uniConvert1stUint8SubZpToFp32_4x4); + status |= vsi_nn_kernel_gpu_add_param(node, "uniConvert2ndUint8SubZpToFp32_4x4", + &uniConvert2ndUint8SubZpToFp32_4x4); + status |= vsi_nn_kernel_gpu_add_param(node, "uniConvert3rdUint8SubZpToFp32_4x4", + &uniConvert3rdUint8SubZpToFp32_4x4); + status |= vsi_nn_kernel_gpu_add_param(node, "uniConvert4thUint8SubZpToFp32_4x4", + &uniConvert4thUint8SubZpToFp32_4x4); + status |= vsi_nn_kernel_gpu_add_param(node, "inputZP", &input_zp); + status |= vsi_nn_kernel_gpu_add_param(node, "input_scale", &scaleIn); + status |= vsi_nn_kernel_gpu_add_param(node, "sumInZp", &sumInZp); + status |= vsi_nn_kernel_gpu_add_param(node, "tmpZp1", &tmpZp1); + status |= vsi_nn_kernel_gpu_add_param(node, "tmpZp2", &tmpZp2); + CHECK_STATUS_FAIL_GOTO(status, OnError ); + } + break; + case _PACK_SELECT_KEY( U8, F16, U8 ): + case _PACK_SELECT_KEY( F16, F16, F16 ): + case _PACK_SELECT_KEY( F16, F16, U8 ): + { + status = vsi_nn_kernel_gpu_add_param(node, "uniFp16SumSqr_dp8x2", + &uniFp16SumSqr_dp8x2); + status |= vsi_nn_kernel_gpu_add_param(node, "uniExtractHalf4_dp4x4", + &uniExtractHalf4_dp4x4); + status |= vsi_nn_kernel_gpu_add_param(node, "uniConvertInt32toUint8_2x8", + &uniConvertInt32toUint8_2x8); + status |= vsi_nn_kernel_gpu_add_param(node, "uniSumU8_16x1", &uniSumU8_16x1); + status |= vsi_nn_kernel_gpu_add_param(node, "uniSqrSum_16x1", &uniSqrSum_16x1); + status |= vsi_nn_kernel_gpu_add_param(node, "uniConvert1stUint8SubZpToFp32_4x4", + &uniConvert1stUint8SubZpToFp32_4x4); + status |= vsi_nn_kernel_gpu_add_param(node, "uniConvert2ndUint8SubZpToFp32_4x4", + &uniConvert2ndUint8SubZpToFp32_4x4); + status |= vsi_nn_kernel_gpu_add_param(node, "uniConvert3rdUint8SubZpToFp32_4x4", + &uniConvert3rdUint8SubZpToFp32_4x4); + status |= vsi_nn_kernel_gpu_add_param(node, "uniConvert4thUint8SubZpToFp32_4x4", + &uniConvert4thUint8SubZpToFp32_4x4); + status |= vsi_nn_kernel_gpu_add_param(node, "inputZP", &input_zp); + status |= vsi_nn_kernel_gpu_add_param(node, "input_scale", &scaleIn); + status |= vsi_nn_kernel_gpu_add_param(node, "sumInZp", &sumInZp); + status |= vsi_nn_kernel_gpu_add_param(node, "tmpZp1", &tmpZp1); + status |= vsi_nn_kernel_gpu_add_param(node, "tmpZp2", &tmpZp2); + status |= vsi_nn_kernel_gpu_add_param(node, "output_zp", &output_zp); + status |= vsi_nn_kernel_gpu_add_param(node, "outputScale", &scaleOut); + CHECK_STATUS_FAIL_GOTO(status, OnError ); + } + break; + case _PACK_SELECT_KEY( I16, F16, I16 ): + { + status = vsi_nn_kernel_gpu_add_param(node, "uniInt16SumSqr_dp8x2", + &uniInt16SumSqr_dp8x2); + status |= vsi_nn_kernel_gpu_add_param(node, "uniConvert1stUint8SubZpToFp32_4x4", + &uniConvert1stUint8SubZpToFp32_4x4); + status |= vsi_nn_kernel_gpu_add_param(node, "uniConvert2ndUint8SubZpToFp32_4x4", + &uniConvert2ndUint8SubZpToFp32_4x4); + status |= vsi_nn_kernel_gpu_add_param(node, "uniConvertInt32toUint8_2x8", + &uniConvertInt32toUint8_2x8); + + status |= vsi_nn_kernel_gpu_add_param(node, "inputZP", &input_zp); + status |= vsi_nn_kernel_gpu_add_param(node, "input_scale", &scaleIn); + status |= vsi_nn_kernel_gpu_add_param(node, "output_zp", &output_zp); + status |= vsi_nn_kernel_gpu_add_param(node, "outputScale", &scaleOut); + status |= vsi_nn_kernel_gpu_add_param(node, "dimRatio_scale", &dimRatio_scale); + CHECK_STATUS_FAIL_GOTO(status, OnError ); + } + break; + default: + VSI_ASSERT( FALSE ); + return VSI_FAILURE; + } +#undef _PACK_SELECT_KEY + } + +OnError: + if (attr[0]) + { + vsi_nn_kernel_tensor_attr_release( &attr[0] ); + attr[0] = NULL; + } + if (attr[1]) + { + vsi_nn_kernel_tensor_attr_release( &attr[1] ); + attr[1] = NULL; + } + if (attr[2]) + { + vsi_nn_kernel_tensor_attr_release( &attr[2] ); + attr[2] = NULL; + } + + return status; +} + +DEF_KERNEL_INITIALIZER(_sumsqr_initializer) + ( + vsi_nn_kernel_node_t node, + const vsi_nn_kernel_node_param_t * param, + size_t param_size + ) +{ + vsi_status status = VSI_FAILURE; + gpu_param_t shaderParam = { + 3, // workdim + {0, 0, 0}, // globalWorkOffset: control the start location be processed in the image + {0, 0, 0}, // globalWorkScale: how many pixels could be processed by a single thread + {0, 0, 0}, // localWorkSize: local group size in thread + {0, 0, 0}}; // globalWorkSize: image size in thread + + vsi_nn_kernel_tensor_attr_t* attr[2] = {NULL, NULL}; + vsi_int_array_t * input_shape = NULL; + float scaleIn = 1.0f; + int32_t input_zp = 0; + vx_uint32 iter = 0; + int32_t sumInZp = 0; + int32_t tmpZp1 = 0; + float tmpZp2 = 0; + float e2InScale = 0; + float rowSumScale = 0; + int32_t width = 0; + int32_t height = 0; + int32_t chn = 0; + + attr[0] = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[0] ); + CHECK_PTR_FAIL_GOTO( attr[0], "Create tensor attr buffer fail.", OnError ); + attr[1] = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[1] ); + CHECK_PTR_FAIL_GOTO( attr[1], "Create tensor attr buffer fail.", OnError ); + + input_shape = attr[0]->shape; + + if (attr[0]->quant == VSI_NN_KERNEL_QUANT_ASYMM) + { + input_zp = attr[0]->asymm.zero_point; + scaleIn = attr[0]->asymm.scale; + } + else if (attr[0]->quant == VSI_NN_KERNEL_QUANT_DFP) + { + if (attr[0]->dfp.fl > 0) + { + scaleIn = (1.0f / ((float) ((int64_t)1 << attr[0]->dfp.fl))); + } + else + { + scaleIn = ((float) ((int64_t)1 << -attr[0]->dfp.fl)); + } + input_zp = 0; + } + + width = input_shape->data[0]; + height = input_shape->data[1]; + chn = attr[1]->shape->data[1]; + iter = height * 16; + + e2InScale = scaleIn * scaleIn; + if (attr[0]->quant == VSI_NN_KERNEL_QUANT_ASYMM) + { + sumInZp = input_zp * iter * (-1); + tmpZp1 = (-2) * input_zp; + tmpZp2 = input_zp * input_zp * e2InScale; + rowSumScale = height * 16 * tmpZp2; + } + + shaderParam.global_scale[0] = 1; + shaderParam.global_scale[1] = 1; + shaderParam.global_scale[2] = 1; + shaderParam.local_size[0] = 16; + shaderParam.local_size[1] = 1; + shaderParam.local_size[2] = 1; + + if (attr[0]->dtype == I8 || attr[0]->dtype == U8) + { + shaderParam.global_size[0] = (width + 255) / 256 * 16; + } + else if (attr[0]->dtype == I16 || attr[0]->dtype == F16) + { + shaderParam.global_size[0] = (width + 127) / 128 * 16; + } + shaderParam.global_size[1] = chn; + shaderParam.global_size[2] = 1; + + status = vsi_nn_kernel_gpu_config( node, &shaderParam ); + CHECK_STATUS_FAIL_GOTO(status, OnError); + + if (attr[0]->dtype == U8) + { + gpu_dp_inst_t uniSumU8_16x1 = {{ + 0x55555555, // TCfg + 0x00000000, // ASelt + 0x76543210, 0xfedcba98, // ABin + 0xaaaaaaaa, // BSelt + 0x00000000, 0x00000000, // BBin + 0x00002400, // AccumType, ConstantType, and PostShift + 0x00010001, 0x00010001, 0x00010001, 0x00010001, 0x00010001, 0x00010001, 0x00010001, 0x00010001 // Constant + }, GPU_DP_TYPE_16 }; + gpu_dp_inst_t uniSqrSum_16x1 = {{ + 0x55555555, // TCfg + 0x00000000, // ASelt + 0x76543210, 0xfedcba98, // ABin + 0x55555555, // BSelt + 0x76543210, 0xfedcba98, // BBin + 0x00000400, // AccumType, ConstantType, and PostShift + 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000 // Constant + }, GPU_DP_TYPE_16 }; + status = vsi_nn_kernel_gpu_add_param(node, "uniSumU8_16x1", &uniSumU8_16x1); + status |= vsi_nn_kernel_gpu_add_param(node, "uniSqrSum_16x1", &uniSqrSum_16x1); + status |= vsi_nn_kernel_gpu_add_param(node, "sumInZp", &sumInZp); + status |= vsi_nn_kernel_gpu_add_param(node, "tmpZp1", &tmpZp1); + status |= vsi_nn_kernel_gpu_add_param(node, "input_scale", &scaleIn); + status |= vsi_nn_kernel_gpu_add_param(node, "e2InScale", &e2InScale); + status |= vsi_nn_kernel_gpu_add_param(node, "rowSumScale", &rowSumScale); + CHECK_STATUS_FAIL_GOTO(status, OnError ); + } + else if (attr[0]->dtype == F16) + { + gpu_dp_inst_t uniFp16SumSqr_dp8x2 = {{ + 0x55555555, // TCfg + 0x00000000, // ASelt + 0x76543210, 0x76543210, // ABin + 0x5555aaaa, // BSelt + 0x00000000, 0x76543210, // BBin + 0x00000100, // AccumType, ConstantType, and PostShift + 0x3c003c00, 0x3c003c00, 0x3c003c00, 0x3c003c00, 0x00000000, 0x00000000, 0x00000000, 0x00000000 // Constant + }, GPU_DP_TYPE_16 }; + status = vsi_nn_kernel_gpu_add_param(node, "uniFp16SumSqr_dp8x2", &uniFp16SumSqr_dp8x2); + CHECK_STATUS_FAIL_GOTO(status, OnError ); + } + else if (attr[0]->dtype == I16) + { + gpu_dp_inst_t uniInt16SumSqr_dp8x2 = {{ + 0x55555555, // TCfg + 0x00000000, // ASelt + 0x76543210, 0x76543210, // ABin + 0x5555aaaa, // BSelt + 0x00000000, 0x76543210, // BBin + 0x00000300, // AccumType, ConstantType, and PostShift + 0x00010001, 0x00010001, 0x00010001, 0x00010001, 0x00000000, 0x00000000, 0x00000000, 0x00000000 // Constant + }, GPU_DP_TYPE_16 }; + status = vsi_nn_kernel_gpu_add_param(node, "uniInt16SumSqr_dp8x2", &uniInt16SumSqr_dp8x2); + status |= vsi_nn_kernel_gpu_add_param(node, "input_scale", &scaleIn); + status |= vsi_nn_kernel_gpu_add_param(node, "e2InScale", &e2InScale); + CHECK_STATUS_FAIL_GOTO(status, OnError ); + } + + status = vsi_nn_kernel_gpu_add_param(node, "width", &width); + status |= vsi_nn_kernel_gpu_add_param(node, "height", &height); + CHECK_STATUS_FAIL_GOTO(status, OnError ); + +OnError: + if (attr[0]) + { + vsi_nn_kernel_tensor_attr_release( &attr[0] ); + attr[0] = NULL; + } + if (attr[1]) + { + vsi_nn_kernel_tensor_attr_release( &attr[1] ); + attr[1] = NULL; + } + + return status; +} + +DEF_KERNEL_INITIALIZER(_layernorm_wh_initializer) + ( + vsi_nn_kernel_node_t node, + const vsi_nn_kernel_node_param_t * param, + size_t param_size + ) +{ + vsi_status status = VSI_FAILURE; + gpu_param_t shaderParam = { + 3, // workdim + {0, 0, 0}, // globalWorkOffset: control the start location be processed in the image + {0, 0, 0}, // globalWorkScale: how many pixels could be processed by a single thread + {0, 0, 0}, // localWorkSize: local group size in thread + {0, 0, 0}}; // globalWorkSize: image size in thread + + vsi_nn_kernel_tensor_attr_t* attr[3] = {NULL, NULL}; + vsi_int_array_t * input_shape = NULL; + float scaleIn = 1.0f; + float scaleOut = 1.0f; + float output_zp = 0; + int32_t input_zp = 0; + float dimRatio = 0; + vx_uint32 group_num = 0; + vx_int32 height = 0, width = 0, chn = 0, height_chn_org = 0; + + attr[0] = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[0] ); + CHECK_PTR_FAIL_GOTO( attr[0], "Create tensor attr buffer fail.", OnError ); + attr[1] = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[3] ); + CHECK_PTR_FAIL_GOTO( attr[1], "Create tensor attr buffer fail.", OnError ); + attr[2] = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[4] ); + CHECK_PTR_FAIL_GOTO( attr[2], "Create tensor attr buffer fail.", OnError ); + + input_shape = attr[0]->shape; + + if (attr[0]->quant == VSI_NN_KERNEL_QUANT_ASYMM) + { + input_zp = attr[0]->asymm.zero_point; + scaleIn = attr[0]->asymm.scale; + } + else if (attr[0]->quant == VSI_NN_KERNEL_QUANT_DFP) + { + if (attr[0]->dfp.fl > 0) + { + scaleIn = (1.0f / ((float) ((int64_t)1 << attr[0]->dfp.fl))); + } + else + { + scaleIn = ((float) ((int64_t)1 << -attr[0]->dfp.fl)); + } + input_zp = 0; + } + + if (attr[2]->quant == VSI_NN_KERNEL_QUANT_ASYMM) + { + output_zp = (float)attr[2]->asymm.zero_point; + scaleOut = 1.0f / attr[2]->asymm.scale; + } + else if (attr[2]->quant == VSI_NN_KERNEL_QUANT_DFP) + { + if (attr[2]->dfp.fl > 0) + { + scaleOut = (float)((int64_t)1 << attr[2]->dfp.fl); + } + else + { + scaleOut = (1.0f / (float)((int64_t)1 << -attr[2]->dfp.fl)); + } + output_zp = 0; + } + + width = input_shape->data[0]; + height = input_shape->data[1]; + chn = attr[1]->shape->data[1]; + height_chn_org = (input_shape->size > 2 ? input_shape->data[2] : 1) / chn; + + dimRatio = (float)(1.0 / (width * height)); + + group_num = (width + 255) / 256; + if (attr[0]->dtype == I16 || attr[0]->dtype == F16) + { + group_num = (width + 127) / 128; + } + + shaderParam.global_scale[0] = 8; + shaderParam.global_scale[1] = 1; + shaderParam.global_scale[2] = 1; + shaderParam.global_size[0] = gpu_align_p2((width + shaderParam.global_scale[0] - 1) + / shaderParam.global_scale[0], 4); + shaderParam.global_size[1] = (chn + shaderParam.global_scale[1] - 1) + / shaderParam.global_scale[1]; + shaderParam.global_size[2] = 1; + + status = vsi_nn_kernel_gpu_config( node, &shaderParam ); + CHECK_STATUS_FAIL_GOTO(status, OnError); + + { + gpu_dp_inst_t UniFP16toFP32Lo4_dp4x4 = {{ + 0x01010101, // TCfg + 0x00000000, // ASelt + 0x00010000, 0x00030002, // ABin + 0x02020202, // BSelt + 0x00000000, 0x00000000, // BBin + 0x00000100, // AccumType, ConstantType, and PostShift + 0x00003c00, 0x00000000, 0x00003c00, 0x00000000, 0x00003c00, 0x00000000, 0x00003c00, 0x00000000 // Constant + }, GPU_DP_TYPE_16 }; + gpu_dp_inst_t uniConvertSecFp16Fp32_4x4 = {{ + 0x01010101, // TCfg + 0x00000000, // ASelt + 0x00050004, 0x00070006, // ABin + 0x02020202, // BSelt + 0x00000000, 0x00000000, // BBin + 0x00000100, // AccumType, ConstantType, and PostShift + 0x00003c00, 0x00000000, 0x00003c00, 0x00000000, 0x00003c00, 0x00000000, 0x00003c00, 0x00000000 // Constant + }, GPU_DP_TYPE_16 }; + gpu_dp_inst_t uniConvertInt32toUint8_2x8 = {{ + 0x33333333, // TCfg + 0x11110000, // ASelt + 0x03020100, 0x03020100, // ABin + 0x00000000, // BSelt + 0x00000000, 0x00000000, // BBin + 0x00002400, // AccumType, ConstantType, and PostShift + 0x00000000, 0x00000000, 0x00000000, 0x00000000, + 0x00000000, 0x00000000, 0x00000000, 0x00000000 // Constant + }, GPU_DP_TYPE_16 }; + gpu_dp_inst_t uniConvert1stUint8SubZpToFp32_4x4 = {{ + 0x05050505, // TCfg + 0x04040404, // ASelt + 0x00010000, 0x00030002, // ABin + 0x0a0a0a0a, // BSelt + 0x00000000, 0x00000000, // BBin + 0x00000400, // AccumType, ConstantType, and PostShift + 0xffff0001, 0x00000000, 0xffff0001, 0x00000000, 0xffff0001, 0x00000000, 0xffff0001, 0x00000000 // Constant + }, GPU_DP_TYPE_16 }; + gpu_dp_inst_t uniConvert2ndUint8SubZpToFp32_4x4 = {{ + 0x05050505, // TCfg + 0x04040404, // ASelt + 0x00050004, 0x00070006, // ABin + 0x0a0a0a0a, // BSelt + 0x00000000, 0x00000000, // BBin + 0x00000400, // AccumType, ConstantType, and PostShift + 0xffff0001, 0x00000000, 0xffff0001, 0x00000000, 0xffff0001, 0x00000000, 0xffff0001, 0x00000000 // Constant + }, GPU_DP_TYPE_16 }; + gpu_dp_inst_t uniConvertHalfToFp16_2x8 = {{ + 0x11111111, // TCfg + 0x11110000, // ASelt + 0x06040200, 0x06040200, // ABin + 0x22222222, // BSelt + 0x00000000, 0x00000000, // BBin + 0x00000100, // AccumType, ConstantType, and PostShift + 0x00003c00, 0x00003c00, 0x00003c00, 0x00003c00, 0x00003c00, 0x00003c00, 0x00003c00, 0x00003c00 // Constant + }, GPU_DP_TYPE_16 }; + + uint32_t pack_key = 0; +#define _PACK_SELECT_KEY( IN0_TYPE, OUT_TYPE ) \ + (IN0_TYPE | (OUT_TYPE << 8)) + + pack_key = _PACK_SELECT_KEY( attr[0]->dtype, attr[2]->dtype ); + + status = vsi_nn_kernel_gpu_add_param(node, "height", &height); + status |= vsi_nn_kernel_gpu_add_param(node, "height_depth", &height_chn_org); + status |= vsi_nn_kernel_gpu_add_param(node, "dimRatio", &dimRatio); + status |= vsi_nn_kernel_gpu_add_param(node, "group_num", &group_num); + status |= vsi_nn_kernel_gpu_add_param(node, "UniFP16toFP32Lo4_dp4x4", &UniFP16toFP32Lo4_dp4x4); + status |= vsi_nn_kernel_gpu_add_param(node, "uniConvertSecFp16Fp32_4x4", &uniConvertSecFp16Fp32_4x4); + status |= vsi_nn_kernel_gpu_add_param(node, "uniConvertInt32toUint8_2x8", &uniConvertInt32toUint8_2x8); + status |= vsi_nn_kernel_gpu_add_param(node, "output_zp", &output_zp); + status |= vsi_nn_kernel_gpu_add_param(node, "outputScale", &scaleOut); + CHECK_STATUS_FAIL_GOTO(status, OnError ); + + switch( pack_key ) + { + case _PACK_SELECT_KEY( U8, U8 ): + case _PACK_SELECT_KEY( U8, F16 ): + { + status = vsi_nn_kernel_gpu_add_param(node, "uniConvert1stUint8SubZpToFp32_4x4", + &uniConvert1stUint8SubZpToFp32_4x4); + status |= vsi_nn_kernel_gpu_add_param(node, "uniConvert2ndUint8SubZpToFp32_4x4", + &uniConvert2ndUint8SubZpToFp32_4x4); + status |= vsi_nn_kernel_gpu_add_param(node, "uniConvertHalfToFp16_2x8", + &uniConvertHalfToFp16_2x8); + status |= vsi_nn_kernel_gpu_add_param(node, "inputZP", &input_zp); + status |= vsi_nn_kernel_gpu_add_param(node, "input_scale", &scaleIn); + CHECK_STATUS_FAIL_GOTO(status, OnError ); + } + break; + case _PACK_SELECT_KEY( F16, F16 ): + case _PACK_SELECT_KEY( F16, U8 ): + { + status = vsi_nn_kernel_gpu_add_param(node, "uniConvertHalfToFp16_2x8", + &uniConvertHalfToFp16_2x8); + CHECK_STATUS_FAIL_GOTO(status, OnError ); + } + break; + case _PACK_SELECT_KEY( I16, I16 ): + { + status = vsi_nn_kernel_gpu_add_param(node, "uniConvert1stUint8SubZpToFp32_4x4", + &uniConvert1stUint8SubZpToFp32_4x4); + status |= vsi_nn_kernel_gpu_add_param(node, "uniConvert2ndUint8SubZpToFp32_4x4", + &uniConvert2ndUint8SubZpToFp32_4x4); + status |= vsi_nn_kernel_gpu_add_param(node, "inputZP", &input_zp); + status |= vsi_nn_kernel_gpu_add_param(node, "input_scale", &scaleIn); + CHECK_STATUS_FAIL_GOTO(status, OnError ); + } + break; + default: + VSI_ASSERT( FALSE ); + return VSI_FAILURE; + } +#undef _PACK_SELECT_KEY + } + +OnError: + if (attr[0]) + { + vsi_nn_kernel_tensor_attr_release( &attr[0] ); + attr[0] = NULL; + } + if (attr[1]) + { + vsi_nn_kernel_tensor_attr_release( &attr[1] ); + attr[1] = NULL; + } + if (attr[2]) + { + vsi_nn_kernel_tensor_attr_release( &attr[2] ); + attr[2] = NULL; + } + + return status; +} + +/* + * Query kernel + */ +static vsi_status _query_kernel + ( + vsi_nn_tensor_t* const* const inputs, + vsi_nn_tensor_t* const* const outputs, + vsi_nn_kernel_t* kernel, + int32_t reshape2D + ) +{ + vsi_status status = VSI_FAILURE; + vsi_nn_kernel_dtype_e input0_dtype = U8; + vsi_nn_kernel_dtype_e output_dtype = U8; + uint32_t key = 0; + int i = 0; + _kernel_type_e kernel_type = LAYERNORM_KERNEL; + + if (reshape2D) + { + kernel_type = LAYERNORM_2D_KERNEL; + } + + input0_dtype = vsi_nn_kernel_map_dtype( inputs[0]->attr.dtype.vx_type ); + output_dtype = vsi_nn_kernel_map_dtype( outputs[0]->attr.dtype.vx_type ); + + key = HASH_LAYERNORM_KEY( input0_dtype, output_dtype, kernel_type ); + + for( i = 0; i < _cnt_of_array(_layernorm_kernel_map); i ++ ) + { + if ( _layernorm_kernel_map[i].key == key ) + { + break; + } + } + if ( i < _cnt_of_array(_layernorm_kernel_map) ) + { + snprintf( kernel->info.name, VX_MAX_KERNEL_NAME, "%s", _layernorm_kernel_map[i].function_name ); + kernel->info.parameters = _layernorm_kernel_param_def; + kernel->info.numParams = _LAYERNORM_PARAM_NUM; + kernel->info.initialize = _layernorm_initializer; + + vsi_nn_kernel_add_source( kernel, VSI_NN_GPU_SOURCE_FMT_CODE, 2, + "vsi_nn_kernel_header", + _layernorm_kernel_map[i].source_name ); + vsi_nn_kernel_add_source( kernel, VSI_NN_GPU_SOURCE_FMT_EXECUTABLE, 1, + _layernorm_kernel_map[i].source_name ); + status = VSI_SUCCESS; + } + return status; +} /* _query_kernel() */ + +static vsi_status _query_kernel_wh + ( + vsi_nn_tensor_t* const* const inputs, + vsi_nn_tensor_t* const* const outputs, + vsi_nn_kernel_t* kernel_sumSqr, + vsi_nn_kernel_t* kernel, + _kernel_type_e is2D_sumsqr, + _kernel_type_e is2D_wh + ) +{ + vsi_status status = VSI_FAILURE; + vsi_nn_kernel_dtype_e input0_dtype = U8; + vsi_nn_kernel_dtype_e output_dtype = U8; + uint32_t key = 0; + int i = 0; + + input0_dtype = vsi_nn_kernel_map_dtype( inputs[0]->attr.dtype.vx_type ); + output_dtype = vsi_nn_kernel_map_dtype( outputs[0]->attr.dtype.vx_type ); + + key = HASH_LAYERNORM_KEY( input0_dtype, F32, is2D_sumsqr ); + + for( i = 0; i < _cnt_of_array(_sumsqr_kernel_map); i ++ ) + { + if ( _sumsqr_kernel_map[i].key == key ) + { + break; + } + } + if ( i < _cnt_of_array(_sumsqr_kernel_map) ) + { + snprintf( kernel_sumSqr->info.name, VX_MAX_KERNEL_NAME, "%s", _sumsqr_kernel_map[i].function_name ); + kernel_sumSqr->info.parameters = _sumSqr_kernel_param_def; + kernel_sumSqr->info.numParams = _SUMSQR_PARAM_NUM; + kernel_sumSqr->info.initialize = _sumsqr_initializer; + + vsi_nn_kernel_add_source( kernel_sumSqr, VSI_NN_GPU_SOURCE_FMT_CODE, 2, + "vsi_nn_kernel_header", + _sumsqr_kernel_map[i].source_name ); + vsi_nn_kernel_add_source( kernel_sumSqr, VSI_NN_GPU_SOURCE_FMT_EXECUTABLE, 1, + _sumsqr_kernel_map[i].source_name ); + } + + + key = HASH_LAYERNORM_KEY( input0_dtype, output_dtype, is2D_wh ); + + for( i = 0; i < _cnt_of_array(_sumsqr_kernel_map); i ++ ) + { + if ( _sumsqr_kernel_map[i].key == key ) + { + break; + } + } + if ( i < _cnt_of_array(_sumsqr_kernel_map) ) + { + snprintf( kernel->info.name, VX_MAX_KERNEL_NAME, "%s", _sumsqr_kernel_map[i].function_name ); + kernel->info.parameters = _layernorm_wh_kernel_param_def; + kernel->info.numParams = _LAYERNORM_WH_PARAM_NUM; + kernel->info.initialize = _layernorm_wh_initializer; + + vsi_nn_kernel_add_source( kernel, VSI_NN_GPU_SOURCE_FMT_CODE, 2, + "vsi_nn_kernel_header", + _sumsqr_kernel_map[i].source_name ); + vsi_nn_kernel_add_source( kernel, VSI_NN_GPU_SOURCE_FMT_EXECUTABLE, 1, + _sumsqr_kernel_map[i].source_name ); + status = VSI_SUCCESS; + } + return status; +} /* _query_kernel_wh() */ + +static vsi_nn_kernel_node_t _setup_wh + ( + vsi_nn_graph_t * graph, + vsi_nn_tensor_t ** inputs, + size_t input_num, + vsi_nn_tensor_t ** outputs, + size_t output_num, + const vsi_nn_kernel_param_t * params, + vsi_nn_kernel_t * kernel + ) +{ + vsi_status status = VSI_FAILURE; + vsi_nn_kernel_tensor_t rs_input = NULL, rs_output = NULL, rs_gamma = NULL, rs_beta = NULL; + vsi_nn_kernel_node_param_t sumSqr_node_params[_SUMSQR_PARAM_NUM] = { NULL }; + vsi_nn_kernel_node_param_t node_params[_LAYERNORM_WH_PARAM_NUM] = { NULL }; + vsi_nn_kernel_node_t tmp_node = NULL; + vsi_nn_kernel_node_t node = NULL; + vsi_nn_tensor_attr_t attr; + _kernel_type_e is2D_sumsqr = SUMSQR_2D_KERNEL; + _kernel_type_e is2D_wh = LAYERNORM_WH_2D_KERNEL; + vsi_nn_kernel_t * kernel_sumSqr = NULL; + vsi_nn_tensor_t * tensor_sumSqr = NULL; + float eps = vsi_nn_kernel_param_get_float32( params, "eps" ); + + int32_t axis[VSI_NN_MAX_DIM_NUM] = {0}; + int32_t axis_num = 1; + int32_t new_axis[VSI_NN_MAX_DIM_NUM] = {0}; + int32_t new_shape[2][VSI_NN_MAX_DIM_NUM] = {{ 1, 1, 1, 1 }}; + uint32_t axis_size = 0; + uint32_t rank_in = 0, rank_para = 0; + uint32_t outer_size = 1; + uint32_t i = 0; + + for(i = 1; i < inputs[0]->attr.dim_num; i++) + { + outer_size *= inputs[0]->attr.size[i]; + } + + status = vsi_nn_kernel_optimize_tensor_shape( + (int32_t *)inputs[0]->attr.size, inputs[0]->attr.dim_num, + axis, axis_num, new_shape[0], &rank_in, new_axis, &axis_size); + if ( status == FALSE || axis_size > 2) + { + return NULL; + } + + status = vsi_nn_kernel_optimize_tensor_shape( + (int32_t *)inputs[1]->attr.size, inputs[1]->attr.dim_num, + axis, axis_num, new_shape[1], &rank_para, new_axis, &axis_size); + if ( status == FALSE || axis_size > 2) + { + return NULL; + } + + rs_input = vsi_nn_kernel_tensor_reshape(inputs[0]->t, new_shape[0], rank_in); + + rs_beta = vsi_nn_kernel_tensor_reshape(inputs[1]->t, new_shape[1], rank_para); + + rs_gamma = vsi_nn_kernel_tensor_reshape(inputs[2]->t, new_shape[1], rank_para); + + rs_output = vsi_nn_kernel_tensor_reshape(outputs[0]->t, new_shape[0], rank_in); + + if (rank_in > 2) + { + is2D_sumsqr = SUMSQR_KERNEL; + is2D_wh = LAYERNORM_WH_KERNEL; + } + + kernel_sumSqr = vsi_nn_kernel_create( VSI_NN_KERNEL_TYPE_EVIS ); + // Assign unique_id + kernel_sumSqr->unique_id = kernel->unique_id; + + memset( &attr, 0, sizeof(vsi_nn_tensor_attr_t) ); + attr.dtype.vx_type = VSI_NN_TYPE_FLOAT32; + attr.is_const = FALSE; + attr.vtl = TRUE; + attr.size[0] = ((new_shape[0][0] + 255) / 256) * 4; + + if ( inputs[0]->attr.dtype.vx_type == VSI_NN_TYPE_INT16 + || inputs[0]->attr.dtype.vx_type == VSI_NN_TYPE_FLOAT16) + { + attr.size[0] = ((new_shape[0][0] + 127) / 128) * 4; + } + attr.size[1] = outer_size; + attr.size[2] = 1; + attr.size[3] = 1; + attr.dim_num = 4; + tensor_sumSqr = vsi_nn_CreateTensor( graph, &attr ); + + status = _query_kernel_wh(inputs, outputs, kernel_sumSqr, kernel, is2D_sumsqr, is2D_wh); + if ( VSI_SUCCESS != status ) + { + goto final; + } + + { + tmp_node = vsi_nn_kernel_create_node( graph, kernel_sumSqr ); + if (tmp_node) + { + sumSqr_node_params[0] = rs_input; + sumSqr_node_params[1] = (vsi_nn_kernel_node_param_t)tensor_sumSqr->t; + + status = vsi_nn_kernel_node_pass_param( tmp_node, sumSqr_node_params, + _SUMSQR_PARAM_NUM ); + CHECK_STATUS(status); + { + // Set default border mode. + vx_border_t border; + border.mode = VX_BORDER_CONSTANT; + border.constant_value.U8 = 0; + border.constant_value.U16 = 0; + if (inputs[0]->attr.dtype.vx_type == VSI_NN_TYPE_UINT8) + { + border.constant_value.U8 = (vx_uint8)inputs[0]->attr.dtype.zero_point; + } + status = vxSetNodeAttribute( (vx_node)tmp_node, VX_NODE_BORDER, &border, sizeof(border) ); + CHECK_STATUS(status); + } + } + } + + { + node = vsi_nn_kernel_create_node( graph, kernel ); + if (node) + { + uint32_t index = 0; + node_params[index++] = rs_input; + node_params[index++] = rs_beta; + node_params[index++] = rs_gamma; + node_params[index++] = (vsi_nn_kernel_node_param_t)tensor_sumSqr->t; + node_params[index++] = rs_output; + node_params[index++] = vsi_nn_kernel_scalar_create( graph, F32, &eps ); + + status = vsi_nn_kernel_node_pass_param( node, node_params, + _LAYERNORM_WH_PARAM_NUM ); + CHECK_STATUS(status); + vsi_nn_kernel_scalar_release( &node_params[5] ); + { + // Set default border mode. + vx_border_t border; + border.mode = VX_BORDER_CONSTANT; + border.constant_value.U8 = 0; + border.constant_value.U16 = 0; + if (inputs[0]->attr.dtype.vx_type == VSI_NN_TYPE_UINT8) + { + border.constant_value.U8 = (vx_uint8)inputs[0]->attr.dtype.zero_point; + } + status = vxSetNodeAttribute( (vx_node)node, VX_NODE_BORDER, &border, sizeof(border) ); + CHECK_STATUS(status); + } + } + } + +final: + if (rs_beta) + { + vsi_nn_kernel_tensor_release( &rs_beta ); + } + if (rs_gamma) + { + vsi_nn_kernel_tensor_release( &rs_gamma ); + } + if (rs_input) + { + vsi_nn_kernel_tensor_release( &rs_input ); + } + if (rs_output) + { + vsi_nn_kernel_tensor_release( &rs_output ); + } + if ( kernel_sumSqr ) + { + vsi_nn_kernel_release( &kernel_sumSqr ); + } + if ( tensor_sumSqr ) + { + vsi_nn_ReleaseTensor( &tensor_sumSqr ); + } + if (tmp_node) {vsi_nn_kernel_node_release( &tmp_node );} + + return node; +} + + +static vsi_nn_kernel_node_t _setup + ( + vsi_nn_graph_t * graph, + vsi_nn_tensor_t ** inputs, + size_t input_num, + vsi_nn_tensor_t ** outputs, + size_t output_num, + const vsi_nn_kernel_param_t * params, + vsi_nn_kernel_t * kernel + ) +{ + vsi_status status = VSI_FAILURE; + vsi_nn_kernel_node_param_t node_params[_LAYERNORM_PARAM_NUM] = { NULL }; + vsi_nn_kernel_node_t node = NULL; + vsi_nn_kernel_tensor_t rs_input = NULL, rs_output = NULL, rs_gamma = NULL, rs_beta = NULL; + int32_t rs_flg = vsi_nn_kernel_param_get_int32( params, "reshape_flg" ); + int32_t wh_flg = vsi_nn_kernel_param_get_int32( params, "wh_flg" ); + int32_t optFlg = rs_flg || (outputs[0]->attr.dim_num < 3); + float eps = vsi_nn_kernel_param_get_float32( params, "eps" ); + + if (wh_flg) + { + node = _setup_wh(graph, inputs, input_num, outputs, output_num, params, kernel); + goto final; + } + + status = _query_kernel( inputs, outputs, kernel, optFlg); + if (VSI_SUCCESS != status) + { + goto final; + } + + if (rs_flg) + { + int32_t shape[VSI_NN_MAX_DIM_NUM] = {0}; + shape[0] = inputs[0]->attr.size[0]; + shape[1] = inputs[0]->attr.size[1] * inputs[0]->attr.size[2]; + shape[2] = 1; + shape[3] = inputs[0]->attr.dim_num > 3 ? inputs[0]->attr.size[3] : 1; + rs_input = vsi_nn_kernel_tensor_reshape( inputs[0]->t, shape, 4 ); + + shape[0] = outputs[0]->attr.size[0]; + shape[1] = outputs[0]->attr.size[1] * outputs[0]->attr.size[2]; + shape[2] = 1; + shape[3] = outputs[0]->attr.dim_num > 3 ? outputs[0]->attr.size[3] : 1; + rs_output = vsi_nn_kernel_tensor_reshape( outputs[0]->t, shape, 4 ); + } + if (inputs[1]->attr.dim_num < 2) + { + int32_t shape[VSI_NN_MAX_DIM_NUM] = {0}; + shape[0] = inputs[1]->attr.size[0]; + shape[1] = 1; + shape[2] = 1; + shape[3] = 1; + rs_beta = vsi_nn_kernel_tensor_reshape( inputs[1]->t, shape, 4 ); + } + if (inputs[2]->attr.dim_num < 2) + { + int32_t shape[VSI_NN_MAX_DIM_NUM] = {0}; + shape[0] = inputs[2]->attr.size[0]; + shape[1] = 1; + shape[2] = 1; + shape[3] = 1; + rs_gamma = vsi_nn_kernel_tensor_reshape( inputs[2]->t, shape, 4 ); + } + + // Nomalization + node = vsi_nn_kernel_create_node( graph, kernel ); + if (node) + { + uint32_t index = 0; + if (rs_flg) + { + node_params[index++] = rs_input; + } + else + { + node_params[index++] = (vsi_nn_kernel_node_param_t)inputs[0]->t; + } + if (inputs[1]->attr.dim_num < 2) + { + node_params[index++] = rs_beta; + } + else + { + node_params[index++] = (vsi_nn_kernel_node_param_t)inputs[1]->t; + } + if (inputs[2]->attr.dim_num < 2) + { + node_params[index++] = rs_gamma; + } + else + { + node_params[index++] = (vsi_nn_kernel_node_param_t)inputs[2]->t; + } + if (rs_flg) + { + node_params[index++] = rs_output; + } + else + { + node_params[index++] = (vsi_nn_kernel_node_param_t)outputs[0]->t; + } + node_params[index++] = vsi_nn_kernel_scalar_create( graph, F32, &eps ); + + status = vsi_nn_kernel_node_pass_param( node, node_params, + _LAYERNORM_PARAM_NUM ); + CHECK_STATUS(status); + vsi_nn_kernel_scalar_release( &node_params[4] ); + { + // Set default border mode. + vx_border_t border; + border.mode = VX_BORDER_CONSTANT; + border.constant_value.U8 = 0; + border.constant_value.U16 = 0; + if (inputs[0]->attr.dtype.vx_type == VSI_NN_TYPE_UINT8) + { + border.constant_value.U8 = (vx_uint8)inputs[0]->attr.dtype.zero_point; + } + status = vxSetNodeAttribute( (vx_node)node, VX_NODE_BORDER, &border, sizeof(border) ); + CHECK_STATUS(status); + } + } + + /* Pass parameters to node. */ +final: + if (rs_beta) + { + vsi_nn_kernel_tensor_release( &rs_beta ); + } + if (rs_gamma) + { + vsi_nn_kernel_tensor_release( &rs_gamma ); + } + if (rs_flg) + { + vsi_nn_kernel_tensor_release( &rs_input ); + vsi_nn_kernel_tensor_release( &rs_output ); + } + return node; +} /* _setup() */ + +__END_DECLS + +REGISTER_BACKEND_EVIS( layer_norm, _setup ) + diff --git a/src/tim/vx/internal/src/kernel/evis/pre_process_bgra_evis.c b/src/tim/vx/internal/src/kernel/evis/pre_process_bgra_evis.c index e46ea14..42ff180 100644 --- a/src/tim/vx/internal/src/kernel/evis/pre_process_bgra_evis.c +++ b/src/tim/vx/internal/src/kernel/evis/pre_process_bgra_evis.c @@ -68,7 +68,6 @@ static const struct { { TENSOR_PRE_PROCESS_BGRA_KERNELS(U8, U8, SCALE, KERNEL_SOURCE_1) TENSOR_PRE_PROCESS_BGRA_KERNELS(U8, U8, COPY, KERNEL_SOURCE_1) - TENSOR_PRE_PROCESS_BGRA_KERNELS(U8, U8, SCALE_NHWC, KERNEL_SOURCE_2) }; static vx_param_description_t vxPreProcessBgraKernel_param_def[] = @@ -106,7 +105,6 @@ DEF_KERNEL_INITIALIZER(_pre_process_bgra_initializer) int32_t dstZP = 0; float outputScale = 1; int32_t reorder = 0; - int32_t trans = 0; int32_t xRatio = 0; int32_t yRatio = 0; int32_t order1 = 2; @@ -126,8 +124,6 @@ DEF_KERNEL_INITIALIZER(_pre_process_bgra_initializer) CHECK_STATUS_FAIL_GOTO(status, OnError ); status = vsi_nn_kernel_scalar_read_int32((vsi_nn_kernel_scalar_t)param[10], &reorder); CHECK_STATUS_FAIL_GOTO(status, OnError ); - status = vsi_nn_kernel_scalar_read_int32((vsi_nn_kernel_scalar_t)param[11], &trans); - CHECK_STATUS_FAIL_GOTO(status, OnError ); out_shape = attr[0]->shape; dstZP = attr[0]->asymm.zero_point; @@ -135,19 +131,14 @@ DEF_KERNEL_INITIALIZER(_pre_process_bgra_initializer) width = out_shape->data[0]; height = out_shape->data[1]; - if(trans) - { - width = width / 3; - } - - if(reorder != 0) + if (reorder != 0) { reorder = 2; order1 = 0; } enable_copy = (int32_t)(xRatio == (1 << 15) && yRatio == (1 << 15)); - if(attr[0]->quant == VSI_NN_KERNEL_QUANT_DFP) + if (attr[0]->quant == VSI_NN_KERNEL_QUANT_DFP) { if (attr[0]->dfp.fl > 0) { @@ -159,11 +150,11 @@ DEF_KERNEL_INITIALIZER(_pre_process_bgra_initializer) } dstZP = 0; } - else if(attr[0]->quant == VSI_NN_KERNEL_QUANT_ASYMM) + else if (attr[0]->quant == VSI_NN_KERNEL_QUANT_ASYMM) { outputScale = 1.0f/outputScale; } - else if( attr[0]->quant == VSI_NN_KERNEL_QUANT_NONE ) + else if ( attr[0]->quant == VSI_NN_KERNEL_QUANT_NONE ) { outputScale = 1; dstZP = 0; @@ -286,16 +277,6 @@ DEF_KERNEL_INITIALIZER(_pre_process_bgra_initializer) 0x00000001, 0x00000000, 0x00000001, 0x00000000 // Constant }, GPU_DP_TYPE_16 }; - gpu_dp_inst_t uniExtractInt32BgraToU8Bgr_2x8 = {{ - 0x00333333, // TCfg - 0x00111000, // ASelt - 0x00020100, 0x00000201, // ABin - 0x00000000, // BSelt - 0x00000000, 0x00000000, // BBin - 0x00002600, // AccumType, ConstantType, and PostShift - 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000 // Constant - }, GPU_DP_TYPE_16 }; - // copy gpu_dp_inst_t uniExtractBfromBgra_4x4 = {{ 0x01010101, // TCfg @@ -355,23 +336,7 @@ DEF_KERNEL_INITIALIZER(_pre_process_bgra_initializer) 0x00000001, 0x00000001, 0x00000001, 0x00000001, 0x00000001, 0x00000001, 0x00000001, 0x00000001 // Constant }, GPU_DP_TYPE_16 }; - if(trans) - { - status = vsi_nn_kernel_gpu_add_param(node, "uniExtractInt32BgraToU8Bgr_2x8", - &uniExtractInt32BgraToU8Bgr_2x8); - status |= vsi_nn_kernel_gpu_add_param(node, "uniBilinearTmp1BgraShort_4x4", &uniBilinearTmp1BgraShort_4x4); - status |= vsi_nn_kernel_gpu_add_param(node, "uniBilinearTmp2BgraShort_4x4", &uniBilinearTmp2BgraShort_4x4); - status |= vsi_nn_kernel_gpu_add_param(node, "uniBilinearTmp3BgraShort_4x4", &uniBilinearTmp3BgraShort_4x4); - status |= vsi_nn_kernel_gpu_add_param(node, "uniBilinearTmp4BgraShort_4x4", &uniBilinearTmp4BgraShort_4x4); - status |= vsi_nn_kernel_gpu_add_param(node, "uniBilinearTmp5BgraShort_4x4", &uniBilinearTmp5BgraShort_4x4); - status |= vsi_nn_kernel_gpu_add_param(node, "uniBilinearTmp6BgraShort_4x4", &uniBilinearTmp6BgraShort_4x4); - status |= vsi_nn_kernel_gpu_add_param(node, "uniBilinearTmp7BgraShort_4x4", &uniBilinearTmp7BgraShort_4x4); - status |= vsi_nn_kernel_gpu_add_param(node, "uniBilinearTmp8BgraShort_4x4", &uniBilinearTmp8BgraShort_4x4); - status |= vsi_nn_kernel_gpu_add_param(node, "uniDescaleU8_4x4", &uniDescaleU8_4x4); - status |= vsi_nn_kernel_gpu_add_param(node, "uniConvertIntergetoF32_4x4", &uniConvertIntergetoF32_4x4); - CHECK_STATUS_FAIL_GOTO(status, OnError); - } - else if(enable_copy) + if (enable_copy) { status = vsi_nn_kernel_gpu_add_param(node, "uniExtractBfromBgra_4x4", &uniExtractBfromBgra_4x4); status |= vsi_nn_kernel_gpu_add_param(node, "uniExtractGfromBgra_4x4", &uniExtractGfromBgra_4x4); @@ -429,16 +394,11 @@ static vsi_status _query_kernel uint32_t key = 0; int i = 0; vsi_bool enable_copy = vsi_nn_kernel_param_get_int32( params, "enable_copy" ); - vsi_bool enable_perm = vsi_nn_kernel_param_get_int32( params, "enable_perm" ); input0_dtype = vsi_nn_kernel_map_dtype( inputs[0]->attr.dtype.vx_type ); output_dtype = vsi_nn_kernel_map_dtype( outputs[0]->attr.dtype.vx_type ); - if(enable_perm) - { - convert_type = SCALE_NHWC; - } - else if(enable_copy) + if (enable_copy) { convert_type = COPY; } @@ -449,14 +409,14 @@ static vsi_status _query_kernel key = HASH_PRE_PROCESS_BGRA_KEY( input0_dtype, output_dtype, convert_type, 0 ); - for( i = 0; i < _cnt_of_array(pre_process_bgra_map); i ++ ) + for ( i = 0; i < _cnt_of_array(pre_process_bgra_map); i ++ ) { if( pre_process_bgra_map[i].key == key ) { break; } } - if( i < _cnt_of_array(pre_process_bgra_map) ) + if ( i < _cnt_of_array(pre_process_bgra_map) ) { snprintf( kernel->info.name, VX_MAX_KERNEL_NAME, "%s", pre_process_bgra_map[i].function_name ); kernel->info.parameters = vxPreProcessBgraKernel_param_def; @@ -488,19 +448,19 @@ static vsi_nn_kernel_node_t _setup vsi_nn_kernel_node_t node = NULL; int32_t shapes[VSI_NN_MAX_DIM_NUM] = {1, 1, 1, 1}; vsi_nn_tensor_t* reshape_tensors[1] = {NULL}; - int32_t trans = vsi_nn_kernel_param_get_int32( params, "enable_perm" ); + int32_t trans = 0; - if( !vsi_nn_kernel_gpu_check_shape( (int32_t*)outputs[0]->attr.size, + if ( !vsi_nn_kernel_gpu_check_shape( (int32_t*)outputs[0]->attr.size, outputs[0]->attr.dim_num ) ) { return NULL; } status = _query_kernel( inputs, outputs, kernel, params ); - if( VSI_SUCCESS == status) + if ( VSI_SUCCESS == status) { node = vsi_nn_kernel_create_node( graph, kernel ); - if( node ) + if ( node ) { uint32_t index = 2; int32_t scale_x = vsi_nn_kernel_param_get_int32( params, "scale_x" ); diff --git a/src/tim/vx/internal/src/kernel/evis/pre_process_nv12_evis.c b/src/tim/vx/internal/src/kernel/evis/pre_process_nv12_evis.c index 6976058..8ce0467 100644 --- a/src/tim/vx/internal/src/kernel/evis/pre_process_nv12_evis.c +++ b/src/tim/vx/internal/src/kernel/evis/pre_process_nv12_evis.c @@ -43,7 +43,6 @@ __BEGIN_DECLS #define VX_KERNEL_NAME_PRE_PROCESS_NV12_SCALE_U8TOU8 CVIVANTE_NAMESPACE("evis.pre_process_nv12_scale_U8toU8") #define VX_KERNEL_NAME_PRE_PROCESS_NV12_SCALE_U8TOI8 CVIVANTE_NAMESPACE("evis.pre_process_nv12_scale_U8toI8") #define VX_KERNEL_NAME_PRE_PROCESS_NV12_COPY_U8TOU8 CVIVANTE_NAMESPACE("evis.pre_process_nv12_copy_U8toU8") -#define VX_KERNEL_NAME_PRE_PROCESS_NV12_TRANS_U8TOU8 CVIVANTE_NAMESPACE("evis.pre_process_nv12_trans_U8toU8") // greater than a quarter #define VX_KERNEL_NAME_PRE_PROCESS_NV12_SCALE_U8TOU8_GQ CVIVANTE_NAMESPACE("evis.pre_process_nv12_scale_U8toU8_gq") @@ -51,7 +50,6 @@ __BEGIN_DECLS #define KERNEL_SOURCE_1 "pre_process_nv12_scale_8bits", #define KERNEL_SOURCE_2 "pre_process_nv12_scale", -#define KERNEL_SOURCE_3 "pre_process_nv12_trans_u8", #define KERNEL_SOURCE_4 "pre_process_nv12_scale_mix" typedef enum @@ -85,7 +83,6 @@ static const struct { TENSOR_PRE_PROCESS_NV12_KERNELS(U8, U8, COPY, KERNEL_SOURCE_1) TENSOR_PRE_PROCESS_NV12_KERNELS(U8, F16, SCALE, KERNEL_SOURCE_2) TENSOR_PRE_PROCESS_NV12_KERNELS(U8, I16, SCALE, KERNEL_SOURCE_2) - TENSOR_PRE_PROCESS_NV12_KERNELS(U8, U8, TRANS, KERNEL_SOURCE_3) TENSOR_PRE_PROCESS_NV12_GQ_KERNELS(U8, U8, SCALE, KERNEL_SOURCE_4) TENSOR_PRE_PROCESS_NV12_GQ_KERNELS(U8, F16, SCALE, KERNEL_SOURCE_4) }; @@ -156,17 +153,17 @@ DEF_KERNEL_INITIALIZER(_pre_process_nv12_copy_initializer) width = out_shape->data[0]; height = out_shape->data[1]; - if(reorder != 0) + if (reorder != 0) { reorder = 2; order1 = 0; } - if(attr[0]->quant == VSI_NN_KERNEL_QUANT_ASYMM) + if (attr[0]->quant == VSI_NN_KERNEL_QUANT_ASYMM) { dstScale = 1.0f / dstScale; } - else if(attr[0]->quant == VSI_NN_KERNEL_QUANT_DFP) + else if (attr[0]->quant == VSI_NN_KERNEL_QUANT_DFP) { if (attr[0]->dfp.fl > 0) { @@ -178,7 +175,7 @@ DEF_KERNEL_INITIALIZER(_pre_process_nv12_copy_initializer) } dstZP = 0; } - else if(attr[0]->quant == VSI_NN_KERNEL_QUANT_NONE) + else if (attr[0]->quant == VSI_NN_KERNEL_QUANT_NONE) { dstScale = 1; dstZP = 0; @@ -295,7 +292,6 @@ DEF_KERNEL_INITIALIZER(_pre_process_nv12_initializer) int32_t dstZP = 0; float dstScale = 1; int32_t reorder = 0; - int32_t trans = 0; int32_t order1 = 2; uint32_t width = 0; uint32_t height = 0; @@ -325,8 +321,6 @@ DEF_KERNEL_INITIALIZER(_pre_process_nv12_initializer) CHECK_STATUS_FAIL_GOTO(status, OnError ); status = vsi_nn_kernel_scalar_read_int32((vsi_nn_kernel_scalar_t)param[11], &reorder); CHECK_STATUS_FAIL_GOTO(status, OnError ); - status = vsi_nn_kernel_scalar_read_int32((vsi_nn_kernel_scalar_t)param[12], &trans); - CHECK_STATUS_FAIL_GOTO(status, OnError ); out_shape = attr[1]->shape; dstZP = attr[1]->asymm.zero_point; @@ -334,24 +328,21 @@ DEF_KERNEL_INITIALIZER(_pre_process_nv12_initializer) width = out_shape->data[0]; height = out_shape->data[1]; - if(reorder != 0) + if (reorder != 0) { reorder = 2; order1 = 0; } - if(trans) - { - width = width / 3; - } + resize = (float)width / attr[0]->shape->data[0]; xrIntFloat_16 = (attr[0]->shape->data[0] << 16) / width + 1; yrIntFloat_16 = (attr[0]->shape->data[1] << 16) / height + 1; - if(attr[1]->quant == VSI_NN_KERNEL_QUANT_ASYMM) + if (attr[1]->quant == VSI_NN_KERNEL_QUANT_ASYMM) { dstScale = 1.0f / dstScale; } - else if(attr[1]->quant == VSI_NN_KERNEL_QUANT_DFP) + else if (attr[1]->quant == VSI_NN_KERNEL_QUANT_DFP) { if (attr[1]->dfp.fl > 0) { @@ -363,7 +354,7 @@ DEF_KERNEL_INITIALIZER(_pre_process_nv12_initializer) } dstZP = 0; } - else if(attr[1]->quant == VSI_NN_KERNEL_QUANT_NONE) + else if (attr[1]->quant == VSI_NN_KERNEL_QUANT_NONE) { dstScale = 1; dstZP = 0; @@ -450,27 +441,6 @@ DEF_KERNEL_INITIALIZER(_pre_process_nv12_initializer) }, GPU_DP_TYPE_16 }; //trans - gpu_dp_inst_t uniTransPackBgr1st_2x8 = {{ - 0x11311311, // TCfg - 0x00100100, // ASelt - 0x01000400, 0x06020105, // ABin - 0x22022022, // BSelt - 0x00000000, 0x00000000, // BBin - 0x00002600, // AccumType, ConstantType, and PostShift - 0x00000001, 0x00000001, 0x00000000, 0x00000001, - 0x00000001, 0x00000000, 0x00000001, 0x00000001 // Constant - }, GPU_DP_TYPE_16 }; - gpu_dp_inst_t uniTransPackBgr2nd_2x8 = {{ - 0x00003113, // TCfg - 0x00001001, // ASelt - 0x03070302, 0x00000000, // ABin - 0x00000220, // BSelt - 0x00000000, 0x00000000, // BBin - 0x00002600, // AccumType, ConstantType, and PostShift - 0x00000000, 0x00000001, 0x00000001, 0x00000000, - 0x00000000, 0x00000000, 0x00000000, 0x00000000 // Constant - }, GPU_DP_TYPE_16 }; - gpu_dp_inst_t uniCalculateYShift_2x8 = {{ 0x00009999, // TCfg 0x00000000, // ASelt @@ -502,23 +472,15 @@ DEF_KERNEL_INITIALIZER(_pre_process_nv12_initializer) status |= vsi_nn_kernel_gpu_add_param(node, "bMeanScaleVarZp", &bMeanScaleVarZp); status |= vsi_nn_kernel_gpu_add_param(node, "gMeanScaleVarZp", &gMeanScaleVarZp); status |= vsi_nn_kernel_gpu_add_param(node, "rMeanScaleVarZp", &rMeanScaleVarZp); - if(resize >= 0.25 && (attr[1]->dtype == U8 || attr[1]->dtype == F16) && !trans) + + if (resize >= 0.25 && (attr[1]->dtype == U8 || attr[1]->dtype == F16)) { status |= vsi_nn_kernel_gpu_add_param(node, "uniCalculateYShift_2x8", &uniCalculateYShift_2x8); status |= vsi_nn_kernel_gpu_add_param(node, "uniCalculateUVShift_2x8", &uniCalculateUVShift_2x8); } CHECK_STATUS_FAIL_GOTO(status, OnError ); - if(trans && attr[1]->dtype == U8) - { - status = vsi_nn_kernel_gpu_add_param(node, "uniTransPackBgr1st_2x8", &uniTransPackBgr1st_2x8); - status |= vsi_nn_kernel_gpu_add_param(node, "uniTransPackBgr2nd_2x8", &uniTransPackBgr2nd_2x8); - CHECK_STATUS_FAIL_GOTO(status, OnError ); - } - else - { - status |= vsi_nn_kernel_gpu_add_param(node, "rOrder", &reorder); - } + status |= vsi_nn_kernel_gpu_add_param(node, "rOrder", &reorder); status |= vsi_nn_kernel_gpu_add_param(node, "bOrder", &order1); CHECK_STATUS_FAIL_GOTO(status, OnError ); @@ -572,20 +534,15 @@ static vsi_status _query_kernel uint32_t key = 0; int i = 0; vsi_bool enable_copy = vsi_nn_kernel_param_get_int32( params, "enable_copy" ); - vsi_bool enable_perm = vsi_nn_kernel_param_get_int32( params, "enable_perm" ); uint32_t srcWidth = inputs[0]->attr.size[0]; - uint32_t dstWidth = enable_perm ? outputs[0]->attr.size[1] : outputs[0]->attr.size[0]; + uint32_t dstWidth = outputs[0]->attr.size[0]; float scaleVal = (float)dstWidth / srcWidth; uint32_t optFlg = 0; input0_dtype = vsi_nn_kernel_map_dtype( inputs[0]->attr.dtype.vx_type ); output_dtype = vsi_nn_kernel_map_dtype( outputs[0]->attr.dtype.vx_type ); - if(enable_perm) - { - convert_type = TRANS; - } - else if(enable_copy && output_dtype == U8) + if (enable_copy && output_dtype == U8) { convert_type = COPY; } @@ -594,7 +551,7 @@ static vsi_status _query_kernel convert_type = SCALE; } - if(scaleVal >= 0.25 && (output_dtype == U8 || output_dtype == F16) && convert_type == SCALE) + if (scaleVal >= 0.25 && (output_dtype == U8 || output_dtype == F16) && convert_type == SCALE) { optFlg = 1; } @@ -608,7 +565,7 @@ static vsi_status _query_kernel break; } } - if( i < _cnt_of_array(pre_process_nv12_map) ) + if ( i < _cnt_of_array(pre_process_nv12_map) ) { snprintf( kernel->info.name, VX_MAX_KERNEL_NAME, "%s", pre_process_nv12_map[i].function_name ); kernel->info.parameters = vxPreProcessNv12Kernel_param_def; @@ -646,21 +603,20 @@ static vsi_nn_kernel_node_t _setup vsi_status status = VSI_FAILURE; vsi_nn_kernel_node_param_t tmp_params[_EVIS_PRE_PROCESS_NV12_PARAM_NUM] = { NULL }; vsi_nn_kernel_node_t node = NULL; - int32_t shapes[VSI_NN_MAX_DIM_NUM] = {1, 1, 1, 1}; vsi_nn_tensor_t* reshape_tensors[1] = {NULL}; - int32_t trans = vsi_nn_kernel_param_get_int32( params, "enable_perm" ); + int32_t trans = 0; - if( !vsi_nn_kernel_gpu_check_shape( (int32_t*)outputs[0]->attr.size, + if ( !vsi_nn_kernel_gpu_check_shape( (int32_t*)outputs[0]->attr.size, outputs[0]->attr.dim_num ) ) { return NULL; } status = _query_kernel( inputs, outputs, kernel, params ); - if( VSI_SUCCESS == status) + if ( VSI_SUCCESS == status) { node = vsi_nn_kernel_create_node( graph, kernel ); - if( node ) + if ( node ) { uint32_t index = 3; int32_t scale_x = vsi_nn_kernel_param_get_int32( params, "scale_x" ); @@ -674,22 +630,9 @@ static vsi_nn_kernel_node_t _setup int32_t reverse = vsi_nn_kernel_param_get_int32( params, "reverse" ); /* Pass parameters to node. */ - if(trans) - { - shapes[0] = outputs[0]->attr.size[0] * outputs[0]->attr.size[1]; - shapes[1] = outputs[0]->attr.size[2]; - - reshape_tensors[0] = vsi_nn_reshape_tensor( graph, - outputs[0], (uint32_t*)shapes, outputs[0]->attr.dim_num); - - vsi_nn_kernel_node_pack_io( tmp_params, _EVIS_PRE_PROCESS_NV12_PARAM_NUM, - inputs, 2, &reshape_tensors[0], 1 ); - } - else - { - vsi_nn_kernel_node_pack_io( tmp_params, _EVIS_PRE_PROCESS_NV12_PARAM_NUM, + vsi_nn_kernel_node_pack_io( tmp_params, _EVIS_PRE_PROCESS_NV12_PARAM_NUM, inputs, 2, outputs, 1 ); - } + tmp_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &scale_x ); tmp_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &scale_y ); tmp_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &left ); diff --git a/src/tim/vx/internal/src/kernel/evis/pre_process_rgb_evis.c b/src/tim/vx/internal/src/kernel/evis/pre_process_rgb_evis.c index c5ea1c5..09f55a6 100644 --- a/src/tim/vx/internal/src/kernel/evis/pre_process_rgb_evis.c +++ b/src/tim/vx/internal/src/kernel/evis/pre_process_rgb_evis.c @@ -90,14 +90,6 @@ static const struct { TENSOR_PRE_PROCESS_RGB_KERNELS(U8, I16, COPY, KERNEL_SOURCE_2) TENSOR_PRE_PROCESS_RGB_KERNELS(U8, U8, COPY, KERNEL_SOURCE_2) TENSOR_PRE_PROCESS_RGB_KERNELS(U8, I8, COPY, KERNEL_SOURCE_2) - TENSOR_PRE_PROCESS_RGB_KERNELS(U8, F16, SCALE_NHWC, KERNEL_SOURCE_3) - TENSOR_PRE_PROCESS_RGB_KERNELS(U8, I16, SCALE_NHWC, KERNEL_SOURCE_3) - TENSOR_PRE_PROCESS_RGB_KERNELS(U8, U8, SCALE_NHWC, KERNEL_SOURCE_3) - TENSOR_PRE_PROCESS_RGB_KERNELS(U8, I8, SCALE_NHWC, KERNEL_SOURCE_3) - TENSOR_PRE_PROCESS_RGB_KERNELS(U8, F16, COPY_NHWC, KERNEL_SOURCE_4) - TENSOR_PRE_PROCESS_RGB_KERNELS(U8, I16, COPY_NHWC, KERNEL_SOURCE_4) - TENSOR_PRE_PROCESS_RGB_KERNELS(U8, U8, COPY_NHWC, KERNEL_SOURCE_4) - TENSOR_PRE_PROCESS_RGB_KERNELS(U8, I8, COPY_NHWC, KERNEL_SOURCE_4) }; static vx_param_description_t vxPreProcessRgbKernel_param_def[] = @@ -156,8 +148,6 @@ DEF_KERNEL_INITIALIZER(_pre_process_rgb_initializer) CHECK_STATUS_FAIL_GOTO(status, OnError ); status = vsi_nn_kernel_scalar_read_int32((vsi_nn_kernel_scalar_t)param[10], &reorder); CHECK_STATUS_FAIL_GOTO(status, OnError ); - status = vsi_nn_kernel_scalar_read_int32((vsi_nn_kernel_scalar_t)param[11], &trans); - CHECK_STATUS_FAIL_GOTO(status, OnError ); out_shape = attr[0]->shape; outputZP = (float)attr[0]->asymm.zero_point; @@ -165,14 +155,14 @@ DEF_KERNEL_INITIALIZER(_pre_process_rgb_initializer) width = out_shape->data[0]; height = out_shape->data[1]; - if(reorder != 0) + if (reorder != 0) { reorder = 2; order1 = 0; } enable_copy = (int32_t)(xRatio == (1 << 15) && yRatio == (1 << 15)); - if(attr[0]->quant == VSI_NN_KERNEL_QUANT_DFP) + if (attr[0]->quant == VSI_NN_KERNEL_QUANT_DFP) { if (attr[0]->dfp.fl > 0) { @@ -184,11 +174,11 @@ DEF_KERNEL_INITIALIZER(_pre_process_rgb_initializer) } outputZP = 0; } - else if(attr[0]->quant == VSI_NN_KERNEL_QUANT_ASYMM) + else if (attr[0]->quant == VSI_NN_KERNEL_QUANT_ASYMM) { outputScale = 1.0f / outputScale; } - else if( attr[0]->quant == VSI_NN_KERNEL_QUANT_NONE ) + else if ( attr[0]->quant == VSI_NN_KERNEL_QUANT_NONE ) { outputScale = 1; outputZP = 0; @@ -199,48 +189,6 @@ DEF_KERNEL_INITIALIZER(_pre_process_rgb_initializer) pack_key = _PACK_SELECT_KEY( enable_copy, reorder, trans); { - // trans and copy - gpu_dp_inst_t uniNormilizationLo_2x8 = {{ - 0x99999999, // TCfg - 0x44444444, // ASelt - 0x45002142, 0x27480324, // ABin - 0x99999999, // BSelt - 0x06060606, 0x06060606, // BBin - 0x00000100, // AccumType, ConstantType, and PostShift - 0x3c000000, 0x3c000000, 0x3c000000, 0x3c000000, - 0x3c000000, 0x3c000000, 0x3c000000, 0x3c000000 // Constant - }, GPU_DP_TYPE_16 }; - gpu_dp_inst_t uniNormilizationHi_2x8 = {{ - 0x09999999, // TCfg - 0x04444444, // ASelt - 0x092a4b06, 0x000c2d4e, // ABin - 0x09999999, // BSelt - 0x06060606, 0x00060606, // BBin - 0x00000100, // AccumType, ConstantType, and PostShift - 0x3c000000, 0x3c000000, 0x3c000000, 0x3c000000, - 0x3c000000, 0x3c000000, 0x3c000000, 0x00000000 // Constant - }, GPU_DP_TYPE_16 }; - gpu_dp_inst_t uniNormilizationLo_NHWC_2x8 = {{ - 0x99999999, // TCfg - 0x44444444, // ASelt - 0x03422100, 0x27064524, // ABin - 0x99999999, // BSelt - 0x06060606, 0x06060606, // BBin - 0x00000100, // AccumType, ConstantType, and PostShift - 0x3c000000, 0x3c000000, 0x3c000000, 0x3c000000, - 0x3c000000, 0x3c000000, 0x3c000000, 0x3c000000 // Constant - }, GPU_DP_TYPE_16 }; - gpu_dp_inst_t uniNormilizationHi_NHWC_2x8 = {{ - 0x09999999, // TCfg - 0x04444444, // ASelt - 0x4b2a0948, 0x004e2d0c, // ABin - 0x09999999, // BSelt - 0x06060606, 0x00060606, // BBin - 0x00000100, // AccumType, ConstantType, and PostShift - 0x3c000000, 0x3c000000, 0x3c000000, 0x3c000000, - 0x3c000000, 0x3c000000, 0x3c000000, 0x00000000 // Constant - }, GPU_DP_TYPE_16 }; - // copy gpu_dp_inst_t uniExtractRtoF32_part0_4x4 = {{ 0x01010101, // TCfg @@ -404,79 +352,9 @@ DEF_KERNEL_INITIALIZER(_pre_process_rgb_initializer) 0x00000001, 0x00000000, 0x00000001, 0x00000000, 0x00000001, 0x00000000, 0x00000001, 0x00000000 // Constant }, GPU_DP_TYPE_16 }; - gpu_dp_inst_t uniRePackRGBLo_2x8 = {{ - 0x00111111, // TCfg - 0x00001001, // ASelt - 0x01000400, 0x00000105, // ABin - 0x00222222, // BSelt - 0x00000000, 0x00000000, // BBin - 0x00000600, // AccumType, ConstantType, and PostShift - 0x00000001, 0x00000001, 0x00000001, 0x00000001, - 0x00000001, 0x00000001, 0x00000000, 0x00000000 // Constant - }, GPU_DP_TYPE_16 }; - gpu_dp_inst_t uniRePackRGBHi_2x8 = {{ - 0x00111111, // TCfg - 0x00001001, // ASelt - 0x03020602, 0x00000307, // ABin - 0x00222222, // BSelt - 0x00000000, 0x00000000, // BBin - 0x00000600, // AccumType, ConstantType, and PostShift - 0x00000001, 0x00000001, 0x00000001, 0x00000001, - 0x00000001, 0x00000001, 0x00000000, 0x00000000 // Constant - }, GPU_DP_TYPE_16 }; - gpu_dp_inst_t uniRePackRGBLo_NHWC_2x8 = {{ - 0x00111111, // TCfg - 0x00100100, // ASelt - 0x01000400, 0x00000105, // ABin - 0x00222222, // BSelt - 0x00000000, 0x00000000, // BBin - 0x00000600, // AccumType, ConstantType, and PostShift - 0x00000001, 0x00000001, 0x00000001, 0x00000001, - 0x00000001, 0x00000001, 0x00000000, 0x00000000 // Constant - }, GPU_DP_TYPE_16 }; - gpu_dp_inst_t uniRePackRGBHi_NHWC_2x8 = {{ - 0x00111111, // TCfg - 0x00100100, // ASelt - 0x03020602, 0x00000307, // ABin - 0x00222222, // BSelt - 0x00000000, 0x00000000, // BBin - 0x00000600, // AccumType, ConstantType, and PostShift - 0x00000001, 0x00000001, 0x00000001, 0x00000001, - 0x00000001, 0x00000001, 0x00000000, 0x00000000 // Constant - }, GPU_DP_TYPE_16 }; - switch( pack_key ) + switch ( pack_key ) { - case _PACK_SELECT_KEY( 1, 0, 1): // copy trans - { - shaderParam.global_scale[0] = 15; - shaderParam.global_scale[1] = 1; - shaderParam.global_scale[2] = 1; - shaderParam.global_size[0] = gpu_align_p2((width + shaderParam.global_scale[0] - 1) - / shaderParam.global_scale[0], 4); - shaderParam.global_size[1] = height; - shaderParam.global_size[2] = 1; - - status = vsi_nn_kernel_gpu_add_param(node, "uniNormilizationLo_2x8", &uniNormilizationLo_NHWC_2x8); - status |= vsi_nn_kernel_gpu_add_param(node, "uniNormilizationHi_2x8", &uniNormilizationHi_NHWC_2x8); - CHECK_STATUS_FAIL_GOTO(status, OnError); - } - break; - case _PACK_SELECT_KEY( 1, 2, 1): // copy reorder trans - { - shaderParam.global_scale[0] = 15; - shaderParam.global_scale[1] = 1; - shaderParam.global_scale[2] = 1; - shaderParam.global_size[0] = gpu_align_p2((width + shaderParam.global_scale[0] - 1) - / shaderParam.global_scale[0], 4); - shaderParam.global_size[1] = height; - shaderParam.global_size[2] = 1; - - status = vsi_nn_kernel_gpu_add_param(node, "uniNormilizationLo_2x8", &uniNormilizationLo_2x8); - status |= vsi_nn_kernel_gpu_add_param(node, "uniNormilizationHi_2x8", &uniNormilizationHi_2x8); - CHECK_STATUS_FAIL_GOTO(status, OnError); - } - break; case _PACK_SELECT_KEY( 1, 0, 0): // copy case _PACK_SELECT_KEY( 1, 2, 0): // copy reorder { @@ -539,68 +417,6 @@ DEF_KERNEL_INITIALIZER(_pre_process_rgb_initializer) CHECK_STATUS_FAIL_GOTO(status, OnError); } break; - case _PACK_SELECT_KEY( 0, 0, 1): // trans - { - shaderParam.global_scale[0] = 4; - shaderParam.global_scale[1] = 1; - shaderParam.global_scale[2] = 1; - shaderParam.global_size[0] = gpu_align_p2((width / 3 + shaderParam.global_scale[0] - 1) - / shaderParam.global_scale[0], 4); - shaderParam.global_size[1] = height; - shaderParam.global_size[2] = 1; - - if(attr[0]->dtype == F16) - { - status = vsi_nn_kernel_gpu_add_param(node, "uniExtract8Data_2x8", &uniExtractHalf8_2x8); - } - else - { - status = vsi_nn_kernel_gpu_add_param(node, "uniExtract8Data_2x8", &uniExtractInteger_2x8); - } - status |= vsi_nn_kernel_gpu_add_param(node, "uniConvertIntergetoF32_4x4", &uniConvertIntergetoF32_4x4); - status |= vsi_nn_kernel_gpu_add_param(node, "uniUnpackToR", &uniUnpackToR); - status |= vsi_nn_kernel_gpu_add_param(node, "uniUnpackToG", &uniUnpackToG); - status |= vsi_nn_kernel_gpu_add_param(node, "uniUnpackToB", &uniUnpackToB); - status |= vsi_nn_kernel_gpu_add_param(node, "uniVecShift10", &uniVecShift10); - status |= vsi_nn_kernel_gpu_add_param(node, "uniAddRShift", &uniAddRShift); - status |= vsi_nn_kernel_gpu_add_param(node, "uniGetTempVal", &uniGetTempVal); - status |= vsi_nn_kernel_gpu_add_param(node, "uniExtractBytes", &uniExtractBytes); - status |= vsi_nn_kernel_gpu_add_param(node, "uniRePackRGBLo_2x8", &uniRePackRGBLo_NHWC_2x8); - status |= vsi_nn_kernel_gpu_add_param(node, "uniRePackRGBHi_2x8", &uniRePackRGBHi_NHWC_2x8); - CHECK_STATUS_FAIL_GOTO(status, OnError); - } - break; - case _PACK_SELECT_KEY( 0, 2, 1): // reorder trans - { - shaderParam.global_scale[0] = 4; - shaderParam.global_scale[1] = 1; - shaderParam.global_scale[2] = 1; - shaderParam.global_size[0] = gpu_align_p2((width / 3 + shaderParam.global_scale[0] - 1) - / shaderParam.global_scale[0], 4); - shaderParam.global_size[1] = height; - shaderParam.global_size[2] = 1; - - if(attr[0]->dtype == F16) - { - status = vsi_nn_kernel_gpu_add_param(node, "uniExtract8Data_2x8", &uniExtractHalf8_2x8); - } - else - { - status = vsi_nn_kernel_gpu_add_param(node, "uniExtract8Data_2x8", &uniExtractInteger_2x8); - } - status |= vsi_nn_kernel_gpu_add_param(node, "uniConvertIntergetoF32_4x4", &uniConvertIntergetoF32_4x4); - status |= vsi_nn_kernel_gpu_add_param(node, "uniUnpackToR", &uniUnpackToR); - status |= vsi_nn_kernel_gpu_add_param(node, "uniUnpackToG", &uniUnpackToG); - status |= vsi_nn_kernel_gpu_add_param(node, "uniUnpackToB", &uniUnpackToB); - status |= vsi_nn_kernel_gpu_add_param(node, "uniVecShift10", &uniVecShift10); - status |= vsi_nn_kernel_gpu_add_param(node, "uniAddRShift", &uniAddRShift); - status |= vsi_nn_kernel_gpu_add_param(node, "uniGetTempVal", &uniGetTempVal); - status |= vsi_nn_kernel_gpu_add_param(node, "uniExtractBytes", &uniExtractBytes); - status |= vsi_nn_kernel_gpu_add_param(node, "uniRePackRGBLo_2x8", &uniRePackRGBLo_2x8); - status |= vsi_nn_kernel_gpu_add_param(node, "uniRePackRGBHi_2x8", &uniRePackRGBHi_2x8); - CHECK_STATUS_FAIL_GOTO(status, OnError); - } - break; default: break; } @@ -637,23 +453,14 @@ static vsi_status _query_kernel uint32_t key = 0; int i = 0; vsi_bool enable_copy = vsi_nn_kernel_param_get_int32( params, "enable_copy" ); - vsi_bool enable_perm = vsi_nn_kernel_param_get_int32( params, "enable_perm" ); input0_dtype = vsi_nn_kernel_map_dtype( inputs[0]->attr.dtype.vx_type ); output_dtype = vsi_nn_kernel_map_dtype( outputs[0]->attr.dtype.vx_type ); - if(enable_copy && enable_perm) - { - convert_type = COPY_NHWC; - } - else if(enable_copy) + if (enable_copy) { convert_type = COPY; } - else if(enable_perm) - { - convert_type = SCALE_NHWC; - } else { convert_type = SCALE; @@ -661,14 +468,14 @@ static vsi_status _query_kernel key = HASH_PRE_PROCESS_RGB_KEY( input0_dtype, output_dtype, convert_type, 0 ); - for( i = 0; i < _cnt_of_array(pre_process_rgb_map); i ++ ) + for ( i = 0; i < _cnt_of_array(pre_process_rgb_map); i ++ ) { if( pre_process_rgb_map[i].key == key ) { break; } } - if( i < _cnt_of_array(pre_process_rgb_map) ) + if ( i < _cnt_of_array(pre_process_rgb_map) ) { snprintf( kernel->info.name, VX_MAX_KERNEL_NAME, "%s", pre_process_rgb_map[i].function_name ); kernel->info.parameters = vxPreProcessRgbKernel_param_def; @@ -698,21 +505,20 @@ static vsi_nn_kernel_node_t _setup vsi_status status = VSI_FAILURE; vsi_nn_kernel_node_param_t tmp_params[_EVIS_PRE_PROCESS_RGB_PARAM_NUM] = { NULL }; vsi_nn_kernel_node_t node = NULL; - int32_t shapes[VSI_NN_MAX_DIM_NUM] = {1, 1, 1, 1}; vsi_nn_tensor_t* reshape_tensors[1] = {NULL}; - int32_t trans = vsi_nn_kernel_param_get_int32( params, "enable_perm" ); + int32_t trans = 0; - if( !vsi_nn_kernel_gpu_check_shape( (int32_t*)outputs[0]->attr.size, + if ( !vsi_nn_kernel_gpu_check_shape( (int32_t*)outputs[0]->attr.size, outputs[0]->attr.dim_num ) ) { return NULL; } status = _query_kernel( inputs, outputs, kernel, params ); - if( VSI_SUCCESS == status) + if ( VSI_SUCCESS == status) { node = vsi_nn_kernel_create_node( graph, kernel ); - if( node ) + if ( node ) { uint32_t index = 2; int32_t scale_x = vsi_nn_kernel_param_get_int32( params, "scale_x" ); @@ -726,18 +532,7 @@ static vsi_nn_kernel_node_t _setup int32_t reverse = vsi_nn_kernel_param_get_int32( params, "reverse" ); /* Pass parameters to node. */ - if(trans) - { - shapes[0] = outputs[0]->attr.size[0] * outputs[0]->attr.size[1]; - shapes[1] = outputs[0]->attr.size[2]; - - reshape_tensors[0] = vsi_nn_reshape_tensor( graph, - outputs[0], (uint32_t*)shapes, outputs[0]->attr.dim_num); - - vsi_nn_kernel_node_pack_io( tmp_params, _EVIS_PRE_PROCESS_RGB_PARAM_NUM, - inputs, 1, &reshape_tensors[0], 1 ); - } - else + if (trans == 0) { vsi_nn_kernel_node_pack_io( tmp_params, _EVIS_PRE_PROCESS_RGB_PARAM_NUM, inputs, 1, outputs, 1 ); @@ -767,7 +562,7 @@ static vsi_nn_kernel_node_t _setup } } - if(reshape_tensors[0]) + if (reshape_tensors[0]) { vsi_nn_ReleaseTensor(&reshape_tensors[0]); } diff --git a/src/tim/vx/internal/src/kernel/evis/pre_process_yuv420_evis.c b/src/tim/vx/internal/src/kernel/evis/pre_process_yuv420_evis.c index b7617ae..2d32371 100644 --- a/src/tim/vx/internal/src/kernel/evis/pre_process_yuv420_evis.c +++ b/src/tim/vx/internal/src/kernel/evis/pre_process_yuv420_evis.c @@ -43,15 +43,12 @@ __BEGIN_DECLS #define VX_KERNEL_NAME_PRE_PROCESS_YUV420_SCALE_U8TOU8 CVIVANTE_NAMESPACE("evis.pre_process_yuv420_scale_U8toU8") #define VX_KERNEL_NAME_PRE_PROCESS_YUV420_SCALE_U8TOI8 CVIVANTE_NAMESPACE("evis.pre_process_yuv420_scale_U8toI8") #define VX_KERNEL_NAME_PRE_PROCESS_YUV420_COPY_U8TOU8 CVIVANTE_NAMESPACE("evis.pre_process_yuv420_copy_U8toU8") -#define VX_KERNEL_NAME_PRE_PROCESS_YUV420_COPY_TRANS_U8TOU8 CVIVANTE_NAMESPACE("evis.pre_process_yuv420_copy_trans_U8") -#define VX_KERNEL_NAME_PRE_PROCESS_YUV420_TRANS_U8TOU8 CVIVANTE_NAMESPACE("evis.pre_process_yuv420_trans_U8toU8") #define KERNEL_SOURCE_1 "pre_process_yuv420_scale_u8", #define KERNEL_SOURCE_2 "pre_process_yuv420_copy_u8", #define KERNEL_SOURCE_3 "pre_process_yuv420_scale_fp16", #define KERNEL_SOURCE_4 "pre_process_yuv420_scale_i16", #define KERNEL_SOURCE_5 "pre_process_yuv420_scale_i8", -#define KERNEL_SOURCE_6 "pre_process_yuv420_trans_u8" typedef enum { @@ -80,8 +77,6 @@ static const struct { TENSOR_PRE_PROCESS_YUV420_KERNELS(U8, U8, SCALE, KERNEL_SOURCE_1) TENSOR_PRE_PROCESS_YUV420_KERNELS(U8, I8, SCALE, KERNEL_SOURCE_5) TENSOR_PRE_PROCESS_YUV420_KERNELS(U8, U8, COPY, KERNEL_SOURCE_2) - TENSOR_PRE_PROCESS_YUV420_KERNELS(U8, U8, COPY_TRANS, KERNEL_SOURCE_2) - TENSOR_PRE_PROCESS_YUV420_KERNELS(U8, U8, TRANS, KERNEL_SOURCE_6) }; static vx_param_description_t vxPreProcessYuv420Kernel_param_def[] = @@ -143,24 +138,24 @@ DEF_KERNEL_INITIALIZER(_pre_process_yuv420_copy_initializer) width = out_shape->data[0]; height = out_shape->data[1]; - if(reorder != 0) + if (reorder != 0) { reorder = 2; order1 = 0; } - if(trans) + if (trans) { width = width / 3; } - if(attr[0]->dtype == U8) + if (attr[0]->dtype == U8) { dstScale = 1.0f / dstScale; } shaderParam.global_scale[0] = 16; - if(attr[0]->dtype == I16 || attr[0]->dtype == F16) + if (attr[0]->dtype == I16 || attr[0]->dtype == F16) { shaderParam.global_scale[0] = 8; } @@ -176,131 +171,6 @@ DEF_KERNEL_INITIALIZER(_pre_process_yuv420_copy_initializer) CHECK_STATUS_FAIL_GOTO(status, OnError); { - gpu_dp_inst_t uniPackBG0_2x8 = {{ - 0x11011011, // TCfg - 0x10010010, // ASelt - 0x01000000, 0x02020001, // ABin - 0x22022022, // BSelt - 0x00000000, 0x00000000, // BBin - 0x00000600, // AccumType, ConstantType, and PostShift - 0x00000001, 0x00000001, 0x00000000, 0x00000001, - 0x00000001, 0x00000000, 0x00000001, 0x00000001 // Constant - }, GPU_DP_TYPE_16 }; - - gpu_dp_inst_t uniPackTmpAndR_2x8 = {{ - 0x11111111, // TCfg - 0x00100100, // ASelt - 0x03000100, 0x07060104, // ABin - 0x22222222, // BSelt - 0x00000000, 0x00000000, // BBin - 0x00000600, // AccumType, ConstantType, and PostShift - 0x00000001, 0x00000001, 0x00000001, 0x00000001, - 0x00000001, 0x00000001, 0x00000001, 0x00000001 // Constant - }, GPU_DP_TYPE_16 }; - - gpu_dp_inst_t uniPackRB0_2x8 = {{ - 0x11011011, // TCfg - 0x10010010, // ASelt - 0x03000302, 0x05040004, // ABin - 0x22022022, // BSelt - 0x00000000, 0x00000000, // BBin - 0x00000600, // AccumType, ConstantType, and PostShift - 0x00000001, 0x00000001, 0x00000000, 0x00000001, - 0x00000001, 0x00000000, 0x00000001, 0x00000001 // Constant - }, GPU_DP_TYPE_16 }; - gpu_dp_inst_t uniPackTmp0AndG_2x8 = {{ - 0x11111111, // TCfg - 0x00100100, // ASelt - 0x03030100, 0x07060404, // ABin - 0x22222222, // BSelt - 0x00000000, 0x00000000, // BBin - 0x00000600, // AccumType, ConstantType, and PostShift - 0x00000001, 0x00000001, 0x00000001, 0x00000001, - 0x00000001, 0x00000001, 0x00000001, 0x00000001 // Constant - }, GPU_DP_TYPE_16 }; - - gpu_dp_inst_t uniPackGR1_2x8 = {{ - 0x11011011, // TCfg - 0x10010010, // ASelt - 0x06000505, 0x07070006, // ABin - 0x22022022, // BSelt - 0x00000000, 0x00000000, // BBin - 0x00000600, // AccumType, ConstantType, and PostShift - 0x00000001, 0x00000001, 0x00000000, 0x00000001, - 0x00000001, 0x00000000, 0x00000001, 0x00000001 // Constant - }, GPU_DP_TYPE_16 }; - gpu_dp_inst_t uniPackTmp1AndB_2x8 = {{ - 0x11111111, // TCfg - 0x00100100, // ASelt - 0x03060100, 0x07060704, // ABin - 0x22222222, // BSelt - 0x00000000, 0x00000000, // BBin - 0x00000600, // AccumType, ConstantType, and PostShift - 0x00000001, 0x00000001, 0x00000001, 0x00000001, - 0x00000001, 0x00000001, 0x00000001, 0x00000001 // Constant - }, GPU_DP_TYPE_16 }; - gpu_dp_inst_t uniPackBG1_2x8 = {{ - 0x11011011, // TCfg - 0x10010010, // ASelt - 0x09000808, 0x0a0a0009, // ABin - 0x22022022, // BSelt - 0x00000000, 0x00000000, // BBin - 0x00000600, // AccumType, ConstantType, and PostShift - 0x00000001, 0x00000001, 0x00000000, 0x00000001, - 0x00000001, 0x00000000, 0x00000001, 0x00000001 // Constant - }, GPU_DP_TYPE_16 }; - gpu_dp_inst_t uniPackTmp1AndR_2x8 = {{ - 0x11111111, // TCfg - 0x00100100, // ASelt - 0x03080100, 0x07060904, // ABin - 0x22222222, // BSelt - 0x00000000, 0x00000000, // BBin - 0x00000600, // AccumType, ConstantType, and PostShift - 0x00000001, 0x00000001, 0x00000001, 0x00000001, - 0x00000001, 0x00000001, 0x00000001, 0x00000001 // Constant - }, GPU_DP_TYPE_16 }; - - gpu_dp_inst_t uniPackRB2_2x8 = {{ - 0x11011011, // TCfg - 0x10010010, // ASelt - 0x0b000b0a, 0x0d0c000c, // ABin - 0x22022022, // BSelt - 0x00000000, 0x00000000, // BBin - 0x00000600, // AccumType, ConstantType, and PostShift - 0x00000001, 0x00000001, 0x00000000, 0x00000001, - 0x00000001, 0x00000000, 0x00000001, 0x00000001 // Constant - }, GPU_DP_TYPE_16 }; - gpu_dp_inst_t uniPackTmp2AndG_2x8 = {{ - 0x11111111, // TCfg - 0x00100100, // ASelt - 0x030b0100, 0x07060c04, // ABin - 0x22222222, // BSelt - 0x00000000, 0x00000000, // BBin - 0x00000600, // AccumType, ConstantType, and PostShift - 0x00000001, 0x00000001, 0x00000001, 0x00000001, - 0x00000001, 0x00000001, 0x00000001, 0x00000001 // Constant - }, GPU_DP_TYPE_16 }; - gpu_dp_inst_t uniPackGR2_2x8 = {{ - 0x11011011, // TCfg - 0x10010010, // ASelt - 0x0e000d0d, 0x0f0f000e, // ABin - 0x22022022, // BSelt - 0x00000000, 0x00000000, // BBin - 0x00000600, // AccumType, ConstantType, and PostShift - 0x00000001, 0x00000001, 0x00000000, 0x00000001, - 0x00000001, 0x00000000, 0x00000001, 0x00000001 // Constant - }, GPU_DP_TYPE_16 }; - gpu_dp_inst_t uniPackTmp2AndB_2x8 = {{ - 0x11111111, // TCfg - 0x00100100, // ASelt - 0x030e0100, 0x07060f04, // ABin - 0x22222222, // BSelt - 0x00000000, 0x00000000, // BBin - 0x00000600, // AccumType, ConstantType, and PostShift - 0x00000001, 0x00000001, 0x00000001, 0x00000001, - 0x00000001, 0x00000001, 0x00000001, 0x00000001 // Constant - }, GPU_DP_TYPE_16 }; - gpu_dp_inst_t uniCalculateTmpR1st_4x4 = {{ 0x05050505, // TCfg 0x04040404, // ASelt @@ -574,19 +444,6 @@ DEF_KERNEL_INITIALIZER(_pre_process_yuv420_copy_initializer) status |= vsi_nn_kernel_gpu_add_param(node, "uniCalculateTmpB4th_4x4", &uniCalculateTmpB4th_4x4); status |= vsi_nn_kernel_gpu_add_param(node, "uniCalculateB1st_4x4", &uniCalculateR1st_4x4); - status |= vsi_nn_kernel_gpu_add_param(node, "uniPackBG0_2x8", &uniPackBG0_2x8); - status |= vsi_nn_kernel_gpu_add_param(node, "uniPackTmpAndR_2x8", &uniPackTmpAndR_2x8); - status |= vsi_nn_kernel_gpu_add_param(node, "uniPackRB0_2x8", &uniPackRB0_2x8); - status |= vsi_nn_kernel_gpu_add_param(node, "uniPackTmp0AndG_2x8", &uniPackTmp0AndG_2x8); - status |= vsi_nn_kernel_gpu_add_param(node, "uniPackGR1_2x8", &uniPackGR1_2x8); - status |= vsi_nn_kernel_gpu_add_param(node, "uniPackTmp1AndB_2x8", &uniPackTmp1AndB_2x8); - status |= vsi_nn_kernel_gpu_add_param(node, "uniPackBG1_2x8", &uniPackBG1_2x8); - status |= vsi_nn_kernel_gpu_add_param(node, "uniPackTmp1AndR_2x8", &uniPackTmp1AndR_2x8); - status |= vsi_nn_kernel_gpu_add_param(node, "uniPackRB2_2x8", &uniPackRB2_2x8); - status |= vsi_nn_kernel_gpu_add_param(node, "uniPackTmp2AndG_2x8", &uniPackTmp2AndG_2x8); - status |= vsi_nn_kernel_gpu_add_param(node, "uniPackGR2_2x8", &uniPackGR2_2x8); - status |= vsi_nn_kernel_gpu_add_param(node, "uniPackTmp2AndB_2x8", &uniPackTmp2AndB_2x8); - status |= vsi_nn_kernel_gpu_add_param(node, "uniQuantU8toU8LoB_2x8", &uniQuantU8toU8LoB_2x8); status |= vsi_nn_kernel_gpu_add_param(node, "uniQuantU8toU8HiB_2x8", &uniQuantU8toU8HiB_2x8); status |= vsi_nn_kernel_gpu_add_param(node, "uniQuantU8toU8LoG_2x8", &uniQuantU8toU8LoG_2x8); @@ -633,7 +490,6 @@ DEF_KERNEL_INITIALIZER(_pre_process_yuv420_initializer) int32_t dstZP = 0; float dstScale = 1; int32_t reorder = 0; - int32_t trans = 0; int32_t order1 = 2; uint32_t width = 0; uint32_t height = 0; @@ -646,8 +502,6 @@ DEF_KERNEL_INITIALIZER(_pre_process_yuv420_initializer) status = vsi_nn_kernel_scalar_read_int32((vsi_nn_kernel_scalar_t)param[12], &reorder); CHECK_STATUS_FAIL_GOTO(status, OnError ); - status = vsi_nn_kernel_scalar_read_int32((vsi_nn_kernel_scalar_t)param[13], &trans); - CHECK_STATUS_FAIL_GOTO(status, OnError ); out_shape = attr[0]->shape; dstZP = attr[0]->asymm.zero_point; @@ -655,17 +509,13 @@ DEF_KERNEL_INITIALIZER(_pre_process_yuv420_initializer) width = out_shape->data[0]; height = out_shape->data[1]; - if(reorder != 0) + if (reorder != 0) { reorder = 2; order1 = 0; } - if(trans) - { - width = width / 3; - } - if(attr[0]->quant == VSI_NN_KERNEL_QUANT_DFP) + if (attr[0]->quant == VSI_NN_KERNEL_QUANT_DFP) { if (attr[0]->dfp.fl > 0) { @@ -677,11 +527,11 @@ DEF_KERNEL_INITIALIZER(_pre_process_yuv420_initializer) } dstZP = 0; } - else if(attr[0]->quant == VSI_NN_KERNEL_QUANT_ASYMM) + else if (attr[0]->quant == VSI_NN_KERNEL_QUANT_ASYMM) { - dstScale = 1.0f/dstScale; + dstScale = 1.0f / dstScale; } - else if( attr[0]->quant == VSI_NN_KERNEL_QUANT_NONE ) + else if ( attr[0]->quant == VSI_NN_KERNEL_QUANT_NONE ) { dstScale = 1; dstZP = 0; @@ -925,26 +775,6 @@ DEF_KERNEL_INITIALIZER(_pre_process_yuv420_initializer) 0x00000001, 0x00000001, 0x00000001, 0x00000001, 0x00000001, 0x00000001, 0x00000001, 0x00000001 // Constant }, GPU_DP_TYPE_16 }; - //trans - gpu_dp_inst_t uniTransPackBgr1st_2x8 = {{ - 0x11311311, // TCfg - 0x00100100, // ASelt - 0x01000400, 0x06020105, // ABin - 0x22022022, // BSelt - 0x00000000, 0x00000000, // BBin - 0x00002600, // AccumType, ConstantType, and PostShift - 0x00000001, 0x00000001, 0x00000000, 0x00000001, 0x00000001, 0x00000000, 0x00000001, 0x00000001 // Constant - }, GPU_DP_TYPE_16 }; - gpu_dp_inst_t uniTransPackBgr2nd_2x8 = {{ - 0x00003113, // TCfg - 0x00001001, // ASelt - 0x03070302, 0x00000000, // ABin - 0x00000220, // BSelt - 0x00000000, 0x00000000, // BBin - 0x00002600, // AccumType, ConstantType, and PostShift - 0x00000000, 0x00000001, 0x00000001, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000 // Constant - }, GPU_DP_TYPE_16 }; - status = vsi_nn_kernel_gpu_add_param(node, "uniCalculateR1st_4x4", &uniCalculateR1st_4x4); status |= vsi_nn_kernel_gpu_add_param(node, "uniCalculateTmpGbyU_2x8", &uniCalculateTmpGbyU_2x8); status |= vsi_nn_kernel_gpu_add_param(node, "uniCalculateTmpGbyU2nd_2x8", &uniCalculateTmpGbyU2nd_2x8); @@ -975,16 +805,7 @@ DEF_KERNEL_INITIALIZER(_pre_process_yuv420_initializer) status |= vsi_nn_kernel_gpu_add_param(node, "uniCalculateGWise_4x4", &uniCalculateGWise_4x4); status |= vsi_nn_kernel_gpu_add_param(node, "uniCalculateGWise2nd_4x4", &uniCalculateGWise2nd_4x4); - if(trans) - { - status = vsi_nn_kernel_gpu_add_param(node, "uniTransPackBgr1st_2x8", &uniTransPackBgr1st_2x8); - status |= vsi_nn_kernel_gpu_add_param(node, "uniTransPackBgr2nd_2x8", &uniTransPackBgr2nd_2x8); - CHECK_STATUS_FAIL_GOTO(status, OnError ); - } - else - { - status |= vsi_nn_kernel_gpu_add_param(node, "rOrder", &reorder); - } + status |= vsi_nn_kernel_gpu_add_param(node, "rOrder", &reorder); status |= vsi_nn_kernel_gpu_add_param(node, "bOrder", &order1); CHECK_STATUS_FAIL_GOTO(status, OnError ); @@ -1041,20 +862,11 @@ static vsi_status _query_kernel uint32_t key = 0; int i = 0; vsi_bool enable_copy = vsi_nn_kernel_param_get_int32( params, "enable_copy" ); - vsi_bool enable_perm = vsi_nn_kernel_param_get_int32( params, "enable_perm" ); input0_dtype = vsi_nn_kernel_map_dtype( inputs[0]->attr.dtype.vx_type ); output_dtype = vsi_nn_kernel_map_dtype( outputs[0]->attr.dtype.vx_type ); - if(enable_perm && enable_copy) - { - convert_type = COPY_TRANS; - } - else if(enable_perm) - { - convert_type = TRANS; - } - else if(enable_copy && output_dtype == U8) + if (enable_copy && output_dtype == U8) { convert_type = COPY; } @@ -1065,20 +877,20 @@ static vsi_status _query_kernel key = HASH_PRE_PROCESS_YUV420_KEY( input0_dtype, output_dtype, convert_type, 0 ); - for( i = 0; i < _cnt_of_array(pre_process_yuv420_map); i ++ ) + for ( i = 0; i < _cnt_of_array(pre_process_yuv420_map); i ++ ) { - if( pre_process_yuv420_map[i].key == key ) + if ( pre_process_yuv420_map[i].key == key ) { break; } } - if( i < _cnt_of_array(pre_process_yuv420_map) ) + if ( i < _cnt_of_array(pre_process_yuv420_map) ) { snprintf( kernel->info.name, VX_MAX_KERNEL_NAME, "%s", pre_process_yuv420_map[i].function_name ); kernel->info.parameters = vxPreProcessYuv420Kernel_param_def; kernel->info.numParams = _cnt_of_array( vxPreProcessYuv420Kernel_param_def ); - if(enable_copy && output_dtype == U8) + if (enable_copy && output_dtype == U8) { kernel->info.initialize = _pre_process_yuv420_copy_initializer; } @@ -1110,21 +922,20 @@ static vsi_nn_kernel_node_t _setup vsi_status status = VSI_FAILURE; vsi_nn_kernel_node_param_t tmp_params[_EVIS_PRE_PROCESS_YUV420_PARAM_NUM] = { NULL }; vsi_nn_kernel_node_t node = NULL; - int32_t shapes[VSI_NN_MAX_DIM_NUM] = {1, 1, 1, 1}; vsi_nn_tensor_t* reshape_tensors[1] = {NULL}; - int32_t trans = vsi_nn_kernel_param_get_int32( params, "enable_perm" ); + int32_t trans = 0; - if( !vsi_nn_kernel_gpu_check_shape( (int32_t*)outputs[0]->attr.size, + if ( !vsi_nn_kernel_gpu_check_shape( (int32_t*)outputs[0]->attr.size, outputs[0]->attr.dim_num ) ) { return NULL; } status = _query_kernel( inputs, outputs, kernel, params ); - if( VSI_SUCCESS == status) + if ( VSI_SUCCESS == status) { node = vsi_nn_kernel_create_node( graph, kernel ); - if( node ) + if ( node ) { uint32_t index = 4; int32_t scale_x = vsi_nn_kernel_param_get_int32( params, "scale_x" ); @@ -1138,22 +949,10 @@ static vsi_nn_kernel_node_t _setup int32_t reverse = vsi_nn_kernel_param_get_int32( params, "reverse" ); /* Pass parameters to node. */ - if(trans) - { - shapes[0] = outputs[0]->attr.size[0] * outputs[0]->attr.size[1]; - shapes[1] = outputs[0]->attr.size[2]; - reshape_tensors[0] = vsi_nn_reshape_tensor( graph, - outputs[0], (uint32_t*)shapes, outputs[0]->attr.dim_num); + vsi_nn_kernel_node_pack_io( tmp_params, _EVIS_PRE_PROCESS_YUV420_PARAM_NUM, + inputs, 3, outputs, 1 ); - vsi_nn_kernel_node_pack_io( tmp_params, _EVIS_PRE_PROCESS_YUV420_PARAM_NUM, - inputs, 3, &reshape_tensors[0], 1 ); - } - else - { - vsi_nn_kernel_node_pack_io( tmp_params, _EVIS_PRE_PROCESS_YUV420_PARAM_NUM, - inputs, 3, outputs, 1 ); - } tmp_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &scale_x ); tmp_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &scale_y ); tmp_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &left ); @@ -1178,7 +977,7 @@ static vsi_nn_kernel_node_t _setup vsi_nn_kernel_scalar_release( &tmp_params[13] ); } } - if(reshape_tensors[0]) + if (reshape_tensors[0]) { vsi_nn_ReleaseTensor(&reshape_tensors[0]); } diff --git a/src/tim/vx/internal/src/kernel/evis/pre_process_yuv444_evis.c b/src/tim/vx/internal/src/kernel/evis/pre_process_yuv444_evis.c index adb16cb..7d51d43 100644 --- a/src/tim/vx/internal/src/kernel/evis/pre_process_yuv444_evis.c +++ b/src/tim/vx/internal/src/kernel/evis/pre_process_yuv444_evis.c @@ -43,11 +43,8 @@ __BEGIN_DECLS #define VX_KERNEL_NAME_PRE_PROCESS_YUV444_SCALE_U8TOU8 CVIVANTE_NAMESPACE("evis.pre_process_yuv444_scale_U8toU8") #define VX_KERNEL_NAME_PRE_PROCESS_YUV444_SCALE_U8TOI8 CVIVANTE_NAMESPACE("evis.pre_process_yuv444_scale_U8toI8") #define VX_KERNEL_NAME_PRE_PROCESS_YUV444_COPY_U8TOU8 CVIVANTE_NAMESPACE("evis.pre_process_yuv444_copy_U8toU8") -#define VX_KERNEL_NAME_PRE_PROCESS_YUV444_COPY_TRANS_U8TOU8 CVIVANTE_NAMESPACE("evis.pre_process_yuv444_copy_trans_U8") -#define VX_KERNEL_NAME_PRE_PROCESS_YUV444_TRANS_U8TOU8 CVIVANTE_NAMESPACE("evis.pre_process_yuv444_trans_U8toU8") #define KERNEL_SOURCE_1 "pre_process_yuv444_scale", -#define KERNEL_SOURCE_2 "pre_process_yuv444_trans_u8", #define KERNEL_SOURCE_3 "pre_process_yuv444_scale_fp16", #define KERNEL_SOURCE_4 "pre_process_yuv444_copy_u8", @@ -78,8 +75,6 @@ static const struct { TENSOR_PRE_PROCESS_YUV444_KERNELS(U8, U8, SCALE, KERNEL_SOURCE_1) TENSOR_PRE_PROCESS_YUV444_KERNELS(U8, I8, SCALE, KERNEL_SOURCE_1) TENSOR_PRE_PROCESS_YUV444_KERNELS(U8, U8, COPY, KERNEL_SOURCE_4) - TENSOR_PRE_PROCESS_YUV444_KERNELS(U8, U8, COPY_TRANS, KERNEL_SOURCE_4) - TENSOR_PRE_PROCESS_YUV444_KERNELS(U8, U8, TRANS, KERNEL_SOURCE_2) }; static vx_param_description_t vxPreProcessYuv444Kernel_param_def[] = @@ -119,7 +114,6 @@ DEF_KERNEL_INITIALIZER(_pre_process_yuv444_copy_initializer) int32_t dstZP = 0; float dstScale = 1; int32_t reorder = 0; - int32_t trans = 0; int32_t order1 = 2; uint32_t width = 0; uint32_t height = 0; @@ -132,8 +126,6 @@ DEF_KERNEL_INITIALIZER(_pre_process_yuv444_copy_initializer) status = vsi_nn_kernel_scalar_read_int32((vsi_nn_kernel_scalar_t)param[12], &reorder); CHECK_STATUS_FAIL_GOTO(status, OnError ); - status = vsi_nn_kernel_scalar_read_int32((vsi_nn_kernel_scalar_t)param[13], &trans); - CHECK_STATUS_FAIL_GOTO(status, OnError ); out_shape = attr[0]->shape; dstZP = attr[0]->asymm.zero_point; @@ -141,24 +133,19 @@ DEF_KERNEL_INITIALIZER(_pre_process_yuv444_copy_initializer) width = out_shape->data[0]; height = out_shape->data[1]; - if(reorder != 0) + if (reorder != 0) { reorder = 2; order1 = 0; } - if(trans) - { - width = width / 3; - } - - if(attr[0]->dtype == U8) + if (attr[0]->dtype == U8) { dstScale = 1.0f / dstScale; } shaderParam.global_scale[0] = 16; - if(attr[0]->dtype == I16 || attr[0]->dtype == F16) + if (attr[0]->dtype == I16 || attr[0]->dtype == F16) { shaderParam.global_scale[0] = 8; } @@ -174,131 +161,6 @@ DEF_KERNEL_INITIALIZER(_pre_process_yuv444_copy_initializer) CHECK_STATUS_FAIL_GOTO(status, OnError); { - gpu_dp_inst_t uniPackBG0_2x8 = {{ - 0x11011011, // TCfg - 0x10010010, // ASelt - 0x01000000, 0x02020001, // ABin - 0x22022022, // BSelt - 0x00000000, 0x00000000, // BBin - 0x00000600, // AccumType, ConstantType, and PostShift - 0x00000001, 0x00000001, 0x00000000, 0x00000001, - 0x00000001, 0x00000000, 0x00000001, 0x00000001 // Constant - }, GPU_DP_TYPE_16 }; - - gpu_dp_inst_t uniPackTmpAndR_2x8 = {{ - 0x11111111, // TCfg - 0x00100100, // ASelt - 0x03000100, 0x07060104, // ABin - 0x22222222, // BSelt - 0x00000000, 0x00000000, // BBin - 0x00000600, // AccumType, ConstantType, and PostShift - 0x00000001, 0x00000001, 0x00000001, 0x00000001, - 0x00000001, 0x00000001, 0x00000001, 0x00000001 // Constant - }, GPU_DP_TYPE_16 }; - - gpu_dp_inst_t uniPackRB0_2x8 = {{ - 0x11011011, // TCfg - 0x10010010, // ASelt - 0x03000302, 0x05040004, // ABin - 0x22022022, // BSelt - 0x00000000, 0x00000000, // BBin - 0x00000600, // AccumType, ConstantType, and PostShift - 0x00000001, 0x00000001, 0x00000000, 0x00000001, - 0x00000001, 0x00000000, 0x00000001, 0x00000001 // Constant - }, GPU_DP_TYPE_16 }; - gpu_dp_inst_t uniPackTmp0AndG_2x8 = {{ - 0x11111111, // TCfg - 0x00100100, // ASelt - 0x03030100, 0x07060404, // ABin - 0x22222222, // BSelt - 0x00000000, 0x00000000, // BBin - 0x00000600, // AccumType, ConstantType, and PostShift - 0x00000001, 0x00000001, 0x00000001, 0x00000001, - 0x00000001, 0x00000001, 0x00000001, 0x00000001 // Constant - }, GPU_DP_TYPE_16 }; - - gpu_dp_inst_t uniPackGR1_2x8 = {{ - 0x11011011, // TCfg - 0x10010010, // ASelt - 0x06000505, 0x07070006, // ABin - 0x22022022, // BSelt - 0x00000000, 0x00000000, // BBin - 0x00000600, // AccumType, ConstantType, and PostShift - 0x00000001, 0x00000001, 0x00000000, 0x00000001, - 0x00000001, 0x00000000, 0x00000001, 0x00000001 // Constant - }, GPU_DP_TYPE_16 }; - gpu_dp_inst_t uniPackTmp1AndB_2x8 = {{ - 0x11111111, // TCfg - 0x00100100, // ASelt - 0x03060100, 0x07060704, // ABin - 0x22222222, // BSelt - 0x00000000, 0x00000000, // BBin - 0x00000600, // AccumType, ConstantType, and PostShift - 0x00000001, 0x00000001, 0x00000001, 0x00000001, - 0x00000001, 0x00000001, 0x00000001, 0x00000001 // Constant - }, GPU_DP_TYPE_16 }; - gpu_dp_inst_t uniPackBG1_2x8 = {{ - 0x11011011, // TCfg - 0x10010010, // ASelt - 0x09000808, 0x0a0a0009, // ABin - 0x22022022, // BSelt - 0x00000000, 0x00000000, // BBin - 0x00000600, // AccumType, ConstantType, and PostShift - 0x00000001, 0x00000001, 0x00000000, 0x00000001, - 0x00000001, 0x00000000, 0x00000001, 0x00000001 // Constant - }, GPU_DP_TYPE_16 }; - gpu_dp_inst_t uniPackTmp1AndR_2x8 = {{ - 0x11111111, // TCfg - 0x00100100, // ASelt - 0x03080100, 0x07060904, // ABin - 0x22222222, // BSelt - 0x00000000, 0x00000000, // BBin - 0x00000600, // AccumType, ConstantType, and PostShift - 0x00000001, 0x00000001, 0x00000001, 0x00000001, - 0x00000001, 0x00000001, 0x00000001, 0x00000001 // Constant - }, GPU_DP_TYPE_16 }; - - gpu_dp_inst_t uniPackRB2_2x8 = {{ - 0x11011011, // TCfg - 0x10010010, // ASelt - 0x0b000b0a, 0x0d0c000c, // ABin - 0x22022022, // BSelt - 0x00000000, 0x00000000, // BBin - 0x00000600, // AccumType, ConstantType, and PostShift - 0x00000001, 0x00000001, 0x00000000, 0x00000001, - 0x00000001, 0x00000000, 0x00000001, 0x00000001 // Constant - }, GPU_DP_TYPE_16 }; - gpu_dp_inst_t uniPackTmp2AndG_2x8 = {{ - 0x11111111, // TCfg - 0x00100100, // ASelt - 0x030b0100, 0x07060c04, // ABin - 0x22222222, // BSelt - 0x00000000, 0x00000000, // BBin - 0x00000600, // AccumType, ConstantType, and PostShift - 0x00000001, 0x00000001, 0x00000001, 0x00000001, - 0x00000001, 0x00000001, 0x00000001, 0x00000001 // Constant - }, GPU_DP_TYPE_16 }; - gpu_dp_inst_t uniPackGR2_2x8 = {{ - 0x11011011, // TCfg - 0x10010010, // ASelt - 0x0e000d0d, 0x0f0f000e, // ABin - 0x22022022, // BSelt - 0x00000000, 0x00000000, // BBin - 0x00000600, // AccumType, ConstantType, and PostShift - 0x00000001, 0x00000001, 0x00000000, 0x00000001, - 0x00000001, 0x00000000, 0x00000001, 0x00000001 // Constant - }, GPU_DP_TYPE_16 }; - gpu_dp_inst_t uniPackTmp2AndB_2x8 = {{ - 0x11111111, // TCfg - 0x00100100, // ASelt - 0x030e0100, 0x07060f04, // ABin - 0x22222222, // BSelt - 0x00000000, 0x00000000, // BBin - 0x00000600, // AccumType, ConstantType, and PostShift - 0x00000001, 0x00000001, 0x00000001, 0x00000001, - 0x00000001, 0x00000001, 0x00000001, 0x00000001 // Constant - }, GPU_DP_TYPE_16 }; - gpu_dp_inst_t uniCalculateTmpR1st_4x4 = {{ 0x05050505, // TCfg 0x04040404, // ASelt @@ -563,19 +425,6 @@ DEF_KERNEL_INITIALIZER(_pre_process_yuv444_copy_initializer) status |= vsi_nn_kernel_gpu_add_param(node, "uniCalculateTmpB4th_4x4", &uniCalculateTmpB4th_4x4); status |= vsi_nn_kernel_gpu_add_param(node, "uniCalculateB1st_4x4", &uniCalculateR1st_4x4); - status |= vsi_nn_kernel_gpu_add_param(node, "uniPackBG0_2x8", &uniPackBG0_2x8); - status |= vsi_nn_kernel_gpu_add_param(node, "uniPackTmpAndR_2x8", &uniPackTmpAndR_2x8); - status |= vsi_nn_kernel_gpu_add_param(node, "uniPackRB0_2x8", &uniPackRB0_2x8); - status |= vsi_nn_kernel_gpu_add_param(node, "uniPackTmp0AndG_2x8", &uniPackTmp0AndG_2x8); - status |= vsi_nn_kernel_gpu_add_param(node, "uniPackGR1_2x8", &uniPackGR1_2x8); - status |= vsi_nn_kernel_gpu_add_param(node, "uniPackTmp1AndB_2x8", &uniPackTmp1AndB_2x8); - status |= vsi_nn_kernel_gpu_add_param(node, "uniPackBG1_2x8", &uniPackBG1_2x8); - status |= vsi_nn_kernel_gpu_add_param(node, "uniPackTmp1AndR_2x8", &uniPackTmp1AndR_2x8); - status |= vsi_nn_kernel_gpu_add_param(node, "uniPackRB2_2x8", &uniPackRB2_2x8); - status |= vsi_nn_kernel_gpu_add_param(node, "uniPackTmp2AndG_2x8", &uniPackTmp2AndG_2x8); - status |= vsi_nn_kernel_gpu_add_param(node, "uniPackGR2_2x8", &uniPackGR2_2x8); - status |= vsi_nn_kernel_gpu_add_param(node, "uniPackTmp2AndB_2x8", &uniPackTmp2AndB_2x8); - status |= vsi_nn_kernel_gpu_add_param(node, "uniQuantU8toU8LoB_2x8", &uniQuantU8toU8LoB_2x8); status |= vsi_nn_kernel_gpu_add_param(node, "uniQuantU8toU8HiB_2x8", &uniQuantU8toU8HiB_2x8); status |= vsi_nn_kernel_gpu_add_param(node, "uniQuantU8toU8LoG_2x8", &uniQuantU8toU8LoG_2x8); @@ -622,7 +471,6 @@ DEF_KERNEL_INITIALIZER(_pre_process_yuv444_initializer) int32_t dstZP = 0; float dstScale = 1; int32_t reorder = 0; - int32_t trans = 0; int32_t order1 = 2; uint32_t width = 0; uint32_t height = 0; @@ -635,8 +483,6 @@ DEF_KERNEL_INITIALIZER(_pre_process_yuv444_initializer) status = vsi_nn_kernel_scalar_read_int32((vsi_nn_kernel_scalar_t)param[12], &reorder); CHECK_STATUS_FAIL_GOTO(status, OnError ); - status = vsi_nn_kernel_scalar_read_int32((vsi_nn_kernel_scalar_t)param[13], &trans); - CHECK_STATUS_FAIL_GOTO(status, OnError ); out_shape = attr[0]->shape; dstZP = attr[0]->asymm.zero_point; @@ -644,17 +490,13 @@ DEF_KERNEL_INITIALIZER(_pre_process_yuv444_initializer) width = out_shape->data[0]; height = out_shape->data[1]; - if(reorder != 0) + if (reorder != 0) { reorder = 2; order1 = 0; } - if(trans) - { - width = width / 3; - } - if(attr[0]->quant == VSI_NN_KERNEL_QUANT_DFP) + if (attr[0]->quant == VSI_NN_KERNEL_QUANT_DFP) { if (attr[0]->dfp.fl > 0) { @@ -666,11 +508,11 @@ DEF_KERNEL_INITIALIZER(_pre_process_yuv444_initializer) } dstZP = 0; } - else if(attr[0]->quant == VSI_NN_KERNEL_QUANT_ASYMM) + else if (attr[0]->quant == VSI_NN_KERNEL_QUANT_ASYMM) { - dstScale = 1.0f/dstScale; + dstScale = 1.0f / dstScale; } - else if( attr[0]->quant == VSI_NN_KERNEL_QUANT_NONE ) + else if ( attr[0]->quant == VSI_NN_KERNEL_QUANT_NONE ) { dstScale = 1; dstZP = 0; @@ -914,26 +756,6 @@ DEF_KERNEL_INITIALIZER(_pre_process_yuv444_initializer) 0x00000001, 0x00000001, 0x00000001, 0x00000001, 0x00000001, 0x00000001, 0x00000001, 0x00000001 // Constant }, GPU_DP_TYPE_16 }; - //trans - gpu_dp_inst_t uniTransPackBgr1st_2x8 = {{ - 0x11311311, // TCfg - 0x00100100, // ASelt - 0x01000400, 0x06020105, // ABin - 0x22022022, // BSelt - 0x00000000, 0x00000000, // BBin - 0x00002600, // AccumType, ConstantType, and PostShift - 0x00000001, 0x00000001, 0x00000000, 0x00000001, 0x00000001, 0x00000000, 0x00000001, 0x00000001 // Constant - }, GPU_DP_TYPE_16 }; - gpu_dp_inst_t uniTransPackBgr2nd_2x8 = {{ - 0x00003113, // TCfg - 0x00001001, // ASelt - 0x03070302, 0x00000000, // ABin - 0x00000220, // BSelt - 0x00000000, 0x00000000, // BBin - 0x00002600, // AccumType, ConstantType, and PostShift - 0x00000000, 0x00000001, 0x00000001, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000 // Constant - }, GPU_DP_TYPE_16 }; - status = vsi_nn_kernel_gpu_add_param(node, "uniCalculateR1st_4x4", &uniCalculateR1st_4x4); status |= vsi_nn_kernel_gpu_add_param(node, "uniCalculateTmpGbyU_2x8", &uniCalculateTmpGbyU_2x8); status |= vsi_nn_kernel_gpu_add_param(node, "uniCalculateTmpGbyU2nd_2x8", &uniCalculateTmpGbyU2nd_2x8); @@ -963,17 +785,8 @@ DEF_KERNEL_INITIALIZER(_pre_process_yuv444_initializer) status |= vsi_nn_kernel_gpu_add_param(node, "uniCalculateGWise_4x4", &uniCalculateGWise_4x4); status |= vsi_nn_kernel_gpu_add_param(node, "uniCalculateGWise2nd_4x4", &uniCalculateGWise2nd_4x4); + status |= vsi_nn_kernel_gpu_add_param(node, "rOrder", &reorder); - if(trans) - { - status = vsi_nn_kernel_gpu_add_param(node, "uniTransPackBgr1st_2x8", &uniTransPackBgr1st_2x8); - status |= vsi_nn_kernel_gpu_add_param(node, "uniTransPackBgr2nd_2x8", &uniTransPackBgr2nd_2x8); - CHECK_STATUS_FAIL_GOTO(status, OnError ); - } - else - { - status |= vsi_nn_kernel_gpu_add_param(node, "rOrder", &reorder); - } status |= vsi_nn_kernel_gpu_add_param(node, "bOrder", &order1); CHECK_STATUS_FAIL_GOTO(status, OnError ); @@ -1024,20 +837,11 @@ static vsi_status _query_kernel uint32_t key = 0; int i = 0; vsi_bool enable_copy = vsi_nn_kernel_param_get_int32( params, "enable_copy" ); - vsi_bool enable_perm = vsi_nn_kernel_param_get_int32( params, "enable_perm" ); input0_dtype = vsi_nn_kernel_map_dtype( inputs[0]->attr.dtype.vx_type ); output_dtype = vsi_nn_kernel_map_dtype( outputs[0]->attr.dtype.vx_type ); - if(enable_perm && enable_copy) - { - convert_type = COPY_TRANS; - } - else if(enable_perm) - { - convert_type = TRANS; - } - else if(enable_copy && output_dtype == U8) + if (enable_copy && output_dtype == U8) { convert_type = COPY; } @@ -1048,20 +852,20 @@ static vsi_status _query_kernel key = HASH_PRE_PROCESS_YUV444_KEY( input0_dtype, output_dtype, convert_type, 0 ); - for( i = 0; i < _cnt_of_array(pre_process_yuv444_map); i ++ ) + for ( i = 0; i < _cnt_of_array(pre_process_yuv444_map); i ++ ) { - if( pre_process_yuv444_map[i].key == key ) + if ( pre_process_yuv444_map[i].key == key ) { break; } } - if( i < _cnt_of_array(pre_process_yuv444_map) ) + if ( i < _cnt_of_array(pre_process_yuv444_map) ) { snprintf( kernel->info.name, VX_MAX_KERNEL_NAME, "%s", pre_process_yuv444_map[i].function_name ); kernel->info.parameters = vxPreProcessYuv444Kernel_param_def; kernel->info.numParams = _cnt_of_array( vxPreProcessYuv444Kernel_param_def ); - if(enable_copy && output_dtype == U8) + if (enable_copy && output_dtype == U8) { kernel->info.initialize = _pre_process_yuv444_copy_initializer; } @@ -1093,21 +897,20 @@ static vsi_nn_kernel_node_t _setup vsi_status status = VSI_FAILURE; vsi_nn_kernel_node_param_t tmp_params[_EVIS_PRE_PROCESS_YUV444_PARAM_NUM] = { NULL }; vsi_nn_kernel_node_t node = NULL; - int32_t shapes[VSI_NN_MAX_DIM_NUM] = {1, 1, 1, 1}; vsi_nn_tensor_t* reshape_tensors[1] = {NULL}; - int32_t trans = vsi_nn_kernel_param_get_int32( params, "enable_perm" ); + int32_t trans = 0; - if( !vsi_nn_kernel_gpu_check_shape( (int32_t*)outputs[0]->attr.size, + if ( !vsi_nn_kernel_gpu_check_shape( (int32_t*)outputs[0]->attr.size, outputs[0]->attr.dim_num ) ) { return NULL; } status = _query_kernel( inputs, outputs, kernel, params ); - if( VSI_SUCCESS == status) + if ( VSI_SUCCESS == status) { node = vsi_nn_kernel_create_node( graph, kernel ); - if( node ) + if ( node ) { uint32_t index = 4; int32_t scale_x = vsi_nn_kernel_param_get_int32( params, "scale_x" ); @@ -1121,22 +924,9 @@ static vsi_nn_kernel_node_t _setup int32_t reverse = vsi_nn_kernel_param_get_int32( params, "reverse" ); /* Pass parameters to node. */ - if(trans) - { - shapes[0] = outputs[0]->attr.size[0] * outputs[0]->attr.size[1]; - shapes[1] = outputs[0]->attr.size[2]; + vsi_nn_kernel_node_pack_io( tmp_params, _EVIS_PRE_PROCESS_YUV444_PARAM_NUM, + inputs, 3, outputs, 1 ); - reshape_tensors[0] = vsi_nn_reshape_tensor( graph, - outputs[0], (uint32_t*)shapes, outputs[0]->attr.dim_num); - - vsi_nn_kernel_node_pack_io( tmp_params, _EVIS_PRE_PROCESS_YUV444_PARAM_NUM, - inputs, 3, &reshape_tensors[0], 1 ); - } - else - { - vsi_nn_kernel_node_pack_io( tmp_params, _EVIS_PRE_PROCESS_YUV444_PARAM_NUM, - inputs, 3, outputs, 1 ); - } tmp_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &scale_x ); tmp_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &scale_y ); tmp_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &left ); diff --git a/src/tim/vx/internal/src/kernel/evis/resize_bilinear_evis.c b/src/tim/vx/internal/src/kernel/evis/resize_bilinear_evis.c index 0cc7c61..af3e06f 100644 --- a/src/tim/vx/internal/src/kernel/evis/resize_bilinear_evis.c +++ b/src/tim/vx/internal/src/kernel/evis/resize_bilinear_evis.c @@ -369,6 +369,26 @@ DEF_KERNEL_INITIALIZER(_resize_bilinear_initializer) 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000 // Constant }, GPU_DP_TYPE_16}; + gpu_dp_inst_t uniRightSubLeft_4x4 = {{ + 0x09090909, // TCfg + 0x00000000, // ASelt + 0x00230001, 0x00670045, // ABin + 0x0a0a0a0a, // BSelt + 0x00000000, 0x00000000, // BBin + 0x00000400, // AccumType, ConstantType, and PostShift + 0x00010001, 0x00000000, 0x00010001, 0x00000000, + 0x00010001, 0x00000000, 0x00010001, 0x00000000 // Constant + }, GPU_DP_TYPE_16}; + gpu_dp_inst_t uniDFPtoFp32_left_4x4 = {{ + 0x01010101, // TCfg + 0x00000000, // ASelt + 0x00020000, 0x00060004, // ABin + 0x02020202, // BSelt + 0x00000000, 0x00000000, // BBin + 0x00000400, // AccumType, ConstantType, and PostShift + 0x00000001, 0x00000000, 0x00000001, 0x00000000, + 0x00000001, 0x00000000, 0x00000001, 0x00000000 // Constant + }, GPU_DP_TYPE_16}; if (I8 == input_dtype && I8 == output_dtype && out_width > in_width) { @@ -405,7 +425,8 @@ DEF_KERNEL_INITIALIZER(_resize_bilinear_initializer) status = vsi_nn_kernel_gpu_add_param( node, "uniConvertI32toI16_2x8", &uniConvertI32toI16_2x8); status |= vsi_nn_kernel_gpu_add_param( node, "uniGetMaskShift_2x8", &uniGetMaskShift_2x8); - status |= vsi_nn_kernel_gpu_add_param( node, "uniConvertDFP2FP32_part1_4x4", + status |= vsi_nn_kernel_gpu_add_param( node, "uniDFPtoFp32_left_4x4", &uniConvertDFP2FP32_4x4); + status |= vsi_nn_kernel_gpu_add_param( node, "uniRightSubLeft_4x4", &uniConvertDFP2FP32_part1_4x4); status |= vsi_nn_kernel_gpu_add_param( node, "depth", &depth); CHECK_STATUS_FAIL_GOTO(status, final ); @@ -447,16 +468,22 @@ DEF_KERNEL_INITIALIZER(_resize_bilinear_initializer) status = vsi_nn_kernel_gpu_add_param( node, "uniConvertI32toI16_2x8", &uniConvertI32toI16_2x8); status |= vsi_nn_kernel_gpu_add_param( node, "uniGetMaskShift_2x8", &uniGetMaskShift_2x8); - status |= vsi_nn_kernel_gpu_add_param( node, "uniConvertDFP2FP32_part1_4x4", + status |= vsi_nn_kernel_gpu_add_param( node, "uniDFPtoFp32_left_4x4", &uniConvertDFP2FP32_4x4); + status |= vsi_nn_kernel_gpu_add_param( node, "uniRightSubLeft_4x4", &uniConvertDFP2FP32_part1_4x4); status |= vsi_nn_kernel_gpu_add_param( node, "depth", &depth); CHECK_STATUS_FAIL_GOTO(status, final ); gpu_param.global_scale[2] = depth; } + else + { + status = vsi_nn_kernel_gpu_add_param( node, "uniRightSubLeft_4x4", &uniRightSubLeft_4x4); + status |= vsi_nn_kernel_gpu_add_param( node, "uniDFPtoFp32_left_4x4", &uniDFPtoFp32_left_4x4); + CHECK_STATUS_FAIL_GOTO(status, final ); + } - status = vsi_nn_kernel_gpu_add_param( node, "uniConvertDFP2FP32_4x4", &uniConvertDFP2FP32_4x4); - status |= vsi_nn_kernel_gpu_add_param( node, "uniExtact8Bit_2x8", &uniExtact8Bit_2x8); + status = vsi_nn_kernel_gpu_add_param( node, "uniExtact8Bit_2x8", &uniExtact8Bit_2x8); status |= vsi_nn_kernel_gpu_add_param( node, "scale_xy", scale_factor); status |= vsi_nn_kernel_gpu_add_param( node, "dfpScale", &dfpScale); CHECK_STATUS_FAIL_GOTO(status, final ); @@ -485,10 +512,33 @@ DEF_KERNEL_INITIALIZER(_resize_bilinear_initializer) 0x00010001, 0x00000000, 0x00010001, 0x00000000, 0x00010001, 0x00000000, 0x00010001, 0x00000000 // Constant }, GPU_DP_TYPE_16}; + gpu_dp_inst_t uniU8SubZPtoFp32_left_4x4 = {{ + 0x09090909, // TCfg + 0x04040404, // ASelt + 0x00020000, 0x00060004, // ABin + 0x0a0a0a0a, // BSelt + 0x00000000, 0x00000000, // BBin + 0x00000400, // AccumType, ConstantType, and PostShift + 0x00010001, 0x00000000, 0x00010001, 0x00000000, + 0x00010001, 0x00000000, 0x00010001, 0x00000000 // Constant + }, GPU_DP_TYPE_16}; + gpu_dp_inst_t uniU8RightSubLeft_4x4 = {{ + 0x09090909, // TCfg + 0x00000000, // ASelt + 0x00230001, 0x00670045, // ABin + 0x0a0a0a0a, // BSelt + 0x00000000, 0x00000000, // BBin + 0x00000400, // AccumType, ConstantType, and PostShift + 0x00010001, 0x00000000, 0x00010001, 0x00000000, + 0x00010001, 0x00000000, 0x00010001, 0x00000000 // Constant + }, GPU_DP_TYPE_16}; + if (F16 == output_dtype) { - status = vsi_nn_kernel_gpu_add_param( node, "uint8Scale", &input_scale); + status = vsi_nn_kernel_gpu_add_param( node, "uint8Scale", &input_scale); + status |= vsi_nn_kernel_gpu_add_param( node, "uniU8SubZPtoFp32_left_4x4", &uniU8SubZPtoFp32_left_4x4); + status |= vsi_nn_kernel_gpu_add_param( node, "uniU8RightSubLeft_4x4", &uniU8RightSubLeft_4x4); CHECK_STATUS_FAIL_GOTO(status, final ); } else @@ -544,13 +594,21 @@ DEF_KERNEL_INITIALIZER(_resize_bilinear_initializer) } else { + status = vsi_nn_kernel_gpu_add_param( node, "uniU8SubZPtoFp32_left_4x4", &uniU8SubZPtoFp32_4x4); status |= vsi_nn_kernel_gpu_add_param( node, - "uniU8SubZPtoFp32_part1_4x4", &uniU8SubZPtoFp32_part1_4x4); + "uniU8RightSubLeft_4x4", &uniU8SubZPtoFp32_part1_4x4); } CHECK_STATUS_FAIL_GOTO(status, final ); gpu_param.global_scale[2] = depth; } + else if (!is_use_scale_kernel) + { + status = vsi_nn_kernel_gpu_add_param( node, "uniU8SubZPtoFp32_left_4x4", &uniU8SubZPtoFp32_left_4x4); + status |= vsi_nn_kernel_gpu_add_param( node, "uniU8RightSubLeft_4x4", &uniU8RightSubLeft_4x4); + CHECK_STATUS_FAIL_GOTO(status, final ); + } + if (!is_use_scale_kernel) { status = vsi_nn_kernel_gpu_add_param( node, "uniExtact8Bit_2x8", &uniExtact8Bit_2x8); @@ -562,8 +620,7 @@ DEF_KERNEL_INITIALIZER(_resize_bilinear_initializer) status = vsi_nn_kernel_gpu_add_param( node, "scale_xy", scale_factor); if (!is_use_scale_kernel) { - status = vsi_nn_kernel_gpu_add_param( node, "uniU8SubZPtoFp32_4x4", &uniU8SubZPtoFp32_4x4); - status |= vsi_nn_kernel_gpu_add_param( node, "input_ZP", &inputZP); + status = vsi_nn_kernel_gpu_add_param( node, "input_ZP", &inputZP); } CHECK_STATUS_FAIL_GOTO(status, final ); } @@ -581,25 +638,25 @@ DEF_KERNEL_INITIALIZER(_resize_bilinear_initializer) 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000 // Constant }, GPU_DP_TYPE_16}; - gpu_dp_inst_t uniFp16toFp32_4x4 = {{ - 0x01010101, // TCfg - 0x00000000, // ASelt - 0x00010000, 0x00030002, // ABin - 0x02020202, // BSelt - 0x00000000, 0x00000000, // BBin - 0x00000100, // AccumType, ConstantType, and PostShift - 0x00003c00, 0x00000000, 0x00003c00, 0x00000000, - 0x00003c00, 0x00000000, 0x00003c00, 0x00000000 // Constant - }, GPU_DP_TYPE_16}; gpu_dp_inst_t uniRightSubLeft_4x4 = {{ 0x09090909, // TCfg - 0x04040404, // ASelt - 0x00110000, 0x00330022, // ABin + 0x00000000, // ASelt + 0x00230001, 0x00670045, // ABin 0x0a0a0a0a, // BSelt 0x00000000, 0x00000000, // BBin - 0x00000100, // AccumType, ConstantType, and PostShift - 0x3c003c00, 0x00000000, 0x3c003c00, 0x00000000, - 0x3c003c00, 0x00000000, 0x3c003c00, 0x00000000 // Constant + 0x00000400, // AccumType, ConstantType, and PostShift + 0x00010001, 0x00000000, 0x00010001, 0x00000000, + 0x00010001, 0x00000000, 0x00010001, 0x00000000 // Constant + }, GPU_DP_TYPE_16}; + gpu_dp_inst_t uniFp16toFp32_left_4x4 = {{ + 0x01010101, // TCfg + 0x00000000, // ASelt + 0x00020000, 0x00060004, // ABin + 0x02020202, // BSelt + 0x00000000, 0x00000000, // BBin + 0x00000400, // AccumType, ConstantType, and PostShift + 0x00000001, 0x00000000, 0x00000001, 0x00000000, + 0x00000001, 0x00000000, 0x00000001, 0x00000000 // Constant }, GPU_DP_TYPE_16}; gpu_dp_inst_t uniExtactHalf8_2x8 = {{ 0x11111111, // TCfg @@ -634,7 +691,17 @@ DEF_KERNEL_INITIALIZER(_resize_bilinear_initializer) 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000 // Constant }, GPU_DP_TYPE_16}; - gpu_dp_inst_t uniFp16toFp32_part1_4x4 = {{ + gpu_dp_inst_t uniFp16toFp32_Lo_4x4 = {{ + 0x01010101, // TCfg + 0x00000000, // ASelt + 0x00010000, 0x00030002, // ABin + 0x02020202, // BSelt + 0x00000000, 0x00000000, // BBin + 0x00000400, // AccumType, ConstantType, and PostShift + 0x00000001, 0x00000000, 0x00000001, 0x00000000, + 0x00000001, 0x00000000, 0x00000001, 0x00000000 // Constant + }, GPU_DP_TYPE_16}; + gpu_dp_inst_t uniFp16toFp32_Hi_4x4 = {{ 0x09090909, // TCfg 0x00000000, // ASelt 0x00150004, 0x00370026, // ABin @@ -647,7 +714,8 @@ DEF_KERNEL_INITIALIZER(_resize_bilinear_initializer) status = vsi_nn_kernel_gpu_add_param( node, "uniConvertI32toI16_2x8", &uniConvertI32toI16_2x8); status |= vsi_nn_kernel_gpu_add_param( node, "uniGetMaskShift_2x8", &uniGetMaskShift_2x8); - status |= vsi_nn_kernel_gpu_add_param( node, "uniFp16toFp32_part1_4x4", &uniFp16toFp32_part1_4x4); + status |= vsi_nn_kernel_gpu_add_param( node, "uniFp16toFp32_left_4x4", &uniFp16toFp32_Lo_4x4); + status |= vsi_nn_kernel_gpu_add_param( node, "uniRightSubLeft_4x4", &uniFp16toFp32_Hi_4x4); status |= vsi_nn_kernel_gpu_add_param( node, "uniExtactHalf8_2x8", &uniExtactHalf8_2x8); status |= vsi_nn_kernel_gpu_add_param( node, "depth", &depth); CHECK_STATUS_FAIL_GOTO(status, final ); @@ -657,19 +725,21 @@ DEF_KERNEL_INITIALIZER(_resize_bilinear_initializer) else if (F16 == output_dtype) { status = vsi_nn_kernel_gpu_add_param( node, "uniExtactHalf8_2x8", &uniExtactHalf8_2x8); + status |= vsi_nn_kernel_gpu_add_param( node, "uniRightSubLeft_4x4", &uniRightSubLeft_4x4); + status |= vsi_nn_kernel_gpu_add_param( node, "uniFp16toFp32_left_4x4", &uniFp16toFp32_left_4x4); CHECK_STATUS_FAIL_GOTO(status, final ); } else { status = vsi_nn_kernel_gpu_add_param( node, "uniExtact8Bit_2x8", &uniExtact8Bit_2x8); + status |= vsi_nn_kernel_gpu_add_param( node, "uniRightSubLeft_4x4", &uniRightSubLeft_4x4); + status |= vsi_nn_kernel_gpu_add_param( node, "uniFp16toFp32_left_4x4", &uniFp16toFp32_left_4x4); status |= vsi_nn_kernel_gpu_add_param( node, "uint8Scale", &uint8Scale); status |= vsi_nn_kernel_gpu_add_param( node, "output_ZP", &uint8ZP_out); CHECK_STATUS_FAIL_GOTO(status, final ); } - status = vsi_nn_kernel_gpu_add_param( node, "uniFp16toFp32_4x4", &uniFp16toFp32_4x4); - status |= vsi_nn_kernel_gpu_add_param( node, "uniRightSubLeft_4x4", &uniRightSubLeft_4x4); - status |= vsi_nn_kernel_gpu_add_param( node, "scale_xy", scale_factor); + status = vsi_nn_kernel_gpu_add_param( node, "scale_xy", scale_factor); CHECK_STATUS_FAIL_GOTO(status, final ); } else if (BF16 == input_dtype && BF16 == output_dtype) diff --git a/src/tim/vx/internal/src/kernel/evis/space2depth_internal_evis.c b/src/tim/vx/internal/src/kernel/evis/space2depth_internal_evis.c new file mode 100644 index 0000000..cc4c5f6 --- /dev/null +++ b/src/tim/vx/internal/src/kernel/evis/space2depth_internal_evis.c @@ -0,0 +1,366 @@ +/**************************************************************************** +* +* Copyright (c) 2019 Vivante Corporation +* +* Permission is hereby granted, free of charge, to any person obtaining a +* copy of this software and associated documentation files (the "Software"), +* to deal in the Software without restriction, including without limitation +* the rights to use, copy, modify, merge, publish, distribute, sublicense, +* and/or sell copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following conditions: +* +* The above copyright notice and this permission notice shall be included in +* all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +* DEALINGS IN THE SOFTWARE. +* +*****************************************************************************/ + + +#include +#include +#include +#include "vsi_nn_types.h" +#include "vsi_nn_tensor.h" +#include "vsi_nn_graph.h" +#include "vsi_nn_log.h" +#include "vsi_nn_prv.h" +#include "vsi_nn_tensor_util.h" +#include "vsi_nn_error.h" +#include "utils/vsi_nn_util.h" +#include "kernel/vsi_nn_kernel.h" +#include "libnnext/vx_lib_nnext.h" + +__BEGIN_DECLS + +/* + * Define kernel meta. + */ +#define VX_KERNEL_NAME_SPACE2DEPTH_INTERNAL_U8TOU8 CVIVANTE_NAMESPACE("evis.space2depth_internal_U8toU8") +#define VX_KERNEL_NAME_SPACE2DEPTH_INTERNAL_U8TOU8_X2Y1 CVIVANTE_NAMESPACE("evis.space2depth_internal_U8toU8_X2Y1") +#define VX_KERNEL_NAME_SPACE2DEPTH_INTERNAL_I8TOI8 CVIVANTE_NAMESPACE("evis.space2depth_internal_I8toI8") +#define VX_KERNEL_NAME_SPACE2DEPTH_INTERNAL_I8TOI8_X2Y1 CVIVANTE_NAMESPACE("evis.space2depth_internal_I8toI8_X2Y1") +#define VX_KERNEL_NAME_SPACE2DEPTH_INTERNAL_I16TOI16 CVIVANTE_NAMESPACE("evis.space2depth_internal_I16toI16") +#define VX_KERNEL_NAME_SPACE2DEPTH_INTERNAL_I16TOI16_X2Y1 CVIVANTE_NAMESPACE("evis.space2depth_internal_I16toI16_X2Y1") +#define VX_KERNEL_NAME_SPACE2DEPTH_INTERNAL_F16TOF16 CVIVANTE_NAMESPACE("evis.space2depth_internal_F16toF16") +#define VX_KERNEL_NAME_SPACE2DEPTH_INTERNAL_F16TOF16_X2Y1 CVIVANTE_NAMESPACE("evis.space2depth_internal_F16toF16_X2Y1") + +#define KERNEL_SOURCE_1 "space2depth_internal" + +// Add kernel hashtable here +#define HASH_SPACE2DEPTH_INTERNAL_KEY(_input0_type, _output_type, _opt_stride) \ + ((_input0_type << 24) | (_output_type << 16) | (_opt_stride << 8)) + +#define TENSOR_SPACE2DEPTH_INTERNAL_KERNELS(IN0_TYPE, OUT_TYPE, SOURCE) \ + { HASH_SPACE2DEPTH_INTERNAL_KEY(IN0_TYPE, OUT_TYPE, 0), \ + VX_KERNEL_NAME_SPACE2DEPTH_INTERNAL_##IN0_TYPE##TO##OUT_TYPE, \ + SOURCE }, + +#define TENSOR_SPACE2DEPTH_INTERNAL_OPT_KERNELS(IN0_TYPE, OUT_TYPE, SOURCE) \ + { HASH_SPACE2DEPTH_INTERNAL_KEY(IN0_TYPE, OUT_TYPE, 1), \ + VX_KERNEL_NAME_SPACE2DEPTH_INTERNAL_##IN0_TYPE##TO##OUT_TYPE##_X2Y1, \ + SOURCE }, + +static const struct { + uint32_t key; + char* function_name; + const char* source_name; + } space2depth_internal_map[] = +{ + TENSOR_SPACE2DEPTH_INTERNAL_KERNELS(U8, U8, KERNEL_SOURCE_1) + TENSOR_SPACE2DEPTH_INTERNAL_KERNELS(I8, I8, KERNEL_SOURCE_1) + TENSOR_SPACE2DEPTH_INTERNAL_KERNELS(I16, I16, KERNEL_SOURCE_1) + TENSOR_SPACE2DEPTH_INTERNAL_KERNELS(F16, F16, KERNEL_SOURCE_1) + TENSOR_SPACE2DEPTH_INTERNAL_OPT_KERNELS(U8, U8, KERNEL_SOURCE_1) + TENSOR_SPACE2DEPTH_INTERNAL_OPT_KERNELS(I8, I8, KERNEL_SOURCE_1) + TENSOR_SPACE2DEPTH_INTERNAL_OPT_KERNELS(I16, I16, KERNEL_SOURCE_1) + TENSOR_SPACE2DEPTH_INTERNAL_OPT_KERNELS(F16, F16, KERNEL_SOURCE_1) +}; + +/* + * Kernel params + */ +static vx_param_description_t _space2depth_internal_kernel_param_def[] = +{ + {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, + {VX_OUTPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED}, + // Add kererl parameters here +}; +#define _SPACE2DEPTH_INTERNAL_PARAM_NUM _cnt_of_array( _space2depth_internal_kernel_param_def ) + +/* + * Kernel initializer + */ +DEF_KERNEL_INITIALIZER(_space2depth_internal_initializer) + ( + vsi_nn_kernel_node_t node, + const vsi_nn_kernel_node_param_t * param, + size_t param_size + ) +{ + vsi_status status = VSI_FAILURE; + gpu_param_t shaderParam = { + 3, // workdim + {0, 0, 0}, // globalWorkOffset: control the start location be processed in the image + {0, 0, 0}, // globalWorkScale: how many pixels could be processed by a single thread + {0, 0, 0}, // localWorkSize: local group size in thread + {0, 0, 0}}; // globalWorkSize: image size in thread + + uint32_t input_dims = 0; + vsi_nn_kernel_tensor_attr_t* attr[2] = {NULL, NULL}; + int32_t input_width = 0; + int32_t input_height = 0; + int32_t input_depth = 0; + int32_t stride_x = 0; + int32_t stride_y = 0; + int32_t opt_flg = 0; + + uint32_t pack_key = 0; + + attr[0] = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[0] ); + CHECK_PTR_FAIL_GOTO( attr[0], "Create tensor attr buffer fail.", OnError ); + attr[1] = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[1] ); + CHECK_PTR_FAIL_GOTO( attr[1], "Create tensor attr buffer fail.", OnError ); + + status = vsi_nn_kernel_scalar_read_int32((vsi_nn_kernel_scalar_t)param[2], &stride_x); + CHECK_STATUS_FAIL_GOTO(status, OnError ); + status = vsi_nn_kernel_scalar_read_int32((vsi_nn_kernel_scalar_t)param[3], &stride_y); + CHECK_STATUS_FAIL_GOTO(status, OnError ); + + input_dims = (uint32_t)attr[0]->shape->size; + input_width = attr[0]->shape->data[0]; + input_height = attr[0]->shape->data[1]; + input_depth = input_dims > 2 ? attr[0]->shape->data[2] : 1; + + shaderParam.global_scale[0] = 1; + shaderParam.global_scale[1] = 1; + shaderParam.global_scale[2] = 1; + if (stride_x == 2 && stride_y == 1) + { + shaderParam.global_scale[0] = 16; + if (attr[0]->dtype == F16 || attr[0]->dtype == I16) + { + shaderParam.global_scale[0] = 8; + } + opt_flg = 1; + } + shaderParam.global_size[0] = gpu_align_p2((input_width + shaderParam.global_scale[0] - 1) + / shaderParam.global_scale[0], 4); + shaderParam.global_size[1] = input_height; + shaderParam.global_size[2] = input_depth; + + status = vsi_nn_kernel_gpu_config( node, &shaderParam ); + CHECK_STATUS_FAIL_GOTO(status, OnError); + +#define _PACK_SELECT_KEY( IN0_TYPE, OUT_TYPE, OPT_FLG ) \ + (IN0_TYPE | (OUT_TYPE << 8) | (OPT_FLG << 16)) + + pack_key = _PACK_SELECT_KEY( attr[0]->dtype, attr[1]->dtype, opt_flg); + + { + gpu_dp_inst_t uniExtractEvenUint8Stride2_2x8 = {{ + 0x11111111, // TCfg + 0x00000000, // ASelt + 0x06040200, 0x0e0c0a08, // ABin + 0x22222222, // BSelt + 0x00000000, 0x00000000, // BBin + 0x00000700, // AccumType, ConstantType, and PostShift + 0x00000001, 0x00000001, 0x00000001, 0x00000001, 0x00000001, 0x00000001, 0x00000001, 0x00000001 // Constant + }, GPU_DP_TYPE_16 }; + gpu_dp_inst_t uniExtractOddUint8Stride2_2x8 = {{ + 0x11111111, // TCfg + 0x00000000, // ASelt + 0x07050301, 0x0f0d0b09, // ABin + 0x22222222, // BSelt + 0x00000000, 0x00000000, // BBin + 0x00000700, // AccumType, ConstantType, and PostShift + 0x00000001, 0x00000001, 0x00000001, 0x00000001, 0x00000001, 0x00000001, 0x00000001, 0x00000001 // Constant + }, GPU_DP_TYPE_16 }; + + gpu_dp_inst_t uniExtractEvenFp16Stride2_4x4 = {{ + 0x01010101, // TCfg + 0x00000000, // ASelt + 0x00020000, 0x00060004, // ABin + 0x02020202, // BSelt + 0x00000000, 0x00000000, // BBin + 0x00000400, // AccumType, ConstantType, and PostShift + 0x00000001, 0x00000000, 0x00000001, 0x00000000, 0x00000001, 0x00000000, 0x00000001, 0x00000000 // Constant + }, GPU_DP_TYPE_16 }; + gpu_dp_inst_t uniExtractOddFp16Stride2_4x4 = {{ + 0x01010101, // TCfg + 0x00000000, // ASelt + 0x00030001, 0x00070005, // ABin + 0x02020202, // BSelt + 0x00000000, 0x00000000, // BBin + 0x00000400, // AccumType, ConstantType, and PostShift + 0x00000001, 0x00000000, 0x00000001, 0x00000000, 0x00000001, 0x00000000, 0x00000001, 0x00000000 // Constant + }, GPU_DP_TYPE_16 }; + + status = vsi_nn_kernel_gpu_add_param(node, "input_depth", &input_depth); + CHECK_STATUS_FAIL_GOTO(status, OnError ); + + switch( pack_key ) + { + case _PACK_SELECT_KEY( U8, U8, 0 ): + case _PACK_SELECT_KEY( I8, I8, 0 ): + case _PACK_SELECT_KEY( I16, I16, 0 ): + case _PACK_SELECT_KEY( F16, F16, 0 ): + break; + case _PACK_SELECT_KEY( U8, U8, 1 ): + case _PACK_SELECT_KEY( I8, I8, 1 ): + { + status = vsi_nn_kernel_gpu_add_param( node, + "uniExtractEvenUint8Stride2_2x8", &uniExtractEvenUint8Stride2_2x8 ); + status |= vsi_nn_kernel_gpu_add_param( node, + "uniExtractOddUint8Stride2_2x8", &uniExtractOddUint8Stride2_2x8 ); + CHECK_STATUS_FAIL_GOTO(status, OnError ); + } + break; + case _PACK_SELECT_KEY( I16, I16, 1 ): + case _PACK_SELECT_KEY( F16, F16, 1 ): + { + status = vsi_nn_kernel_gpu_add_param( node, + "uniExtractEvenFp16Stride2_4x4", &uniExtractEvenFp16Stride2_4x4 ); + status |= vsi_nn_kernel_gpu_add_param( node, + "uniExtractOddFp16Stride2_4x4", &uniExtractOddFp16Stride2_4x4 ); + CHECK_STATUS_FAIL_GOTO(status, OnError ); + } + break; + default: + break; + } + } +#undef _PACK_SELECT_KEY + + CHECK_STATUS_FAIL_GOTO(status, OnError ); + +OnError: + if (attr[0]) + { + vsi_nn_kernel_tensor_attr_release( &attr[0] ); + attr[0] = NULL; + } + if (attr[1]) + { + vsi_nn_kernel_tensor_attr_release( &attr[1] ); + attr[1] = NULL; + } + + return status; +} + +/* + * Query kernel + */ +static vsi_status _query_kernel + ( + vsi_nn_tensor_t* const* const inputs, + vsi_nn_tensor_t* const* const outputs, + vsi_nn_kernel_t* kernel, + const vsi_nn_kernel_param_t * params, + int32_t opt_flg + ) +{ + vsi_status status = VSI_FAILURE; + vsi_nn_kernel_dtype_e input0_dtype = U8; + vsi_nn_kernel_dtype_e output_dtype = U8; + uint32_t key = 0; + int i = 0; + + input0_dtype = vsi_nn_kernel_map_dtype( inputs[0]->attr.dtype.vx_type ); + output_dtype = vsi_nn_kernel_map_dtype( outputs[0]->attr.dtype.vx_type ); + + key = HASH_SPACE2DEPTH_INTERNAL_KEY( input0_dtype, output_dtype, opt_flg ); + + for( i = 0; i < _cnt_of_array(space2depth_internal_map); i ++ ) + { + if ( space2depth_internal_map[i].key == key ) + { + break; + } + } + if ( i < _cnt_of_array(space2depth_internal_map) ) + { + snprintf( kernel->info.name, VX_MAX_KERNEL_NAME, "%s", space2depth_internal_map[i].function_name ); + kernel->info.parameters = _space2depth_internal_kernel_param_def; + kernel->info.numParams = _SPACE2DEPTH_INTERNAL_PARAM_NUM; + kernel->info.initialize = _space2depth_internal_initializer; + + vsi_nn_kernel_add_source( kernel, VSI_NN_GPU_SOURCE_FMT_CODE, 2, + "vsi_nn_kernel_header", + space2depth_internal_map[i].source_name ); + vsi_nn_kernel_add_source( kernel, VSI_NN_GPU_SOURCE_FMT_EXECUTABLE, 1, + space2depth_internal_map[i].source_name ); + status = VSI_SUCCESS; + } + return status; +} /* _query_kernel() */ + + +static vsi_nn_kernel_node_t _setup + ( + vsi_nn_graph_t * graph, + vsi_nn_tensor_t ** inputs, + size_t input_num, + vsi_nn_tensor_t ** outputs, + size_t output_num, + const vsi_nn_kernel_param_t * params, + vsi_nn_kernel_t * kernel + ) +{ + vsi_status status = VSI_FAILURE; + vsi_nn_kernel_node_param_t tmp_params[_SPACE2DEPTH_INTERNAL_PARAM_NUM] = { NULL }; + vsi_nn_kernel_node_t node = NULL; + int32_t block_size_x = vsi_nn_kernel_param_get_int32( params, "block_size_x" ); + int32_t block_size_y = vsi_nn_kernel_param_get_int32( params, "block_size_y" ); + int32_t opt_flg = (block_size_x == 2 && block_size_y == 1) ? 1 : 0; + + if ( !vsi_nn_kernel_gpu_check_shape( (int32_t*)outputs[0]->attr.size, + outputs[0]->attr.dim_num ) ) + { + return NULL; + } + + status = _query_kernel( inputs, outputs, kernel, params, opt_flg ); + if ( VSI_SUCCESS == status) + { + node = vsi_nn_kernel_create_node( graph, kernel ); + if ( node ) + { + vsi_nn_kernel_node_pack_io( tmp_params, _SPACE2DEPTH_INTERNAL_PARAM_NUM, inputs, 1, outputs, 1 ); + tmp_params[2] = vsi_nn_kernel_scalar_create( graph, I32, &block_size_x ); + tmp_params[3] = vsi_nn_kernel_scalar_create( graph, I32, &block_size_y ); + status = vsi_nn_kernel_node_pass_param( node, tmp_params, _SPACE2DEPTH_INTERNAL_PARAM_NUM ); + CHECK_STATUS(status); + vsi_nn_kernel_scalar_release( &tmp_params[2] ); + vsi_nn_kernel_scalar_release( &tmp_params[3] ); + { + // Set default border mode. + vx_border_t border; + border.mode = VX_BORDER_CONSTANT; + border.constant_value.U8 = 0; + border.constant_value.U16 = 0; + if (inputs[0]->attr.dtype.vx_type == VSI_NN_TYPE_UINT8) + { + border.constant_value.U8 = (vx_uint8)inputs[0]->attr.dtype.zero_point; + } + status = vxSetNodeAttribute( (vx_node)node, VX_NODE_BORDER, &border, sizeof(border) ); + CHECK_STATUS(status); + } + } + } + return node; +} /* _setup() */ + +__END_DECLS + +REGISTER_BACKEND_EVIS( space2depth_internal, _setup ) + diff --git a/src/tim/vx/internal/src/kernel/evis/upsamplescale_evis.c b/src/tim/vx/internal/src/kernel/evis/upsamplescale_evis.c new file mode 100644 index 0000000..5d89b18 --- /dev/null +++ b/src/tim/vx/internal/src/kernel/evis/upsamplescale_evis.c @@ -0,0 +1,422 @@ +/**************************************************************************** +* +* Copyright (c) 2020 Vivante Corporation +* +* Permission is hereby granted, free of charge, to any person obtaining a +* copy of this software and associated documentation files (the "Software"), +* to deal in the Software without restriction, including without limitation +* the rights to use, copy, modify, merge, publish, distribute, sublicense, +* and/or sell copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following conditions: +* +* The above copyright notice and this permission notice shall be included in +* all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +* DEALINGS IN THE SOFTWARE. +* +*****************************************************************************/ + + +#include +#include +#include +#include "vsi_nn_types.h" +#include "vsi_nn_tensor.h" +#include "vsi_nn_graph.h" +#include "vsi_nn_log.h" +#include "vsi_nn_error.h" +#include "vsi_nn_prv.h" +#include "vsi_nn_tensor_util.h" +#include "utils/vsi_nn_util.h" +#include "kernel/vsi_nn_kernel.h" +#include "libnnext/vx_lib_nnext.h" + +__BEGIN_DECLS + +/* + * Define kernel meta. + */ +typedef enum +{ + UP_ORG = 0, + UP_K2, +} _internal_upscale_e; + +#define _UPSAMPLESCALE_KERNEL_SOURCE "upsamplescale" +#define _UPSAMPLESCALE_KERNEL_K2_SOURCE "upsamplescale_k2" +#define _UPSAMPLESCALE_KERNEL_NAME CVIVANTE_NAMESPACE("evis.upsamplescale") + +#define STR(a) #a +// Add kernel hashtable here +#define UPSAMPLESCALE_HASH_KEY( IN_DTYPE, OUT_DTYPE, FLAG ) \ + (( IN_DTYPE ) | ( OUT_DTYPE << 8) | ( FLAG << 16)) + +#define PACK_KERNEL_MAP( IN_DTYPE, OUT_DTYPE ) \ + { UPSAMPLESCALE_HASH_KEY( IN_DTYPE, OUT_DTYPE, UP_ORG ), \ + CVIVANTE_NAMESPACE("evis.upsamplescale_"STR(IN_DTYPE)"to"STR(OUT_DTYPE)), \ + _UPSAMPLESCALE_KERNEL_SOURCE } + +#define PACK_KERNEL_MAP_K2( IN_DTYPE, OUT_DTYPE ) \ + { UPSAMPLESCALE_HASH_KEY( IN_DTYPE, OUT_DTYPE, UP_K2 ), \ + CVIVANTE_NAMESPACE("evis.upsamplescale_"STR(IN_DTYPE)"to"STR(OUT_DTYPE)"_K2"), \ + _UPSAMPLESCALE_KERNEL_K2_SOURCE } + +typedef struct +{ + uint32_t key; + char * function_name; + const char * source_name; +} _kernel_map_type; + +static const _kernel_map_type _upsamplescale_kernel_map[] = +{ + // Register kernel here + PACK_KERNEL_MAP( F16, F16 ), + PACK_KERNEL_MAP( F16, I16 ), + PACK_KERNEL_MAP( F16, I8 ), + PACK_KERNEL_MAP( F16, U8 ), + PACK_KERNEL_MAP( I16, I16 ), + PACK_KERNEL_MAP( I16, F16 ), + PACK_KERNEL_MAP( I8, I8 ), + PACK_KERNEL_MAP( I8, F16 ), + PACK_KERNEL_MAP( U8, U8 ), + PACK_KERNEL_MAP( U8, F16 ), + + PACK_KERNEL_MAP_K2( F16, F16 ), + PACK_KERNEL_MAP_K2( F16, I16 ), + PACK_KERNEL_MAP_K2( F16, I8 ), + PACK_KERNEL_MAP_K2( F16, U8 ), + PACK_KERNEL_MAP_K2( I16, I16 ), + PACK_KERNEL_MAP_K2( I16, F16 ), + PACK_KERNEL_MAP_K2( I8, I8 ), + PACK_KERNEL_MAP_K2( I8, F16 ), + PACK_KERNEL_MAP_K2( U8, U8 ), + PACK_KERNEL_MAP_K2( U8, F16 ), +}; + + +/* + * Kernel params + */ +static vx_param_description_t _upsamplescale_kernel_param_def[] = +{ + {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, + {VX_OUTPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED}, + // Add kererl parameters here +}; +#define _UPSAMPLESCALE_PARAM_NUM _cnt_of_array( _upsamplescale_kernel_param_def ) +#define SCALAR_STRIDE_VALUE (2) +#define SCALAR_SCALE_VALUE (3) +/* + * Kernel initializer + */ +DEF_KERNEL_INITIALIZER(_upsamplescale_initializer) + ( + vsi_nn_kernel_node_t node, + const vsi_nn_kernel_node_param_t * param, + size_t param_size + ) +{ +#define _PACK_UPSCALE_KEY( IN_TYPE, OUT_TYPE, FLAG ) \ + ( IN_TYPE | ( OUT_TYPE << 16) | (FLAG << 24) ) + + vsi_status status = VSI_FAILURE; + gpu_param_t gpu_param = { + 3, + {0, 0, 0}, + {0, 0, 0}, + {0, 0, 0}, + {0, 0, 0} + }; + vsi_nn_kernel_tensor_attr_t * output_attr = NULL; + vsi_nn_kernel_tensor_attr_t * input_attr = NULL; + vsi_int_array_t * in_shape = NULL; + vsi_nn_kernel_dtype_e input_dtype = F16; + vsi_nn_kernel_dtype_e output_dtype = F16; + int32_t stride = 0; + float scale = 0; + float scaleIn = 1.0f; + float scaleOut = 1.0f; + int32_t output_ZP = 0; + int32_t input_ZP = 0; + int32_t srcFixPointPos = 0; + int32_t dstFixPointPos = 0; + uint32_t pack_key = 0; + _internal_upscale_e flag = UP_ORG; + + input_attr = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[0] ); + CHECK_PTR_FAIL_GOTO( input_attr, "Create tensor attr buffer fail.", final ); + output_attr = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[1] ); + CHECK_PTR_FAIL_GOTO( output_attr, "Create tensor attr buffer fail.", final ); + + in_shape = input_attr->shape; + vsi_nn_kernel_scalar_read_int32((vsi_nn_kernel_scalar_t)param[SCALAR_STRIDE_VALUE], &(stride)); + vsi_nn_kernel_scalar_read_float32((vsi_nn_kernel_scalar_t)param[SCALAR_SCALE_VALUE], &(scale)); + input_dtype = input_attr->dtype; + output_dtype = output_attr->dtype; + + if (VSI_NN_KERNEL_QUANT_DFP == input_attr->quant) + { + srcFixPointPos = input_attr->dfp.fl; + if (srcFixPointPos >=0 ) + scaleIn = 1.0f / (float) ((int64_t)1 << srcFixPointPos); + else + scaleIn = (float) ((int64_t)1 << -srcFixPointPos); + } + else if (VSI_NN_KERNEL_QUANT_ASYMM == input_attr->quant) + { + input_ZP = input_attr->asymm.zero_point; + scaleIn = input_attr->asymm.scale; + } + + if (VSI_NN_KERNEL_QUANT_DFP == output_attr->quant) + { + dstFixPointPos = output_attr->dfp.fl; + if (dstFixPointPos >=0 ) + scaleOut = 1.0f / (float) ((int64_t)1 << dstFixPointPos); + else + scaleOut = (float) ((int64_t)1 << -dstFixPointPos); + } + else if (VSI_NN_KERNEL_QUANT_ASYMM == output_attr->quant) + { + output_ZP = output_attr->asymm.zero_point; + scaleOut = output_attr->asymm.scale; + } + + if (stride == 2 && scale >= 0) + { + flag = UP_K2; + } + + if ( flag == UP_K2 ) + { + gpu_param.global_scale[0] = 8; + gpu_param.global_scale[1] = 1; + gpu_param.global_scale[2] = 1; + } + else + { + gpu_param.global_scale[0] = 1; + gpu_param.global_scale[1] = 1; + gpu_param.global_scale[2] = 1; + } + + gpu_param.global_size[0] = gpu_align_p2( + (in_shape->data[0] + gpu_param.global_scale[0] - 1) + / gpu_param.global_scale[0], 4); + gpu_param.global_size[1] = ( + (in_shape->data[1] + gpu_param.global_scale[1] - 1) + / gpu_param.global_scale[1]); + gpu_param.global_size[2] = in_shape->size > 2 ? in_shape->data[2] : 1; + + pack_key = _PACK_UPSCALE_KEY( input_dtype, output_dtype, flag ); + + switch( pack_key ) + { + case _PACK_UPSCALE_KEY( F16, F16, UP_K2 ): + case _PACK_UPSCALE_KEY( F16, I16, UP_K2 ): + case _PACK_UPSCALE_KEY( F16, I8, UP_K2 ): + case _PACK_UPSCALE_KEY( F16, U8, UP_K2 ): + case _PACK_UPSCALE_KEY( I16, F16, UP_K2 ): + case _PACK_UPSCALE_KEY( I16, I16, UP_K2 ): + case _PACK_UPSCALE_KEY( I8, F16, UP_K2 ): + case _PACK_UPSCALE_KEY( I8, I8, UP_K2 ): + case _PACK_UPSCALE_KEY( U8, F16, UP_K2 ): + case _PACK_UPSCALE_KEY( U8, U8, UP_K2 ): + { + uint16_t multiplier = 0; + int32_t postShift = 0; + uint32_t multAndoutZP[2] = {0}; + gpu_dp_inst_t uniUpSampleScale2X_lo_2x8 = {{ + 0xdddddddd, // TCfg + 0x44444444, // ASelt + 0x11111010, 0x13131212, // ABin + 0x11111111, // BSelt + 0x00000000, 0x00000000, // BBin + 0x00002400, // AccumType, ConstantType, and PostShift + 0x00000000, 0x00000000, 0x00000000, 0x00000000, + 0x00000000, 0x00000000, 0x00000000, 0x00000000 // Constant + }, GPU_DP_TYPE_16}; + gpu_dp_inst_t uniUpSampleScale2X_hi_2x8 = {{ + 0xdddddddd, // TCfg + 0x44444444, // ASelt + 0x15151414, 0x17171616, // ABin + 0x11111111, // BSelt + 0x00000000, 0x00000000, // BBin + 0x00002400, // AccumType, ConstantType, and PostShift + 0x00000000, 0x00000000, 0x00000000, 0x00000000, + 0x00000000, 0x00000000, 0x00000000, 0x00000000 // Constant + }, GPU_DP_TYPE_16}; + + gpu_quantize_multiplier_16bit(scaleIn * scale / scaleOut, &multiplier, &postShift); + multAndoutZP[0] = (uint32_t)(multiplier); + multAndoutZP[1] = (uint32_t)((output_ZP << postShift) - input_ZP * multiplier); + + uniUpSampleScale2X_lo_2x8.data[7] |= (postShift & 0x1F); + uniUpSampleScale2X_hi_2x8.data[7] |= (postShift & 0x1F); + + status = vsi_nn_kernel_gpu_add_param( node, "uniUpScale2X_lo_2x8", &uniUpSampleScale2X_lo_2x8); + status |= vsi_nn_kernel_gpu_add_param( node, "uniUpScale2X_hi_2x8", &uniUpSampleScale2X_hi_2x8); + status |= vsi_nn_kernel_gpu_add_param( node, "multAndoutZP", multAndoutZP); + } + break; + case _PACK_UPSCALE_KEY( F16, F16, UP_ORG ): + case _PACK_UPSCALE_KEY( F16, I16, UP_ORG ): + case _PACK_UPSCALE_KEY( F16, I8, UP_ORG ): + case _PACK_UPSCALE_KEY( F16, U8, UP_ORG ): + case _PACK_UPSCALE_KEY( I16, F16, UP_ORG ): + case _PACK_UPSCALE_KEY( I16, I16, UP_ORG ): + case _PACK_UPSCALE_KEY( I8, F16, UP_ORG ): + case _PACK_UPSCALE_KEY( I8, I8, UP_ORG ): + case _PACK_UPSCALE_KEY( U8, F16, UP_ORG ): + case _PACK_UPSCALE_KEY( U8, U8, UP_ORG ): + { + float output_scale = scaleIn * scale / scaleOut; + float tail = output_ZP - input_ZP * output_scale; + gpu_dp_inst_t uniConvertDatatoF32_4x4 = {{ + 0x01010101, // TCfg + 0x00000000, // ASelt + 0x00010000, 0x00030002, // ABin + 0x02020202, // BSelt + 0x00000000, 0x00000000, // BBin + 0x00000100, // AccumType, ConstantType, and PostShift + 0x00003c00, 0x00000000, 0x00003c00, 0x00000000, + 0x00003c00, 0x00000000, 0x00003c00, 0x00000000 // Constant + }, GPU_DP_TYPE_16}; + + status = vsi_nn_kernel_gpu_add_param( node, "uniConvertDatatoF32_4x4", &uniConvertDatatoF32_4x4); + status |= vsi_nn_kernel_gpu_add_param( node, "output_scale", &output_scale); + status |= vsi_nn_kernel_gpu_add_param( node, "tail", &tail); + } + break; + default: + break; + } + +#undef _PACK_UPSCALE_KEY + status = vsi_nn_kernel_gpu_config( node, &gpu_param ); + +final: + if (input_attr) + { + vsi_nn_kernel_tensor_attr_release( &input_attr ); + input_attr = NULL; + } + if (output_attr) + { + vsi_nn_kernel_tensor_attr_release( &output_attr ); + output_attr = NULL; + } + + return status; +} /* _upsamplescale_initializer() */ + + + +/* + * Query kernel + */ +static vsi_status _query_kernel + ( + vsi_nn_kernel_t * kernel, + vsi_nn_tensor_t * const * const inputs, + vsi_nn_tensor_t * const * const outputs, + int32_t stride, + float scale + /* Add extra params */ + ) +{ + vsi_status status = VSI_FAILURE; + vsi_nn_kernel_dtype_e in_dtype; + vsi_nn_kernel_dtype_e out_dtype; + const _kernel_map_type * kernel_map = _upsamplescale_kernel_map; + vx_param_description_t * param_def = _upsamplescale_kernel_param_def; + size_t param_def_size = _cnt_of_array( _upsamplescale_kernel_param_def ); + vx_kernel_initialize_f initializer = _upsamplescale_initializer; + _internal_upscale_e flag = (stride == 2 && scale >= 0 ) ? UP_K2 : UP_ORG; + + uint32_t key = 0; + int i; + + in_dtype = vsi_nn_kernel_map_dtype( inputs[0]->attr.dtype.vx_type ); + out_dtype = vsi_nn_kernel_map_dtype( outputs[0]->attr.dtype.vx_type ); + + key = UPSAMPLESCALE_HASH_KEY( in_dtype, out_dtype, flag ); + + for( i = 0; i < _cnt_of_array( _upsamplescale_kernel_map ); i ++ ) + { + if( kernel_map[i].key == key ) + { + break; + } + } + if( i < _cnt_of_array( _upsamplescale_kernel_map ) ) + { + snprintf( kernel->info.name, VX_MAX_KERNEL_NAME, "%s", kernel_map[i].function_name ); + kernel->info.parameters = param_def; + kernel->info.numParams = (uint32_t)param_def_size; + kernel->info.initialize = initializer; + // Register code source + vsi_nn_kernel_add_source( kernel, VSI_NN_GPU_SOURCE_FMT_CODE, 2, + "vsi_nn_kernel_header", + kernel_map[i].source_name ); + // Register binary source + vsi_nn_kernel_add_source( kernel, VSI_NN_GPU_SOURCE_FMT_EXECUTABLE, 1, + kernel_map[i].source_name ); + status = VSI_SUCCESS; + } + return status; +} /* _query_kernel() */ + + +static vsi_nn_kernel_node_t _setup + ( + vsi_nn_graph_t * graph, + vsi_nn_tensor_t ** inputs, + size_t input_num, + vsi_nn_tensor_t ** outputs, + size_t output_num, + const vsi_nn_kernel_param_t * params, + vsi_nn_kernel_t * kernel + ) +{ + vsi_status status = VSI_FAILURE; + vsi_nn_kernel_node_param_t node_params[_UPSAMPLESCALE_PARAM_NUM]; + vsi_nn_kernel_node_t node = NULL; + int32_t stride = vsi_nn_kernel_param_get_int32( params, "stride" ); + float scale = vsi_nn_kernel_param_get_float32( params, "scale" ); + + status = _query_kernel( kernel, inputs, outputs, stride, scale ); + if ( VSI_SUCCESS == status) + { + node = vsi_nn_kernel_create_node( graph, kernel ); + if ( node ) + { + /* Set inputs and outputs */ + vsi_nn_kernel_node_pack_io( node_params, _UPSAMPLESCALE_PARAM_NUM, + inputs, input_num, outputs, output_num ); + node_params[SCALAR_STRIDE_VALUE] = vsi_nn_kernel_scalar_create( graph, I32, &stride ); + node_params[SCALAR_SCALE_VALUE] = vsi_nn_kernel_scalar_create( graph, F32, &scale ); + + /* Pass parameters to node. */ + status = vsi_nn_kernel_node_pass_param( node, node_params, _UPSAMPLESCALE_PARAM_NUM ); + vsi_nn_kernel_scalar_release( &node_params[SCALAR_STRIDE_VALUE] ); + vsi_nn_kernel_scalar_release( &node_params[SCALAR_SCALE_VALUE] ); + VSI_ASSERT( status == VSI_SUCCESS ); + } + } + return node; +} /* _setup() */ + +__END_DECLS + +REGISTER_BACKEND_EVIS( upsamplescale, _setup ) + diff --git a/src/tim/vx/internal/src/kernel/vsi_nn_kernel_gpu_shape_optimize.c b/src/tim/vx/internal/src/kernel/vsi_nn_kernel_gpu_shape_optimize.c index f8d23a0..ef35bf7 100644 --- a/src/tim/vx/internal/src/kernel/vsi_nn_kernel_gpu_shape_optimize.c +++ b/src/tim/vx/internal/src/kernel/vsi_nn_kernel_gpu_shape_optimize.c @@ -24,6 +24,7 @@ #include #include "vsi_nn_tensor.h" +#include "vsi_nn_tensor_util.h" #include "vsi_nn_error.h" #include "utils/vsi_nn_math.h" #include "kernel/vsi_nn_kernel_gpu_shape_optimize.h" @@ -53,7 +54,7 @@ static vsi_bool compute_gpu_divisor int32_t i = 0; for( i = vsi_nn_min( input_value, limit - 1 ); i > 0; i -- ) { - if( ( i % gcd == 0 ) && ( input_value % i == 0 ) ) + if ( ( i % gcd == 0 ) && ( input_value % i == 0 ) ) { *divisor = i; return TRUE; @@ -75,7 +76,7 @@ static size_t element_fill_dim if (size_x == 1) return 0; - if( size_x < GPU_TENSOR_MAX_WIDTH) + if ( size_x < GPU_TENSOR_MAX_WIDTH) { shape_x[rank_x] = size_x; } @@ -85,7 +86,7 @@ static size_t element_fill_dim int32_t remainder = 0; compute_gpu_divisor( size_x, GPU_TENSOR_MAX_WIDTH, 1, &divisor ); remainder = size_x / divisor; - if( remainder > GPU_TENSOR_MAX_WIDTH || rank_x >= max_rank) + if ( remainder > GPU_TENSOR_MAX_WIDTH || rank_x >= max_rank) { // Cannot optimize. shape_x[rank_x] = size_x; @@ -97,7 +98,7 @@ static size_t element_fill_dim * so it should be always 2. */ cost_size = 2; - if( size_x > 1 ) + if ( size_x > 1 ) { shape_x[rank_x] = divisor; shape_x[rank_x + 1] = remainder; @@ -170,25 +171,25 @@ vsi_bool vsi_nn_kernel_optimize_reduce_shape rank_in += element_fill_dim(out_shape_x, rank_in, GPU_TENSOR_MAX_WIDTH, outerSize); rank_out += element_fill_dim(out_shape_output, rank_out, GPU_TENSOR_MAX_WIDTH, outerSize); - if( 0 == rank_in ) + if ( 0 == rank_in ) { out_shape_x[0] = 1; out_shape_x[1] = 1; rank_in = 2; } - else if( 1 == rank_in ) + else if ( 1 == rank_in ) { out_shape_x[1] = 1; rank_in = 2; } - if( 0 == rank_out ) + if ( 0 == rank_out ) { out_shape_output[0] = 1; out_shape_output[1] = 1; rank_out = 2; } - else if( 1 == rank_out ) + else if ( 1 == rank_out ) { out_shape_output[1] = 1; rank_out = 2; @@ -200,6 +201,75 @@ vsi_bool vsi_nn_kernel_optimize_reduce_shape return ret; } /* vsi_nn_kernel_optimize_reduce_shape() */ +vsi_bool vsi_nn_kernel_optimize_tensor_shape + ( + const int32_t* shape_x, const size_t rank_x, + const int32_t *axis, const size_t axis_size, + int32_t* out_shape_x, uint32_t* out_rank_x, + int32_t* out_axis, uint32_t* out_axis_size + ) +{ + vsi_bool ret = TRUE; + size_t i = 0; + size_t rank_in = 0; + size_t dims = 0; + int32_t innerSize = 1; + int32_t outerSize = 1; + int32_t axisSize = 1; + + for (i = 0; i < axis_size; i++) + { + axisSize *= shape_x[axis[i]]; + } + + for (i = 0; i < (size_t)axis[0]; i++) + { + innerSize *= shape_x[i]; + } + + for (i = axis[axis_size - 1] + 1; i < rank_x; i++) + { + outerSize *= shape_x[i]; + } + + rank_in += element_fill_dim(out_shape_x, rank_in, GPU_TENSOR_MAX_WIDTH, innerSize); + dims = element_fill_dim(out_shape_x, rank_in, GPU_TENSOR_MAX_WIDTH, axisSize); + if (dims == 0) + { + out_axis[0] = (int32_t)rank_in; + *out_axis_size = 1; + out_shape_x[rank_in ++] = 1; + } + else + { + *out_axis_size = (uint32_t)dims; + for (i = 0; i < dims; i++) + { + out_axis[i] = (int32_t)rank_in + (int32_t)i; + } + } + + rank_in += dims; + + rank_in += element_fill_dim(out_shape_x, rank_in, GPU_TENSOR_MAX_WIDTH, outerSize); + + if ( 0 == rank_in ) + { + out_shape_x[0] = 1; + out_shape_x[1] = 1; + rank_in = 2; + } + else if ( 1 == rank_in ) + { + out_shape_x[1] = 1; + rank_in = 2; + } + + *out_rank_x = (uint32_t)rank_in; + + return ret; +} /* vsi_nn_kernel_optimize_reduce_shape() */ + vsi_bool vsi_nn_kernel_optimize_element_shape ( const int32_t* shape_x, const size_t rank_x, @@ -218,13 +288,13 @@ vsi_bool vsi_nn_kernel_optimize_element_shape rank_in += element_fill_dim(out_shape_x, rank_in, GPU_TENSOR_MAX_WIDTH, element_num); - if( 0 == rank_in ) + if ( 0 == rank_in ) { out_shape_x[0] = 1; out_shape_x[1] = 1; rank_in = 2; } - else if( 1 == rank_in ) + else if ( 1 == rank_in ) { out_shape_x[1] = 1; rank_in = 2; @@ -275,13 +345,13 @@ vsi_bool vsi_nn_kernel_optimize_softmax_shape rank_in += element_fill_dim(out_shape_x, rank_in, GPU_TENSOR_MAX_WIDTH, outerSize); - if( 0 == rank_in ) + if ( 0 == rank_in ) { out_shape_x[0] = 1; out_shape_x[1] = 1; rank_in = 2; } - else if( 1 == rank_in ) + else if ( 1 == rank_in ) { out_shape_x[1] = 1; rank_in = 2; @@ -313,7 +383,7 @@ static size_t tile_fill_dim size_t cost_size = 1; VSI_ASSERT( rank <= max_rank ); VSI_ASSERT( size_output >= (int32_t)((int64_t)(0xFFFFFFFF) - 1) ); - if( size_output < GPU_TENSOR_MAX_WIDTH ) + if ( size_output < GPU_TENSOR_MAX_WIDTH ) { shape_x[rank] = size_x; shape_y[rank] = size_y; @@ -325,7 +395,7 @@ static size_t tile_fill_dim int32_t remainder = 0; compute_gpu_divisor( size_output, GPU_TENSOR_MAX_WIDTH, 1, &divisor ); remainder = size_output / divisor; - if( remainder > GPU_TENSOR_MAX_WIDTH || rank >= max_rank ) + if ( remainder > GPU_TENSOR_MAX_WIDTH || rank >= max_rank ) { // Cannot optimize. shape_x[rank] = size_x; @@ -339,7 +409,7 @@ static size_t tile_fill_dim * so it should be always 2. */ cost_size = 2; - if( size_x > 1 ) + if ( size_x > 1 ) { shape_x[rank] = divisor; shape_x[rank + 1] = remainder; @@ -349,7 +419,7 @@ static size_t tile_fill_dim shape_x[rank] = 1; shape_x[rank + 1] = 1; } - if( size_y > 1 ) + if ( size_y > 1 ) { shape_y[rank] = divisor; shape_y[rank + 1] = remainder; @@ -401,20 +471,20 @@ vsi_bool vsi_nn_kernel_optimize_tile_shape sz = shape_output[i]; /* * Skip dim if the size is equal to 1 - * Also skip if( sx == 1 && sy == 1 ) + * Also skip if ( sx == 1 && sy == 1 ) */ - if( shape_output[i] == 1 ) + if ( shape_output[i] == 1 ) { continue; } // Update state state = TILE_STATE_EMPTY; - if( sx == sz ) + if ( sx == sz ) { state = TILE_STATE_NO_AXIS; } - else if( sx != sz ) + else if ( sx != sz ) { state = TILE_STATE_AXIS_X; } @@ -472,16 +542,16 @@ vsi_bool vsi_nn_kernel_optimize_tile_shape break; } #undef _pack_state - if( append_dim ) + if ( append_dim ) { dims += tile_fill_dim( out_shape_x, out_shape_y, out_shape_output, dims, VSI_NN_MAX_DIM_NUM, sx, sy, sz ); } } - if( ret ) + if ( ret ) { /* Append the last dim */ - if( i == rank_output ) + if ( i == rank_output ) { sx = effective_size_x; sy = effective_size_y; @@ -490,7 +560,7 @@ vsi_bool vsi_nn_kernel_optimize_tile_shape dims, VSI_NN_MAX_DIM_NUM, sx, sy, sz ); } /* Avoid 1D shape*/ - if( 1 == dims ) + if ( 1 == dims ) { out_shape_x[1] = 1; out_shape_y[1] = 1; @@ -508,3 +578,39 @@ vsi_bool vsi_nn_kernel_optimize_tile_shape #undef _swap_size return ret; } /* vsi_nn_kernel_optimize_eltwise_shape() */ + +vsi_bool vsi_nn_kernel_optimize_1d_tensor_shape + ( + const int32_t* shape, const uint32_t rank, + int32_t* out_shape, uint32_t* out_rank + ) +{ + memcpy(out_shape, shape, sizeof(int32_t) * rank); + *out_rank = vsi_nn_max(rank, 2); + + out_shape[1] = rank == 1 ? 1 : out_shape[1]; + + return TRUE; +} + +vsi_bool vsi_nn_kernel_optimize_nchw2xhw_shape + ( + const int32_t* shape, const uint32_t rank, + int32_t* out_shape, uint32_t* out_rank + ) +{ + uint32_t dim_num = 0; + uint32_t i = 0; + + vsi_nn_kernel_optimize_1d_tensor_shape( shape, + rank, out_shape, &dim_num); + + for (i = 3; i < dim_num; i++) + { + out_shape[2] *= out_shape[i]; + } + + *out_rank = vsi_nn_min(dim_num, 3); + + return TRUE; +} \ No newline at end of file diff --git a/src/tim/vx/internal/src/kernel/vx/clip_vx.c b/src/tim/vx/internal/src/kernel/vx/clip_vx.c index 2c74303..3c4ab45 100644 --- a/src/tim/vx/internal/src/kernel/vx/clip_vx.c +++ b/src/tim/vx/internal/src/kernel/vx/clip_vx.c @@ -131,10 +131,8 @@ static vsi_nn_kernel_node_t _setup float index[1024] = {0}; float value[1024] = {0}; - if ( inputs[0]->attr.dtype.vx_type == VSI_NN_TYPE_FLOAT32 || - inputs[0]->attr.dtype.vx_type == VSI_NN_TYPE_INT32 || - outputs[0]->attr.dtype.vx_type == VSI_NN_TYPE_INT32 || - outputs[0]->attr.dtype.vx_type == VSI_NN_TYPE_INT32) + if ( inputs[0]->attr.dtype.vx_type == VSI_NN_TYPE_INT32 || + outputs[0]->attr.dtype.vx_type == VSI_NN_TYPE_INT32 ) { return NULL; } diff --git a/src/tim/vx/internal/src/kernel/vx/convolutional.c b/src/tim/vx/internal/src/kernel/vx/convolutional.c index bb0d060..04e517c 100644 --- a/src/tim/vx/internal/src/kernel/vx/convolutional.c +++ b/src/tim/vx/internal/src/kernel/vx/convolutional.c @@ -255,7 +255,7 @@ REGISTER_CONV_OPENVX_KERNEL( depthwise_conv1d ) vx_node node = NULL; vx_nn_convolution_params_ext2_t vxparam; vx_tensor temp_tensors[3] = { NULL }; - int i; + int32_t i; vsi_bool need_explicit_padding = FALSE; _build_vx_conv2d_param( @@ -277,8 +277,17 @@ REGISTER_CONV_OPENVX_KERNEL( depthwise_conv1d ) if (inputs[1]->attr.dtype.qnt_type != VSI_NN_QNT_TYPE_AFFINE_PERCHANNEL_SYMMETRIC) { - temp_tensors[1] = _expand_tensor_dim( inputs[1]->t, - (int32_t*)inputs[1]->attr.size, inputs[1]->attr.dim_num, 0 ); + int32_t new_w_shape[VSI_NN_MAX_DIM_NUM] = { 0 }; + uint32_t new_w_rank = 4; + new_w_shape[0] = 1; + new_w_shape[1] = inputs[1]->attr.size[0]; + new_w_shape[2] = 1; + for (i = 1; i < (int32_t)(inputs[1]->attr.dim_num); i++) + { + new_w_shape[2] *= inputs[1]->attr.size[i]; + } + new_w_shape[3] = 1; + temp_tensors[1] = vxReshapeTensor( inputs[1]->t, new_w_shape, new_w_rank ); CHECK_PTR_FAIL_GOTO( temp_tensors[1], "Expand kernel dim fail.", final ); } else diff --git a/src/tim/vx/internal/src/kernel/vx/eltwise_unary_vx.c b/src/tim/vx/internal/src/kernel/vx/eltwise_unary_vx.c index c78de9d..0a64be9 100644 --- a/src/tim/vx/internal/src/kernel/vx/eltwise_unary_vx.c +++ b/src/tim/vx/internal/src/kernel/vx/eltwise_unary_vx.c @@ -165,10 +165,8 @@ static vsi_nn_kernel_node_t _setup float index[1024] = {0}; float value[1024] = {0}; - if ( inputs[0]->attr.dtype.vx_type == VSI_NN_TYPE_FLOAT32 || - inputs[0]->attr.dtype.vx_type == VSI_NN_TYPE_INT32 || - outputs[0]->attr.dtype.vx_type == VSI_NN_TYPE_INT32 || - outputs[0]->attr.dtype.vx_type == VSI_NN_TYPE_INT32) + if ( inputs[0]->attr.dtype.vx_type == VSI_NN_TYPE_INT32 || + outputs[0]->attr.dtype.vx_type == VSI_NN_TYPE_INT32 ) { return NULL; } diff --git a/src/tim/vx/internal/src/kernel/vx/relu_keras_vx.c b/src/tim/vx/internal/src/kernel/vx/relu_keras_vx.c index 14ec73d..9afde85 100644 --- a/src/tim/vx/internal/src/kernel/vx/relu_keras_vx.c +++ b/src/tim/vx/internal/src/kernel/vx/relu_keras_vx.c @@ -135,10 +135,8 @@ static vsi_nn_kernel_node_t _setup float index[1024] = {0}; float value[1024] = {0}; - if ( inputs[0]->attr.dtype.vx_type == VSI_NN_TYPE_FLOAT32 || - inputs[0]->attr.dtype.vx_type == VSI_NN_TYPE_INT32 || - outputs[0]->attr.dtype.vx_type == VSI_NN_TYPE_INT32 || - outputs[0]->attr.dtype.vx_type == VSI_NN_TYPE_INT32) + if ( inputs[0]->attr.dtype.vx_type == VSI_NN_TYPE_INT32 || + outputs[0]->attr.dtype.vx_type == VSI_NN_TYPE_INT32 ) { return NULL; } diff --git a/src/tim/vx/internal/src/libnnext/ops/cl/layer_normalization.cl b/src/tim/vx/internal/src/libnnext/ops/cl/layer_normalization.cl new file mode 100644 index 0000000..8bc826b --- /dev/null +++ b/src/tim/vx/internal/src/libnnext/ops/cl/layer_normalization.cl @@ -0,0 +1,143 @@ + +__kernel __attribute__((reqd_work_group_size(16, 1, 1))) void layer_norm_F32toF32( + __read_only image2d_array_t input, + __read_only image2d_t bias, + __read_only image2d_t scale, + __write_only image2d_array_t output, + float eps, + float input_zp, + float input_scale, + float output_zp, + float output_scale, + float e2InScale, + float scale_inOut, + float sumZpScale, + float zp2ScaleE2, + float sumZpScaleE2, + int width, + int height, + float dim_ratio + ) +{ + int lidx = get_local_id(0); + int4 coord = (int4)(lidx, get_global_id(1), get_global_id(2), 0); + + float4 data, dst; + float2 sumSqr = (float2)(0); + float scale_vari, bias_val; + __local float2 local_sum[16]; + + for(; coord.x < width;) + { + data = read_imagef(input, coord); + coord.x += 16; + sumSqr.x += data.x; + sumSqr.y += data.x * data.x; + } + local_sum[lidx] = sumSqr; + barrier(CLK_LOCAL_MEM_FENCE); + if(lidx == 0) + { + for(int i = 1; i < 16; i++) + { + sumSqr += local_sum[i]; + } + local_sum[0] = sumSqr; + } + barrier(CLK_LOCAL_MEM_FENCE); + sumSqr = local_sum[0] * dim_ratio; + sumSqr.s1 = sumSqr.s1 - sumSqr.s0 * sumSqr.s0 + eps; + sumSqr.s1 = rsqrt(sumSqr.s1); + + for(coord.x = lidx; coord.x < width;) + { + float4 gamma = read_imagef(scale, coord.xw); + float4 beta = read_imagef(bias, coord.xw); + data = read_imagef(input, coord); + + scale_vari = gamma.s0 * sumSqr.s1; + bias_val = (beta.s0 - scale_vari * sumSqr.s0); + + dst.x = data.x * scale_vari + bias_val; + write_imagef(output, coord, dst); + coord.x += 16; + } +} + +__kernel __attribute__((reqd_work_group_size(16, 1, 1))) void layer_norm_U8toU8( + __read_only image2d_array_t input, + __read_only image2d_t bias, + __read_only image2d_t scale, + __write_only image2d_array_t output, + float eps, + float input_zp, + float input_scale, + float output_zp, + float output_scale, + float e2InScale, + float scale_inOut, + float sumZpScale, + float zp2ScaleE2, + float sumZpScaleE2, + int width, + int height, + float dim_ratio + ) +{ + int lidx = get_local_id(0); + int4 coord = (int4)(lidx, get_global_id(1), get_global_id(2), 0); + + uint4 data, dst; + float2 sumSqr; + uint tmpSum = 0, tmpSqr = 0; + float scale_vari, bias_val; + __local uint local_sum[1]; + __local uint local_sqr[1]; + + if(lidx == 0) + { + local_sum[0] = 0; + local_sqr[0] = 0; + } + barrier(CLK_LOCAL_MEM_FENCE); + + for(; coord.x < width;) + { + data = read_imageui(input, coord); + coord.x+=16; + tmpSum += data.x; + tmpSqr += data.x * data.x; + } + atom_add(local_sum, tmpSum); + atom_add(local_sqr, tmpSqr); + barrier(CLK_LOCAL_MEM_FENCE); + tmpSum = local_sum[0]; + tmpSqr = local_sqr[0]; + //sumSqr.x = ((float)tmpSum - width * input_zp) * input_scale; + //sumSqr.y = ((float)tmpSqr - 2 * input_zp * (float)tmpSum + width * input_zp * input_zp) * e2InScale; + sumSqr.x = (float)tmpSum * input_scale - sumZpScale; + sumSqr.y = (float)tmpSqr * e2InScale - zp2ScaleE2 * (float)tmpSum + sumZpScaleE2; + + sumSqr *= dim_ratio; + sumSqr.s1 = sumSqr.s1 - sumSqr.s0 * sumSqr.s0 + eps; + sumSqr.s1 = rsqrt(sumSqr.s1); + + for(coord.x = lidx; coord.x < width;) + { + float4 gamma = read_imagef(scale, coord.xw); + float4 beta = read_imagef(bias, coord.xw); + data = read_imageui(input, coord); + + scale_vari = gamma.s0 * sumSqr.s1; + float alpha = scale_inOut * scale_vari; + bias_val = (beta.s0 - scale_vari * sumSqr.s0) * output_scale + output_zp; + + float tmpVal = data.x - input_zp; + + float4 norm; + norm.x = tmpVal * alpha + bias_val; + dst = convert_uint4_rte(norm); + write_imageui(output, coord, dst); + coord.x+=16; + } +} diff --git a/src/tim/vx/internal/src/libnnext/ops/cl/matrixmul.cl b/src/tim/vx/internal/src/libnnext/ops/cl/matrixmul.cl index ec757ca..70a81da 100644 --- a/src/tim/vx/internal/src/libnnext/ops/cl/matrixmul.cl +++ b/src/tim/vx/internal/src/libnnext/ops/cl/matrixmul.cl @@ -6,32 +6,30 @@ __kernel void gemm_F32F32toF32_2D( int K, int N, int ac2zero, - int bc2zero + int bc2zero, + float scale_a, + float zp_a, + float scale_b, + float zp_b, + float scale_out, + float zp_out ) { - int gidx = get_global_id(0); - int gidy = get_global_id(1); - - int2 coord_a = (int2)(0, gidy); - int2 coord_b = (int2)(gidx, 0); - + int4 coord = (int4)(get_global_id(0), get_global_id(1), 0, 0); float4 sum = (float4)(0); - for(; coord_a.x < K;) + for(; coord.z < K;) { float4 tempA0; float4 tempB0; - tempA0 = read_imagef(inputA, coord_a); - tempB0 = read_imagef(inputB, coord_b); - coord_a.x++; - coord_b.y++; + tempA0 = read_imagef(inputA, coord.zy); + tempB0 = read_imagef(inputB, coord.xz); + coord.z++; - sum += tempA0 * tempB0; + sum = sum + tempA0 * tempB0; } - - coord_b.y = gidy; - write_imagef(output, coord_b, sum); + write_imagef(output, coord.xy, sum); } __kernel void gemm_F32F32toF32_3D( @@ -42,7 +40,13 @@ __kernel void gemm_F32F32toF32_3D( int K, int N, int ac2zero, - int bc2zero + int bc2zero, + float scale_a, + float zp_a, + float scale_b, + float zp_b, + float scale_out, + float zp_out ) { int4 coord_a = (int4)(0, get_global_id(1), (ac2zero ? 0 : get_global_id(2)), 0); @@ -60,10 +64,160 @@ __kernel void gemm_F32F32toF32_3D( coord_a.x++; coord_b.y++; - sum += tempA0 * tempB0; + sum = sum + tempA0 * tempB0; } coord_b.y = get_global_id(1); coord_b.z = get_global_id(2); write_imagef(output, coord_b, sum); } + +__kernel void gemm_transb_F32F32toF32_2D( + __read_only image2d_t inputA, + __read_only image2d_t inputB, + __write_only image2d_t output, + int M, + int K, + int N, + int ac2zero, + int bc2zero, + float scale_a, + float zp_a, + float scale_b, + float zp_b, + float scale_out, + float zp_out + ) +{ + int4 coord = (int4)(get_global_id(0), get_global_id(1), 0, 0); + float4 sum = (float4)(0); + + for(; coord.z < K;) + { + float4 tempA0; + float4 tempB0; + + tempA0 = read_imagef(inputA, coord.zy); + tempB0 = read_imagef(inputB, coord.zx); + coord.z++; + + sum = sum + tempA0 * tempB0; + } + write_imagef(output, coord.xy, sum); +} + +__kernel void gemm_transb_F32F32toF32_3D( + __read_only image2d_array_t inputA, + __read_only image2d_array_t inputB, + __write_only image2d_array_t output, + int M, + int K, + int N, + int ac2zero, + int bc2zero, + float scale_a, + float zp_a, + float scale_b, + float zp_b, + float scale_out, + float zp_out + ) +{ + int4 coord_a = (int4)(0, get_global_id(1), (ac2zero ? 0 : get_global_id(2)), 0); + int4 coord_b = (int4)(0, get_global_id(0), (bc2zero ? 0 : get_global_id(2)), 0); + + float4 sum = (float4)(0); + + for(; coord_a.x < K;) + { + float4 tempA0; + float4 tempB0; + + tempA0 = read_imagef(inputA, coord_a); + tempB0 = read_imagef(inputB, coord_b); + coord_a.x++; + coord_b.x++; + + sum = sum + tempA0 * tempB0; + } + + coord_a.x = get_global_id(0); + coord_a.z = get_global_id(2); + write_imagef(output, coord_b, sum); +} + +__kernel void gemm_transb_F32I8toF32_2D( + __read_only image2d_t inputA, + __read_only image2d_t inputB, + __write_only image2d_t output, + int M, + int K, + int N, + int ac2zero, + int bc2zero, + float scale_a, + float zp_a, + float scale_b, + float zp_b, + float scale_out, + float zp_out + ) +{ + int4 coord = (int4)(get_global_id(0), get_global_id(1), 0, 0); + float4 sum = (float4)(0); + for(; coord.z < K;) + { + float4 tempA0; + float4 tempB0; + + tempA0 = read_imagef(inputA, coord.zy); + tempB0 = convert_float4(read_imagei(inputB, coord.zx)); + coord.z++; + tempB0.x = (tempB0.x - zp_b) * scale_b; + + sum = sum + tempA0 * tempB0; + } + + write_imagef(output, coord.xy, sum); +} + +__kernel void gemm_transb_F32I8toF32_3D( + __read_only image2d_array_t inputA, + __read_only image2d_array_t inputB, + __write_only image2d_array_t output, + int M, + int K, + int N, + int ac2zero, + int bc2zero, + float scale_a, + float zp_a, + float scale_b, + float zp_b, + float scale_out, + float zp_out + ) +{ + int4 coord_a = (int4)(0, get_global_id(1), (ac2zero ? 0 : get_global_id(2)), 0); + int4 coord_b = (int4)(0, get_global_id(0), (bc2zero ? 0 : get_global_id(2)), 0); + + float4 sum = (float4)(0); + + for(; coord_a.x < K;) + { + float4 tempA0; + float4 tempB0; + + tempA0 = read_imagef(inputA, coord_a); + tempB0 = convert_float4(read_imagei(inputB, coord_b)); + tempB0.x = (tempB0.x - zp_b) * scale_b; + coord_a.x++; + coord_b.x++; + + sum = sum + tempA0 * tempB0; + } + + coord_a.x = get_global_id(0); + coord_a.z = get_global_id(2); + write_imagef(output, coord_b, sum); +} diff --git a/src/tim/vx/internal/src/libnnext/ops/cl/matrixmul_transA.cl b/src/tim/vx/internal/src/libnnext/ops/cl/matrixmul_transA.cl index 7c290d4..b7bc8ee 100644 --- a/src/tim/vx/internal/src/libnnext/ops/cl/matrixmul_transA.cl +++ b/src/tim/vx/internal/src/libnnext/ops/cl/matrixmul_transA.cl @@ -6,32 +6,30 @@ __kernel void gemm_transa_F32F32toF32_2D( int K, int N, int ac2zero, - int bc2zero + int bc2zero, + float scale_a, + float zp_a, + float scale_b, + float zp_b, + float scale_out, + float zp_out ) { - int gidx = get_global_id(0); - int gidy = get_global_id(1); - - int2 coord_a = (int2)(gidy, 0); - int2 coord_b = (int2)(gidx, 0); - + int4 coord = (int4)(get_global_id(0), get_global_id(1), 0, 0); float4 sum = (float4)(0); - for(; coord_a.y < K;) + for(; coord.z < K;) { float4 tempA0; float4 tempB0; - tempA0 = read_imagef(inputA, coord_a); - tempB0 = read_imagef(inputB, coord_b); - coord_a.y++; - coord_b.y++; + tempA0 = read_imagef(inputA, coord.yz); + tempB0 = read_imagef(inputB, coord.xz); + coord.z++; - sum += tempA0 * tempB0; + sum = sum + tempA0 * tempB0; } - - coord_b.y = gidy; - write_imagef(output, coord_b, sum); + write_imagef(output, coord.xy, sum); } __kernel void gemm_transa_F32F32toF32_3D( @@ -42,7 +40,13 @@ __kernel void gemm_transa_F32F32toF32_3D( int K, int N, int ac2zero, - int bc2zero + int bc2zero, + float scale_a, + float zp_a, + float scale_b, + float zp_b, + float scale_out, + float zp_out ) { int gidx = get_global_id(0); @@ -63,7 +67,7 @@ __kernel void gemm_transa_F32F32toF32_3D( coord_a.y++; coord_b.y++; - sum += tempA0 * tempB0; + sum = sum + tempA0 * tempB0; } coord_b.y = gidy; diff --git a/src/tim/vx/internal/src/libnnext/ops/cl/roi_align.cl b/src/tim/vx/internal/src/libnnext/ops/cl/roi_align.cl new file mode 100644 index 0000000..feef55a --- /dev/null +++ b/src/tim/vx/internal/src/libnnext/ops/cl/roi_align.cl @@ -0,0 +1,108 @@ +inline float roi_align_1x1 +( + __read_only image2d_array_t input, + float2 region_start, + float2 region_end, + float2 bin_size, + int2 grid_size, + float2 rcp_of_grid_size, + int pz +) +{ + float sum = 0; + + for(int iy = 0; iy < grid_size.y; ++iy) + { + for(int ix = 0; ix < grid_size.x; ++ix) + { + float2 ixy = (float2)(ix + 0.5f, iy + 0.5f); + float2 pos = region_start + ixy * bin_size * rcp_of_grid_size; + + int2 xy_low = convert_int2(pos); + int2 xy_high = xy_low + 1; + + float ly = pos.y - xy_low.y; + float lx = pos.x - xy_low.x; + float hy = 1.0f - ly; + float hx = 1.0f - lx; + + float w1 = hy * hx; + float w2 = hy * lx; + float w3 = ly * hx; + float w4 = ly * lx; + + float data1 = read_imagef(input, (int4)(xy_low.x, xy_low.y, pz, 0)).x; + float data2 = read_imagef(input, (int4)(xy_high.x, xy_low.y, pz, 0)).x; + float data3 = read_imagef(input, (int4)(xy_low.x, xy_high.y, pz, 0)).x; + float data4 = read_imagef(input, (int4)(xy_high.x, xy_high.y, pz, 0)).x; + + sum = sum + w1 * data1 + w2 * data2 + w3 * data3 + w4 * data4; + } + } + + return (float)(sum * rcp_of_grid_size.x * rcp_of_grid_size.y); +} + + +#define EPS_GRID 0.00001f +__kernel void roi_align_F32toF32 +( + __read_only image2d_array_t input, + __read_only image2d_t rois, + __read_only image2d_t n_rois, + __write_only image2d_array_t output, + float spatial_x_scale, + float spatial_y_scale, + float in_width, + float in_height, + float rcp_of_out_width, + float rcp_of_out_height, + float sampling_x_ratio, + float sampling_y_ratio, + int depth +) +{ + int px = get_global_id(0); + int py = get_global_id(1); + int pw = get_global_id(2); + + int roi_batch = read_imagei(n_rois, (int2)(pw, 0)).x; + float4 roi_x = read_imagef(rois, (int2)(0, pw)); + float4 roi_y = read_imagef(rois, (int2)(1, pw)); + float4 roi_z = read_imagef(rois, (int2)(2, pw)); + float4 roi_w = read_imagef(rois, (int2)(3, pw)); + float4 roi = (float4)(roi_x.x, roi_y.x, roi_z.x, roi_w.x); + + float4 roi_anchor = roi * (float4)(spatial_x_scale, spatial_y_scale, spatial_x_scale, spatial_y_scale); + float2 roi_dims = fmax(roi_anchor.zw - roi_anchor.xy, 1.0f); + + float2 spatial_indx = (float2)(px, py); + float2 pooled_dims = (float2)(rcp_of_out_width, rcp_of_out_height); + float2 max_spatial_dims = (float2)(in_width, in_height); + + float2 bin_size = roi_dims * pooled_dims; + float2 region_start = spatial_indx * bin_size + roi_anchor.xy; + float2 region_end = region_start + bin_size; + + float2 roi_bin_grid = (float2)(sampling_x_ratio, sampling_y_ratio); + + roi_bin_grid = roi_bin_grid == 0 ? ceil(bin_size - EPS_GRID) : roi_bin_grid; + + int kz = roi_batch * depth; + float2 rcp_of_grid_size = 1.0f / roi_bin_grid; + int2 grid_size_xy = convert_int2(roi_bin_grid); + float4 interp; + int kz1 = pw * depth; + for (int pz = 0; pz < depth; pz ++, kz ++, kz1 ++) + { + interp.x = roi_align_1x1( input, + region_start, + region_end, + bin_size, + grid_size_xy, + rcp_of_grid_size, + kz); + + write_imagef(output, (int4)(px, py, kz1, 0), interp); + } +} \ No newline at end of file diff --git a/src/tim/vx/internal/src/libnnext/ops/cl/space2depth_internal.cl b/src/tim/vx/internal/src/libnnext/ops/cl/space2depth_internal.cl new file mode 100644 index 0000000..fc39817 --- /dev/null +++ b/src/tim/vx/internal/src/libnnext/ops/cl/space2depth_internal.cl @@ -0,0 +1,90 @@ + +__kernel void space2depth_internal_F32toF32 ( + image2d_array_t input, + image2d_array_t output, + int block_size_x, int block_size_y, + float scaleInOut, float zpInOut) +{ + int x = get_global_id(0); + int y = get_global_id(1); + int z = get_global_id(2); + int inDepth = get_image_array_size(input); + + int4 coord = (int4)(x, y, z, 0); + float4 data = {0.0}; + data = read_imagef(input, coord); + + ushort blockSize_x = convert_ushort(block_size_x); + ushort blockSize_y = convert_ushort(block_size_y); + int4 coord_out = (int4)(convert_ushort(x)/blockSize_x, convert_ushort(y)/blockSize_y, 0, 0); + coord_out.z = ((x - coord_out.x * block_size_x) + (y - coord_out.y * block_size_y) * block_size_x) * inDepth + + z; + write_imagef(output, coord_out, data); +} + +__kernel void space2depth_internal_F32toF32_X2Y1 ( + image2d_array_t input, + image2d_array_t output, + int block_size_x, int block_size_y, + float scaleInOut, float zpInOut) +{ + int x = get_global_id(0); + int y = get_global_id(1); + int z = get_global_id(2); + int inDepth = get_image_array_size(input); + + int4 coord = (int4)(x, y, z, 0); + float4 data = {0.0}; + data = read_imagef(input, coord); + + int4 coord_out = (int4)(x >> 1, y, 0, 0); + coord_out.z = (x & 1) * inDepth + z; + write_imagef(output, coord_out, data); +} + +__kernel void space2depth_internal_U8toU8 ( + image2d_array_t input, + image2d_array_t output, + int block_size_x, int block_size_y, + float scaleInOut, float zpInOut) +{ + int x = get_global_id(0); + int y = get_global_id(1); + int z = get_global_id(2); + int inDepth = get_image_array_size(input); + + int4 coord = (int4)(x, y, z, 0); + uint4 data = {0}; + data = read_imageui(input, coord); + + ushort blockSize_x = convert_ushort(block_size_x); + ushort blockSize_y = convert_ushort(block_size_y); + int4 coord_out = (int4)(convert_ushort(x)/blockSize_x, convert_ushort(y)/blockSize_y, 0, 0); + coord_out.z = ((x - coord_out.x * block_size_x) + (y - coord_out.y * block_size_y) * block_size_x) * inDepth + + z; + + data.x = convert_uint(data.x * scaleInOut + zpInOut); + write_imageui(output, coord_out, data); +} + +__kernel void space2depth_internal_U8toU8_X2Y1 ( + image2d_array_t input, + image2d_array_t output, + int block_size_x, int block_size_y, + float scaleInOut, float zpInOut) +{ + int x = get_global_id(0); + int y = get_global_id(1); + int z = get_global_id(2); + int inDepth = get_image_array_size(input); + + int4 coord = (int4)(x, y, z, 0); + uint4 data = {0}; + data = read_imageui(input, coord); + + int4 coord_out = (int4)(x >> 1, y, 0, 0); + coord_out.z = (x & 1) * inDepth + z; + + data.x = convert_uint(data.x * scaleInOut + zpInOut); + write_imageui(output, coord_out, data); +} diff --git a/src/tim/vx/internal/src/libnnext/ops/kernel/vsi_nn_kernel_crop.c b/src/tim/vx/internal/src/libnnext/ops/kernel/vsi_nn_kernel_crop.c deleted file mode 100644 index 3ab7764..0000000 --- a/src/tim/vx/internal/src/libnnext/ops/kernel/vsi_nn_kernel_crop.c +++ /dev/null @@ -1,253 +0,0 @@ -/**************************************************************************** -* -* Copyright (c) 2020 Vivante Corporation -* -* Permission is hereby granted, free of charge, to any person obtaining a -* copy of this software and associated documentation files (the "Software"), -* to deal in the Software without restriction, including without limitation -* the rights to use, copy, modify, merge, publish, distribute, sublicense, -* and/or sell copies of the Software, and to permit persons to whom the -* Software is furnished to do so, subject to the following conditions: -* -* The above copyright notice and this permission notice shall be included in -* all copies or substantial portions of the Software. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING -* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER -* DEALINGS IN THE SOFTWARE. -* -*****************************************************************************/ - -#include -#include -#include -#include - -#include "vsi_nn_platform.h" - -#include "vsi_nn_prv.h" -#include "vsi_nn_log.h" -#include "vsi_nn_tensor_util.h" -#include "utils/vsi_nn_util.h" -#include "utils/vsi_nn_math.h" -#include "client/vsi_nn_vxkernel.h" -#include "libnnext/vx_lib_nnext.h" - -void myTensorCropFunc - ( - int8_t *src, - int8_t *dst - ) -{ - - return; -} -vsi_status VX_CALLBACK TensorCropInternalKernel - (vx_node node, - const vx_reference* paramObj, - uint32_t paramNum - ) -{ - vsi_status status = VX_ERROR_INVALID_PARAMETERS; - - if(paramNum == 2) - { - - } - - return status; -} - -vsi_status VX_CALLBACK TensorCropInitializer - (vx_node nodObj, - const vx_reference *paramObj, - uint32_t paraNum - ) -{ - vsi_status status = VX_SUCCESS; -#define gcmALIGN(n, align) ((n) + ((align) - 1)) & ~((align) - 1) - vx_kernel_execution_parameters_t shaderParam = { - 3, // workdim - {0, 0, 0}, // globalWorkOffset: control the start location be processed in the image - {0, 0, 0}, // globalWorkScale: how many pixels could be processed by a single thread - {0, 0, 0}, // localWorkSize: local group size in threads - {0, 0, 0}}; // globalWorkSize: image size in threads - - vx_tensor input = (vx_tensor)paramObj[0]; - vx_tensor output = (vx_tensor)paramObj[1]; - uint32_t output_size[4] = {1, 1, 1, 1}; - vsi_enum dataFormat, dstFormat; - int8_t input_fixPointPos = 0; - vx_uint32 i = 0; - int32_t offset[3]; - size_t size[DIM_SIZE]; - vsi_nn_tensor_attr_t attr[2]; - - memset(&attr[0], 0, sizeof(vsi_nn_tensor_attr_t)); - memset(&attr[1], 0, sizeof(vsi_nn_tensor_attr_t)); - - status = vsi_nn_vxGetTensorAttr(input, &attr[0]); - status |= vsi_nn_vxGetTensorAttr(output, &attr[1]); - if (status != VX_SUCCESS) - { - VSILOGE("vsi_nn_vxGetTensorAttr failure! at line %d\n", __LINE__); - return status; - } - - dataFormat = attr[0].dtype.vx_type; - input_fixPointPos = attr[0].dtype.fl; - dstFormat = attr[1].dtype.vx_type; - for (i = 0; i < attr[1].dim_num; i++) - { - output_size[i] = attr[1].size[i]; - } - - vxCopyScalar((vx_scalar)paramObj[2], &offset[0], VX_READ_ONLY, VX_MEMORY_TYPE_HOST); - vxCopyScalar((vx_scalar)paramObj[3], &offset[1], VX_READ_ONLY, VX_MEMORY_TYPE_HOST); - vxCopyScalar((vx_scalar)paramObj[4], &offset[2], VX_READ_ONLY, VX_MEMORY_TYPE_HOST); - - memset(size, 0, sizeof(size_t) * DIM_SIZE); - switch(dstFormat) - { - case VSI_NN_TYPE_INT8: - case VSI_NN_TYPE_UINT8: - size[0] = 16; - size[1] = 4; - break; - case VSI_NN_TYPE_INT16: - case VSI_NN_TYPE_UINT16: - case VSI_NN_TYPE_FLOAT16: - size[0] = 8; - size[1] = 4; - break; - } - - shaderParam.globalWorkOffset[0] = offset[0]; - shaderParam.globalWorkOffset[1] = offset[1]; - shaderParam.globalWorkOffset[2] = offset[2]; - shaderParam.globalWorkScale[0] = size[0]; - shaderParam.globalWorkScale[1] = size[1]; - shaderParam.globalWorkScale[2] = 1; - shaderParam.globalWorkSize[0] = gcmALIGN((output_size[0] + shaderParam.globalWorkScale[0] - 1) - / shaderParam.globalWorkScale[0], 4); - shaderParam.globalWorkSize[1] = (output_size[1] + shaderParam.globalWorkScale[1] - 1) - / shaderParam.globalWorkScale[1]; - shaderParam.globalWorkSize[2] = output_size[2]; - - if(dataFormat == VSI_NN_TYPE_INT16 && dstFormat == VSI_NN_TYPE_FLOAT16) - { - vx_uint32 uniConvertInt16toFp16_2x8[16] = { - 0x11111111, // TCfg - 0x00000000, // ASelt - 0x03020100, 0x07060504, // ABin - 0x22222222, // BSelt - 0x00000000, 0x00000000, // BBin - 0x00000600, // AccumType, ConstantType, and PostShift - 0x00000001, 0x00000001, 0x00000001, 0x00000001, 0x00000001, 0x00000001, 0x00000001, 0x00000001 // Constant - }; - -#define cropMIN(x, y) (((x) <= (y)) ? (x) : (y)) -#define CROP_MAX_POST_SHIFT_BITS (31) -#define CROP_MAX_MULTIPLIER_NUM (65535) - - if (input_fixPointPos > 0) - { - vx_uint8 postshift = cropMIN(input_fixPointPos, CROP_MAX_POST_SHIFT_BITS); - - uniConvertInt16toFp16_2x8[7] |= (postshift & 0x1F); - } - else - { - vx_uint32 multiplier = cropMIN((int64_t)1 << (-input_fixPointPos), CROP_MAX_MULTIPLIER_NUM); - - for (i = 0; i < 8; i++) - { - uniConvertInt16toFp16_2x8[i + 8] = multiplier; - } - } -#undef cropMIN -#undef CROP_MAX_POST_SHIFT_BITS -#undef CROP_MAX_MULTIPLIER_NUM - - status |= vxSetNodeUniform(nodObj, "uniConvertInt16toFp16_2x8", 1, uniConvertInt16toFp16_2x8); - } - - vxSetNodeAttribute(nodObj, VX_NODE_ATTRIBUTE_KERNEL_EXECUTION_PARAMETERS, - &shaderParam, sizeof(vx_kernel_execution_parameters_t)); - - if(status < 0) - { - VSILOGE("[%s : %d]Initializer failure! \n",__FILE__, __LINE__); - } - return status; -} - -vx_param_description_t basekernel_tensorCrop_params[] = { - {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, - {VX_OUTPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, - {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED}, - {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED}, - {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED} -}; - - -#ifdef __cplusplus -extern "C" { -#endif -vx_kernel_description_t vxTensorCropKernelInt16Info = -{ - VX_KERNEL_ENUM_TENSORCROP_INT16, - VX_KERNEL_NAME_TENSORCROP_INT16, - NULL, - basekernel_tensorCrop_params, - (sizeof(basekernel_tensorCrop_params) / sizeof(basekernel_tensorCrop_params[0])), - vsi_nn_KernelValidator, - NULL, - NULL, - TensorCropInitializer, - vsi_nn_KernelDeinitializer -}; - -vx_kernel_description_t vxTensorCropKernelInt8Info = -{ - VX_KERNEL_ENUM_TENSORCROP_INT8, - VX_KERNEL_NAME_TENSORCROP_INT8, - NULL, - basekernel_tensorCrop_params, - (sizeof(basekernel_tensorCrop_params) / sizeof(basekernel_tensorCrop_params[0])), - vsi_nn_KernelValidator, - NULL, - NULL, - TensorCropInitializer, - vsi_nn_KernelDeinitializer -}; - -vx_kernel_description_t vxTensorCropKernelInt16Fp16Info = -{ - VX_KERNEL_ENUM_TENSORCROP_INT16_FP16, - VX_KERNEL_NAME_TENSORCROP_INT16_FP16, - NULL, - basekernel_tensorCrop_params, - (sizeof(basekernel_tensorCrop_params) / sizeof(basekernel_tensorCrop_params[0])), - vsi_nn_KernelValidator, - NULL, - NULL, - TensorCropInitializer, - vsi_nn_KernelDeinitializer -}; - -vx_kernel_description_t * vx_kernel_CROP_list[] = -{ - NULL, - &vxTensorCropKernelInt16Info, - &vxTensorCropKernelInt8Info, - &vxTensorCropKernelInt16Fp16Info, - NULL -}; -#ifdef __cplusplus -} -#endif diff --git a/src/tim/vx/internal/src/libnnext/ops/kernel/vsi_nn_kernel_fullconnect2.c b/src/tim/vx/internal/src/libnnext/ops/kernel/vsi_nn_kernel_fullconnect2.c deleted file mode 100644 index dacde22..0000000 --- a/src/tim/vx/internal/src/libnnext/ops/kernel/vsi_nn_kernel_fullconnect2.c +++ /dev/null @@ -1,323 +0,0 @@ -/**************************************************************************** -* -* Copyright (c) 2020 Vivante Corporation -* -* Permission is hereby granted, free of charge, to any person obtaining a -* copy of this software and associated documentation files (the "Software"), -* to deal in the Software without restriction, including without limitation -* the rights to use, copy, modify, merge, publish, distribute, sublicense, -* and/or sell copies of the Software, and to permit persons to whom the -* Software is furnished to do so, subject to the following conditions: -* -* The above copyright notice and this permission notice shall be included in -* all copies or substantial portions of the Software. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING -* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER -* DEALINGS IN THE SOFTWARE. -* -*****************************************************************************/ - -#include -#include -#include -#include - -#include "vsi_nn_platform.h" - -#include "vsi_nn_prv.h" -#include "vsi_nn_log.h" -#include "vsi_nn_tensor_util.h" -#include "utils/vsi_nn_util.h" -#include "utils/vsi_nn_dtype_util.h" -#include "utils/vsi_nn_math.h" -#include "client/vsi_nn_vxkernel.h" -#include "libnnext/vx_lib_nnext.h" - -#define _VX_KERNEL_VAR (vx_kernel_FCL2) -#define _VX_KERNEL_ID (VX_KERNEL_ENUM_FULLYCONNECTED_AXIS2) -#define _VX_KERNEL_NAME ("vsi_nn_kernel_fullconnect2") -#define _VX_KERNEL_FUNC_KERNEL (vxFullconnect2Kernel) - -//static uint32_t layerNum = 0; - -static vsi_status VX_CALLBACK vxFullconnect2Kernel - ( - vx_node node, - const vx_reference* paramObj, - uint32_t paramNum - ) -{ - /* TODO: */ -#define ARG_NUM (2) -#define TENSOR_NUM_INPUT (3) -#define TENSOR_NUM_OUTPUT (1) -#define TENSOR_NUM (TENSOR_NUM_INPUT+TENSOR_NUM_OUTPUT) - - vsi_status status = VX_SUCCESS; - uint32_t i, j, k; - vx_context context = NULL; - vsi_nn_tensor_attr_t attr[TENSOR_NUM]; - uint32_t stride_size[TENSOR_NUM][VSI_NN_MAX_DIM_NUM]; - vx_tensor_addressing user_addr[TENSOR_NUM] = {NULL}; - uint8_t *buffer_ptr[TENSOR_NUM] = {NULL}; - vx_tensor tensor[TENSOR_NUM]; - - //char fileName[256] = {'\0'}; - //uint32_t total_size; - int32_t axis, weights; - uint32_t num_fc = 1, num_no_fc = 1; - - - //prepare data - context = vxGetContext((vx_reference)node); - - for( i = 0; i < TENSOR_NUM_INPUT; i ++ ) - { - tensor[i] = (vx_tensor)paramObj[i]; - buffer_ptr[i] = vsi_nn_ConvertRawTensorToData2(context, tensor[i], - &(attr[i]), stride_size[i], &(user_addr[i]), VX_READ_ONLY); - } - for( i = TENSOR_NUM_INPUT; i < TENSOR_NUM; i ++ ) - { - tensor[i] = (vx_tensor)paramObj[i]; - buffer_ptr[i] = vsi_nn_ConvertRawTensorToData2(context, tensor[i], - &(attr[i]), stride_size[i], &(user_addr[i]), VX_WRITE_ONLY); - } - - vxCopyScalar((vx_scalar)paramObj[TENSOR_NUM], &(axis), - VX_READ_ONLY, VX_MEMORY_TYPE_HOST); - vxCopyScalar((vx_scalar)paramObj[TENSOR_NUM + 1], &(weights), - VX_READ_ONLY, VX_MEMORY_TYPE_HOST); - - //op calc - for(i = 0; i <= (uint32_t)axis; ++i) - { - num_fc *= attr[0].size[i]; - } - for(i = axis + 1; i < attr[0].dim_num; ++i) - { - num_no_fc *= attr[0].size[i]; - } - - for(k = 0; k < num_no_fc; ++k) - { - for(j = 0; j < (uint32_t)weights; ++j) - { - float sum; - vsi_nn_DtypeToFloat32(&buffer_ptr[2][stride_size[2][0] * j], &sum, &attr[2].dtype); - for(i = 0; i < num_fc; ++i) - { - float x, w; - vsi_nn_DtypeToFloat32(&buffer_ptr[0][stride_size[0][0] * (i + num_fc * k)], - &x, &attr[0].dtype); - vsi_nn_DtypeToFloat32(&buffer_ptr[1][stride_size[1][0] * (i + num_fc * j)], - &w, &attr[1].dtype); - sum += w * x; - } - vsi_nn_Float32ToDtype(sum, &buffer_ptr[3][stride_size[3][0] * (j + weights * k)], - &attr[3].dtype); - } - } - -#if 0 - print_index = 3; - total_size = vsi_nn_ShapeProduct(size[print_index], dim_num[print_index]); - if (dim_num[print_index] == 3) - { - snprintf(fileName, VSI_NN_MAX_PATH, "%s_%d_%d_%d_%d.txt", _VX_KERNEL_NAME, layerNum, - size[print_index][0], size[print_index][1], size[print_index][2]); - } - else - { - snprintf(fileName, VSI_NN_MAX_PATH, "%s_%d_%d_%d_%d_%d.txt", _VX_KERNEL_NAME, layerNum, - size[print_index][0], size[print_index][1], size[print_index][2], size[print_index][3]); - } - vsi_nn_SaveDataToText(fileName, buffer_ptr[print_index], total_size, - data_format[print_index], NULL); - layerNum++; -#endif - //save data - for( i = TENSOR_NUM_INPUT; i < TENSOR_NUM; i ++ ) - { - status = vsi_nn_copy_tensor_patch(tensor[i], &attr[i], buffer_ptr[i], VX_WRITE_ONLY); - if (user_addr[i]) vxReleaseTensorAddressing(&(user_addr[i])); - } - for( i = 0; i < TENSOR_NUM; i ++ ) - { - if (buffer_ptr[i]) free(buffer_ptr[i]); - } - - return status; -} /* _VX_KERNEL_FUNC_KERNEL() */ - - -static vx_param_description_t s_params[] = -{ - { VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED }, - { VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED }, - { VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED }, - { VX_OUTPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED }, - { VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED }, - { VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED }, -}; - -void myFullyConnected_Axis2Func - ( - int8_t *src, - int8_t *dst - ) -{ - - return; -} -vsi_status VX_CALLBACK vxFullyConnected_Axis2Kernel - ( - vx_node node, - const vx_reference* paramObj, - uint32_t paramNum - ) -{ - vsi_status status = VX_ERROR_INVALID_PARAMETERS; - - if(paramNum == 2) - { - - } - - return status; -} - -vsi_status VX_CALLBACK vxFullyConnected_Axis2Initializer - ( - vx_node nodObj, - const vx_reference *paramObj, - uint32_t paraNum - ) -{ - vsi_status status = VX_SUCCESS; - - // Alignment with a power of two value. -#define gcmALIGN(n, align) ((n) + ((align) - 1)) & ~((align) - 1) - vx_kernel_execution_parameters_t shaderParam = { - 2, // workdim - {0, 0, 0}, // globalWorkOffset: control the start location be processed in the image - {0, 0, 0}, // globalWorkScale: how many pixels could be processed by a single thread - {0, 0, 0}, // localWorkSize: local group size in threads - {0, 0, 0}}; // globalWorkSize: image size in threads - - uint32_t input_size[DIM_SIZE] = {1, 1, 1, 1}; - uint32_t output_size[DIM_SIZE] = {1, 1, 1, 1}; - - uint32_t uniMulAcc_16x1[16] = { - 0x00005555, // TCfg - 0x00000000, // ASelt - 0x76543210, 0x00000000, // ABin - 0x00005555, // BSelt - 0x76543210, 0x00000000, // BBin - 0x00000400, // AccumType, ConstantType, and PostShift - 0x00000000, 0x00000000, 0x00000000, 0x00000000, - 0x00000000, 0x00000000, 0x00000000, 0x00000000 // Constant - }; - uint32_t loopNum = 0; - vsi_nn_tensor_attr_t attr[2]; - uint32_t i; - uint32_t input_dims = 0; - uint32_t output_dims = 0; - - memset(&attr[0], 0, sizeof(vsi_nn_tensor_attr_t)); - memset(&attr[1], 0, sizeof(vsi_nn_tensor_attr_t)); - - status = vsi_nn_vxGetTensorAttr((vx_tensor)paramObj[1], &attr[0]); - status |= vsi_nn_vxGetTensorAttr((vx_tensor)paramObj[3], &attr[1]); - if (status != VX_SUCCESS) - { - VSILOGE("vsi_nn_vxGetTensorAttr failure! at line %d\n", __LINE__); - return status; - } - input_dims = attr[0].dim_num; - for (i = 0; i < input_dims; i++) - { - input_size[i] = attr[0].size[i]; - } - output_dims = attr[1].dim_num; - for (i = 0; i < output_dims; i++) - { - output_size[i] = attr[1].size[i]; - } - - shaderParam.globalWorkOffset[0] = 0; - shaderParam.globalWorkOffset[1] = 0; - shaderParam.globalWorkScale[0] = 1; - shaderParam.globalWorkScale[1] = 1; - shaderParam.globalWorkSize[0] = gcmALIGN((output_size[0] + shaderParam.globalWorkScale[0] - 1) - / shaderParam.globalWorkScale[0], 4); - shaderParam.globalWorkSize[1] = (output_size[1] + shaderParam.globalWorkScale[1] - 1) - / shaderParam.globalWorkScale[1]; - - vxSetNodeAttribute(nodObj, VX_NODE_ATTRIBUTE_KERNEL_EXECUTION_PARAMETERS, - &shaderParam, sizeof(vx_kernel_execution_parameters_t)); - - vxSetNodeUniform(nodObj, "uniMulAcc_16x1", 1, uniMulAcc_16x1); - - loopNum = gcmALIGN(input_size[0], 32); - vxSetNodeUniform(nodObj, "loopNum", 1, &loopNum); - if(status < 0) - { - VSILOGE("[%s : %d]Initializer failure! \n",__FILE__, __LINE__); - } - return status; -} - -static vx_param_description_t vxFullyConnected_Axis2KernelParam[] = -{ - {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, - {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, - {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, - {VX_OUTPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED} -}; - -#ifdef __cplusplus -extern "C" { -#endif -vx_kernel_description_t _VX_KERNEL_VAR = -{ - _VX_KERNEL_ID, - _VX_KERNEL_NAME, - _VX_KERNEL_FUNC_KERNEL, - s_params, - _cnt_of_array( s_params ), - vsi_nn_KernelValidator, - NULL, - NULL, - vsi_nn_KernelInitializer, - vsi_nn_KernelDeinitializer -}; - -vx_kernel_description_t vxFullyConnected_Axis2KernelInfo = -{ - _VX_KERNEL_ID, - _VX_KERNEL_NAME, - vxFullyConnected_Axis2Kernel, - vxFullyConnected_Axis2KernelParam, - (sizeof(vxFullyConnected_Axis2KernelParam) / sizeof(vxFullyConnected_Axis2KernelParam[0])), - vsi_nn_KernelValidator, - NULL, - NULL, - vxFullyConnected_Axis2Initializer, - vsi_nn_KernelDeinitializer -}; - -vx_kernel_description_t * vx_kernel_FCL2_list[] = -{ - &_VX_KERNEL_VAR, - &vxFullyConnected_Axis2KernelInfo, - NULL -}; -#ifdef __cplusplus -} -#endif diff --git a/src/tim/vx/internal/src/libnnext/ops/kernel/vsi_nn_kernel_layernormalize.c b/src/tim/vx/internal/src/libnnext/ops/kernel/vsi_nn_kernel_layernormalize.c deleted file mode 100644 index f259835..0000000 --- a/src/tim/vx/internal/src/libnnext/ops/kernel/vsi_nn_kernel_layernormalize.c +++ /dev/null @@ -1,688 +0,0 @@ -/**************************************************************************** -* -* Copyright (c) 2020 Vivante Corporation -* -* Permission is hereby granted, free of charge, to any person obtaining a -* copy of this software and associated documentation files (the "Software"), -* to deal in the Software without restriction, including without limitation -* the rights to use, copy, modify, merge, publish, distribute, sublicense, -* and/or sell copies of the Software, and to permit persons to whom the -* Software is furnished to do so, subject to the following conditions: -* -* The above copyright notice and this permission notice shall be included in -* all copies or substantial portions of the Software. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING -* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER -* DEALINGS IN THE SOFTWARE. -* -*****************************************************************************/ - -#include -#include -#include -#include -#include - -#include "vsi_nn_platform.h" - -#include "vsi_nn_prv.h" -#include "vsi_nn_log.h" -#include "vsi_nn_tensor_util.h" -#include "utils/vsi_nn_util.h" -#include "utils/vsi_nn_dtype_util.h" -#include "utils/vsi_nn_math.h" -#include "client/vsi_nn_vxkernel.h" -#include "libnnext/vx_lib_nnext.h" - -void myLayerNormFunc - ( - void* src, - int16_t* scale, - float* bias, - float eps, - void* dst, - uint32_t input_dim, - uint32_t width, - uint32_t height, - uint32_t channel, - uint32_t batch - ) -{ - uint32_t ch = (input_dim <= 2) ? 1 : channel; - uint32_t bn = (input_dim <= 3) ? 1 : batch; - uint32_t b = 0, c = 0, h = 0, w = 0; - - int16_t* imgIn, *imgOut; - imgIn = (int16_t*)src; - imgOut = (int16_t*)dst; - - VSILOGI("Hello myLayerNormFunc!\n"); - for (b = 0; b < bn; b++) - { - for (c = 0; c < ch; c++) - { - for (h = 0; h < height; h++) - { - uint32_t len = (h + (c + b*ch)*height) * width; - float sum = .0f; - float sumsq = .0f; - float mean = .0f; - float vari = .0f; - - for (w = 0; w < width; w++) - { - uint32_t index = len + w; - sum += vsi_nn_Fp16toFp32(imgIn[index]); - } - mean = sum / width; - for (w = 0; w < width; w++) - { - uint32_t index = len + w; - float data = vsi_nn_Fp16toFp32(imgIn[index]) - mean; - sumsq += data * data; - } - vari = sumsq / width; - vari = (float)(1.0 / sqrtf(vari + eps)); - for (w = 0; w < width; w++) - { - uint32_t index = len + w; - float data = vsi_nn_Fp16toFp32(imgIn[index]) - mean; - float scaleVal = vsi_nn_Fp16toFp32(scale[w]); - float biasVal = bias[w]; - float normVal = data * vari * scaleVal + biasVal; - imgOut[index] = vsi_nn_Fp32ToFp16(normVal); - } - } - } - } - return; -} -void myLayerNormFunc_u8 - ( - void* src, - int16_t* scale, - float* bias, - float eps, - void* dst, - uint32_t input_dim, - uint32_t width, - uint32_t height, - uint32_t channel, - uint32_t batch, - int32_t inZp, - int32_t outZp, - float inScale, - float outScale - ) -{ - uint32_t ch = (input_dim <= 2) ? 1 : channel; - uint32_t bn = (input_dim <= 3) ? 1 : batch; - uint32_t b = 0, c = 0, h = 0, w = 0; - - uint8_t* imgIn, *imgOut; - imgIn = (uint8_t*)src; - imgOut = (uint8_t*)dst; - - VSILOGI("Hello myLayerNormFunc!\n"); - for (b = 0; b < bn; b++) - { - for (c = 0; c < ch; c++) - { - for (h = 0; h < height; h++) - { - uint32_t len = (h + (c + b*ch)*height) * width; - float sum = .0f; - float sumsq = .0f; - float mean = .0f; - float vari = .0f; - - for (w = 0; w < width; w++) - { - uint32_t index = len + w; - //sum += vsi_nn_Fp16toFp32(imgIn[index]); - sum += vsi_nn_AffineToFp32(imgIn[index], inScale, inZp, VSI_NN_TYPE_UINT8); - } - mean = sum / width; - for (w = 0; w < width; w++) - { - uint32_t index = len + w; - //float data = vsi_nn_Fp16toFp32(imgIn[index]) - mean; - float data = vsi_nn_AffineToFp32(imgIn[index], inScale, inZp, VSI_NN_TYPE_UINT8) - mean; - sumsq += data * data; - } - vari = sumsq / width; - vari = (float)(1.0 / sqrtf(vari + eps)); - for (w = 0; w < width; w++) - { - uint32_t index = len + w; - //float data = vsi_nn_Fp16toFp32(imgIn[index]) - mean; - float data = vsi_nn_AffineToFp32(imgIn[index], inScale, inZp, VSI_NN_TYPE_UINT8) - mean; - float scaleVal = vsi_nn_Fp16toFp32(scale[w]); - float biasVal = bias[w]; - float normVal = data * vari * scaleVal + biasVal; - //imgOut[index] = vsi_nn_Fp32ToFp16(normVal); - imgOut[index] = (vx_uint8)vsi_nn_Fp32ToAffine(normVal, outScale, outZp, VSI_NN_TYPE_UINT8); - } - } - } - } - return; -} -vsi_status VX_CALLBACK vxLayerNormKernel - ( - vx_node node, - const vx_reference* paramObj, - uint32_t paramNum - ) -{ - vsi_status status = VX_ERROR_INVALID_PARAMETERS; - - if(paramNum == 5) - { - vx_context context = NULL; - // tensor - vx_tensor imgObj[4] = { NULL }; - vsi_nn_tensor_attr_t attr[4]; - int16_t *input = NULL, *output = NULL, *scale = NULL; - float *bias = NULL; - uint32_t input_size[4] = {1, 1, 1, 1}, output_size[4] = {1, 1, 1, 1}; - uint32_t scale_size[4] = {1, 1, 1, 1}, bias_size[4] = {1, 1, 1, 1}; - uint32_t input_stride_size[4] = {0}; - uint32_t output_stride_size[4] = {0}; - uint32_t scale_stride_size[4] = {0}; - uint32_t bias_stride_size[4] = {0}; - vx_tensor_addressing input_user_addr = NULL; - vx_tensor_addressing output_user_addr = NULL; - vx_tensor_addressing scale_user_addr = NULL; - vx_tensor_addressing bias_user_addr = NULL; - vsi_nn_type_e inputFormat = VSI_NN_TYPE_FLOAT16, outputFormat = VSI_NN_TYPE_FLOAT16; - vsi_nn_type_e scaleFormat = VSI_NN_TYPE_FLOAT16, biasFormat = VSI_NN_TYPE_FLOAT16; - uint32_t input_dims = 0, output_dims = 0; - uint32_t scale_dims = 0, bias_dims = 0; - uint32_t i; - int32_t in_zp, out_zp; - float in_scale, out_scale; - // scalar - vx_scalar scalar[1] = { NULL }; - float eps = .0f; - - imgObj[0] = (vx_tensor)paramObj[0]; - imgObj[1] = (vx_tensor)paramObj[1]; - imgObj[2] = (vx_tensor)paramObj[2]; - imgObj[3] = (vx_tensor)paramObj[3]; - scalar[0] = (vx_scalar)paramObj[4]; - memset(&attr[0], 0, sizeof(vsi_nn_tensor_attr_t)); - memset(&attr[1], 0, sizeof(vsi_nn_tensor_attr_t)); - memset(&attr[2], 0, sizeof(vsi_nn_tensor_attr_t)); - memset(&attr[3], 0, sizeof(vsi_nn_tensor_attr_t)); - context = vxGetContext((vx_reference)node); - if (context == NULL) - { - VSILOGE("vxGetContext failure! at line %d\n", __LINE__); - goto OnError; - } - - status = vsi_nn_vxGetTensorAttr(imgObj[0], &attr[0]); - status |= vsi_nn_vxGetTensorAttr(imgObj[1], &attr[1]); - status |= vsi_nn_vxGetTensorAttr(imgObj[2], &attr[2]); - status |= vsi_nn_vxGetTensorAttr(imgObj[3], &attr[3]); - if (status != VX_SUCCESS) - { - VSILOGE("vsi_nn_vxGetTensorAttr failure! at line %d\n", __LINE__); - goto OnError; - } - input_dims = attr[0].dim_num; - inputFormat = attr[0].dtype.vx_type; - for (i = 0; i < input_dims; i++) - { - input_size[i] = attr[0].size[i]; - } - in_zp = attr[0].dtype.zero_point; - in_scale = attr[0].dtype.scale; - - //bias - bias_dims = attr[1].dim_num; - biasFormat = attr[1].dtype.vx_type; - for (i = 0; i < bias_dims; i++) - { - bias_size[i] = attr[1].size[i]; - } - //scale - scale_dims = attr[2].dim_num; - scaleFormat = attr[2].dtype.vx_type; - for (i = 0; i < scale_dims; i++) - { - scale_size[i] = attr[2].size[i]; - } - - //output - output_dims = attr[3].dim_num; - outputFormat = attr[3].dtype.vx_type; - for (i = 0; i < output_dims; i++) - { - output_size[i] = attr[3].size[i]; - } - out_zp = attr[3].dtype.zero_point; - out_scale = attr[3].dtype.scale; - - input_size[2] = (input_dims <= 2)?1:input_size[2]; - input_size[3] = (input_dims <= 3)?1:input_size[3]; - - input_stride_size[0] = vsi_nn_GetTypeBytes(inputFormat); - output_stride_size[0] = vsi_nn_GetTypeBytes(outputFormat); - for (i=1; i< input_dims; i++) - { - input_stride_size[i] = input_stride_size[i-1] * input_size[i-1]; - output_stride_size[i] = output_stride_size[i-1] * output_size[i-1]; - } - input = (int16_t*)malloc(input_size[0]*input_size[1]*input_size[2]*sizeof(int16_t)); - output = (int16_t*)malloc(output_size[0]*output_size[1]*output_size[2]*sizeof(int16_t)); - input_user_addr = vxCreateTensorAddressing(context, input_size, input_stride_size, (vx_uint8)input_dims); - vsi_nn_copy_tensor_patch(imgObj[0], &attr[0], input, VX_READ_ONLY); - //scale and bias - scale_stride_size[0] = vsi_nn_GetTypeBytes(scaleFormat); - bias_stride_size[0] = vsi_nn_GetTypeBytes(biasFormat); - for (i=1; i< scale_dims; i++) - { - scale_stride_size[i] = scale_stride_size[i-1] * scale_size[i-1]; - bias_stride_size[i] = bias_stride_size[i-1] * bias_size[i-1]; - } - scale = (int16_t*)malloc(scale_size[0]*sizeof(int16_t)); - bias = (float*)malloc(bias_size[0]*sizeof(float)); - bias_user_addr = vxCreateTensorAddressing(context, bias_size, bias_stride_size, (vx_uint8)bias_dims); - vsi_nn_copy_tensor_patch(imgObj[1], &attr[1], bias, VX_READ_ONLY); - scale_user_addr = vxCreateTensorAddressing(context, scale_size, scale_stride_size, (vx_uint8)scale_dims); - vsi_nn_copy_tensor_patch(imgObj[2], &attr[2], scale, VX_READ_ONLY); - - // scalar - status = vxCopyScalar(scalar[0], &eps, VX_READ_ONLY, VX_MEMORY_TYPE_HOST); - if (status != VX_SUCCESS) - { - VSILOGE("vxCopyScalar failure! at line %d\n", __LINE__); - goto OnError; - } - // Call C Prototype - if(inputFormat == VSI_NN_TYPE_FLOAT16) - { - myLayerNormFunc(input, scale, bias, eps, output, input_dims, input_size[0], - input_size[1], input_size[2], input_size[3]); - } - else - { - myLayerNormFunc_u8(input, scale, bias, eps, output, input_dims, input_size[0], - input_size[1], input_size[2], input_size[3], in_zp, out_zp, in_scale, out_scale); - } - - //output tensor - output_user_addr = vxCreateTensorAddressing(context, output_size, - output_stride_size, (vx_uint8)output_dims); - vsi_nn_copy_tensor_patch(imgObj[3], &attr[3], output, VX_WRITE_ONLY); - -OnError: - if(input) free(input); - if(scale) free(scale); - if(bias) free(bias); - if(output) free(output); - if(input_user_addr) vxReleaseTensorAddressing(&input_user_addr); - if(scale_user_addr) vxReleaseTensorAddressing(&scale_user_addr); - if(bias_user_addr) vxReleaseTensorAddressing(&bias_user_addr); - if(output_user_addr) vxReleaseTensorAddressing(&output_user_addr); - } - - return status; -} -vsi_status VX_CALLBACK vxLayerNormInitializer - ( - vx_node nodObj, - const vx_reference *paramObj, - uint32_t paraNum - ) -{ - vsi_status status = VX_SUCCESS; - // Alignment with a power of two value. -#define gcmALIGN(n, align) ((n) + ((align) - 1)) & ~((align) - 1) - vx_kernel_execution_parameters_t shaderParam = { - 3, // workdim - {0, 0, 0}, // globalWorkOffset: control the start location be processed in the image - {0, 0, 0}, // globalWorkScale: how many pixels could be processed by a single thread - {0, 0, 0}, // localWorkSize: local group size in thread - {0, 0, 0}}; // globalWorkSize: image size in thread - - vx_tensor input = (vx_tensor)paramObj[0]; - vx_tensor scale = (vx_tensor)paramObj[2]; - vx_tensor output = (vx_tensor)paramObj[3]; - uint32_t input_size[4] = {1, 1, 1, 1}; - uint32_t input_dims = 0; - vsi_nn_type_e inputDataFormat = VSI_NN_TYPE_FLOAT16; - vsi_nn_type_e scaleDataFormat = VSI_NN_TYPE_FLOAT16; - vsi_nn_type_e outputDataFormat = VSI_NN_TYPE_FLOAT16; - vx_float32 scaleIn = 0; - vx_float32 scaleOut = 0; - vx_float32 reScaleOut_u8 = 0; - vx_float32 reOutZP = 0.f; - int32_t output_ZP = 0; - int32_t input_ZP = 0; - vx_uint32 iter = 0; - int32_t sumInZp = 0; - int32_t tmpZp1 = 0; - int32_t tmpZp2 = 0; - vx_float32 e2InScale = 0; - vsi_nn_tensor_attr_t attr[3]; - uint32_t i; - - memset(&attr[0], 0, sizeof(vsi_nn_tensor_attr_t)); - memset(&attr[1], 0, sizeof(vsi_nn_tensor_attr_t)); - memset(&attr[2], 0, sizeof(vsi_nn_tensor_attr_t)); - status = vsi_nn_vxGetTensorAttr(input, &attr[0]); - status |= vsi_nn_vxGetTensorAttr(output, &attr[1]); - status |= vsi_nn_vxGetTensorAttr(scale, &attr[2]); - if (status != VX_SUCCESS) - { - VSILOGE("vsi_nn_vxGetTensorAttr failure! at line %d\n", __LINE__); - return status; - } - - input_dims = attr[0].dim_num; - inputDataFormat = attr[0].dtype.vx_type; - for (i = 0; i < input_dims; i++) - { - input_size[i] = attr[0].size[i]; - } - input_ZP = attr[0].dtype.zero_point; - scaleIn = attr[0].dtype.scale; - outputDataFormat = attr[1].dtype.vx_type; - output_ZP = attr[1].dtype.zero_point; - scaleOut = attr[1].dtype.scale; - scaleDataFormat = attr[2].dtype.vx_type; - - if(outputDataFormat == VSI_NN_TYPE_UINT8) - { - reScaleOut_u8 = 1.0f / scaleOut; - reOutZP = (vx_float32)output_ZP; - } - iter = ((input_size[0] + 15) / 16) * 16; - sumInZp = input_ZP * iter * (-1); - tmpZp1 = (-2) * input_ZP; - tmpZp2 = iter * input_ZP * input_ZP; - e2InScale = scaleIn * scaleIn; - - input_size[2] = (input_dims <= 2)?1:input_size[2]; - - shaderParam.globalWorkOffset[0] = 0; - shaderParam.globalWorkOffset[1] = 0; - shaderParam.globalWorkOffset[2] = 0; - shaderParam.globalWorkScale[0] = input_size[0]; - shaderParam.globalWorkScale[1] = 1; - shaderParam.globalWorkScale[2] = 1; - shaderParam.globalWorkSize[0] = 1; - shaderParam.globalWorkSize[1] = gcmALIGN((input_size[1] + shaderParam.globalWorkScale[1] - 1) - / shaderParam.globalWorkScale[1], 4); - shaderParam.globalWorkSize[2] = input_size[2]; - - status |= vxSetNodeAttribute(nodObj, VX_NODE_ATTRIBUTE_KERNEL_EXECUTION_PARAMETERS, - &shaderParam, sizeof(vx_kernel_execution_parameters_t)); - if(status < 0) - { - VSILOGE("[%s : %d]Initializer failure! \n",__FILE__, __LINE__); - } - { - vx_float32 dimRatio = 1.0f / (vx_float32)input_size[0]; - vx_uint32 uniFp16SumSqr_dp8x2[16] = { - 0x55555555, // TCfg - 0x00000000, // ASelt - 0x76543210, 0x76543210, // ABin - 0x5555aaaa, // BSelt - 0x00000000, 0x76543210, // BBin - 0x00000100, // AccumType, ConstantType, and PostShift - 0x3c003c00, 0x3c003c00, 0x3c003c00, 0x3c003c00, 0x00000000, 0x00000000, 0x00000000, 0x00000000 // Constant - }; - vx_uint32 UniFP16toFP32Lo4_dp4x4[16] = { - 0x01010101, // TCfg - 0x00000000, // ASelt - 0x00010000, 0x00030002, // ABin - 0x02020202, // BSelt - 0x00000000, 0x00000000, // BBin - 0x00000100, // AccumType, ConstantType, and PostShift - 0x00003c00, 0x00000000, 0x00003c00, 0x00000000, 0x00003c00, 0x00000000, 0x00003c00, 0x00000000 // Constant - }; - vx_uint32 uniExtractHalf4_dp4x4[16] = { - 0x01010101, // TCfg - 0x00000000, // ASelt - 0x00020000, 0x00060004, // ABin - 0x02020202, // BSelt - 0x00000000, 0x00000000, // BBin - 0x00000100, // AccumType, ConstantType, and PostShift - 0x00003c00, 0x00000000, 0x00003c00, 0x00000000, 0x00003c00, 0x00000000, 0x00003c00, 0x00000000 // Constant - }; - vx_uint32 uniConvertSecFp16Fp32_4x4[16] = { - 0x01010101, // TCfg - 0x00000000, // ASelt - 0x00050004, 0x00070006, // ABin - 0x02020202, // BSelt - 0x00000000, 0x00000000, // BBin - 0x00000100, // AccumType, ConstantType, and PostShift - 0x00003c00, 0x00000000, 0x00003c00, 0x00000000, 0x00003c00, 0x00000000, 0x00003c00, 0x00000000 // Constant - }; - vx_uint32 uniSumU8_16x1[16] = { - 0x55555555, // TCfg - 0x00000000, // ASelt - 0x76543210, 0xfedcba98, // ABin - 0xaaaaaaaa, // BSelt - 0x00000000, 0x00000000, // BBin - 0x00002400, // AccumType, ConstantType, and PostShift - 0x00010001, 0x00010001, 0x00010001, 0x00010001, 0x00010001, 0x00010001, 0x00010001, 0x00010001 // Constant - }; - vx_uint32 uniSqrSum_16x1[16] = { - 0x55555555, // TCfg - 0x00000000, // ASelt - 0x76543210, 0xfedcba98, // ABin - 0x55555555, // BSelt - 0x76543210, 0xfedcba98, // BBin - 0x00000400, // AccumType, ConstantType, and PostShift - 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000 // Constant - }; - vx_uint32 uniConvert1stUint8SubZpToFp32_4x4[16] = { - 0x05050505, // TCfg - 0x04040404, // ASelt - 0x00010000, 0x00030002, // ABin - 0x0a0a0a0a, // BSelt - 0x00000000, 0x00000000, // BBin - 0x00000400, // AccumType, ConstantType, and PostShift - 0xffff0001, 0x00000000, 0xffff0001, 0x00000000, 0xffff0001, 0x00000000, 0xffff0001, 0x00000000 // Constant - }; - vx_uint32 uniConvert2ndUint8SubZpToFp32_4x4[16] = { - 0x05050505, // TCfg - 0x04040404, // ASelt - 0x00050004, 0x00070006, // ABin - 0x0a0a0a0a, // BSelt - 0x00000000, 0x00000000, // BBin - 0x00000400, // AccumType, ConstantType, and PostShift - 0xffff0001, 0x00000000, 0xffff0001, 0x00000000, 0xffff0001, 0x00000000, 0xffff0001, 0x00000000 // Constant - }; - vx_uint32 uniConvert3rdUint8SubZpToFp32_4x4[16] = { - 0x05050505, // TCfg - 0x04040404, // ASelt - 0x00090008, 0x000b000a, // ABin - 0x0a0a0a0a, // BSelt - 0x00000000, 0x00000000, // BBin - 0x00000400, // AccumType, ConstantType, and PostShift - 0xffff0001, 0x00000000, 0xffff0001, 0x00000000, 0xffff0001, 0x00000000, 0xffff0001, 0x00000000 // Constant - }; - vx_uint32 uniConvert4thUint8SubZpToFp32_4x4[16] = { - 0x05050505, // TCfg - 0x04040404, // ASelt - 0x000d000c, 0x000f000e, // ABin - 0x0a0a0a0a, // BSelt - 0x00000000, 0x00000000, // BBin - 0x00002400, // AccumType, ConstantType, and PostShift - 0xffff0001, 0x00000000, 0xffff0001, 0x00000000, 0xffff0001, 0x00000000, 0xffff0001, 0x00000000 // Constant - }; - vx_uint32 uniConvertInt32toUint8_2x8[16] = { - 0x33333333, // TCfg - 0x11110000, // ASelt - 0x03020100, 0x03020100, // ABin - 0x00000000, // BSelt - 0x00000000, 0x00000000, // BBin - 0x00002400, // AccumType, ConstantType, and PostShift - 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000 // Constant - }; - vx_uint32 UniPackFP16even_2x8[16] = { - 0x11111111, // TCfg - 0x11110000, // ASelt - 0x06040200, 0x06040200, // ABin - 0x22222222, // BSelt - 0x00000000, 0x00000000, // BBin - 0x00000100, // AccumType, ConstantType, and PostShift - 0x00003c00, 0x00003c00, 0x00003c00, 0x00003c00, 0x00003c00, 0x00003c00, 0x00003c00, 0x00003c00 // Constant - }; - if (inputDataFormat == VSI_NN_TYPE_UINT8 && outputDataFormat == VSI_NN_TYPE_FLOAT16 - && scaleDataFormat == VSI_NN_TYPE_FLOAT16) - { - status = vxSetNodeUniform(nodObj, "width", 1, &input_size[0]); - status |= vxSetNodeUniform(nodObj, "dimRatio", 1, &dimRatio); - status |= vxSetNodeUniform(nodObj, "UniFP16toFP32Lo4_dp4x4", 1, UniFP16toFP32Lo4_dp4x4); - status |= vxSetNodeUniform(nodObj, "uniConvertSecFp16Fp32_4x4", 1, uniConvertSecFp16Fp32_4x4); - status |= vxSetNodeUniform(nodObj, "uniSumU8_16x1", 1, uniSumU8_16x1); - status |= vxSetNodeUniform(nodObj, "uniSqrSum_16x1", 1, uniSqrSum_16x1); - status |= vxSetNodeUniform(nodObj, "uniConvert1stUint8SubZpToFp32_4x4", 1, uniConvert1stUint8SubZpToFp32_4x4); - status |= vxSetNodeUniform(nodObj, "uniConvert2ndUint8SubZpToFp32_4x4", 1, uniConvert2ndUint8SubZpToFp32_4x4); - status |= vxSetNodeUniform(nodObj, "uniConvert3rdUint8SubZpToFp32_4x4", 1, uniConvert3rdUint8SubZpToFp32_4x4); - status |= vxSetNodeUniform(nodObj, "uniConvert4thUint8SubZpToFp32_4x4", 1, uniConvert4thUint8SubZpToFp32_4x4); - status |= vxSetNodeUniform(nodObj, "inputZP", 1, &input_ZP); - status |= vxSetNodeUniform(nodObj, "input_scale", 1, &scaleIn); - status |= vxSetNodeUniform(nodObj, "sumInZp", 1, &sumInZp); - status |= vxSetNodeUniform(nodObj, "tmpZp1", 1, &tmpZp1); - status |= vxSetNodeUniform(nodObj, "tmpZp2", 1, &tmpZp2); - status |= vxSetNodeUniform(nodObj, "e2InScale", 1, &e2InScale); - status |= vxSetNodeUniform(nodObj, "UniPackFP16even_2x8", 1, UniPackFP16even_2x8); - } - else - { - status = vxSetNodeUniform(nodObj, "uniFp16SumSqr_dp8x2", 1, uniFp16SumSqr_dp8x2); - status |= vxSetNodeUniform(nodObj, "width", 1, &input_size[0]); - status |= vxSetNodeUniform(nodObj, "dimRatio", 1, &dimRatio); - status |= vxSetNodeUniform(nodObj, "UniFP16toFP32Lo4_dp4x4", 1, UniFP16toFP32Lo4_dp4x4); - status |= vxSetNodeUniform(nodObj, "uniExtractHalf4_dp4x4", 1, uniExtractHalf4_dp4x4); - status |= vxSetNodeUniform(nodObj, "uniConvertInt32toUint8_2x8", 1, uniConvertInt32toUint8_2x8); - status |= vxSetNodeUniform(nodObj, "uniConvertSecFp16Fp32_4x4", 1, uniConvertSecFp16Fp32_4x4); - status |= vxSetNodeUniform(nodObj, "uniSumU8_16x1", 1, uniSumU8_16x1); - status |= vxSetNodeUniform(nodObj, "uniSqrSum_16x1", 1, uniSqrSum_16x1); - status |= vxSetNodeUniform(nodObj, "uniConvert1stUint8SubZpToFp32_4x4", 1, uniConvert1stUint8SubZpToFp32_4x4); - status |= vxSetNodeUniform(nodObj, "uniConvert2ndUint8SubZpToFp32_4x4", 1, uniConvert2ndUint8SubZpToFp32_4x4); - status |= vxSetNodeUniform(nodObj, "uniConvert3rdUint8SubZpToFp32_4x4", 1, uniConvert3rdUint8SubZpToFp32_4x4); - status |= vxSetNodeUniform(nodObj, "uniConvert4thUint8SubZpToFp32_4x4", 1, uniConvert4thUint8SubZpToFp32_4x4); - status |= vxSetNodeUniform(nodObj, "inputZP", 1, &input_ZP); - status |= vxSetNodeUniform(nodObj, "output_ZP", 1, &output_ZP); - status |= vxSetNodeUniform(nodObj, "input_scale", 1, &scaleIn); - status |= vxSetNodeUniform(nodObj, "outputScale", 1, &reScaleOut_u8); - status |= vxSetNodeUniform(nodObj, "outputZP", 1, &reOutZP); - status |= vxSetNodeUniform(nodObj, "sumInZp", 1, &sumInZp); - status |= vxSetNodeUniform(nodObj, "tmpZp1", 1, &tmpZp1); - status |= vxSetNodeUniform(nodObj, "tmpZp2", 1, &tmpZp2); - status |= vxSetNodeUniform(nodObj, "e2InScale", 1, &e2InScale); - } - if(status < 0) - { - VSILOGE("[%s : %d]Initializer failure! \n",__FILE__, __LINE__); - } - } - return status; -} -static vx_param_description_t vxLayerNormKernelParam[] = -{ - {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, - {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, - {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, - {VX_OUTPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, - {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED} -}; -#ifdef __cplusplus -extern "C" { -#endif -vx_kernel_description_t vxLayerNormKernelInfo = -{ - VX_KERNEL_ENUM_LAYERNORM, - VX_KERNEL_NAME_LAYERNORM, - NULL, - vxLayerNormKernelParam, - (sizeof(vxLayerNormKernelParam) / sizeof(vxLayerNormKernelParam[0])), - vsi_nn_KernelValidator, - NULL, - NULL, - vxLayerNormInitializer, - vsi_nn_KernelDeinitializer -}; - -vx_kernel_description_t vxLayerNormKernelInfo_u8 = -{ - VX_KERNEL_ENUM_LAYERNORM, - VX_KERNEL_NAME_LAYERNORM_UINT8, - NULL, - vxLayerNormKernelParam, - (sizeof(vxLayerNormKernelParam) / sizeof(vxLayerNormKernelParam[0])), - vsi_nn_KernelValidator, - NULL, - NULL, - vxLayerNormInitializer, - vsi_nn_KernelDeinitializer -}; - -vx_kernel_description_t vxLayerNormKernelInfo_FP16toU8 = -{ - VX_KERNEL_ENUM_LAYERNORM_FP16TOU8, - VX_KERNEL_NAME_LAYERNORM_FP16TOU8, - NULL, - vxLayerNormKernelParam, - (sizeof(vxLayerNormKernelParam) / sizeof(vxLayerNormKernelParam[0])), - vsi_nn_KernelValidator, - NULL, - NULL, - vxLayerNormInitializer, - vsi_nn_KernelDeinitializer -}; - -vx_kernel_description_t vxLayerNormKernelInfo_U8toFP16 = -{ - VX_KERNEL_ENUM_LAYERNORM, - VX_KERNEL_NAME_LAYERNORM_U8TOFP16, - NULL, - vxLayerNormKernelParam, - (sizeof(vxLayerNormKernelParam) / sizeof(vxLayerNormKernelParam[0])), - vsi_nn_KernelValidator, - NULL, - NULL, - vxLayerNormInitializer, - vsi_nn_KernelDeinitializer -}; - -vx_kernel_description_t vxLayerNormKernelInfo_CPU = -{ - VX_KERNEL_ENUM_LAYERNORM, - VX_KERNEL_NAME_LAYERNORM, - vxLayerNormKernel, - vxLayerNormKernelParam, - (sizeof(vxLayerNormKernelParam) / sizeof(vxLayerNormKernelParam[0])), - vsi_nn_KernelValidator, - NULL, - NULL, - vsi_nn_KernelInitializer, - vsi_nn_KernelDeinitializer -}; - -vx_kernel_description_t * vx_kernel_LAYERNORM_list[] = -{ - &vxLayerNormKernelInfo_CPU, - &vxLayerNormKernelInfo, - &vxLayerNormKernelInfo_u8, - &vxLayerNormKernelInfo_FP16toU8, - &vxLayerNormKernelInfo_U8toFP16, - NULL -}; -#ifdef __cplusplus -} -#endif diff --git a/src/tim/vx/internal/src/libnnext/ops/kernel/vsi_nn_kernel_reduce.c b/src/tim/vx/internal/src/libnnext/ops/kernel/vsi_nn_kernel_reduce.c deleted file mode 100644 index fa478d0..0000000 --- a/src/tim/vx/internal/src/libnnext/ops/kernel/vsi_nn_kernel_reduce.c +++ /dev/null @@ -1,190 +0,0 @@ -/**************************************************************************** -* -* Copyright (c) 2020 Vivante Corporation -* -* Permission is hereby granted, free of charge, to any person obtaining a -* copy of this software and associated documentation files (the "Software"), -* to deal in the Software without restriction, including without limitation -* the rights to use, copy, modify, merge, publish, distribute, sublicense, -* and/or sell copies of the Software, and to permit persons to whom the -* Software is furnished to do so, subject to the following conditions: -* -* The above copyright notice and this permission notice shall be included in -* all copies or substantial portions of the Software. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING -* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER -* DEALINGS IN THE SOFTWARE. -* -*****************************************************************************/ - -#include -#include -#include - -#include "vsi_nn_platform.h" - -#include "vsi_nn_prv.h" -#include "vsi_nn_log.h" -#include "vsi_nn_tensor_util.h" -#include "utils/vsi_nn_util.h" -#include "utils/vsi_nn_dtype_util.h" -#include "utils/vsi_nn_math.h" -#include "client/vsi_nn_vxkernel.h" -#include "libnnext/vx_lib_nnext.h" - -#define _VX_KERNEL_VAR (vx_kernel_REDUCE) -#define _VX_KERNEL_ID (VX_KERNEL_ENUM_REDUCE) -#define _VX_KERNEL_NAME ("vsi_nn_kernel_reduce") -#define _VX_KERNEL_FUNC_KERNEL (vxReduceKernel) - -static vx_status VX_CALLBACK vxReduceKernel - ( - vx_node node, - const vx_reference* paramObj, - vx_uint32 paramNum - ) -{ - /* TODO: */ -#define ARG_NUM (6) -#define TENSOR_NUM_INPUT (1) -#define TENSOR_NUM_OUTPUT (1) -#define TENSOR_NUM (TENSOR_NUM_INPUT+TENSOR_NUM_OUTPUT) - - vx_status status = VX_SUCCESS; - vx_context context = NULL; - vsi_nn_tensor_attr_t attr[TENSOR_NUM]; - vx_uint32 stride_size[TENSOR_NUM][VSI_NN_MAX_DIM_NUM]; - vx_tensor_addressing user_addr[TENSOR_NUM] = {NULL}; - vx_uint8 *buffer_ptr[TENSOR_NUM] = {NULL}; - vx_tensor tensor[TENSOR_NUM]; - - vx_float32 factor0; - vx_int32 factor; - vx_uint32 batch, c, h, w; - vx_uint32 i, j, k, b; - - //prepare data - context = vxGetContext((vx_reference)node); - - for( i = 0; i < TENSOR_NUM_INPUT; i ++ ) - { - tensor[i] = (vx_tensor)paramObj[i]; - buffer_ptr[i] = vsi_nn_ConvertRawTensorToData2(context, tensor[i], - &(attr[i]), stride_size[i], &(user_addr[i]), VX_READ_ONLY); - } - for( i = TENSOR_NUM_INPUT; i < TENSOR_NUM; i ++ ) - { - tensor[i] = (vx_tensor)paramObj[i]; - buffer_ptr[i] = vsi_nn_ConvertRawTensorToData2(context, tensor[i], - &(attr[i]), stride_size[i], &(user_addr[i]), VX_WRITE_ONLY); - } - - vxCopyScalar((vx_scalar)paramObj[TENSOR_NUM], &(factor0), VX_READ_ONLY, VX_MEMORY_TYPE_HOST); - - //op calc - if (factor0 > 1) - { - factor = (vx_int32)(factor0 + 0.5); - w = attr[0].size[0]; - h = attr[0].size[1]; - c = attr[0].size[2]; - batch = 1; - for(b = 0; b < batch; ++b){ - for(k = 0; k < c; ++k){ - for(j = 0; j < h*factor; ++j){ - for(i = 0; i < w*factor; ++i){ - vx_int32 in_index = b*w*h*c + k*w*h + (j/factor)*w + i/factor; - vx_int32 out_index = b*w*h*c*factor*factor + k*w*h*factor*factor + - j*w*factor + i; - vx_float32 fval; - //out[out_index] = in[in_index]; - vsi_nn_DtypeToFloat32(&buffer_ptr[0][stride_size[0][0] * in_index], - &fval, &attr[0].dtype); - vsi_nn_Float32ToDtype(fval, &buffer_ptr[1][stride_size[1][0] * out_index], - &attr[1].dtype); - } - } - } - } - } - else - { - factor = (vx_int32)(1 / factor0 + 0.5); - w = attr[1].size[0]; - h = attr[1].size[1]; - c = attr[1].size[2]; - batch = 1; - for(b = 0; b < batch; ++b){ - for(k = 0; k < c; ++k){ - for(j = 0; j < h; ++j){ - for(i = 0; i < w; ++i){ - vx_int32 in_index = b*w*h*c*factor*factor + - k*w*h*factor*factor + j*w*factor*factor + i*factor; - vx_int32 out_index = b*w*h*c + k*w*h + j * w + i; - vx_float32 fval; - //out[out_index] = in[in_index]; - vsi_nn_DtypeToFloat32(&buffer_ptr[0][stride_size[0][0] * in_index], &fval, - &attr[0].dtype); - vsi_nn_Float32ToDtype(fval, &buffer_ptr[1][stride_size[1][0] * out_index], - &attr[1].dtype); - } - } - } - } - } - - //save data - for( i = TENSOR_NUM_INPUT; i < TENSOR_NUM; i ++ ) - { - status = vsi_nn_copy_tensor_patch(tensor[i], &attr[i], buffer_ptr[i], VX_WRITE_ONLY); - if (user_addr[i]) vxReleaseTensorAddressing(&(user_addr[i])); - } - for( i = 0; i < TENSOR_NUM; i ++ ) - { - if (buffer_ptr[i]) free(buffer_ptr[i]); - } - return status; -} /* _VX_KERNEL_FUNC_KERNEL() */ - -static vx_param_description_t s_params[] = -{ - { VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED }, - { VX_OUTPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED }, - { VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED }, - { VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED }, - { VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_OPTIONAL }, - { VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_OPTIONAL }, - { VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_OPTIONAL }, - { VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_OPTIONAL }, -}; - -#ifdef __cplusplus -extern "C" { -#endif -vx_kernel_description_t _VX_KERNEL_VAR = -{ - _VX_KERNEL_ID, - _VX_KERNEL_NAME, - _VX_KERNEL_FUNC_KERNEL, - s_params, - _cnt_of_array( s_params ), - vsi_nn_KernelValidator, - NULL, - NULL, - vsi_nn_KernelInitializer, - vsi_nn_KernelDeinitializer -}; - -vx_kernel_description_t * vx_kernel_REDUCE_list[] = -{ - &_VX_KERNEL_VAR, - NULL -}; -#ifdef __cplusplus -} -#endif diff --git a/src/tim/vx/internal/src/libnnext/ops/kernel/vsi_nn_kernel_resize.c b/src/tim/vx/internal/src/libnnext/ops/kernel/vsi_nn_kernel_resize.c deleted file mode 100644 index ef9a073..0000000 --- a/src/tim/vx/internal/src/libnnext/ops/kernel/vsi_nn_kernel_resize.c +++ /dev/null @@ -1,283 +0,0 @@ -/**************************************************************************** -* -* Copyright (c) 2020 Vivante Corporation -* -* Permission is hereby granted, free of charge, to any person obtaining a -* copy of this software and associated documentation files (the "Software"), -* to deal in the Software without restriction, including without limitation -* the rights to use, copy, modify, merge, publish, distribute, sublicense, -* and/or sell copies of the Software, and to permit persons to whom the -* Software is furnished to do so, subject to the following conditions: -* -* The above copyright notice and this permission notice shall be included in -* all copies or substantial portions of the Software. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING -* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER -* DEALINGS IN THE SOFTWARE. -* -*****************************************************************************/ - -#include -#include -#include -#include - -#include "vsi_nn_platform.h" -#include "vsi_nn_prv.h" -#include "vsi_nn_log.h" -#include "vsi_nn_tensor_util.h" -#include "utils/vsi_nn_util.h" -#include "utils/vsi_nn_dtype_util.h" -#include "utils/vsi_nn_math.h" -#include "client/vsi_nn_vxkernel.h" -#include "libnnext/vx_lib_nnext.h" - -#define _VX_KERNEL_VAR (vx_kernel_RESIZE) -#define _VX_KERNEL_ID (VX_KERNEL_ENUM_RESIZE) -#define _VX_KERNEL_NAME ("vsi_nn_kernel_resize") -#define _VX_KERNEL_FUNC_KERNEL (vxResizeKernel) - -static vsi_status VX_CALLBACK vxResizeKernel - ( - vx_node node, - const vx_reference* paramObj, - uint32_t paramNum - ) -{ - /* TODO: */ -#define ARG_NUM (1) -#define TENSOR_NUM_INPUT (1) -#define TENSOR_NUM_OUTPUT (1) -#define TENSOR_NUM (TENSOR_NUM_INPUT+TENSOR_NUM_OUTPUT) - - vsi_status status = VX_SUCCESS; - vx_context context = NULL; - vsi_nn_tensor_attr_t attr[TENSOR_NUM]; - uint32_t stride_size[TENSOR_NUM][VSI_NN_MAX_DIM_NUM]; - vx_tensor_addressing user_addr[TENSOR_NUM] = {NULL}; - uint8_t *buffer_ptr[TENSOR_NUM] = {NULL}; - vx_tensor tensor[TENSOR_NUM]; - - float factor0; - int32_t factor; - uint32_t batch, c, h, w; - uint32_t i, j, k, b; - - //prepare data - context = vxGetContext((vx_reference)node); - - for( i = 0; i < TENSOR_NUM_INPUT; i ++ ) - { - tensor[i] = (vx_tensor)paramObj[i]; - buffer_ptr[i] = vsi_nn_ConvertRawTensorToData2(context, tensor[i], - &(attr[i]), stride_size[i], &(user_addr[i]), VX_READ_ONLY); - } - for( i = TENSOR_NUM_INPUT; i < TENSOR_NUM; i ++ ) - { - tensor[i] = (vx_tensor)paramObj[i]; - buffer_ptr[i] = vsi_nn_ConvertRawTensorToData2(context, tensor[i], - &(attr[i]), stride_size[i], &(user_addr[i]), VX_WRITE_ONLY); - } - - vxCopyScalar((vx_scalar)paramObj[TENSOR_NUM], &(factor0), VX_READ_ONLY, VX_MEMORY_TYPE_HOST); - - //op calc - if (factor0 > 1) - { - factor = (int32_t)(factor0 + 0.5); - w = attr[0].size[0]; - h = attr[0].size[1]; - c = attr[0].size[2]; - batch = 1; - for(b = 0; b < batch; ++b){ - for(k = 0; k < c; ++k){ - for(j = 0; j < h*factor; ++j){ - for(i = 0; i < w*factor; ++i){ - int32_t in_index = b*w*h*c + k*w*h + (j/factor)*w + i/factor; - int32_t out_index = b*w*h*c*factor*factor + k*w*h*factor*factor + - j*w*factor + i; - float fval; - //out[out_index] = in[in_index]; - vsi_nn_DtypeToFloat32(&buffer_ptr[0][stride_size[0][0] * in_index], - &fval, &attr[0].dtype); - vsi_nn_Float32ToDtype(fval, &buffer_ptr[1][stride_size[1][0] * out_index], - &attr[1].dtype); - } - } - } - } - } - else - { - factor = (int32_t)(1 / factor0 + 0.5); - w = attr[1].size[0]; - h = attr[1].size[1]; - c = attr[1].size[2]; - batch = 1; - for(b = 0; b < batch; ++b){ - for(k = 0; k < c; ++k){ - for(j = 0; j < h; ++j){ - for(i = 0; i < w; ++i){ - int32_t in_index = b*w*h*c*factor*factor + - k*w*h*factor*factor + j*w*factor*factor + i*factor; - int32_t out_index = b*w*h*c + k*w*h + j * w + i; - float fval; - //out[out_index] = in[in_index]; - vsi_nn_DtypeToFloat32(&buffer_ptr[0][stride_size[0][0] * in_index], &fval, - &attr[0].dtype); - vsi_nn_Float32ToDtype(fval, &buffer_ptr[1][stride_size[1][0] * out_index], - &attr[1].dtype); - } - } - } - } - } - - //save data - for( i = TENSOR_NUM_INPUT; i < TENSOR_NUM; i ++ ) - { - status = vsi_nn_copy_tensor_patch(tensor[i], &attr[i], buffer_ptr[i], VX_WRITE_ONLY); - if (user_addr[i]) vxReleaseTensorAddressing(&(user_addr[i])); - } - for( i = 0; i < TENSOR_NUM; i ++ ) - { - if (buffer_ptr[i]) free(buffer_ptr[i]); - } - return status; -} /* _VX_KERNEL_FUNC_KERNEL() */ - -static vx_param_description_t s_params[] = -{ - { VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED }, - { VX_OUTPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED }, - { VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED }, -}; - -vsi_status VX_CALLBACK vxTensorResizeInitializer - ( - vx_node nodObj, - const vx_reference *paramObj, - uint32_t paraNum - ) -{ - // Alignment with a power of two value. -#define gcmALIGN(n, align) ((n) + ((align) - 1)) & ~((align) - 1) - vx_kernel_execution_parameters_t shaderParam = { - 2, // workdim - {0, 0, 0}, // globalWorkOffset: control the start location be processed in the image - {0, 0, 0}, // globalWorkScale: how many pixels could be processed by a single thread - {0, 0, 0}, // localWorkSize: local group size in thread - {0, 0, 0}}; // globalWorkSize: image size in thread - uint32_t uniPackEvenData_2x8[16] = { - 0x33333333, // TCfg - 0x11110000, // ASelt - 0x06040200, 0x06040200, // ABin - 0x00000000, // BSelt - 0x00000000, 0x00000000, // BBin - 0x00003400, // AccumType, ConstantType, and PostShift - 0x00000000, 0x00000000, 0x00000000, 0x00000000, - 0x00000000, 0x00000000, 0x00000000, 0x00000000 // Constant - }; - vsi_status status = VX_SUCCESS; - - vx_tensor input = (vx_tensor)paramObj[0]; - uint32_t input_size[DIM_SIZE] = {1, 1, 1, 1}; - vsi_nn_tensor_attr_t attr; - uint32_t i, input_dim; - - memset(&attr, 0, sizeof(vsi_nn_tensor_attr_t)); - status = vsi_nn_vxGetTensorAttr(input, &attr); - if (status != VX_SUCCESS) - { - VSILOGE("vsi_nn_vxGetTensorAttr failure! at line %d\n", __LINE__); - return status; - } - input_dim = attr.dim_num; - for (i = 0; i < input_dim; i++) - { - input_size[i] = attr.size[i]; - } - - shaderParam.globalWorkOffset[0] = 0; - shaderParam.globalWorkOffset[1] = 0; - shaderParam.globalWorkScale[0] = 16; - shaderParam.globalWorkScale[1] = 2; - shaderParam.globalWorkSize[0] = gcmALIGN((input_size[0] + shaderParam.globalWorkScale[0] - 1) - / shaderParam.globalWorkScale[0], 4); - shaderParam.globalWorkSize[1] = (input_size[1] + shaderParam.globalWorkScale[1] - 1) - / shaderParam.globalWorkScale[1]; - - vxSetNodeUniform(nodObj, "uniPackEvenData_2x8", 1, uniPackEvenData_2x8); - - status |= vxSetNodeAttribute(nodObj, VX_NODE_ATTRIBUTE_KERNEL_EXECUTION_PARAMETERS, - &shaderParam, sizeof(vx_kernel_execution_parameters_t)); - - return VX_SUCCESS; -} - -static vx_param_description_t vxTensorResizeKernelParam[] = -{ - {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, - {VX_OUTPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED} -}; - -#ifdef __cplusplus -extern "C" { -#endif -vx_kernel_description_t _VX_KERNEL_VAR = -{ - _VX_KERNEL_ID, - _VX_KERNEL_NAME, - _VX_KERNEL_FUNC_KERNEL, - s_params, - _cnt_of_array( s_params ), - vsi_nn_KernelValidator, - NULL, - NULL, - vsi_nn_KernelInitializer, - vsi_nn_KernelDeinitializer -}; - -vx_kernel_description_t vxTensorResize16BitsDownSampleQuarterKernelInfo = -{ - VX_KERNEL_ENUM_RESIZE_16BITS_DOWNSAMPLE_QUARTER, - VX_KERNEL_NAME_RESIZE_16BITS_DOWNSAMPLE_QUARTER, - NULL, - vxTensorResizeKernelParam, - (sizeof(vxTensorResizeKernelParam) / sizeof(vxTensorResizeKernelParam[0])), - vsi_nn_KernelValidator, - NULL, - NULL, - vxTensorResizeInitializer, - vsi_nn_KernelDeinitializer -}; - -vx_kernel_description_t vxTensorResize8BitsDownSampleQuarterKernelInfo = -{ - VX_KERNEL_ENUM_RESIZE_8BITS_DOWNSAMPLE_QUARTER, - VX_KERNEL_NAME_RESIZE_8BITS_DOWNSAMPLE_QUARTER, - NULL, - vxTensorResizeKernelParam, - (sizeof(vxTensorResizeKernelParam) / sizeof(vxTensorResizeKernelParam[0])), - vsi_nn_KernelValidator, - NULL, - NULL, - vxTensorResizeInitializer, - vsi_nn_KernelDeinitializer -}; - -vx_kernel_description_t * vx_kernel_RESIZE_list[] = -{ - &_VX_KERNEL_VAR, - &vxTensorResize16BitsDownSampleQuarterKernelInfo, - &vxTensorResize8BitsDownSampleQuarterKernelInfo, - NULL -}; -#ifdef __cplusplus -} -#endif diff --git a/src/tim/vx/internal/src/libnnext/ops/kernel/vsi_nn_kernel_roi_align.c b/src/tim/vx/internal/src/libnnext/ops/kernel/vsi_nn_kernel_roi_align.c deleted file mode 100644 index 0287f19..0000000 --- a/src/tim/vx/internal/src/libnnext/ops/kernel/vsi_nn_kernel_roi_align.c +++ /dev/null @@ -1,317 +0,0 @@ -/**************************************************************************** -* -* Copyright (c) 2020 Vivante Corporation -* -* Permission is hereby granted, free of charge, to any person obtaining a -* copy of this software and associated documentation files (the "Software"), -* to deal in the Software without restriction, including without limitation -* the rights to use, copy, modify, merge, publish, distribute, sublicense, -* and/or sell copies of the Software, and to permit persons to whom the -* Software is furnished to do so, subject to the following conditions: -* -* The above copyright notice and this permission notice shall be included in -* all copies or substantial portions of the Software. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING -* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER -* DEALINGS IN THE SOFTWARE. -* -*****************************************************************************/ -#include -#include -#include -#include -#include - -#include "vsi_nn_platform.h" - -#include "vsi_nn_prv.h" -#include "vsi_nn_log.h" -#include "vsi_nn_test.h" -#include "vsi_nn_tensor_util.h" -#include "utils/vsi_nn_util.h" -#include "utils/vsi_nn_dtype_util.h" -#include "utils/vsi_nn_math.h" -#include "client/vsi_nn_vxkernel.h" -#include "libnnext/vx_lib_nnext.h" - -#define _VX_KERNEL_VAR (vx_kernel_ROI_ALIGN) -#define _VX_KERNEL_ID (VX_KERNEL_ENUM_ROI_ALIGN) -#define _VX_KERNEL_NAME (VX_KERNEL_NAME_ROI_ALIGN) -#define _VX_KERNEL_FUNC_KERNEL (vxRoi_alignKernel) - -static vsi_status VX_CALLBACK vxRoi_alignKernel - ( - vx_node node, - const vx_reference* paramObj, - uint32_t paramNum - ) -{ -#define ARG_NUM (6) -#define TENSOR_NUM_INPUT (3) -#define TENSOR_NUM_OUTPUT (1) -#define TENSOR_NUM (TENSOR_NUM_INPUT+TENSOR_NUM_OUTPUT) - - vsi_status status = VSI_FAILURE; - vx_context context = NULL; - vx_tensor input[TENSOR_NUM_INPUT] = {0}; - vx_tensor output[TENSOR_NUM_OUTPUT] = {0}; - float *f32_in_buffer[TENSOR_NUM_INPUT] = {0}; - int32_t* int32_in_buffer[TENSOR_NUM_INPUT] = {0}; - float *f32_out_buffer[TENSOR_NUM_OUTPUT] = {0}; - vsi_nn_tensor_attr_t in_attr[TENSOR_NUM_INPUT]; - vsi_nn_tensor_attr_t out_attr[TENSOR_NUM_OUTPUT]; - uint32_t in_elements[TENSOR_NUM_INPUT] = {0}; - uint32_t out_elements[TENSOR_NUM_OUTPUT]= {0}; - - int32_t output_height; - int32_t output_width; - float height_ratio; - float width_ratio; - int32_t height_sample_num; - int32_t width_sample_num; - - uint32_t i = 0; - for(i = 0; i < TENSOR_NUM_INPUT; i++) - { - memset(&in_attr[i], 0x0, sizeof(vsi_nn_tensor_attr_t)); - } - for(i = 0; i < TENSOR_NUM_OUTPUT; i++) - { - memset(&out_attr[i], 0x0, sizeof(vsi_nn_tensor_attr_t)); - } - /* prepare data */ - context = vxGetContext((vx_reference)node); - - for(i = 0; i < TENSOR_NUM_INPUT; i ++) - { - input[i] = (vx_tensor)paramObj[i]; - status = vsi_nn_vxGetTensorAttr(input[i], &in_attr[i]); - TEST_CHECK_STATUS(status, final); - in_elements[i] = vsi_nn_vxGetTensorElementNum(&in_attr[i]); - if (i == 2) - { - int32_in_buffer[i] = (int32_t *)vsi_nn_vxCopyTensorToData(context, - input[i], &in_attr[i]); - } - else - { - f32_in_buffer[i] = (float *)malloc(in_elements[i] * sizeof(float)); - status = vsi_nn_vxConvertTensorToFloat32Data( - context, input[i], &in_attr[i], f32_in_buffer[i], - in_elements[i] * sizeof(float)); - TEST_CHECK_STATUS(status, final); - } - } - for(i = 0; i < TENSOR_NUM_OUTPUT; i ++) - { - output[i] = (vx_tensor)paramObj[i + TENSOR_NUM_INPUT]; - status = vsi_nn_vxGetTensorAttr(output[i], &out_attr[i]); - TEST_CHECK_STATUS(status, final); - out_elements[i] = vsi_nn_vxGetTensorElementNum(&out_attr[i]); - f32_out_buffer[i]= (float *)malloc(out_elements[i] * sizeof(float)); - memset(f32_out_buffer[i], 0, out_elements[i] * sizeof(float)); - } - vxCopyScalar((vx_scalar)paramObj[TENSOR_NUM], &(output_height), - VX_READ_ONLY, VX_MEMORY_TYPE_HOST); - vxCopyScalar((vx_scalar)paramObj[TENSOR_NUM + 1], &(output_width), - VX_READ_ONLY, VX_MEMORY_TYPE_HOST); - vxCopyScalar((vx_scalar)paramObj[TENSOR_NUM + 2], &(height_ratio), - VX_READ_ONLY, VX_MEMORY_TYPE_HOST); - vxCopyScalar((vx_scalar)paramObj[TENSOR_NUM + 3], &(width_ratio), - VX_READ_ONLY, VX_MEMORY_TYPE_HOST); - vxCopyScalar((vx_scalar)paramObj[TENSOR_NUM + 4], &(height_sample_num), - VX_READ_ONLY, VX_MEMORY_TYPE_HOST); - vxCopyScalar((vx_scalar)paramObj[TENSOR_NUM + 5], &(width_sample_num), - VX_READ_ONLY, VX_MEMORY_TYPE_HOST); - - /* TODO: Add CPU kernel implement */ - { - uint32_t n, j, k; - uint32_t kRoiDim = 4; - float heightScale = 1.0f / height_ratio; - float widthScale = 1.0f / width_ratio; - uint32_t inHeight = in_attr[0].size[2]; - uint32_t inWidth = in_attr[0].size[1]; - uint32_t inDepth = in_attr[0].size[0]; - uint32_t numRois = in_attr[1].size[1]; - uint32_t outHeight = out_attr[0].size[2]; - uint32_t outWidth = out_attr[0].size[1]; - uint32_t out_index = 0; - - for(n = 0; n < numRois; n++) - { - uint32_t batchId = int32_in_buffer[2][n]; - float scale = (in_attr[1].dtype.vx_type == VSI_NN_TYPE_UINT16) ? 0.125f : 1.0f; - float wRoiStart = f32_in_buffer[1][n * kRoiDim] * widthScale * scale; - float hRoiStart = f32_in_buffer[1][n * kRoiDim + 1] * heightScale * scale; - float wRoiEnd = f32_in_buffer[1][n * kRoiDim + 2] * widthScale * scale; - float hRoiEnd = f32_in_buffer[1][n * kRoiDim + 3] * heightScale * scale; - - float roiWidth = vsi_nn_max((wRoiEnd - wRoiStart), 1.0f); - float roiHeight = vsi_nn_max((hRoiEnd - hRoiStart), 1.0f); - float wStepSize = roiWidth / outWidth; - float hStepSize = roiHeight / outHeight; - - uint32_t wSamplingRatio = width_sample_num > 0 - ? width_sample_num : (uint32_t)ceil(wStepSize); - uint32_t hSamplingRatio = height_sample_num > 0 - ? height_sample_num : (uint32_t)ceil(hStepSize); - int32_t numSamplingPoints = wSamplingRatio * hSamplingRatio; - float wBinSize = wStepSize / (float)(wSamplingRatio); - float hBinSize = hStepSize / (float)(hSamplingRatio); - - int32_t batch_base_index = batchId * inHeight * inWidth * inDepth; - - for (i = 0; i < outHeight; i++) - { - for (j = 0; j < outWidth; j++) - { - float wStart = wStepSize * j + wRoiStart; - float wEnd = wStepSize * (j + 1) + wRoiStart; - float hStart = hStepSize * i + hRoiStart; - float hEnd = hStepSize * (i + 1) + hRoiStart; - - float x,y; - for (y = hStart + hBinSize / 2; y < hEnd; y += hBinSize) - { - for (x = wStart + wBinSize / 2; x < wEnd; x += wBinSize) - { - uint32_t x1 = (uint32_t)floor(x); - uint32_t y1 = (uint32_t)floor(y); - uint32_t x2 = x1 + 1, y2 = y1 + 1; - float dx1 = x - (float)(x1); - float dy1 = y - (float)(y1); - if (x1 >= inWidth - 1) { - x1 = x2 = inWidth - 1; - dx1 = 0; - } - if (y1 >= inHeight - 1) { - y1 = y2 = inHeight - 1; - dy1 = 0; - } - { - float dx2 = 1.0f - dx1, dy2 = 1.0f - dy1; - float ws[] = {dx2 * dy2, dx1 * dy2, - dx2 * dy1, dx1 * dy1}; - uint32_t offsets[] = {y1 * inWidth * inDepth + x1 * inDepth, - y1 * inWidth * inDepth + x2 * inDepth, - y2 * inWidth * inDepth + x1 * inDepth, - y2 * inWidth * inDepth + x2 * inDepth}; - for (k = 0; k < inDepth; k++) { - float interpolation = 0; - uint32_t c; - for (c = 0; c < 4; c++) - { - interpolation += ws[c] - * f32_in_buffer[0][batch_base_index + offsets[c] + k]; - } - f32_out_buffer[0][out_index + k] += interpolation; - } - } - } - } - for (k = 0; k < inDepth; k++) - { - f32_out_buffer[0][out_index + k] /= (float)(numSamplingPoints); - } - out_index += inDepth; - } - } - } - } - - /* save data */ - for(i = 0; i < TENSOR_NUM_OUTPUT; i++) - { - status = vsi_nn_vxConvertFloat32DataToTensor( - context, output[i], &out_attr[i], f32_out_buffer[i], - out_elements[i] * sizeof(float)); - TEST_CHECK_STATUS(status, final); - } - -final: - for (i = 0; i < TENSOR_NUM_INPUT; i++) - { - if (f32_in_buffer[i]) free(f32_in_buffer[i]); - if (int32_in_buffer[i]) free(int32_in_buffer[i]); - } - for(i = 0; i < TENSOR_NUM_OUTPUT; i++) - { - if (f32_out_buffer[i]) free(f32_out_buffer[i]); - } - return status; -} /* _VX_KERNEL_FUNC_KERNEL() */ - -static vx_param_description_t vxRoi_alignKernelParam[] = -{ - {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, - {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, - {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, - {VX_OUTPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, - {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED}, - {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED}, - {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED}, - {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED}, - {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED}, - {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED}, -}; - -vx_status VX_CALLBACK vxRoi_alignInitializer - ( - vx_node nodObj, - const vx_reference *paramObj, - vx_uint32 paraNum - ) -{ - vx_status status = VX_SUCCESS; - /*TODO: Add initial code for VX program*/ - - return status; -} - - -#ifdef __cplusplus -extern "C" { -#endif -vx_kernel_description_t vxRoi_align_CPU = -{ - _VX_KERNEL_ID, - _VX_KERNEL_NAME, - _VX_KERNEL_FUNC_KERNEL, - vxRoi_alignKernelParam, - _cnt_of_array( vxRoi_alignKernelParam ), - vsi_nn_KernelValidator, - NULL, - NULL, - vsi_nn_KernelInitializer, - vsi_nn_KernelDeinitializer -}; - -vx_kernel_description_t vxRoi_align_VX = -{ - _VX_KERNEL_ID, - _VX_KERNEL_NAME, - NULL, - vxRoi_alignKernelParam, - _cnt_of_array( vxRoi_alignKernelParam ), - vsi_nn_KernelValidator, - NULL, - NULL, - vxRoi_alignInitializer, - vsi_nn_KernelDeinitializer -}; - -vx_kernel_description_t * vx_kernel_ROI_ALIGN_list[] = -{ - &vxRoi_align_CPU, - &vxRoi_align_VX, - NULL -}; -#ifdef __cplusplus -} -#endif diff --git a/src/tim/vx/internal/src/libnnext/ops/kernel/vsi_nn_kernel_scale.c b/src/tim/vx/internal/src/libnnext/ops/kernel/vsi_nn_kernel_scale.c deleted file mode 100644 index d97517e..0000000 --- a/src/tim/vx/internal/src/libnnext/ops/kernel/vsi_nn_kernel_scale.c +++ /dev/null @@ -1,410 +0,0 @@ -/**************************************************************************** -* -* Copyright (c) 2020 Vivante Corporation -* -* Permission is hereby granted, free of charge, to any person obtaining a -* copy of this software and associated documentation files (the "Software"), -* to deal in the Software without restriction, including without limitation -* the rights to use, copy, modify, merge, publish, distribute, sublicense, -* and/or sell copies of the Software, and to permit persons to whom the -* Software is furnished to do so, subject to the following conditions: -* -* The above copyright notice and this permission notice shall be included in -* all copies or substantial portions of the Software. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING -* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER -* DEALINGS IN THE SOFTWARE. -* -*****************************************************************************/ - -#include -#include -#include -#include - -#include "vsi_nn_platform.h" - -#include "vsi_nn_prv.h" -#include "vsi_nn_log.h" -#include "vsi_nn_test.h" -#include "vsi_nn_tensor_util.h" -#include "utils/vsi_nn_util.h" -#include "utils/vsi_nn_dtype_util.h" -#include "utils/vsi_nn_math.h" -#include "client/vsi_nn_vxkernel.h" -#include "libnnext/vx_lib_nnext.h" - -#define _VX_KERNEL_VAR (vx_kernel_SCALE) -#define _VX_KERNEL_ID (VX_KERNEL_ENUM_SCALE) -#define _VX_KERNEL_NAME ("vsi_nn_kernel_scale") -#define _VX_KERNEL_FUNC_KERNEL (vxScaleKernel) - -static vsi_status VX_CALLBACK vxScaleKernel - ( - vx_node node, - const vx_reference* paramObj, - uint32_t paramNum - ) -{ - vsi_status status = VX_ERROR_INVALID_PARAMETERS; - - if( 6 == paramNum ) - { - vx_context context = NULL; - vx_tensor input_tensor = NULL; - vx_tensor scale_tensor = NULL; - vx_tensor bias_tensor = NULL; - vx_tensor output_tensor = NULL; - uint8_t * input_buffer = NULL; - uint8_t * scale_buffer = NULL; - uint8_t * bias_buffer = NULL; - uint8_t * output_buffer = NULL; - vx_scalar axis_scalar = NULL; - vx_scalar has_bias_scalar = NULL; - int axis = 1; - float has_bias = 0; - uint32_t input_dims = 0; - uint32_t scale_dims = 0; - uint32_t bias_dims = 0; - uint32_t output_dims = 0; - vsi_enum inputFormat = VSI_NN_TYPE_FLOAT16; - vsi_enum scaleFormat = VSI_NN_TYPE_FLOAT16; - vsi_enum biasFormat = VSI_NN_TYPE_FLOAT32; - vsi_enum outputFormat = VSI_NN_TYPE_FLOAT16; - uint32_t input_size[4] = {1, 1, 1, 1}; - uint32_t scale_size[4] = {1, 1, 1, 1}; - uint32_t bias_size[4] = {1, 1, 1, 1}; - uint32_t output_size[4] = {1, 1, 1, 1}; - uint32_t input_stride_size[VSI_NN_MAX_DIM_NUM] = { 0 }; - uint32_t output_stride_size[VSI_NN_MAX_DIM_NUM] = { 0 }; - vx_tensor_addressing input_user_addr = NULL; - vx_tensor_addressing scale_user_addr = NULL; - vx_tensor_addressing bias_user_addr = NULL; - vx_tensor_addressing output_user_addr = NULL; - vsi_nn_tensor_attr_t out_attr; - - status = VX_SUCCESS; - - memset(&out_attr, 0x0, sizeof(vsi_nn_tensor_attr_t)); - - input_tensor = (vx_tensor)paramObj[0]; - scale_tensor = (vx_tensor)paramObj[1]; - bias_tensor = (vx_tensor)paramObj[2]; - output_tensor = (vx_tensor)paramObj[3]; - axis_scalar = (vx_scalar)paramObj[4]; - has_bias_scalar = (vx_scalar)paramObj[5]; - - context = vxGetContext((vx_reference)node); - if( NULL == context) - { - VSILOGE("vxGetContext failure!\n"); - status = VX_FAILURE; - goto OnError; - } - - input_buffer = vsi_nn_ConvertRawTensorToData(context, input_tensor, - &input_dims, &inputFormat, input_size, input_stride_size, - &input_user_addr, VX_READ_ONLY); - if( NULL == input_buffer ) - { - VSILOGE("vsi_nn_ConvertRawTensorToData failure!\n"); - status = VX_ERROR_NO_MEMORY; - goto OnError; - } - - scale_buffer = vsi_nn_ConvertRawTensorToData(context, scale_tensor, - &scale_dims, &scaleFormat, scale_size, input_stride_size, - &scale_user_addr, VX_READ_ONLY); - if( NULL == scale_buffer ) - { - VSILOGE("vsi_nn_ConvertRawTensorToData failure!\n"); - status = VX_ERROR_NO_MEMORY; - goto OnError; - } - - bias_buffer = vsi_nn_ConvertRawTensorToData(context, bias_tensor, - &bias_dims, &biasFormat, bias_size, input_stride_size, - &bias_user_addr, VX_READ_ONLY); - if( NULL == bias_buffer ) - { - VSILOGE("vsi_nn_ConvertRawTensorToData failure!\n"); - status = VX_ERROR_NO_MEMORY; - goto OnError; - } - - output_buffer = vsi_nn_ConvertRawTensorToData(context, output_tensor, - &output_dims, &outputFormat, output_size, output_stride_size, - &output_user_addr, VX_WRITE_ONLY); - if( NULL == output_buffer ) - { - VSILOGE("vsi_nn_ConvertRawTensorToData failure!\n"); - status = VX_ERROR_NO_MEMORY; - goto OnError; - } - - status = vsi_nn_vxGetTensorAttr(output_tensor, &out_attr); - if (status != VX_SUCCESS) - { - VSILOGE("vsi_nn_vxGetTensorAttr failure! at line %d\n", __LINE__); - goto OnError; - } - - status = vxCopyScalar(axis_scalar, &axis, VX_READ_ONLY, VX_MEMORY_TYPE_HOST); - if( VX_SUCCESS != status) - { - VSILOGE("vxCopyScalar axis failure! status:%d\n", status); - goto OnError; - } - status = vxCopyScalar(has_bias_scalar, &has_bias, VX_READ_ONLY, VX_MEMORY_TYPE_HOST); - if( VX_SUCCESS != status ) - { - VSILOGE("vxCopyScalar axis failure! has_bias:%f\n", has_bias); - goto OnError; - } - - if( input_dims != output_dims ) - { - VSILOGE("Invalid parameters, input_dims output_dims mismatch %d:%d\n", - input_dims, output_dims); - status = VX_ERROR_INVALID_PARAMETERS; - goto OnError; - } - if( input_size[0] != scale_size[0] || input_size[0] != bias_size[0] ) - { - VSILOGE("Invalid parameters, input size mismatch %d:%d:%d\n", - input_size[0], scale_size[0], bias_size[0]); - status = VX_ERROR_INVALID_PARAMETERS; - goto OnError; - } - { - uint32_t i = 0; - uint32_t j = 0; - uint32_t fixed_num = 1; - uint32_t changed_num = 1; - - fixed_num = input_size[1] * input_size[2] * input_size[3]; - changed_num = input_size[0]; - - for( i = 0; i < fixed_num; i++ ) - { - int16_t* cur_input_row_ofst = ((int16_t *)input_buffer) + i * changed_num; - int16_t* cur_scale_row_ofst = ((int16_t *)scale_buffer); - float* cur_bias_row_ofst = ((float *)bias_buffer); - int16_t* cur_output_row_ofst = ((int16_t *)output_buffer) + i * changed_num; - - for( j = 0; j < changed_num; j++ ) - { - float cur_input_v = vsi_nn_Fp16ToFp32(*(cur_input_row_ofst + j)); - float cur_scale_v = vsi_nn_Fp16ToFp32(*(cur_scale_row_ofst + j)); - float cur_bias_v = *(cur_bias_row_ofst + j); - - float cur_result = cur_input_v * cur_scale_v + cur_bias_v; - *(cur_output_row_ofst + j) = vsi_nn_Fp32ToFp16(cur_result); - } - } - -#if defined(_SAVE_TENSOR) - { - static int count = 0; - char fname[256] = { 0 }; - sprintf(fname, "scale_output_tensor.%d.axis.%d.txt", count, axis); - vsi_nn_SaveDataToText(fname, output_buffer, - vsi_nn_ShapeProduct(output_size, output_dims), VSI_NN_TYPE_FLOAT16, NULL); - count++; - } -#endif - } - status = vsi_nn_vxCopyDataToTensor(context, output_tensor, &out_attr, output_buffer); - TEST_CHECK_STATUS(status, OnError); -OnError: - if( NULL != input_buffer ) - { - free( input_buffer ); - input_buffer = NULL; - } - if( NULL != scale_buffer ) - { - free( scale_buffer ); - scale_buffer = NULL; - } - if( NULL != bias_buffer ) - { - free( bias_buffer ); - bias_buffer = NULL; - } - if( NULL != output_buffer ) - { - free( output_buffer ); - output_buffer = NULL; - } - - if (input_user_addr) - { - vxReleaseTensorAddressing(&input_user_addr); - } - if (scale_user_addr) - { - vxReleaseTensorAddressing(&scale_user_addr); - } - if (bias_user_addr) - { - vxReleaseTensorAddressing(&bias_user_addr); - } - if (output_user_addr) - { - vxReleaseTensorAddressing(&output_user_addr); - } - - } - - return status; -} /* _VX_KERNEL_FUNC_KERNEL() */ - -static vx_param_description_t s_params[] = -{ - { VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED }, - { VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED }, - { VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_OPTIONAL }, - { VX_OUTPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED }, - { VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED }, - { VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED }, -}; - -vsi_status VX_CALLBACK vxScaleInitializer - ( - vx_node nodObj, - const vx_reference *paramObj, - uint32_t paraNum - ) -{ - // Alignment with a power of two value. -#define gcmALIGN(n, align) ((n) + ((align) - 1)) & ~((align) - 1) - vx_kernel_execution_parameters_t shaderParam = { - 2, // workdim - {0, 0, 0}, // globalWorkOffset: control the start location be processed in the image - {0, 0, 0}, // globalWorkScale: how many pixels could be processed by a single thread - {0, 0, 0}, // localWorkSize: local group size in thread - {0, 0, 0}}; // globalWorkSize: image size in thread - uint32_t uniExtractHalf8_2x8[16] = { - 0x11111111, // TCfg - 0x11110000, // ASelt - 0x06040200, 0x06040200, // ABin - 0x22222222, // BSelt - 0x00000000, 0x00000000, // BBin - 0x00002100, // AccumType, ConstantType, and PostShift - 0x00003c00, 0x00003c00, 0x00003c00, 0x00003c00, - 0x00003c00, 0x00003c00, 0x00003c00, 0x00003c00 // Constant - }; - uint32_t uniFp16MulFp16ToFp32_Lo_4x4[16] = { - 0x01010101, // TCfg - 0x00000000, // ASelt - 0x00010000, 0x00030002, // ABin - 0x01010101, // BSelt - 0x00010000, 0x00030002, // BBin - 0x00000400, // AccumType, ConstantType, and PostShift - 0x00000000, 0x00000000, 0x00000000, 0x00000000, - 0x00000000, 0x00000000, 0x00000000, 0x00000000 // Constant - }; - uint32_t uniFp16MulFp16ToFp32_Hi_4x4[16] = { - 0x01010101, // TCfg - 0x00000000, // ASelt - 0x00050004, 0x00070006, // ABin - 0x01010101, // BSelt - 0x00050004, 0x00070006, // BBin - 0x00000400, // AccumType, ConstantType, and PostShift - 0x00000000, 0x00000000, 0x00000000, 0x00000000, - 0x00000000, 0x00000000, 0x00000000, 0x00000000 // Constant - }; - - vsi_status status = VX_SUCCESS; - - vx_tensor input = (vx_tensor)paramObj[0]; - uint32_t input_size[DIM_SIZE] = {1, 1, 1, 1}; - vx_uint32 i = 0; - vsi_nn_tensor_attr_t attr; - - memset(&attr, 0, sizeof(vsi_nn_tensor_attr_t)); - status = vsi_nn_vxGetTensorAttr(input, &attr); - if (status != VX_SUCCESS) - { - VSILOGE("vsi_nn_vxGetTensorAttr failure! at line %d\n", __LINE__); - return status; - } - for (i = 0; i < attr.dim_num; i++) - { - input_size[i] = attr.size[i]; - } - - shaderParam.globalWorkOffset[0] = 0; - shaderParam.globalWorkOffset[1] = 0; - shaderParam.globalWorkScale[0] = 8; - shaderParam.globalWorkScale[1] = 1; - shaderParam.globalWorkSize[0] = gcmALIGN((input_size[0] + shaderParam.globalWorkScale[0] - 1) - / shaderParam.globalWorkScale[0], 4); - shaderParam.globalWorkSize[1] = (input_size[1] + shaderParam.globalWorkScale[1] - 1) - / shaderParam.globalWorkScale[1]; - - vxSetNodeUniform(nodObj, "uniExtractHalf8_2x8", 1, uniExtractHalf8_2x8); - vxSetNodeUniform(nodObj, "uniFp16MulFp16ToFp32_Lo_4x4", 1, uniFp16MulFp16ToFp32_Lo_4x4); - vxSetNodeUniform(nodObj, "uniFp16MulFp16ToFp32_Hi_4x4", 1, uniFp16MulFp16ToFp32_Hi_4x4); - - status |= vxSetNodeAttribute(nodObj, VX_NODE_ATTRIBUTE_KERNEL_EXECUTION_PARAMETERS, - &shaderParam, sizeof(vx_kernel_execution_parameters_t)); - - return VX_SUCCESS; -} - -static vx_param_description_t vxScaleKernelParam[] = -{ - {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, - {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, - {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_OPTIONAL}, - {VX_OUTPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED} -}; - -#ifdef __cplusplus -extern "C" { -#endif -vx_kernel_description_t _VX_KERNEL_VAR = -{ - _VX_KERNEL_ID, - _VX_KERNEL_NAME, - _VX_KERNEL_FUNC_KERNEL, - s_params, - _cnt_of_array( s_params ), - vsi_nn_KernelValidator, - NULL, - NULL, - vsi_nn_KernelInitializer, - vsi_nn_KernelDeinitializer -}; - -vx_kernel_description_t vxScaleKernelInfo = -{ - VX_KERNEL_ENUM_SCALE, - VX_KERNEL_NAME_SCALE_FP16, - NULL, - vxScaleKernelParam, - (sizeof(vxScaleKernelParam) / sizeof(vxScaleKernelParam[0])), - vsi_nn_KernelValidator, - NULL, - NULL, - vxScaleInitializer, - vsi_nn_KernelDeinitializer -}; - -vx_kernel_description_t * vx_kernel_SCALE_list[] = -{ - &_VX_KERNEL_VAR, - &vxScaleKernelInfo, - NULL -}; -#ifdef __cplusplus -} -#endif - diff --git a/src/tim/vx/internal/src/libnnext/ops/kernel/vsi_nn_kernel_shufflechannel.c b/src/tim/vx/internal/src/libnnext/ops/kernel/vsi_nn_kernel_shufflechannel.c deleted file mode 100644 index acdc249..0000000 --- a/src/tim/vx/internal/src/libnnext/ops/kernel/vsi_nn_kernel_shufflechannel.c +++ /dev/null @@ -1,345 +0,0 @@ -/**************************************************************************** -* -* Copyright (c) 2020 Vivante Corporation -* -* Permission is hereby granted, free of charge, to any person obtaining a -* copy of this software and associated documentation files (the "Software"), -* to deal in the Software without restriction, including without limitation -* the rights to use, copy, modify, merge, publish, distribute, sublicense, -* and/or sell copies of the Software, and to permit persons to whom the -* Software is furnished to do so, subject to the following conditions: -* -* The above copyright notice and this permission notice shall be included in -* all copies or substantial portions of the Software. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING -* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER -* DEALINGS IN THE SOFTWARE. -* -*****************************************************************************/ - -#include -#include -#include -#include - -#include "vsi_nn_platform.h" - -#include "vsi_nn_prv.h" -#include "vsi_nn_log.h" -#include "vsi_nn_test.h" -#include "vsi_nn_tensor_util.h" -#include "utils/vsi_nn_util.h" -#include "utils/vsi_nn_dtype_util.h" -#include "utils/vsi_nn_math.h" -#include "client/vsi_nn_vxkernel.h" -#include "libnnext/vx_lib_nnext.h" - -vsi_status vxShuffleChannelFunc - ( - vx_context context, - vx_tensor input, - vx_tensor output, - int32_t group_number, - int32_t axis - ) -{ - vsi_status status = VX_SUCCESS; - vsi_nn_tensor_attr_t input_attr; - vsi_nn_tensor_attr_t output_attr; - uint8_t *in_data = NULL; - uint8_t *out_data = NULL; - uint32_t stride_size[VSI_NN_MAX_DIM_NUM] = {0}; - uint32_t buf_sz = 0; - uint32_t group_row = group_number; - uint32_t chs = 0, group_col = 0; - uint32_t len = 1, num = 1, feature_map_size = 1; - uint32_t n = 0, i = 0, j = 0; - uint32_t type_bytes = 0, len_bytes = 0, fms_bytes = 0; - - status = vsi_nn_vxGetTensorAttr(input, &input_attr); - status |= vsi_nn_vxGetTensorAttr(output, &output_attr); - TEST_CHECK_STATUS(status, final); - in_data = vsi_nn_vxCopyTensorToData(context, input, &input_attr); - TEST_CHECK_PTR(in_data, final); - buf_sz = vsi_nn_GetStrideSize(&output_attr, stride_size); - out_data = (uint8_t *)malloc( buf_sz ); - TEST_CHECK_PTR(out_data, final); - - chs = input_attr.size[axis]; - group_col = chs / group_row; - type_bytes = vsi_nn_TypeGetBytes( input_attr.dtype.vx_type ); - - for ( i = 0; i < (uint32_t)axis; i++) - { - len *= input_attr.size[i]; - } - for ( i = axis + 1; i < input_attr.dim_num; i++) - { - num *= input_attr.size[i]; - } - for ( i = 0; i <= (uint32_t)axis; i++) - { - feature_map_size *= input_attr.size[i]; - } - - /* Shuffle Channel CPU Implement, the shape and dtype of output must same as input */ - len_bytes = len * type_bytes; - fms_bytes = feature_map_size * type_bytes; - for ( n = 0; n < num; n++) - { - for ( i = 0; i < group_row; i++) - { - for ( j = 0; j < group_col; j++) - { - uint8_t *in_ptr = in_data + n * fms_bytes + (i * group_col + j) * len_bytes; - uint8_t *out_ptr = out_data + n * fms_bytes + (j * group_row + i) * len_bytes; - - memcpy(out_ptr, in_ptr, len_bytes); - } - } - } - - /* Copy data to output tensor */ - status = vsi_nn_vxCopyDataToTensor(context, output, &output_attr, out_data); - TEST_CHECK_STATUS(status, final); -final: - if (in_data) free(in_data); - if (out_data) free(out_data); - return status; -} -vsi_status VX_CALLBACK vxShuffleChannelKernel - ( - vx_node node, - const vx_reference* paramObj, - uint32_t paramNum - ) -{ - vsi_status status = VX_ERROR_INVALID_PARAMETERS; - - if(paramNum == 4) - { - vx_context context = NULL; - // tensor - vx_tensor imgObj[2] = { NULL }; - // scalar - vx_scalar scalar[2] = { NULL }; - int32_t group_number = 0; - int32_t axis = 0; - - imgObj[0] = (vx_tensor)paramObj[0]; - imgObj[1] = (vx_tensor)paramObj[1]; - scalar[0] = (vx_scalar)paramObj[2]; - scalar[1] = (vx_scalar)paramObj[3]; - - context = vxGetContext((vx_reference)node); - TEST_CHECK_PTR(context,final); - // scalar - status = vxCopyScalar(scalar[0], &group_number, VX_READ_ONLY, VX_MEMORY_TYPE_HOST); - TEST_CHECK_STATUS(status, final); - status = vxCopyScalar(scalar[1], &axis, VX_READ_ONLY, VX_MEMORY_TYPE_HOST); - TEST_CHECK_STATUS(status, final); - - // Call C Prototype - status = vxShuffleChannelFunc(context, imgObj[0], imgObj[1], group_number, axis); - TEST_CHECK_STATUS(status, final); - } -final: - return status; -} -vsi_status VX_CALLBACK vxShuffleChannelInitializer - ( - vx_node nodObj, - const vx_reference *paramObj, - uint32_t paraNum - ) -{ - vsi_status status = VX_SUCCESS; - // Alignment with a power of two value. -#define gcmALIGN(n, align) ((n) + ((align) - 1)) & ~((align) - 1) - vx_kernel_execution_parameters_t shaderParam = { - 3, // workdim - {0, 0, 0}, // globalWorkOffset: control the start location be processed in the image - {0, 0, 0}, // globalWorkScale: how many pixels could be processed by a single thread - {0, 0, 0}, // localWorkSize: local group size in thread - {0, 0, 0}}; // globalWorkSize: image size in thread - - vx_tensor input = (vx_tensor)paramObj[0]; - vx_scalar group_numbers = (vx_scalar)paramObj[2]; - vx_scalar axis_s = (vx_scalar)paramObj[3]; - uint32_t input_size[4] = {1, 1, 1, 1}; - vsi_nn_type_e inputDataFormat = VSI_NN_TYPE_FLOAT16; - int32_t group_number = 0; - int32_t axis = 0; - int32_t group_column = 0; - float rgroup_column = 0.0f; - uint32_t chs = 0; - vx_uint32 i = 0; - vsi_nn_tensor_attr_t attr; - - memset(&attr, 0, sizeof(vsi_nn_tensor_attr_t)); - status = vsi_nn_vxGetTensorAttr(input, &attr); - if (status != VX_SUCCESS) - { - VSILOGE("vsi_nn_vxGetTensorAttr failure! at line %d\n", __LINE__); - return status; - } - for (i = 0; i < attr.dim_num; i++) - { - input_size[i] = attr.size[i]; - } - inputDataFormat = attr.dtype.vx_type; - - status |= vxCopyScalar(group_numbers, &group_number, VX_READ_ONLY, VX_MEMORY_TYPE_HOST); - status |= vxCopyScalar(axis_s, &axis, VX_READ_ONLY, VX_MEMORY_TYPE_HOST); - if(VX_SUCCESS != status) - { - VSILOGE("[%s : %d]Initializer failure! \n",__FILE__, __LINE__); - return status; - } - chs = input_size[axis]; - if (chs % group_number) - { - VSILOGE("input channel can't be exact divided by group number! at line %d\n", __LINE__); - return VX_FAILURE; - } - - shaderParam.globalWorkOffset[0] = 0; - shaderParam.globalWorkOffset[1] = 0; - shaderParam.globalWorkOffset[2] = 0; - if (axis == 2) - { - if (inputDataFormat == VSI_NN_TYPE_FLOAT16 || inputDataFormat == VSI_NN_TYPE_INT16) - shaderParam.globalWorkScale[0] = 8; - else - shaderParam.globalWorkScale[0] = 16; - shaderParam.globalWorkScale[1] = 4; - shaderParam.globalWorkScale[2] = 1; - - shaderParam.globalWorkSize[0] = gcmALIGN((input_size[0] + shaderParam.globalWorkScale[0] - 1) - / shaderParam.globalWorkScale[0], 4); - shaderParam.globalWorkSize[1] = (input_size[1] + shaderParam.globalWorkScale[1] - 1) - / shaderParam.globalWorkScale[1]; - shaderParam.globalWorkSize[2] = input_size[2]; - } - else if (axis == 1) - { - shaderParam.globalWorkScale[0] = 32; - shaderParam.globalWorkScale[1] = 1; - shaderParam.globalWorkScale[2] = 1; - - shaderParam.globalWorkSize[0] = gcmALIGN((input_size[0] + shaderParam.globalWorkScale[0] - 1) - / shaderParam.globalWorkScale[0], 4); - shaderParam.globalWorkSize[1] = input_size[1]; - shaderParam.globalWorkSize[2] = input_size[2]; - } - else - { - VSILOGE("[%s : %d]Initializer failure, not support axis: %d! \n",__FILE__, __LINE__, axis); - return VX_FAILURE; - } - group_column = chs / group_number; - rgroup_column = 1.0f / group_column; - - status |= vxSetNodeUniform(nodObj, "group_column", 1, &group_column); - status |= vxSetNodeUniform(nodObj, "rgroup_column", 1, &rgroup_column); - status |= vxSetNodeAttribute(nodObj, VX_NODE_ATTRIBUTE_KERNEL_EXECUTION_PARAMETERS, - &shaderParam, sizeof(vx_kernel_execution_parameters_t)); - - if(status < 0) - { - VSILOGE("[%s : %d]Initializer failure! \n",__FILE__, __LINE__); - } - return status; -} -static vx_param_description_t vxShuffleChannelKernelParam[] = -{ - {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, - {VX_OUTPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, - {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED}, - {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED} -}; -#ifdef __cplusplus -extern "C" { -#endif -vx_kernel_description_t vxShuffleChannelKernelInfo = -{ - VX_KERNEL_ENUM_SHUFFLECHANNEL, - VX_KERNEL_NAME_SHUFFLECHANNEL, - NULL, - vxShuffleChannelKernelParam, - (sizeof(vxShuffleChannelKernelParam) / sizeof(vxShuffleChannelKernelParam[0])), - vsi_nn_KernelValidator, - NULL, - NULL, - vxShuffleChannelInitializer, - vsi_nn_KernelDeinitializer -}; -vx_kernel_description_t vxShuffleChannelKernelInfo8Bits = -{ - VX_KERNEL_ENUM_SHUFFLECHANNEL, - VX_KERNEL_NAME_SHUFFLECHANNEL8BITS, - NULL, - vxShuffleChannelKernelParam, - (sizeof(vxShuffleChannelKernelParam) / sizeof(vxShuffleChannelKernelParam[0])), - vsi_nn_KernelValidator, - NULL, - NULL, - vxShuffleChannelInitializer, - vsi_nn_KernelDeinitializer -}; -vx_kernel_description_t vxShuffleChannelKernelInfo_CPU = -{ - VX_KERNEL_ENUM_SHUFFLECHANNEL, - VX_KERNEL_NAME_SHUFFLECHANNEL, - vxShuffleChannelKernel, - vxShuffleChannelKernelParam, - (sizeof(vxShuffleChannelKernelParam) / sizeof(vxShuffleChannelKernelParam[0])), - vsi_nn_KernelValidator, - NULL, - NULL, - vsi_nn_KernelInitializer, - vsi_nn_KernelDeinitializer -}; -vx_kernel_description_t vxShuffleChannelKernelInfo_16BitsAxis1 = -{ - VX_KERNEL_ENUM_SHUFFLECHANNEL, - VX_KERNEL_NAME_SHUFFLECHANNEL16BITS_AXIS1, - NULL, - vxShuffleChannelKernelParam, - (sizeof(vxShuffleChannelKernelParam) / sizeof(vxShuffleChannelKernelParam[0])), - vsi_nn_KernelValidator, - NULL, - NULL, - vxShuffleChannelInitializer, - vsi_nn_KernelDeinitializer -}; -vx_kernel_description_t vxShuffleChannelKernelInfo_8BitsAxis1 = -{ - VX_KERNEL_ENUM_SHUFFLECHANNEL, - VX_KERNEL_NAME_SHUFFLECHANNEL8BITS_AXIS1, - NULL, - vxShuffleChannelKernelParam, - (sizeof(vxShuffleChannelKernelParam) / sizeof(vxShuffleChannelKernelParam[0])), - vsi_nn_KernelValidator, - NULL, - NULL, - vxShuffleChannelInitializer, - vsi_nn_KernelDeinitializer -}; -vx_kernel_description_t * vx_kernel_SHUFFLECHANNEL_list[] = -{ - &vxShuffleChannelKernelInfo_CPU, - &vxShuffleChannelKernelInfo, - &vxShuffleChannelKernelInfo8Bits, - &vxShuffleChannelKernelInfo_16BitsAxis1, - &vxShuffleChannelKernelInfo_8BitsAxis1, - NULL -}; -#ifdef __cplusplus -} -#endif diff --git a/src/tim/vx/internal/src/libnnext/ops/kernel/vsi_nn_kernel_space2depth.c b/src/tim/vx/internal/src/libnnext/ops/kernel/vsi_nn_kernel_space2depth.c deleted file mode 100644 index 67308f8..0000000 --- a/src/tim/vx/internal/src/libnnext/ops/kernel/vsi_nn_kernel_space2depth.c +++ /dev/null @@ -1,293 +0,0 @@ -/**************************************************************************** -* -* Copyright (c) 2020 Vivante Corporation -* -* Permission is hereby granted, free of charge, to any person obtaining a -* copy of this software and associated documentation files (the "Software"), -* to deal in the Software without restriction, including without limitation -* the rights to use, copy, modify, merge, publish, distribute, sublicense, -* and/or sell copies of the Software, and to permit persons to whom the -* Software is furnished to do so, subject to the following conditions: -* -* The above copyright notice and this permission notice shall be included in -* all copies or substantial portions of the Software. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING -* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER -* DEALINGS IN THE SOFTWARE. -* -*****************************************************************************/ - -#include -#include -#include -#include - -#include "vsi_nn_platform.h" - -#include "vsi_nn_prv.h" -#include "vsi_nn_log.h" -#include "vsi_nn_tensor_util.h" -#include "utils/vsi_nn_util.h" -#include "utils/vsi_nn_dtype_util.h" -#include "utils/vsi_nn_math.h" -#include "client/vsi_nn_vxkernel.h" -#include "libnnext/vx_lib_nnext.h" - -#define _VX_KERNEL_VAR (vx_kernel_SPACE2DEPTH) -#define _VX_KERNEL_ID (VX_KERNEL_ENUM_SPACE2DEPTH) -#define _VX_KERNEL_NAME ("vsi_nn_kernel_space2depth") -#define _VX_KERNEL_FUNC_KERNEL (vxSpace2DepthKernel) - -static vsi_status VX_CALLBACK vxSpace2DepthKernel - ( - vx_node node, - const vx_reference* paramObj, - uint32_t paramNum - ) -{ - /* TODO: */ -#define ARG_NUM (2) -#define TENSOR_NUM_INPUT (1) -#define TENSOR_NUM_OUTPUT (1) -#define TENSOR_NUM (TENSOR_NUM_INPUT+TENSOR_NUM_OUTPUT) - - vsi_status status = VX_SUCCESS; - uint32_t i = 0; - vx_context context = NULL; - vsi_nn_tensor_attr_t attr[TENSOR_NUM]; - uint32_t stride_size[TENSOR_NUM][VSI_NN_MAX_DIM_NUM]; - vx_tensor_addressing user_addr[TENSOR_NUM] = {NULL}; - uint8_t *buffer_ptr[TENSOR_NUM] = {NULL}; - vx_tensor tensor[TENSOR_NUM] = {NULL}; - - int32_t block_size_x = 0, block_size_y = 0; - int32_t output_depth = 0, output_height = 0, output_width = 0; - int32_t input_batch = 0, input_depth = 0, input_height = 0, input_width = 0; - int32_t batch = 0, dim = 0; - - for(i = 0; i < TENSOR_NUM; i++) - { - memset(&attr[i], 0x0, sizeof(vsi_nn_tensor_attr_t)); - } - - //prepare data - context = vxGetContext((vx_reference)node); - - for( i = 0; i < TENSOR_NUM_INPUT; i ++ ) - { - tensor[i] = (vx_tensor)paramObj[i]; - buffer_ptr[i] = vsi_nn_ConvertRawTensorToData2(context, tensor[i], - &(attr[i]), stride_size[i], &(user_addr[i]), VX_READ_ONLY); - } - for( i = TENSOR_NUM_INPUT; i < TENSOR_NUM; i ++ ) - { - tensor[i] = (vx_tensor)paramObj[i]; - buffer_ptr[i] = vsi_nn_ConvertRawTensorToData2(context, tensor[i], - &(attr[i]), stride_size[i], &(user_addr[i]), VX_WRITE_ONLY); - } - - vxCopyScalar((vx_scalar)paramObj[TENSOR_NUM], &(block_size_x), - VX_READ_ONLY, VX_MEMORY_TYPE_HOST); - vxCopyScalar((vx_scalar)paramObj[TENSOR_NUM + 1], &(block_size_y), - VX_READ_ONLY, VX_MEMORY_TYPE_HOST); - - dim = attr[0].dim_num; - if(dim < 4) - attr[0].size[3] = 1; - //op calc - //output_batch = attr[1].size[3]; - output_depth = attr[1].size[2]; - output_height = attr[1].size[1]; - output_width = attr[1].size[0]; - - input_batch = attr[0].size[3]; - input_depth = attr[0].size[2]; - input_height = attr[0].size[1]; - input_width = attr[0].size[0]; - - for (batch = 0; batch < input_batch; ++batch) - { - vx_uint32 output_batch_index = batch * output_height * output_width * output_depth; - vx_uint32 input_batch_index = batch * input_height * input_width * input_depth; - vx_uint32 in_d; - for (in_d = 0; in_d < (vx_uint32)input_depth; in_d ++) - { - vx_uint32 in_h; - for (in_h = 0; in_h < (vx_uint32)input_height; ++ in_h) - { - vx_uint32 in_w; - for (in_w = 0; in_w < (vx_uint32)input_width; in_w ++) - { - vx_int32 out_w = in_w / block_size_x; - vx_int32 out_h = in_h / block_size_y; - //vx_int32 out_d = (in_w % block_size_x) * input_depth + (in_h % block_size_y) * block_size_x * input_depth + in_d; - vx_int32 out_d = (in_w % block_size_x) + (in_h % block_size_y) * block_size_x + in_d * block_size_x * block_size_y; - - vx_int32 in_index = in_w + in_h * input_width +in_d * input_height * input_width + input_batch_index; - vx_int32 out_index = out_w + out_h * output_width + out_d * output_width * output_height + output_batch_index; - - //outputBase[out_index] = inputBase[in_index]; - float fval; - vsi_nn_DtypeToFloat32(&buffer_ptr[0][stride_size[0][0] * in_index], - &fval, &attr[0].dtype); - vsi_nn_Float32ToDtype(fval, &buffer_ptr[1][stride_size[1][0] * out_index], - &attr[1].dtype); - } - } - } - } - - //save data - for( i = TENSOR_NUM_INPUT; i < TENSOR_NUM; i ++ ) - { - vsi_nn_copy_tensor_patch(tensor[i], &attr[i], buffer_ptr[i], VX_WRITE_ONLY); - } - for( i = 0; i < TENSOR_NUM; i ++ ) - { - if (user_addr[i]) vxReleaseTensorAddressing(&(user_addr[i])); - if (buffer_ptr[i]) free(buffer_ptr[i]); - } - - return status; -} /* _VX_KERNEL_FUNC_KERNEL() */ - -vsi_status VX_CALLBACK vxSpace2DepthInitializer - ( - vx_node nodObj, - const vx_reference *paramObj, - uint32_t paraNum - ) -{ - // Alignment with a power of two value. -#define gcmALIGN(n, align) ((n) + ((align) - 1)) & ~((align) - 1) - vx_kernel_execution_parameters_t shaderParam = { - 3, // workdim - {0, 0, 0}, // globalWorkOffset: control the start location be processed in the image - {0, 0, 0}, // globalWorkScale: how many pixels could be processed by a single thread - {0, 0, 0}, // localWorkSize: local group size in thread - {0, 0, 0}}; // globalWorkSize: image size in thread - - vsi_status status = VX_SUCCESS; - - vx_tensor input = (vx_tensor)paramObj[0]; - uint32_t input_size[4] = {1, 1, 1, 1}; - vx_uint32 input_dimz = 0; - vx_uint32 input_depth = 0; - vx_uint32 i = 0; - vsi_nn_tensor_attr_t attr; - - memset(&attr, 0, sizeof(vsi_nn_tensor_attr_t)); - status = vsi_nn_vxGetTensorAttr(input, &attr); - if (status != VX_SUCCESS) - { - VSILOGE("vsi_nn_vxGetTensorAttr failure! at line %d\n", __LINE__); - return status; - } - for (i = 0; i < attr.dim_num; i++) - { - input_size[i] = attr.size[i]; - } - - input_depth = input_size[2]; - if(input_size[3] > 0) - input_dimz = input_depth * input_size[3]; - - shaderParam.globalWorkOffset[0] = 0; - shaderParam.globalWorkOffset[1] = 0; - shaderParam.globalWorkOffset[2] = 0; - shaderParam.globalWorkScale[0] = 8; - shaderParam.globalWorkScale[1] = 1; - shaderParam.globalWorkScale[2] = 1; - shaderParam.localWorkSize[0] = 8; - shaderParam.localWorkSize[1] = 1; - shaderParam.localWorkSize[2] = 1; - shaderParam.globalWorkSize[0] = gcmALIGN((input_size[0] + shaderParam.globalWorkScale[0] - 1) - / shaderParam.globalWorkScale[0], shaderParam.localWorkSize[0]); - shaderParam.globalWorkSize[1] = gcmALIGN((input_size[1] + shaderParam.globalWorkScale[1] - 1) - / shaderParam.globalWorkScale[1], shaderParam.localWorkSize[1]); - shaderParam.globalWorkSize[2] = input_dimz; - - { - vx_uint32 uniExtractEvenFp16Stride2_4x4[16] = { - 0x01010101, // TCfg - 0x00000000, // ASelt - 0x00020000, 0x00060004, // ABin - 0x02020202, // BSelt - 0x00000000, 0x00000000, // BBin - 0x00000400, // AccumType, ConstantType, and PostShift - 0x00000001, 0x00000000, 0x00000001, 0x00000000, 0x00000001, 0x00000000, 0x00000001, 0x00000000 // Constant - }; - vx_uint32 uniExtractOddFp16Stride2_4x4[16] = { - 0x01010101, // TCfg - 0x00000000, // ASelt - 0x00030001, 0x00070005, // ABin - 0x02020202, // BSelt - 0x00000000, 0x00000000, // BBin - 0x00000400, // AccumType, ConstantType, and PostShift - 0x00000001, 0x00000000, 0x00000001, 0x00000000, 0x00000001, 0x00000000, 0x00000001, 0x00000000 // Constant - }; - status |= vxSetNodeUniform(nodObj, "uniExtractEvenFp16Stride2_4x4", 1, uniExtractEvenFp16Stride2_4x4); - status |= vxSetNodeUniform(nodObj, "uniExtractOddFp16Stride2_4x4", 1, uniExtractOddFp16Stride2_4x4); - //status |= vxSetNodeUniform(nodObj, "input_depth", 1, &input_depth); - } - - status |= vxSetNodeAttribute(nodObj, VX_NODE_ATTRIBUTE_KERNEL_EXECUTION_PARAMETERS, - &shaderParam, sizeof(vx_kernel_execution_parameters_t)); - - return VX_SUCCESS; -} - -static vx_param_description_t s_params[] = -{ - { VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED }, - { VX_OUTPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED }, - { VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED }, - { VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED }, -}; - -#ifdef __cplusplus -extern "C" { -#endif -vx_kernel_description_t _VX_KERNEL_VAR = -{ - _VX_KERNEL_ID, - _VX_KERNEL_NAME, - _VX_KERNEL_FUNC_KERNEL, - s_params, - _cnt_of_array( s_params ), - vsi_nn_KernelValidator, - NULL, - NULL, - vsi_nn_KernelInitializer, - vsi_nn_KernelDeinitializer -}; - -vx_kernel_description_t vxSpace2DepthKernelInfo_int16_int16 = -{ - _VX_KERNEL_ID, - VX_KERNEL_NAME_SPACE2DEPTH_INT16_INT16, - NULL, - s_params, - _cnt_of_array( s_params ), - vsi_nn_KernelValidator, - NULL, - NULL, - vxSpace2DepthInitializer, - vsi_nn_KernelDeinitializer -}; - -vx_kernel_description_t * vx_kernel_SPACE2DEPTH_list[] = -{ - NULL, - &_VX_KERNEL_VAR, - &vxSpace2DepthKernelInfo_int16_int16, - NULL -}; -#ifdef __cplusplus -} -#endif diff --git a/src/tim/vx/internal/src/libnnext/ops/vx/a_times_b_plus_c.vx b/src/tim/vx/internal/src/libnnext/ops/vx/a_times_b_plus_c.vx index f19c623..5130391 100644 --- a/src/tim/vx/internal/src/libnnext/ops/vx/a_times_b_plus_c.vx +++ b/src/tim/vx/internal/src/libnnext/ops/vx/a_times_b_plus_c.vx @@ -54,3 +54,81 @@ __kernel void a_times_b_plus_c_F16_F16_F16toF16_2D VXC_WriteImage(output, coord, result, VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0)); } +_viv_uniform VXC_512Bits uniExtractHalf8_2x8; +_viv_uniform VXC_512Bits uniA_Times_B_lo_4x4; +_viv_uniform VXC_512Bits uniA_Times_B_hi_4x4; +__kernel void a_times_b_plus_c_F16_F16_F32toF16 + ( + __read_only image2d_array_t input0, + __read_only image2d_array_t input1, + __read_only image2d_array_t input2, + __write_only image2d_array_t output + ) +{ + int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0); + + vxc_half8 src0, src1, dst; + vxc_ushort8 vec0, vec1, result; + float4 b0, b1; + float4 dst0, dst1; + + VXC_ReadImage2DArray(vec0, input0, coord, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); + _viv_asm(COPY, src0, vec0, 16); + VXC_ReadImage2DArray(vec1, input1, coord, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); + _viv_asm(COPY, src1, vec1, 16); + b0 = read_imagef(input2, coord); + coord.x += 4; + b1 = read_imagef(input2, coord); + coord.x -= 4; + + VXC_DP4x4(dst0, src0, src1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 1), uniA_Times_B_lo_4x4); + VXC_DP4x4(dst1, src0, src1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 1), uniA_Times_B_hi_4x4); + dst0 += b0; + dst1 += b1; + + half4 t0, t1; + _viv_asm(CONV, t0, dst0); + _viv_asm(CONV, t1, dst1); + VXC_DP2x8(dst, t0, t1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniExtractHalf8_2x8); + _viv_asm(COPY, result, dst, 16); + + VXC_WriteImage2DArray(output, coord, result, VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0)); +} + +__kernel void a_times_b_plus_c_F16_F16_F32toF16_2D + ( + __read_only image2d_array_t input0, + __read_only image2d_array_t input1, + __read_only image2d_t input2, + __write_only image2d_array_t output + ) +{ + int4 coord = (int4)(get_global_id(0), get_global_id(1), 0, 0); + + vxc_half8 src0, src1, dst; + vxc_ushort8 vec0, vec1, result; + float4 b0, b1; + float4 dst0, dst1; + + VXC_ReadImage(vec0, input0, coord.xy, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); + _viv_asm(COPY, src0, vec0, 16); + VXC_ReadImage(vec1, input1, coord.xy, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); + _viv_asm(COPY, src1, vec1, 16); + b0 = read_imagef(input2, coord.xy); + coord.z = coord.x + 4; + b1 = read_imagef(input2, coord.zy); + + VXC_DP4x4(dst0, src0, src1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 1), uniA_Times_B_lo_4x4); + VXC_DP4x4(dst1, src0, src1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 1), uniA_Times_B_hi_4x4); + dst0 += b0; + dst1 += b1; + + half4 t0, t1; + _viv_asm(CONV, t0, dst0); + _viv_asm(CONV, t1, dst1); + VXC_DP2x8(dst, t0, t1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniExtractHalf8_2x8); + _viv_asm(COPY, result, dst, 16); + + VXC_WriteImage(output, coord, result, VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0)); +} + diff --git a/src/tim/vx/internal/src/libnnext/ops/vx/gather.vx b/src/tim/vx/internal/src/libnnext/ops/vx/gather.vx index e36dfdb..90b5135 100644 --- a/src/tim/vx/internal/src/libnnext/ops/vx/gather.vx +++ b/src/tim/vx/internal/src/libnnext/ops/vx/gather.vx @@ -1,10 +1,11 @@ #include "cl_viv_vx_ext.h" _viv_uniform int indices_num; +_viv_uniform VXC_512Bits uniExtraCopyDpKeepinEvis_2x8; __kernel void gather_I8toI8( __read_only image2d_t input0, - __read_only image2d_array_t input1, + __read_only image2d_t input1, __write_only image2d_t output, int block_size, int block_num, @@ -16,7 +17,7 @@ __kernel void gather_I8toI8( int gidz = get_global_id(2); // block_num int4 coord_in = (int4)(gidy, 0, gidx, 0); - int4 indice = read_imagei(input1, coord_in.xyyy); + int4 indice = read_imagei(input1, coord_in.xy); coord_in.w = gidz * axis_num + indice.x; vxc_char16 src; @@ -28,7 +29,7 @@ __kernel void gather_I8toI8( __kernel void gather_U8toU8( __read_only image2d_t input0, - __read_only image2d_array_t input1, + __read_only image2d_t input1, __write_only image2d_t output, int block_size, int block_num, @@ -40,7 +41,7 @@ __kernel void gather_U8toU8( int gidz = get_global_id(2); // block_num int4 coord_in = (int4)(gidy, 0, gidx, 0); - int4 indice = read_imagei(input1, coord_in.xyyy); + int4 indice = read_imagei(input1, coord_in.xy); coord_in.w = gidz * axis_num + indice.x; vxc_uchar16 src; @@ -52,7 +53,7 @@ __kernel void gather_U8toU8( __kernel void gather_I16toI16( __read_only image2d_t input0, - __read_only image2d_array_t input1, + __read_only image2d_t input1, __write_only image2d_t output, int block_size, int block_num, @@ -66,7 +67,7 @@ __kernel void gather_I16toI16( int4 coord_in = (int4)(gidy, 0, gidx, 0); - int4 indice = read_imagei(input1, coord_in.xyyy); + int4 indice = read_imagei(input1, coord_in.xy); coord_in.w = gidz * axis_num + indice.x; vxc_short8 src; @@ -78,7 +79,7 @@ __kernel void gather_I16toI16( __kernel void gather_F16toF16( __read_only image2d_t input0, - __read_only image2d_array_t input1, + __read_only image2d_t input1, __write_only image2d_t output, int block_size, int block_num, @@ -92,7 +93,7 @@ __kernel void gather_F16toF16( int4 coord_in = (int4)(gidy, 0, gidx, 0); - int4 indice = read_imagei(input1, coord_in.xyyy); + int4 indice = read_imagei(input1, coord_in.xy); coord_in.w = gidz * axis_num + indice.x; vxc_short8 src; @@ -101,3 +102,107 @@ __kernel void gather_F16toF16( int2 coord = (int2)(gidx, gidz * indices_num + gidy); VXC_WriteImage(output, coord, src, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); } + +__kernel void gather_I8toI8_axis0( + __read_only image2d_t input0, + __read_only image2d_t input1, + __write_only image2d_t output, + int block_size, + int block_num, + int axis_num + ) +{ + int2 coord = (int2)(get_global_id(0), get_global_id(1)); + int4 indices = read_imagei(input1, coord.xx); + int2 coord_in = (int2)(indices.x, get_global_id(1)); + + vxc_char16 src, dst; + VXC_ReadImage(src, input0, coord_in, 0, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0)); + indices.x = get_global_id(1); + VXC_ReadImage(src, input0, indices.yx, 0, VXC_MODIFIER(1, 1, 0, VXC_RM_TowardZero, 0)); + VXC_ReadImage(src, input0, indices.zx, 0, VXC_MODIFIER(2, 2, 0, VXC_RM_TowardZero, 0)); + VXC_ReadImage(src, input0, indices.wx, 0, VXC_MODIFIER(3, 3, 0, VXC_RM_TowardZero, 0)); + + VXC_DP2x8(dst, src, src, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), + uniExtraCopyDpKeepinEvis_2x8); + + VXC_WriteImage(output, coord, dst, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); +} + +__kernel void gather_U8toU8_axis0( + __read_only image2d_t input0, + __read_only image2d_t input1, + __write_only image2d_t output, + int block_size, + int block_num, + int axis_num + ) +{ + int2 coord = (int2)(get_global_id(0), get_global_id(1)); + int4 indices = read_imagei(input1, coord.xx); + int2 coord_in = (int2)(indices.x, get_global_id(1)); + + vxc_uchar16 src, dst; + VXC_ReadImage(src, input0, coord_in, 0, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0)); + indices.x = get_global_id(1); + VXC_ReadImage(src, input0, indices.yx, 0, VXC_MODIFIER(1, 1, 0, VXC_RM_TowardZero, 0)); + VXC_ReadImage(src, input0, indices.zx, 0, VXC_MODIFIER(2, 2, 0, VXC_RM_TowardZero, 0)); + VXC_ReadImage(src, input0, indices.wx, 0, VXC_MODIFIER(3, 3, 0, VXC_RM_TowardZero, 0)); + + VXC_DP2x8(dst, src, src, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), + uniExtraCopyDpKeepinEvis_2x8); + + VXC_WriteImage(output, coord, dst, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); +} + +__kernel void gather_I16toI16_axis0( + __read_only image2d_t input0, + __read_only image2d_t input1, + __write_only image2d_t output, + int block_size, + int block_num, + int axis_num + ) +{ + int2 coord = (int2)(get_global_id(0), get_global_id(1)); + int4 indices = read_imagei(input1, coord.xx); + int2 coord_in = (int2)(indices.x, get_global_id(1)); + + vxc_short8 src, dst; + VXC_ReadImage(src, input0, coord_in, 0, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0)); + indices.x = get_global_id(1); + VXC_ReadImage(src, input0, indices.yx, 0, VXC_MODIFIER(1, 1, 0, VXC_RM_TowardZero, 0)); + VXC_ReadImage(src, input0, indices.zx, 0, VXC_MODIFIER(2, 2, 0, VXC_RM_TowardZero, 0)); + VXC_ReadImage(src, input0, indices.wx, 0, VXC_MODIFIER(3, 3, 0, VXC_RM_TowardZero, 0)); + + VXC_DP2x8(dst, src, src, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), + uniExtraCopyDpKeepinEvis_2x8); + + VXC_WriteImage(output, coord, dst, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); +} + +__kernel void gather_F16toF16_axis0( + __read_only image2d_t input0, + __read_only image2d_t input1, + __write_only image2d_t output, + int block_size, + int block_num, + int axis_num + ) +{ + int2 coord = (int2)(get_global_id(0), get_global_id(1)); + int4 indices = read_imagei(input1, coord.xx); + int2 coord_in = (int2)(indices.x, get_global_id(1)); + + vxc_short8 src, dst; + VXC_ReadImage(src, input0, coord_in, 0, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0)); + indices.x = get_global_id(1); + VXC_ReadImage(src, input0, indices.yx, 0, VXC_MODIFIER(1, 1, 0, VXC_RM_TowardZero, 0)); + VXC_ReadImage(src, input0, indices.zx, 0, VXC_MODIFIER(2, 2, 0, VXC_RM_TowardZero, 0)); + VXC_ReadImage(src, input0, indices.wx, 0, VXC_MODIFIER(3, 3, 0, VXC_RM_TowardZero, 0)); + + VXC_DP2x8(dst, src, src, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), + uniExtraCopyDpKeepinEvis_2x8); + + VXC_WriteImage(output, coord, dst, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); +} diff --git a/src/tim/vx/internal/src/libnnext/ops/vx/gather_mix.vx b/src/tim/vx/internal/src/libnnext/ops/vx/gather_mix.vx index e3950b1..e9b8fd1 100644 --- a/src/tim/vx/internal/src/libnnext/ops/vx/gather_mix.vx +++ b/src/tim/vx/internal/src/libnnext/ops/vx/gather_mix.vx @@ -11,7 +11,7 @@ _viv_uniform int2 multAndoutZP1;//[0:15] multiplier, [31:63] output zp #define GATHER_8BITS_TO_F16(src0_type_name, read_type) \ __kernel void gather_##src0_type_name##toF16( \ __read_only image2d_t input0, \ - __read_only image2d_array_t input1, \ + __read_only image2d_t input1, \ __write_only image2d_t output, \ int block_size, \ int block_num, \ @@ -23,7 +23,7 @@ __kernel void gather_##src0_type_name##toF16( \ int gidz = get_global_id(2); \ \ int4 coord_in = (int4)(gidy, 0, gidx, 0); \ - int4 indice = read_imagei(input1, coord_in.xyyy); \ + int4 indice = read_imagei(input1, coord_in.xy); \ coord_in.w = gidz * axis_num + indice.x; \ \ read_type src; \ @@ -47,7 +47,7 @@ GATHER_8BITS_TO_F16(I8, vxc_char16) #define GATHER_F16_TO_QINT(src1_type_name, write_type) \ __kernel void gather_F16to##src1_type_name( \ __read_only image2d_t input0, \ - __read_only image2d_array_t input1, \ + __read_only image2d_t input1, \ __write_only image2d_t output, \ int block_size, \ int block_num, \ @@ -59,7 +59,7 @@ __kernel void gather_F16to##src1_type_name( \ int gidz = get_global_id(2); \ int4 coord_in = (int4)(gidy, 0, gidx, 0); \ \ - int4 indice = read_imagei(input1, coord_in.xyyy); \ + int4 indice = read_imagei(input1, coord_in.xy); \ coord_in.w = gidz * axis_num + indice.x; \ \ vxc_short8 src; \ @@ -79,7 +79,7 @@ GATHER_F16_TO_QINT(I16, vxc_short8) __kernel void gather_I16toF16( __read_only image2d_t input0, - __read_only image2d_array_t input1, + __read_only image2d_t input1, __write_only image2d_t output, int block_size, int block_num, @@ -91,7 +91,7 @@ __kernel void gather_I16toF16( int gidz = get_global_id(2); int4 coord_in = (int4)(gidy, 0, gidx, 0); - int4 indice = read_imagei(input1, coord_in.xyyy); + int4 indice = read_imagei(input1, coord_in.xy); coord_in.w = gidz * axis_num + indice.x; vxc_short8 src; @@ -109,3 +109,97 @@ __kernel void gather_I16toF16( VXC_WriteImage(output, coord, dst0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); } + +#define GATHER_8BITS_TO_F16_AXIS0(src0_type_name, read_type) \ +__kernel void gather_##src0_type_name##toF16_axis0( \ + __read_only image2d_t input0, \ + __read_only image2d_t input1, \ + __write_only image2d_t output, \ + int block_size, \ + int block_num, \ + int axis_num \ + ) \ +{ \ + int2 coord = (int2)(get_global_id(0), get_global_id(1)); \ + int4 indices = read_imagei(input1, coord.xx); \ + int2 coord_in = (int2)(indices.x, get_global_id(1)); \ + \ + read_type src; \ + VXC_ReadImage(src, input0, coord_in, 0, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0)); \ + indices.x = get_global_id(1); \ + VXC_ReadImage(src, input0, indices.yx, 0, VXC_MODIFIER(1, 1, 0, VXC_RM_TowardZero, 0)); \ + VXC_ReadImage(src, input0, indices.zx, 0, VXC_MODIFIER(2, 2, 0, VXC_RM_TowardZero, 0)); \ + VXC_ReadImage(src, input0, indices.wx, 0, VXC_MODIFIER(3, 3, 0, VXC_RM_TowardZero, 0)); \ + vxc_half8 src0; \ + vxc_short8 dst0; \ + vxc_ushort8 ms0; \ + _viv_asm(COPY, ms0, multAndoutZP0, 16); \ + VXC_DP2x8(src0,src,ms0, VXC_MODIFIER(0,7,0, VXC_RM_TowardZero,1), uniU8MulAndPostShift_0_Lo_2x8); \ + _viv_asm(COPY, dst0, src0, 16); \ + VXC_WriteImage(output, coord, dst0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \ +} +GATHER_8BITS_TO_F16_AXIS0(U8, vxc_uchar16) +GATHER_8BITS_TO_F16_AXIS0(I8, vxc_char16) + +#define GATHER_F16_TO_QINT_AXIS0(src1_type_name, write_type) \ +__kernel void gather_F16to##src1_type_name##_axis0( \ + __read_only image2d_t input0, \ + __read_only image2d_t input1, \ + __write_only image2d_t output, \ + int block_size, \ + int block_num, \ + int axis_num \ + ) \ +{ \ + int2 coord = (int2)(get_global_id(0), get_global_id(1)); \ + int4 indices = read_imagei(input1, coord.xx); \ + int2 coord_in = (int2)(indices.x, get_global_id(1)); \ + \ + vxc_short8 src; \ + VXC_ReadImage(src, input0, coord_in, 0, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0)); \ + indices.x = get_global_id(1); \ + VXC_ReadImage(src, input0, indices.yx, 0, VXC_MODIFIER(1, 1, 0, VXC_RM_TowardZero, 0)); \ + VXC_ReadImage(src, input0, indices.zx, 0, VXC_MODIFIER(2, 2, 0, VXC_RM_TowardZero, 0)); \ + VXC_ReadImage(src, input0, indices.wx, 0, VXC_MODIFIER(3, 3, 0, VXC_RM_TowardZero, 0)); \ + vxc_ushort8 mp1; \ + _viv_asm(COPY, mp1, multAndoutZP1, 16); \ + vxc_half8 data; \ + write_type dst; \ + _viv_asm(COPY, data, src, 16); \ + VXC_DP2x8(dst,data,mp1,VXC_MODIFIER(0,7,0,VXC_RM_ToNearestEven, 1),uniConvertFp16toU8_2x8); \ + VXC_WriteImage(output, coord, dst, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \ +} +GATHER_F16_TO_QINT_AXIS0(U8, vxc_uchar16) +GATHER_F16_TO_QINT_AXIS0(I8, vxc_char16) +GATHER_F16_TO_QINT_AXIS0(I16, vxc_short8) + +__kernel void gather_I16toF16_axis0( + __read_only image2d_t input0, + __read_only image2d_t input1, + __write_only image2d_t output, + int block_size, + int block_num, + int axis_num + ) +{ + int2 coord = (int2)(get_global_id(0), get_global_id(1)); + int4 indices = read_imagei(input1, coord.xx); + int2 coord_in = (int2)(indices.x, get_global_id(1)); + + vxc_short8 src; + VXC_ReadImage(src, input0, coord_in, 0, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0)); + indices.x = get_global_id(1); + VXC_ReadImage(src, input0, indices.yx, 0, VXC_MODIFIER(1, 1, 0, VXC_RM_TowardZero, 0)); + VXC_ReadImage(src, input0, indices.zx, 0, VXC_MODIFIER(2, 2, 0, VXC_RM_TowardZero, 0)); + VXC_ReadImage(src, input0, indices.wx, 0, VXC_MODIFIER(3, 3, 0, VXC_RM_TowardZero, 0)); + + vxc_half8 src0; + vxc_short8 dst0; + vxc_ushort8 ms0; + _viv_asm(COPY, ms0, multAndoutZP0, 16); + VXC_DP2x8(src0, src, ms0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1),\ + uniU8MulAndPostShift_0_Lo_2x8); + _viv_asm(COPY, dst0, src0, 16); + + VXC_WriteImage(output, coord, dst0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); +} diff --git a/src/tim/vx/internal/src/libnnext/ops/vx/layer_normalization.vx b/src/tim/vx/internal/src/libnnext/ops/vx/layer_normalization.vx new file mode 100644 index 0000000..7a796a2 --- /dev/null +++ b/src/tim/vx/internal/src/libnnext/ops/vx/layer_normalization.vx @@ -0,0 +1,279 @@ +#include "cl_viv_vx_ext.h" + +/**************************layernorm float16***********************************/ +_viv_uniform int width; +_viv_uniform float dimRatio; +_viv_uniform VXC_512Bits uniFp16SumSqr_dp8x2; +_viv_uniform VXC_512Bits UniFP16toFP32Lo4_dp4x4; +_viv_uniform VXC_512Bits uniExtractHalf4_dp4x4; + +__kernel void layer_norm_F16toF16( + image2d_array_t input, image2d_t bias, image2d_t scale, + image2d_array_t output, float eps) +{ + int4 coord = (int4)(0, get_global_id(1), get_global_id(2), 0); + int4 coord_out = coord; + + int8 input_desc, output_desc; + _viv_asm(COPY, input_desc, input, sizeof(input_desc)); + int baseAddr_a = (int)get_global_id(2) * input_desc.s4 + input_desc.s0; + _viv_asm(MOV, coord.z, baseAddr_a); + + vxc_short8 src0, src1; + vxc_float sum = 0, sqr = 0; + VXC_OP4(img_load_3d, src0, input, coord.xyzz, VXC_5BITOFFSET_XY(0, 0), \ + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); + + _viv_asm(COPY, output_desc, output, sizeof(output_desc)); + int baseAddr = (int)get_global_id(2) * output_desc.s4 + output_desc.s0; + _viv_asm(MOV, coord_out.w, baseAddr); + + for(coord.x = 8; coord.x < (width+8); coord.x += 8) + { + vxc_half8 val0_h; + _viv_asm(COPY, val0_h, src0, 16); + VXC_OP4(img_load_3d, src0, input, coord.xyzz, VXC_5BITOFFSET_XY(0, 0), \ + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); + vxc_float4 sumsqr; + VXC_DP8x2(sumsqr, val0_h, val0_h, VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0),\ + uniFp16SumSqr_dp8x2); + sum += sumsqr.x; + sqr += sumsqr.y; + } + vxc_float mean; + mean = sum * dimRatio; + vxc_float vari; + vari = sqr*dimRatio - mean*mean; + vari += eps; + vari = rsqrt(vari); + vxc_float4 bias_f; + for(coord.x = 0; coord.x < width; coord.x += 4) + { + VXC_OP4(img_load_3d, src0, input, coord.xyzz, VXC_5BITOFFSET_XY(0, 0), \ + VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); + VXC_ReadImage(src1, scale, coord.xw, VXC_5BITOFFSET_XY(0, 0),\ + VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); + bias_f = read_imagef(bias, coord.xw); + vxc_half8 in_h, scale_h; + _viv_asm(COPY, in_h, src0, 16); + _viv_asm(COPY, scale_h, src1, 16); + vxc_float4 in_f, scale_f; + VXC_DP4x4(in_f, in_h, in_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\ + UniFP16toFP32Lo4_dp4x4); + VXC_DP4x4(scale_f, scale_h, scale_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\ + UniFP16toFP32Lo4_dp4x4); + vxc_float4 sub, norm; + sub = in_f - mean; + norm = scale_f * vari * sub + bias_f; + half4 norm_h; + _viv_asm(CONV, norm_h, norm); + vxc_half8 dst; + VXC_DP4x4(dst, norm_h, norm_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\ + uniExtractHalf4_dp4x4); + vxc_short8 dstval; + _viv_asm(COPY, dstval, dst, 16); + coord_out.x = coord.x; + VXC_OP4_NoDest(img_store_3d, output, coord_out.xyww, dstval, \ + VXC_MODIFIER(0, 3, 0,VXC_RM_TowardZero, 0)); + } +} +/*****************************layernorm uint8 to uint8****************************/ +_viv_uniform VXC_512Bits uniConvert1stUint8SubZpToFp32_4x4; +_viv_uniform VXC_512Bits uniConvert2ndUint8SubZpToFp32_4x4; +_viv_uniform VXC_512Bits uniConvert3rdUint8SubZpToFp32_4x4; +_viv_uniform VXC_512Bits uniConvert4thUint8SubZpToFp32_4x4; +_viv_uniform VXC_512Bits uniSumU8_16x1; +_viv_uniform VXC_512Bits uniSqrSum_16x1; +_viv_uniform float input_scale; +_viv_uniform int inputZP; +_viv_uniform float outputScale; +_viv_uniform float output_zp; +_viv_uniform int sumInZp; +_viv_uniform int tmpZp1; +_viv_uniform int tmpZp2; +_viv_uniform float e2InScale; +_viv_uniform VXC_512Bits uniConvertSecFp16Fp32_4x4; +_viv_uniform VXC_512Bits uniConvertInt32toUint8_2x8; + +__kernel void layer_norm_U8toU8( + image2d_array_t input, image2d_t bias, image2d_t scale, + image2d_array_t output, float eps) +{ + int4 coord = (int4)(0, get_global_id(1), get_global_id(2), 0); + int4 coord_out = coord; + + vxc_uchar16 src0, src2; + vxc_short8 src1; + vxc_half8 scale_h; + float sum = 0, sqr = 0; + vxc_float4 bias_f0, bias_f1, scale_f0, scale_f1; + int tmpSum = 0, tmpSqr = 0; + vxc_int4 tmpSum1; + vxc_int4 tmpSqr1; + short zp = inputZP; + + int8 input_desc, output_desc; + _viv_asm(COPY, input_desc, input, sizeof(input_desc)); + int baseAddr_a = (int)get_global_id(2) * input_desc.s4 + input_desc.s0; + _viv_asm(MOV, coord.z, baseAddr_a); + + _viv_asm(COPY, output_desc, output, sizeof(output_desc)); + int baseAddr = (int)get_global_id(2) * output_desc.s4 + output_desc.s0; + _viv_asm(MOV, coord_out.w, baseAddr); + + for(coord.x = 0; coord.x < width; coord.x += 16) + { + VXC_OP4(img_load_3d, src0, input, coord.xyzz, VXC_5BITOFFSET_XY(0, 0), \ + VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); + VXC_DP16x1(tmpSum1, src0, src0, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0), uniSumU8_16x1); + tmpSum += (tmpSum1.x); + VXC_DP16x1(tmpSqr1, src0, src0, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0), uniSqrSum_16x1); + tmpSqr += (tmpSqr1.x + tmpZp1 * tmpSum1.x); + } + sum = (tmpSum + sumInZp) * input_scale; + sqr = (tmpSqr + tmpZp2) * e2InScale; + + float mean, vari; + mean = sum * dimRatio; + vari = sqr*dimRatio - mean*mean; + vari += eps; + vari = rsqrt(vari); + vxc_int4 tmpVal0, tmpVal1; + vxc_float4 tmpData0, tmpData1, tmpData2, tmpData3; + int2 coord_bias = (int2)(0, 0); + + for(coord.x = 0; coord.x < width; coord.x += 16) + { + VXC_OP4(img_load_3d, src0, input, coord.xyzz, VXC_5BITOFFSET_XY(0, 0), \ + VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); + VXC_ReadImage(src1, scale, coord.xw, VXC_5BITOFFSET_XY(0, 0),\ + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); + coord_bias.x = coord.x; + _viv_asm(COPY, scale_h, src1, 16); + VXC_DP4x4(scale_f0, scale_h, scale_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\ + UniFP16toFP32Lo4_dp4x4); + VXC_DP4x4(scale_f1, scale_h, scale_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\ + uniConvertSecFp16Fp32_4x4); + bias_f0 = read_imagef(bias, coord_bias); + coord_bias.x += 4; + bias_f1 = read_imagef(bias, coord_bias); + coord_bias.x += 4; + + VXC_ReadImage(src1, scale, coord.xw, VXC_5BITOFFSET_XY(8, 0),\ + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); + _viv_asm(COPY, scale_h, src1, 16); + VXC_DP4x4(tmpData0, src0, zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\ + uniConvert1stUint8SubZpToFp32_4x4); + VXC_DP4x4(tmpData1, src0, zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\ + uniConvert2ndUint8SubZpToFp32_4x4); + VXC_DP4x4(tmpData2, src0, zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\ + uniConvert3rdUint8SubZpToFp32_4x4); + VXC_DP4x4(tmpData3, src0, zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\ + uniConvert4thUint8SubZpToFp32_4x4); + tmpData0 *= input_scale; + tmpData1 *= input_scale; + tmpData2 *= input_scale; + tmpData3 *= input_scale; + + vxc_float4 norm; + tmpData0 -= mean; + norm = scale_f0 * vari * tmpData0 + bias_f0; + bias_f0 = read_imagef(bias, coord_bias); + VXC_DP4x4(scale_f0, scale_h, scale_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\ + UniFP16toFP32Lo4_dp4x4); + coord_bias.x += 4; + tmpVal0 = convert_int4_rte(norm * outputScale + output_zp); + + tmpData1 -= mean; + norm = scale_f1 * vari * tmpData1 + bias_f1; + bias_f1 = read_imagef(bias, coord_bias); + VXC_DP4x4(scale_f1, scale_h, scale_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\ + uniConvertSecFp16Fp32_4x4); + tmpVal1 = convert_int4_rte(norm * outputScale + output_zp); + VXC_DP2x8(src2, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1),\ + uniConvertInt32toUint8_2x8); + + tmpData2 -= mean; + norm = scale_f0 * vari * tmpData2 + bias_f0; + tmpVal0 = convert_int4_rte(norm * outputScale + output_zp); + + tmpData3 -= mean; + norm = scale_f1 * vari * tmpData3 + bias_f1; + tmpVal1 = convert_int4_rte(norm * outputScale + output_zp); + VXC_DP2x8(src2, tmpVal0, tmpVal1, VXC_MODIFIER(8, 15, 0, VXC_RM_TowardZero, 1),\ + uniConvertInt32toUint8_2x8); + coord_out.x = coord.x; + VXC_OP4_NoDest(img_store_3d, output, coord_out.xyww, src2, \ + VXC_MODIFIER(0, 15, 0,VXC_RM_TowardZero, 0)); + } +} +/***************************layernorm float16 to uint8**************************/ +__kernel void layer_norm_F16toU8( + image2d_array_t input, image2d_t bias, image2d_t scale, + image2d_array_t output, float eps) +{ + int4 coord = (int4)(0, get_global_id(1), get_global_id(2), 0); + int4 coord_out = coord; + + int8 input_desc, output_desc; + _viv_asm(COPY, input_desc, input, sizeof(input_desc)); + int baseAddr_a = (int)get_global_id(2) * input_desc.s4 + input_desc.s0; + _viv_asm(MOV, coord.z, baseAddr_a); + + vxc_short8 src0, src1; + vxc_float sum = 0, sqr = 0; + VXC_OP4(img_load_3d, src0, input, coord.xyzz, VXC_5BITOFFSET_XY(0, 0), \ + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); + + _viv_asm(COPY, output_desc, output, sizeof(output_desc)); + int baseAddr = (int)get_global_id(2) * output_desc.s4 + output_desc.s0; + _viv_asm(MOV, coord_out.w, baseAddr); + + for(coord.x = 8; coord.x < (width+8); coord.x += 8) + { + vxc_half8 val0_h; + _viv_asm(COPY, val0_h, src0, 16); + VXC_OP4(img_load_3d, src0, input, coord.xyzz, VXC_5BITOFFSET_XY(0, 0), \ + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); + vxc_float4 sumsqr; + VXC_DP8x2(sumsqr, val0_h, val0_h, VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0),\ + uniFp16SumSqr_dp8x2); + sum += sumsqr.x; + sqr += sumsqr.y; + } + vxc_float mean; + mean = sum * dimRatio; + vxc_float vari; + vari = sqr*dimRatio - mean*mean; + vari += eps; + vari = rsqrt(vari); + vxc_float4 bias_f; + for(coord.x = 0; coord.x < width; coord.x += 4) + { + VXC_OP4(img_load_3d, src0, input, coord.xyzz, VXC_5BITOFFSET_XY(0, 0), \ + VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); + VXC_ReadImage(src1, scale, coord.xw, VXC_5BITOFFSET_XY(0, 0),\ + VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); + bias_f = read_imagef(bias, coord.xw); + vxc_half8 in_h, scale_h; + _viv_asm(COPY, in_h, src0, 16); + _viv_asm(COPY, scale_h, src1, 16); + vxc_float4 in_f, scale_f; + VXC_DP4x4(in_f, in_h, in_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\ + UniFP16toFP32Lo4_dp4x4); + VXC_DP4x4(scale_f, scale_h, scale_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\ + UniFP16toFP32Lo4_dp4x4); + vxc_float4 sub, norm; + sub = in_f - mean; + norm = scale_f * vari * sub + bias_f; + norm = norm * outputScale + output_zp; + int4 output_int4; + output_int4 = convert_int4_rte(norm); + vxc_uchar8 dst; + VXC_DP2x8(dst, output_int4, output_int4, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), + uniConvertInt32toUint8_2x8); + coord_out.x = coord.x; + VXC_OP4_NoDest(img_store_3d, output, coord_out.xyww, dst, \ + VXC_MODIFIER(0, 3, 0,VXC_RM_TowardZero, 0)); + } +} \ No newline at end of file diff --git a/src/tim/vx/internal/src/libnnext/ops/vx/vsi_nn_kernel_layernormalize.vx b/src/tim/vx/internal/src/libnnext/ops/vx/layer_normalization_2d.vx similarity index 87% rename from src/tim/vx/internal/src/libnnext/ops/vx/vsi_nn_kernel_layernormalize.vx rename to src/tim/vx/internal/src/libnnext/ops/vx/layer_normalization_2d.vx index db424ad..d517d7d 100644 --- a/src/tim/vx/internal/src/libnnext/ops/vx/vsi_nn_kernel_layernormalize.vx +++ b/src/tim/vx/internal/src/libnnext/ops/vx/layer_normalization_2d.vx @@ -7,12 +7,9 @@ _viv_uniform VXC_512Bits uniFp16SumSqr_dp8x2; _viv_uniform VXC_512Bits UniFP16toFP32Lo4_dp4x4; _viv_uniform VXC_512Bits uniExtractHalf4_dp4x4; -__kernel void vxcLayerNorm( - image2d_array_t input, - image2d_array_t bias, - image2d_array_t scale, - image2d_array_t output, - float eps) +__kernel void layer_norm_F16toF16_2D( + image2d_t input, image2d_t bias, image2d_t scale, + image2d_t output, float eps) { int4 coord = (int4)(0, get_global_id(1), 0, 0); vxc_short8 src0, src1; @@ -44,7 +41,7 @@ __kernel void vxcLayerNorm( VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); VXC_ReadImage(src1, scale, coord.xw, VXC_5BITOFFSET_XY(0, 0),\ VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); - bias_f = read_imagef(bias, coord.xwww); + bias_f = read_imagef(bias, coord.xw); vxc_half8 in_h, scale_h; _viv_asm(COPY, in_h, src0, 16); _viv_asm(COPY, scale_h, src1, 16); @@ -76,7 +73,7 @@ _viv_uniform VXC_512Bits uniSqrSum_16x1; _viv_uniform float input_scale; _viv_uniform int inputZP; _viv_uniform float outputScale; -_viv_uniform int output_ZP; +_viv_uniform float output_zp; _viv_uniform int sumInZp; _viv_uniform int tmpZp1; _viv_uniform int tmpZp2; @@ -84,12 +81,9 @@ _viv_uniform float e2InScale; _viv_uniform VXC_512Bits uniConvertSecFp16Fp32_4x4; _viv_uniform VXC_512Bits uniConvertInt32toUint8_2x8; -__kernel void vxcLayerNorm_u8( - image2d_array_t input, - image2d_array_t bias, - image2d_array_t scale, - image2d_array_t output, - float eps) +__kernel void layer_norm_U8toU8_2D( + image2d_t input, image2d_t bias, image2d_t scale, + image2d_t output, float eps) { int4 coord = (int4)(0, get_global_id(1), 0, 0); vxc_uchar16 src0, src2; @@ -121,15 +115,15 @@ __kernel void vxcLayerNorm_u8( vari = rsqrt(vari); vxc_int4 tmpVal0, tmpVal1; vxc_float4 tmpData0, tmpData1, tmpData2, tmpData3; - int4 coord_bias = (int4)(0, 0, 0, 0); + int2 coord_bias = (int2)(0, 0); for(coord.x = 0; coord.x < width; coord.x += 16) { - coord_bias.x = coord.x; VXC_ReadImage(src0, input, coord.xy, VXC_5BITOFFSET_XY(0, 0),\ VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); VXC_ReadImage(src1, scale, coord.xw, VXC_5BITOFFSET_XY(0, 0),\ VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); + coord_bias.x = coord.x; _viv_asm(COPY, scale_h, src1, 16); VXC_DP4x4(scale_f0, scale_h, scale_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\ UniFP16toFP32Lo4_dp4x4); @@ -151,49 +145,41 @@ __kernel void vxcLayerNorm_u8( uniConvert3rdUint8SubZpToFp32_4x4); VXC_DP4x4(tmpData3, src0, zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\ uniConvert4thUint8SubZpToFp32_4x4); - tmpData0 *= input_scale; - tmpData1 *= input_scale; - tmpData2 *= input_scale; - tmpData3 *= input_scale; + tmpData0 = tmpData0 * input_scale - mean; + tmpData1 = tmpData1 * input_scale - mean; + tmpData2 = tmpData2 * input_scale - mean; + tmpData3 = tmpData3 * input_scale - mean; vxc_float4 norm; - tmpData0 -= mean; norm = scale_f0 * vari * tmpData0 + bias_f0; bias_f0 = read_imagef(bias, coord_bias); VXC_DP4x4(scale_f0, scale_h, scale_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\ UniFP16toFP32Lo4_dp4x4); coord_bias.x += 4; - tmpVal0 = convert_int4_rte(norm * outputScale + output_ZP); + tmpVal0 = convert_int4_rte(norm * outputScale + output_zp); - tmpData1 -= mean; norm = scale_f1 * vari * tmpData1 + bias_f1; bias_f1 = read_imagef(bias, coord_bias); VXC_DP4x4(scale_f1, scale_h, scale_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\ uniConvertSecFp16Fp32_4x4); - tmpVal1 = convert_int4_rte(norm * outputScale + output_ZP); + tmpVal1 = convert_int4_rte(norm * outputScale + output_zp); VXC_DP2x8(src2, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1),\ uniConvertInt32toUint8_2x8); - tmpData2 -= mean; norm = scale_f0 * vari * tmpData2 + bias_f0; - tmpVal0 = convert_int4_rte(norm * outputScale + output_ZP); + tmpVal0 = convert_int4_rte(norm * outputScale + output_zp); - tmpData3 -= mean; norm = scale_f1 * vari * tmpData3 + bias_f1; - tmpVal1 = convert_int4_rte(norm * outputScale + output_ZP); + tmpVal1 = convert_int4_rte(norm * outputScale + output_zp); VXC_DP2x8(src2, tmpVal0, tmpVal1, VXC_MODIFIER(8, 15, 0, VXC_RM_TowardZero, 1),\ uniConvertInt32toUint8_2x8); VXC_WriteImage(output, coord.xy, src2, VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); } } /***************************layernorm float16 to uint8**************************/ -_viv_uniform float outputZP; -__kernel void vxcLayerNormFP16toU8( - image2d_array_t input, - image2d_array_t bias, - image2d_array_t scale, - image2d_array_t output, - float eps) +__kernel void layer_norm_F16toU8_2D( + image2d_t input, image2d_t bias, image2d_t scale, + image2d_t output, float eps) { int4 coord = (int4)(0, get_global_id(1), 0, 0); vxc_short8 src0, src1; @@ -225,7 +211,7 @@ __kernel void vxcLayerNormFP16toU8( VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); VXC_ReadImage(src1, scale, coord.xw, VXC_5BITOFFSET_XY(0, 0),\ VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); - bias_f = read_imagef(bias, coord.xwww); + bias_f = read_imagef(bias, coord.xw); vxc_half8 in_h, scale_h; _viv_asm(COPY, in_h, src0, 16); _viv_asm(COPY, scale_h, src1, 16); @@ -237,7 +223,7 @@ __kernel void vxcLayerNormFP16toU8( vxc_float4 sub, norm; sub = in_f - mean; norm = scale_f * vari * sub + bias_f; - norm = norm * outputScale + outputZP; + norm = norm * outputScale + output_zp; int4 output_int4; output_int4 = convert_int4_rte(norm); vxc_uchar8 dst; @@ -245,4 +231,4 @@ __kernel void vxcLayerNormFP16toU8( uniConvertInt32toUint8_2x8); VXC_WriteImage(output, coord.xy, dst, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); } -} \ No newline at end of file +} diff --git a/src/tim/vx/internal/src/libnnext/ops/vx/layer_normalization_i16.vx b/src/tim/vx/internal/src/libnnext/ops/vx/layer_normalization_i16.vx new file mode 100644 index 0000000..bedc979 --- /dev/null +++ b/src/tim/vx/internal/src/libnnext/ops/vx/layer_normalization_i16.vx @@ -0,0 +1,167 @@ +#include "cl_viv_vx_ext.h" + +/**************************layernorm float16***********************************/ +_viv_uniform int width; +_viv_uniform float dimRatio; +_viv_uniform float dimRatio_scale; +_viv_uniform VXC_512Bits uniInt16SumSqr_dp8x2; +_viv_uniform VXC_512Bits UniFP16toFP32Lo4_dp4x4; +_viv_uniform VXC_512Bits uniConvertSecFp16Fp32_4x4; +_viv_uniform VXC_512Bits uniConvert1stUint8SubZpToFp32_4x4; +_viv_uniform VXC_512Bits uniConvert2ndUint8SubZpToFp32_4x4; +_viv_uniform VXC_512Bits uniConvertInt32toUint8_2x8; + +_viv_uniform float e2InScale; +_viv_uniform float outputScale; +_viv_uniform float output_zp; +_viv_uniform float input_scale; +_viv_uniform int inputZP; + +__kernel void layer_norm_I16toI16( + image2d_array_t input, image2d_t bias, image2d_t scale, + image2d_array_t output, float eps) +{ + int4 coord = (int4)(0, get_global_id(1), get_global_id(2), 0); + + int8 input_desc, output_desc; + _viv_asm(COPY, input_desc, input, sizeof(input_desc)); + int baseAddr_a = (int)get_global_id(2) * input_desc.s4 + input_desc.s0; + _viv_asm(MOV, coord.z, baseAddr_a); + + _viv_asm(COPY, output_desc, output, sizeof(output_desc)); + int baseAddr = (int)get_global_id(2) * output_desc.s4 + output_desc.s0; + _viv_asm(MOV, coord.w, baseAddr); + + vxc_short8 src0, src1, dst; + vxc_float sum = 0, sqr = 0; + for(; coord.x < width;) + { + VXC_OP4(img_load_3d, src0, input, coord.xyzz, VXC_5BITOFFSET_XY(0, 0), \ + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); + coord.x += 8; + vxc_float4 sumsqr; + VXC_DP8x2(sumsqr, src0, src0, VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0),\ + uniInt16SumSqr_dp8x2); + sum += sumsqr.x; + sqr = sqr + sumsqr.y * e2InScale; + } + vxc_float mean; + mean = sum * dimRatio_scale; + vxc_float vari; + vari = sqr*dimRatio - mean*mean; + vari += eps; + vari = rsqrt(vari); + + short zp = inputZP; + vxc_float4 tmpData0, tmpData1; + vxc_float4 bias_f0, bias_f1, scale_f0, scale_f1; + vxc_half8 scale_h; + vxc_int4 tmpVal0, tmpVal1; + + int2 coord_bias = (int2)(0, 0); + + for(coord.x = 0; coord.x < width; coord.x += 8) + { + VXC_OP4(img_load_3d, src0, input, coord.xyzz, VXC_5BITOFFSET_XY(0, 0), \ + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); + coord_bias.x = coord.x; + VXC_ReadImage(src1, scale, coord_bias, VXC_5BITOFFSET_XY(0, 0),\ + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); + bias_f0 = read_imagef(bias, coord_bias); + coord_bias.x += 4; + bias_f1 = read_imagef(bias, coord_bias); + + VXC_DP4x4(tmpData0, src0, zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\ + uniConvert1stUint8SubZpToFp32_4x4); + VXC_DP4x4(tmpData1, src0, zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\ + uniConvert2ndUint8SubZpToFp32_4x4); + + _viv_asm(COPY, scale_h, src1, 16); + VXC_DP4x4(scale_f0, scale_h, scale_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\ + UniFP16toFP32Lo4_dp4x4); + VXC_DP4x4(scale_f1, scale_h, scale_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\ + uniConvertSecFp16Fp32_4x4); + + vxc_float4 sub, norm; + sub = tmpData0 * input_scale - mean; + norm = scale_f0 * vari * sub + bias_f0; + tmpVal0 = convert_int4_rte(norm * outputScale + output_zp); + sub = tmpData1 * input_scale - mean; + norm = scale_f1 * vari * sub + bias_f1; + tmpVal1 = convert_int4_rte(norm * outputScale + output_zp); + + VXC_DP2x8(dst, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1),\ + uniConvertInt32toUint8_2x8); + VXC_OP4_NoDest(img_store_3d, output, coord.xyww, dst, \ + VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0)); + } +} + +__kernel void layer_norm_I16toI16_2D( + image2d_t input, image2d_t bias, image2d_t scale, + image2d_t output, float eps) +{ + int2 coord = (int2)(0, get_global_id(1)); + + vxc_short8 src0, src1, dst; + vxc_float sum = 0, sqr = 0; + for(; coord.x < width;) + { + VXC_ReadImage(src0, input, coord.xy, VXC_5BITOFFSET_XY(0, 0),\ + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); + coord.x += 8; + vxc_float4 sumsqr; + VXC_DP8x2(sumsqr, src0, src0, VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0),\ + uniInt16SumSqr_dp8x2); + sum += sumsqr.x; + sqr = sqr + sumsqr.y * e2InScale; + } + vxc_float mean, vari; + mean = sum * dimRatio_scale; + vari = sqr * dimRatio - mean * mean; + vari += eps; + vari = rsqrt(vari); + + short zp = inputZP; + vxc_float4 tmpData0, tmpData1; + vxc_float4 bias_f0, bias_f1, scale_f0, scale_f1; + vxc_half8 scale_h; + vxc_int4 tmpVal0, tmpVal1; + + int2 coord_bias = (int2)(0, 0); + + for(coord.x = 0; coord.x < width; coord.x += 8) + { + VXC_ReadImage(src0, input, coord.xy, VXC_5BITOFFSET_XY(0, 0),\ + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); + coord_bias.x = coord.x; + VXC_ReadImage(src1, scale, coord_bias, VXC_5BITOFFSET_XY(0, 0),\ + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); + bias_f0 = read_imagef(bias, coord_bias); + coord_bias.x += 4; + bias_f1 = read_imagef(bias, coord_bias); + + VXC_DP4x4(tmpData0, src0, zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\ + uniConvert1stUint8SubZpToFp32_4x4); + VXC_DP4x4(tmpData1, src0, zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\ + uniConvert2ndUint8SubZpToFp32_4x4); + + _viv_asm(COPY, scale_h, src1, 16); + VXC_DP4x4(scale_f0, scale_h, scale_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\ + UniFP16toFP32Lo4_dp4x4); + VXC_DP4x4(scale_f1, scale_h, scale_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\ + uniConvertSecFp16Fp32_4x4); + + vxc_float4 sub, norm; + sub = tmpData0 * input_scale - mean; + norm = scale_f0 * vari * sub + bias_f0; + tmpVal0 = convert_int4_rte(norm * outputScale + output_zp); + sub = tmpData1 * input_scale - mean; + norm = scale_f1 * vari * sub + bias_f1; + tmpVal1 = convert_int4_rte(norm * outputScale + output_zp); + + VXC_DP2x8(dst, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1),\ + uniConvertInt32toUint8_2x8); + VXC_WriteImage(output, coord.xy, dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); + } +} diff --git a/src/tim/vx/internal/src/libnnext/ops/vx/layer_normalization_u8_f16.vx b/src/tim/vx/internal/src/libnnext/ops/vx/layer_normalization_u8_f16.vx new file mode 100644 index 0000000..d7d7066 --- /dev/null +++ b/src/tim/vx/internal/src/libnnext/ops/vx/layer_normalization_u8_f16.vx @@ -0,0 +1,252 @@ +#include "cl_viv_vx_ext.h" + +/*****************************layernorm uint8 to fp16****************************/ +_viv_uniform int width; +_viv_uniform float dimRatio; +_viv_uniform VXC_512Bits UniFP16toFP32Lo4_dp4x4; +_viv_uniform VXC_512Bits uniConvert1stUint8SubZpToFp32_4x4; +_viv_uniform VXC_512Bits uniConvert2ndUint8SubZpToFp32_4x4; +_viv_uniform VXC_512Bits uniConvert3rdUint8SubZpToFp32_4x4; +_viv_uniform VXC_512Bits uniConvert4thUint8SubZpToFp32_4x4; +_viv_uniform VXC_512Bits uniSumU8_16x1; +_viv_uniform VXC_512Bits uniSqrSum_16x1; +_viv_uniform float input_scale; +_viv_uniform int inputZP; +_viv_uniform int sumInZp; +_viv_uniform int tmpZp1; +_viv_uniform int tmpZp2; +_viv_uniform float e2InScale; +_viv_uniform VXC_512Bits uniConvertSecFp16Fp32_4x4; +_viv_uniform VXC_512Bits UniPackFP16even_2x8; + +__kernel void layer_norm_U8toF16( + image2d_array_t input, + image2d_t bias, + image2d_t scale, + image2d_array_t output, + float eps) +{ + int4 coord = (int4)(0, get_global_id(1), get_global_id(2), 0); + int4 coord_out = coord; + vxc_uchar16 src0; + float sum = 0, sqr = 0; + int tmpSum = 0, tmpSqr = 0; + vxc_int4 tmpSum1; + vxc_int4 tmpSqr1; + + int8 input_desc, output_desc; + _viv_asm(COPY, input_desc, input, sizeof(input_desc)); + int baseAddr_a = (int)get_global_id(2) * input_desc.s4 + input_desc.s0; + _viv_asm(MOV, coord.z, baseAddr_a); + + _viv_asm(COPY, output_desc, output, sizeof(output_desc)); + int baseAddr = (int)get_global_id(2) * output_desc.s4 + output_desc.s0; + _viv_asm(MOV, coord_out.w, baseAddr); + + for(coord.x = 0; coord.x < width; coord.x += 16) + { + VXC_OP4(img_load_3d, src0, input, coord.xyzz, VXC_5BITOFFSET_XY(0, 0), \ + VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); + VXC_DP16x1(tmpSum1, src0, src0, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0), uniSumU8_16x1); + tmpSum += (tmpSum1.x); + VXC_DP16x1(tmpSqr1, src0, src0, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0), uniSqrSum_16x1); + tmpSqr += (tmpSqr1.x + tmpZp1 * tmpSum1.x); + } + sum = (tmpSum + sumInZp) * input_scale; + sqr = (tmpSqr + tmpZp2) * e2InScale; + + float mean, vari; + mean = sum * dimRatio; + vari = sqr*dimRatio - mean*mean; + vari += eps; + vari = rsqrt(vari); + vxc_float4 tmpData0, tmpData1, tmpData2, tmpData3; + int2 coord_bias = (int2)(0, 0); + vxc_half8 scale_h; + vxc_float4 bias_f0, bias_f1, scale_f0, scale_f1; + vxc_short8 src1, outval; + short zp = inputZP; + half4 tmpVal0, tmpVal1; + vxc_half8 dst; + + for(coord.x = 0; coord.x < width; coord.x += 16) + { + VXC_OP4(img_load_3d, src0, input, coord.xyzz, VXC_5BITOFFSET_XY(0, 0), \ + VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); + VXC_ReadImage(src1, scale, coord.xw, VXC_5BITOFFSET_XY(0, 0),\ + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); + coord_bias.x = coord.x; + _viv_asm(COPY, scale_h, src1, 16); + VXC_DP4x4(scale_f0, scale_h, scale_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\ + UniFP16toFP32Lo4_dp4x4); + VXC_DP4x4(scale_f1, scale_h, scale_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\ + uniConvertSecFp16Fp32_4x4); + bias_f0 = read_imagef(bias, coord_bias); + coord_bias.x += 4; + bias_f1 = read_imagef(bias, coord_bias); + coord_bias.x += 4; + + VXC_ReadImage(src1, scale, coord.xw, VXC_5BITOFFSET_XY(8, 0),\ + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); + _viv_asm(COPY, scale_h, src1, 16); + VXC_DP4x4(tmpData0, src0, zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\ + uniConvert1stUint8SubZpToFp32_4x4); + VXC_DP4x4(tmpData1, src0, zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\ + uniConvert2ndUint8SubZpToFp32_4x4); + VXC_DP4x4(tmpData2, src0, zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\ + uniConvert3rdUint8SubZpToFp32_4x4); + VXC_DP4x4(tmpData3, src0, zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\ + uniConvert4thUint8SubZpToFp32_4x4); + tmpData0 *= input_scale; + tmpData1 *= input_scale; + tmpData2 *= input_scale; + tmpData3 *= input_scale; + + vxc_float4 norm; + tmpData0 -= mean; + norm = scale_f0 * vari * tmpData0 + bias_f0; + bias_f0 = read_imagef(bias, coord_bias); + VXC_DP4x4(scale_f0, scale_h, scale_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\ + UniFP16toFP32Lo4_dp4x4); + coord_bias.x += 4; + _viv_asm(CONV, tmpVal0, norm); + + tmpData1 -= mean; + norm = scale_f1 * vari * tmpData1 + bias_f1; + bias_f1 = read_imagef(bias, coord_bias); + VXC_DP4x4(scale_f1, scale_h, scale_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\ + uniConvertSecFp16Fp32_4x4); + _viv_asm(CONV, tmpVal1, norm); + VXC_DP2x8(dst, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0),\ + UniPackFP16even_2x8); + _viv_asm(COPY, outval, dst, 16); + coord_out.x = coord.x; + VXC_OP4_NoDest(img_store_3d, output, coord_out.xyww, outval, \ + VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0)); + + tmpData2 -= mean; + norm = scale_f0 * vari * tmpData2 + bias_f0; + _viv_asm(CONV, tmpVal0, norm); + + tmpData3 -= mean; + norm = scale_f1 * vari * tmpData3 + bias_f1; + _viv_asm(CONV, tmpVal1, norm); + VXC_DP2x8(dst, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0),\ + UniPackFP16even_2x8); + _viv_asm(COPY, outval, dst, 16); + coord_out.x += 8; + VXC_OP4_NoDest(img_store_3d, output, coord_out.xyww, outval, \ + VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0)); + } +} + +__kernel void layer_norm_U8toF16_2D( + image2d_t input, + image2d_t bias, + image2d_t scale, + image2d_t output, + float eps) +{ + int4 coord = (int4)(0, get_global_id(1), 0, 0); + vxc_uchar16 src0; + float sum = 0, sqr = 0; + int tmpSum = 0, tmpSqr = 0; + vxc_int4 tmpSum1; + vxc_int4 tmpSqr1; + + for(coord.x = 0; coord.x < width; coord.x += 16) + { + VXC_ReadImage(src0, input, coord.xy, VXC_5BITOFFSET_XY(0, 0),\ + VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); + VXC_DP16x1(tmpSum1, src0, src0, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0), uniSumU8_16x1); + tmpSum += (tmpSum1.x); + VXC_DP16x1(tmpSqr1, src0, src0, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0), uniSqrSum_16x1); + tmpSqr += (tmpSqr1.x + tmpZp1 * tmpSum1.x); + } + sum = (tmpSum + sumInZp) * input_scale; + sqr = (tmpSqr + tmpZp2) * e2InScale; + + float mean, vari; + mean = sum * dimRatio; + vari = sqr*dimRatio - mean*mean; + vari += eps; + vari = rsqrt(vari); + vxc_float4 tmpData0, tmpData1, tmpData2, tmpData3; + int2 coord_bias = (int2)(0, 0); + vxc_half8 scale_h; + vxc_float4 bias_f0, bias_f1, scale_f0, scale_f1; + vxc_short8 src1, outval; + short zp = inputZP; + half4 tmpVal0, tmpVal1; + vxc_half8 dst; + + int2 coord_out = (int2)(get_global_id(0), get_global_id(1)); + + for(coord.x = 0; coord.x < width; coord.x += 16) + { + coord_bias.x = coord.x; + VXC_ReadImage(src0, input, coord.xy, VXC_5BITOFFSET_XY(0, 0),\ + VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); + VXC_ReadImage(src1, scale, coord.xw, VXC_5BITOFFSET_XY(0, 0),\ + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); + _viv_asm(COPY, scale_h, src1, 16); + VXC_DP4x4(scale_f0, scale_h, scale_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\ + UniFP16toFP32Lo4_dp4x4); + VXC_DP4x4(scale_f1, scale_h, scale_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\ + uniConvertSecFp16Fp32_4x4); + bias_f0 = read_imagef(bias, coord_bias); + coord_bias.x += 4; + bias_f1 = read_imagef(bias, coord_bias); + coord_bias.x += 4; + + VXC_ReadImage(src1, scale, coord.xw, VXC_5BITOFFSET_XY(8, 0),\ + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); + _viv_asm(COPY, scale_h, src1, 16); + VXC_DP4x4(tmpData0, src0, zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\ + uniConvert1stUint8SubZpToFp32_4x4); + VXC_DP4x4(tmpData1, src0, zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\ + uniConvert2ndUint8SubZpToFp32_4x4); + VXC_DP4x4(tmpData2, src0, zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\ + uniConvert3rdUint8SubZpToFp32_4x4); + VXC_DP4x4(tmpData3, src0, zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\ + uniConvert4thUint8SubZpToFp32_4x4); + tmpData0 *= input_scale; + tmpData1 *= input_scale; + tmpData2 *= input_scale; + tmpData3 *= input_scale; + + vxc_float4 norm; + tmpData0 -= mean; + norm = scale_f0 * vari * tmpData0 + bias_f0; + bias_f0 = read_imagef(bias, coord_bias); + VXC_DP4x4(scale_f0, scale_h, scale_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\ + UniFP16toFP32Lo4_dp4x4); + coord_bias.x += 4; + _viv_asm(CONV, tmpVal0, norm); + + tmpData1 -= mean; + norm = scale_f1 * vari * tmpData1 + bias_f1; + bias_f1 = read_imagef(bias, coord_bias); + VXC_DP4x4(scale_f1, scale_h, scale_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\ + uniConvertSecFp16Fp32_4x4); + _viv_asm(CONV, tmpVal1, norm); + VXC_DP2x8(dst, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0),\ + UniPackFP16even_2x8); + _viv_asm(COPY, outval, dst, 16); + coord_out.x = coord.x; + VXC_WriteImage(output, coord_out, outval, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); + + tmpData2 -= mean; + norm = scale_f0 * vari * tmpData2 + bias_f0; + _viv_asm(CONV, tmpVal0, norm); + + tmpData3 -= mean; + norm = scale_f1 * vari * tmpData3 + bias_f1; + _viv_asm(CONV, tmpVal1, norm); + VXC_DP2x8(dst, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0),\ + UniPackFP16even_2x8); + _viv_asm(COPY, outval, dst, 16); + coord_out.x += 8; + VXC_WriteImage(output, coord_out, outval, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); + } +} diff --git a/src/tim/vx/internal/src/libnnext/ops/vx/layer_normalization_wh_f16.vx b/src/tim/vx/internal/src/libnnext/ops/vx/layer_normalization_wh_f16.vx new file mode 100644 index 0000000..03802e8 --- /dev/null +++ b/src/tim/vx/internal/src/libnnext/ops/vx/layer_normalization_wh_f16.vx @@ -0,0 +1,426 @@ +#include "cl_viv_vx_ext.h" + +_viv_uniform VXC_512Bits uniFp16SumSqr_dp8x2; +_viv_uniform int width; + +_viv_uniform int height; + +_viv_uniform int height_depth; +_viv_uniform float dimRatio; +_viv_uniform int group_num; +_viv_uniform VXC_512Bits UniFP16toFP32Lo4_dp4x4; +_viv_uniform VXC_512Bits uniConvertSecFp16Fp32_4x4; +_viv_uniform VXC_512Bits uniConvertHalfToFp16_2x8; +_viv_uniform VXC_512Bits uniConvertInt32toUint8_2x8; +_viv_uniform float outputScale; +_viv_uniform float output_zp; + +__kernel __attribute__((reqd_work_group_size(16, 1, 1))) void layernorm_wh_sumSqr_F16toF32( + image2d_array_t input, image2d_t output) +{ + int gidx = get_global_id(0) << 3; + int lidx = get_local_id(0); + int gidz = get_global_id(1); + int4 coord = (int4)(gidx, 0, gidz, 0); + vxc_short8 src0; + vxc_half8 in_h; + vxc_float4 sumsqr; + vxc_float4 tmpSumSqr = (vxc_float4)(0); + + __local float lcl_sum[16]; + __local float lcl_sqr[16]; + + int8 input_desc; + _viv_asm(COPY, input_desc, input, sizeof(input_desc)); + int baseAddr_a = (int)coord.z * input_desc.s4 + input_desc.s0; + _viv_asm(MOV, coord.w, baseAddr_a); + + if(gidx < width) + { + for(coord.y = 0; coord.y < height;) + { + VXC_OP4(img_load_3d, src0, input, coord.xyww, VXC_5BITOFFSET_XY(0, 0), + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); + coord.y++; + _viv_asm(COPY, in_h, src0, 16); + VXC_DP8x2(sumsqr, in_h, in_h, VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0),\ + uniFp16SumSqr_dp8x2); + tmpSumSqr += sumsqr; + } + } + + lcl_sum[lidx] = tmpSumSqr.x; + lcl_sqr[lidx] = tmpSumSqr.y; + barrier(CLK_LOCAL_MEM_FENCE); + + int2 coord_out = (int2)(get_group_id(0) << 2, gidz); + if(lidx == 0) + { + float4 one = (float4)(1, 1, 1, 1); + __local float4* tmp_sum = (__local float4*)lcl_sum; + __local float4* tmp_sqr = (__local float4*)lcl_sqr; + + float sum = 0; + float sqr = 0; + for(int i = 0; i < 4; i++) + { + sum += dot(tmp_sum[i], one); + sqr += dot(tmp_sqr[i], one); + } + + float4 data = (float4)(sum, sqr, 0, 0); + write_imagef(output, coord_out, data); + } +} + +__kernel __attribute__((reqd_work_group_size(16, 1, 1))) void layernorm_wh_sumSqr_F16toF32_2D( + image2d_array_t input, image2d_t output) +{ + int gidx = get_global_id(0) << 3; + int lidx = get_local_id(0); + int gidz = get_global_id(1); + int gidy = gidz * height; + + int2 coord = (int2)(gidx, gidy); + vxc_short8 src0; + vxc_half8 in_h; + vxc_float4 sumsqr; + vxc_float4 tmpSumSqr = (vxc_float4)(0); + + __local float lcl_sum[16]; + __local float lcl_sqr[16]; + + int endH = gidy + height; + if(gidx < width) + { + for(; coord.y < endH;) + { + VXC_ReadImage(src0, input, coord, VXC_5BITOFFSET_XY(0, 0), + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); + coord.y++; + _viv_asm(COPY, in_h, src0, 16); + VXC_DP8x2(sumsqr, in_h, in_h, VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0),\ + uniFp16SumSqr_dp8x2); + tmpSumSqr += sumsqr; + } + } + + lcl_sum[lidx] = tmpSumSqr.x; + lcl_sqr[lidx] = tmpSumSqr.y; + barrier(CLK_LOCAL_MEM_FENCE); + + int2 coord_out = (int2)(get_group_id(0) << 2, gidz); + if(lidx == 0) + { + float4 one = (float4)(1, 1, 1, 1); + __local float4* tmp_sum = (__local float4*)lcl_sum; + __local float4* tmp_sqr = (__local float4*)lcl_sqr; + + float sum = 0; + float sqr = 0; + for(int i = 0; i < 4; i++) + { + sum += dot(tmp_sum[i], one); + sqr += dot(tmp_sqr[i], one); + } + + float4 data = (float4)(sum, sqr, 0, 0); + write_imagef(output, coord_out, data); + } +} + +__kernel __attribute__((reqd_work_group_size(16, 1, 1))) void layernorm_wh_F16toF16( + image2d_array_t input, image2d_array_t bias, image2d_array_t scale, image2d_t meanVari, + image2d_array_t output, float eps) +{ + int gidz = get_global_id(1); + int4 coord = (int4)(get_global_id(0), 0, gidz, 0); + int2 coord_sum = (int2)(0, gidz); + int4 coord_para = coord; + coord_para.z = (ushort)gidz / (ushort)(height_depth); + vxc_short8 src0; + vxc_short8 src1; + vxc_half8 scale_h, in_h; + vxc_float4 bias_f0, bias_f1, scale_f0, scale_f1; + vxc_float4 mean_vari = (vxc_float4)(0); + + for(int i = 0; i < group_num; i++) + { + mean_vari += read_imagef(meanVari, coord_sum); + coord_sum.x += 4; + } + mean_vari *= dimRatio; + mean_vari.s1 = mean_vari.s1 - mean_vari.s0 * mean_vari.s0 + eps; + mean_vari.s1 = rsqrt(mean_vari.s1); + + int4 coord_bias = coord_para; + + int8 input_desc, scale_desc, output_desc; + _viv_asm(COPY, input_desc, input, sizeof(input_desc)); + int baseAddr_a = (int)coord.z * input_desc.s4 + input_desc.s0; + _viv_asm(MOV, coord.w, baseAddr_a); + + _viv_asm(COPY, scale_desc, scale, sizeof(scale_desc)); + int baseAddr_c = (int)coord_para.z * scale_desc.s4 + scale_desc.s0; + _viv_asm(MOV, coord_para.w, baseAddr_c); + + _viv_asm(COPY, output_desc, output, sizeof(output_desc)); + int baseAddr = (int)coord.z * output_desc.s4 + output_desc.s0; + _viv_asm(MOV, coord.z, baseAddr); + + vxc_float4 tmpData0, tmpData1; + vxc_short8 outval; + half4 tmpVal0, tmpVal1; + vxc_half8 dst; + + for(coord.y = 0; coord.y < height; coord.y++) + { + VXC_OP4(img_load_3d, src0, input, coord.xyww, VXC_5BITOFFSET_XY(0, 0), + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); + coord_para.y = coord.y; + coord_bias.y = coord.y; + VXC_OP4(img_load_3d, src1, scale, coord_para.xyww, VXC_5BITOFFSET_XY(0, 0), + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); + bias_f0 = read_imagef(bias, coord_bias); + coord_bias.x += 4; + bias_f1 = read_imagef(bias, coord_bias); + coord_bias.x = coord.x; + + _viv_asm(COPY, in_h, src0, 16); + VXC_DP4x4(tmpData0, in_h, in_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\ + UniFP16toFP32Lo4_dp4x4); + VXC_DP4x4(tmpData1, in_h, in_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\ + uniConvertSecFp16Fp32_4x4); + + _viv_asm(COPY, scale_h, src1, 16); + VXC_DP4x4(scale_f0, scale_h, scale_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\ + UniFP16toFP32Lo4_dp4x4); + VXC_DP4x4(scale_f1, scale_h, scale_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\ + uniConvertSecFp16Fp32_4x4); + + vxc_float4 sub, norm; + sub = tmpData0 - mean_vari.s0; + norm = scale_f0 * mean_vari.s1 * sub + bias_f0; + _viv_asm(CONV, tmpVal0, norm); + sub = tmpData1 - mean_vari.s0; + norm = scale_f1 * mean_vari.s1 * sub + bias_f1; + _viv_asm(CONV, tmpVal1, norm); + VXC_DP2x8(dst, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0),\ + uniConvertHalfToFp16_2x8); + _viv_asm(COPY, outval, dst, 16); + VXC_OP4_NoDest(img_store_3d, output, coord.xyzz, outval, \ + VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0)); + } +} + +__kernel __attribute__((reqd_work_group_size(16, 1, 1))) void layernorm_wh_F16toF16_2D( + image2d_t input, image2d_t bias, image2d_t scale, image2d_t meanVari, + image2d_t output, float eps) +{ + int2 coord = (int2)(get_global_id(0), 0); + int2 coord_bias = (int2)(0, 0); + vxc_short8 src0; + vxc_short8 src1; + vxc_half8 scale_h, in_h; + vxc_float4 bias_f0, bias_f1, scale_f0, scale_f1; + vxc_float4 mean_vari = (vxc_float4)(0); + + for(int i = 0; i < group_num; i++) + { + mean_vari += read_imagef(meanVari, coord_bias); + coord_bias.x += 4; + } + mean_vari *= dimRatio; + mean_vari.s1 = mean_vari.s1 - mean_vari.s0 * mean_vari.s0 + eps; + mean_vari.s1 = rsqrt(mean_vari.s1); + + coord_bias = coord; + + vxc_float4 tmpData0, tmpData1; + vxc_short8 outval; + half4 tmpVal0, tmpVal1; + vxc_half8 dst; + + for(coord.y = 0; coord.y < height; coord.y++) + { + VXC_ReadImage(src0, input, coord, VXC_5BITOFFSET_XY(0, 0),\ + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); + coord_bias.y = coord.y; + VXC_ReadImage(src1, scale, coord, VXC_5BITOFFSET_XY(0, 0),\ + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); + bias_f0 = read_imagef(bias, coord_bias); + coord_bias.x += 4; + bias_f1 = read_imagef(bias, coord_bias); + coord_bias.x = coord.x; + + _viv_asm(COPY, in_h, src0, 16); + VXC_DP4x4(tmpData0, in_h, in_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\ + UniFP16toFP32Lo4_dp4x4); + VXC_DP4x4(tmpData1, in_h, in_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\ + uniConvertSecFp16Fp32_4x4); + + _viv_asm(COPY, scale_h, src1, 16); + VXC_DP4x4(scale_f0, scale_h, scale_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\ + UniFP16toFP32Lo4_dp4x4); + VXC_DP4x4(scale_f1, scale_h, scale_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\ + uniConvertSecFp16Fp32_4x4); + + vxc_float4 sub, norm; + sub = tmpData0 - mean_vari.s0; + norm = scale_f0 * mean_vari.s1 * sub + bias_f0; + _viv_asm(CONV, tmpVal0, norm); + sub = tmpData1 - mean_vari.s0; + norm = scale_f1 * mean_vari.s1 * sub + bias_f1; + _viv_asm(CONV, tmpVal1, norm); + VXC_DP2x8(dst, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0),\ + uniConvertHalfToFp16_2x8); + _viv_asm(COPY, outval, dst, 16); + VXC_WriteImage(output, coord, outval, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); + } +} + +__kernel __attribute__((reqd_work_group_size(16, 1, 1))) void layernorm_wh_F16toU8( + image2d_array_t input, image2d_array_t bias, image2d_array_t scale, image2d_t meanVari, + image2d_array_t output, float eps) +{ + int gidz = get_global_id(1); + int4 coord = (int4)(get_global_id(0), 0, gidz, 0); + int2 coord_sum = (int2)(0, gidz); + int4 coord_para = coord; + coord_para.z = (ushort)gidz / (ushort)(height_depth); + vxc_short8 src0; + vxc_short8 src1; + vxc_half8 scale_h, in_h; + vxc_float4 bias_f0, bias_f1, scale_f0, scale_f1; + vxc_float4 mean_vari = (vxc_float4)(0); + + for(int i = 0; i < group_num; i++) + { + mean_vari += read_imagef(meanVari, coord_sum); + coord_sum.x += 4; + } + mean_vari *= dimRatio; + mean_vari.s1 = mean_vari.s1 - mean_vari.s0 * mean_vari.s0 + eps; + mean_vari.s1 = rsqrt(mean_vari.s1); + + int4 coord_bias = coord_para; + + int8 input_desc, scale_desc, output_desc; + _viv_asm(COPY, input_desc, input, sizeof(input_desc)); + int baseAddr_a = (int)coord.z * input_desc.s4 + input_desc.s0; + _viv_asm(MOV, coord.w, baseAddr_a); + + _viv_asm(COPY, scale_desc, scale, sizeof(scale_desc)); + int baseAddr_c = (int)coord_para.z * scale_desc.s4 + scale_desc.s0; + _viv_asm(MOV, coord_para.w, baseAddr_c); + + _viv_asm(COPY, output_desc, output, sizeof(output_desc)); + int baseAddr = (int)coord.z * output_desc.s4 + output_desc.s0; + _viv_asm(MOV, coord.z, baseAddr); + + vxc_float4 tmpData0, tmpData1; + vxc_uchar16 outval; + vxc_int4 tmpVal0, tmpVal1; + + for(coord.y = 0; coord.y < height; coord.y++) + { + VXC_OP4(img_load_3d, src0, input, coord.xyww, VXC_5BITOFFSET_XY(0, 0), + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); + coord_para.y = coord.y; + coord_bias.y = coord.y; + VXC_OP4(img_load_3d, src1, scale, coord_para.xyww, VXC_5BITOFFSET_XY(0, 0), + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); + bias_f0 = read_imagef(bias, coord_bias); + coord_bias.x += 4; + bias_f1 = read_imagef(bias, coord_bias); + coord_bias.x = coord.x; + + _viv_asm(COPY, in_h, src0, 16); + VXC_DP4x4(tmpData0, in_h, in_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\ + UniFP16toFP32Lo4_dp4x4); + VXC_DP4x4(tmpData1, in_h, in_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\ + uniConvertSecFp16Fp32_4x4); + + _viv_asm(COPY, scale_h, src1, 16); + VXC_DP4x4(scale_f0, scale_h, scale_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\ + UniFP16toFP32Lo4_dp4x4); + VXC_DP4x4(scale_f1, scale_h, scale_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\ + uniConvertSecFp16Fp32_4x4); + + vxc_float4 sub, norm; + sub = tmpData0 - mean_vari.s0; + norm = scale_f0 * mean_vari.s1 * sub + bias_f0; + tmpVal0 = convert_int4_rte(norm * outputScale + output_zp); + sub = tmpData1 - mean_vari.s0; + norm = scale_f1 * mean_vari.s1 * sub + bias_f1; + tmpVal1 = convert_int4_rte(norm * outputScale + output_zp); + VXC_DP2x8(outval, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1),\ + uniConvertInt32toUint8_2x8); + VXC_OP4_NoDest(img_store_3d, output, coord.xyzz, outval, \ + VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0)); + } +} + +__kernel __attribute__((reqd_work_group_size(16, 1, 1))) void layernorm_wh_F16toU8_2D( + image2d_t input, image2d_t bias, image2d_t scale, image2d_t meanVari, + image2d_t output, float eps) +{ + int2 coord = (int2)(get_global_id(0), 0); + int2 coord_bias = (int2)(0, 0); + vxc_short8 src0; + vxc_short8 src1; + vxc_half8 scale_h, in_h; + vxc_float4 bias_f0, bias_f1, scale_f0, scale_f1; + vxc_float4 mean_vari = (vxc_float4)(0); + + for(int i = 0; i < group_num; i++) + { + mean_vari += read_imagef(meanVari, coord_bias); + coord_bias.x += 4; + } + mean_vari *= dimRatio; + mean_vari.s1 = mean_vari.s1 - mean_vari.s0 * mean_vari.s0 + eps; + mean_vari.s1 = rsqrt(mean_vari.s1); + + coord_bias = coord; + + vxc_float4 tmpData0, tmpData1; + vxc_uchar16 outval; + vxc_int4 tmpVal0, tmpVal1; + + for(coord.y = 0; coord.y < height; coord.y++) + { + VXC_ReadImage(src0, input, coord, VXC_5BITOFFSET_XY(0, 0),\ + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); + coord_bias.y = coord.y; + VXC_ReadImage(src1, scale, coord, VXC_5BITOFFSET_XY(0, 0),\ + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); + bias_f0 = read_imagef(bias, coord_bias); + coord_bias.x += 4; + bias_f1 = read_imagef(bias, coord_bias); + coord_bias.x = coord.x; + + _viv_asm(COPY, in_h, src0, 16); + VXC_DP4x4(tmpData0, in_h, in_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\ + UniFP16toFP32Lo4_dp4x4); + VXC_DP4x4(tmpData1, in_h, in_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\ + uniConvertSecFp16Fp32_4x4); + + _viv_asm(COPY, scale_h, src1, 16); + VXC_DP4x4(scale_f0, scale_h, scale_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\ + UniFP16toFP32Lo4_dp4x4); + VXC_DP4x4(scale_f1, scale_h, scale_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\ + uniConvertSecFp16Fp32_4x4); + + vxc_float4 sub, norm; + sub = tmpData0 - mean_vari.s0; + norm = scale_f0 * mean_vari.s1 * sub + bias_f0; + tmpVal0 = convert_int4_rte(norm * outputScale + output_zp); + sub = tmpData1 - mean_vari.s0; + norm = scale_f1 * mean_vari.s1 * sub + bias_f1; + tmpVal1 = convert_int4_rte(norm * outputScale + output_zp); + VXC_DP2x8(outval, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1),\ + uniConvertInt32toUint8_2x8); + VXC_WriteImage(output, coord, outval, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); + } +} \ No newline at end of file diff --git a/src/tim/vx/internal/src/libnnext/ops/vx/layer_normalization_wh_i16.vx b/src/tim/vx/internal/src/libnnext/ops/vx/layer_normalization_wh_i16.vx new file mode 100644 index 0000000..61e4e29 --- /dev/null +++ b/src/tim/vx/internal/src/libnnext/ops/vx/layer_normalization_wh_i16.vx @@ -0,0 +1,266 @@ +#include "cl_viv_vx_ext.h" + +_viv_uniform VXC_512Bits uniInt16SumSqr_dp8x2; +_viv_uniform float e2InScale; +_viv_uniform int width; + +_viv_uniform float input_scale; +_viv_uniform int height; + +_viv_uniform int height_depth; +_viv_uniform float dimRatio; +_viv_uniform int group_num; +_viv_uniform VXC_512Bits UniFP16toFP32Lo4_dp4x4; +_viv_uniform VXC_512Bits uniConvertSecFp16Fp32_4x4; +_viv_uniform VXC_512Bits uniConvertHalfToFp16_2x8; +_viv_uniform VXC_512Bits uniConvertInt32toUint8_2x8; +_viv_uniform float outputScale; +_viv_uniform float output_zp; + +_viv_uniform VXC_512Bits uniConvert1stUint8SubZpToFp32_4x4; +_viv_uniform VXC_512Bits uniConvert2ndUint8SubZpToFp32_4x4; +_viv_uniform int inputZP; + +__kernel __attribute__((reqd_work_group_size(16, 1, 1))) void layernorm_wh_sumSqr_I16toF32( + image2d_array_t input, image2d_t output) +{ + int gidx = get_global_id(0) << 4; + int lidx = get_local_id(0); + int gidz = get_global_id(1); + int4 coord = (int4)(gidx, 0, gidz, 0); + vxc_short8 src0; + float4 tmpSumSqr = (float4)(0); + + __local float lcl_sum[16]; + __local float lcl_sqr[16]; + + int8 input_desc; + _viv_asm(COPY, input_desc, input, sizeof(input_desc)); + int baseAddr_a = (int)coord.z * input_desc.s4 + input_desc.s0; + _viv_asm(MOV, coord.w, baseAddr_a); + + if(gidx < width) + { + for(coord.y = 0; coord.y < height;) + { + VXC_OP4(img_load_3d, src0, input, coord.xyww, VXC_5BITOFFSET_XY(0, 0), + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); + coord.y++; + vxc_float4 sumsqr; + VXC_DP8x2(sumsqr, src0, src0, VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0),\ + uniInt16SumSqr_dp8x2); + tmpSumSqr += sumsqr; + } + tmpSumSqr.x *= input_scale; + tmpSumSqr.y *= e2InScale; + } + lcl_sum[lidx] = tmpSumSqr.x; + lcl_sqr[lidx] = tmpSumSqr.y; + barrier(CLK_LOCAL_MEM_FENCE); + + int2 coord_out = (int2)(get_group_id(0) << 2, gidz); + if(lidx == 0) + { + float4 one = (float4)(1, 1, 1, 1); + __local float4* tmp_sum = (__local float4*)lcl_sum; + __local float4* tmp_sqr = (__local float4*)lcl_sqr; + float4 data = (float4)(0); + for(int i = 0; i < 4; i++) + { + data.x += dot(tmp_sum[i], one); + data.y += dot(tmp_sqr[i], one); + } + write_imagef(output, coord_out, data); + } +} + +__kernel __attribute__((reqd_work_group_size(16, 1, 1))) void layernorm_wh_sumSqr_I16toF32_2D( + image2d_t input, image2d_t output) +{ + int gidx = get_global_id(0) << 4; + int lidx = get_local_id(0); + int gidz = get_global_id(1); + int gidy = gidz * height; + + int2 coord = (int2)(gidx, gidy); + vxc_short8 src0; + float4 tmpSumSqr = (float4)(0); + + __local float lcl_sum[16]; + __local float lcl_sqr[16]; + + int endH = gidy + height; + if(gidx < width) + { + for(; coord.y < endH;) + { + VXC_ReadImage(src0, input, coord, VXC_5BITOFFSET_XY(0, 0), + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); + coord.y++; + vxc_float4 sumsqr; + VXC_DP8x2(sumsqr, src0, src0, VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0),\ + uniInt16SumSqr_dp8x2); + tmpSumSqr += sumsqr; + } + tmpSumSqr.x *= input_scale; + tmpSumSqr.y *= e2InScale; + } + lcl_sum[lidx] = tmpSumSqr.x; + lcl_sqr[lidx] = tmpSumSqr.y; + barrier(CLK_LOCAL_MEM_FENCE); + + int2 coord_out = (int2)(get_group_id(0) << 2, gidz); + if(lidx == 0) + { + float4 one = (float4)(1, 1, 1, 1); + __local float4* tmp_sum = (__local float4*)lcl_sum; + __local float4* tmp_sqr = (__local float4*)lcl_sqr; + float4 data = (float4)(0); + for(int i = 0; i < 4; i++) + { + data.x += dot(tmp_sum[i], one); + data.y += dot(tmp_sqr[i], one); + } + write_imagef(output, coord_out, data); + } +} + +__kernel __attribute__((reqd_work_group_size(16, 1, 1))) void layernorm_wh_I16toI16( + image2d_array_t input, image2d_array_t bias, image2d_array_t scale, image2d_t meanVari, + image2d_array_t output, float eps) +{ + int gidz = get_global_id(1); + int4 coord = (int4)(get_global_id(0), 0, gidz, 0); + int2 coord_sum = (int2)(0, gidz); + int4 coord_para = coord; + coord_para.z = (ushort)gidz / (ushort)(height_depth); + vxc_short8 src0, src1, outval; + vxc_half8 scale_h; + vxc_float4 bias_f0, bias_f1, scale_f0, scale_f1; + vxc_float4 mean_vari = (vxc_float4)(0); + + for(int i = 0; i < group_num; i++) + { + mean_vari += read_imagef(meanVari, coord_sum); + coord_sum.x += 4; + } + mean_vari *= dimRatio; + mean_vari.s1 = mean_vari.s1 - mean_vari.s0 * mean_vari.s0 + eps; + mean_vari.s1 = rsqrt(mean_vari.s1); + + int4 coord_bias = coord_para; + + int8 input_desc, scale_desc, output_desc; + _viv_asm(COPY, input_desc, input, sizeof(input_desc)); + int baseAddr_a = (int)coord.z * input_desc.s4 + input_desc.s0; + _viv_asm(MOV, coord.w, baseAddr_a); + + _viv_asm(COPY, scale_desc, scale, sizeof(scale_desc)); + int baseAddr_c = (int)coord_para.z * scale_desc.s4 + scale_desc.s0; + _viv_asm(MOV, coord_para.w, baseAddr_c); + + _viv_asm(COPY, output_desc, output, sizeof(output_desc)); + int baseAddr = (int)coord.z * output_desc.s4 + output_desc.s0; + _viv_asm(MOV, coord.z, baseAddr); + + short zp = inputZP; + vxc_float4 tmpData0, tmpData1, norm; + vxc_int4 tmpVal0, tmpVal1; + + for(coord.y = 0; coord.y < height; coord.y++) + { + VXC_OP4(img_load_3d, src0, input, coord.xyww, VXC_5BITOFFSET_XY(0, 0), + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); + coord_para.y = coord.y; + coord_bias.y = coord.y; + VXC_OP4(img_load_3d, src1, scale, coord_para.xyww, VXC_5BITOFFSET_XY(0, 0), + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); + bias_f0 = read_imagef(bias, coord_bias); + coord_bias.x += 4; + bias_f1 = read_imagef(bias, coord_bias); + coord_bias.x = coord.x; + + _viv_asm(COPY, scale_h, src1, 16); + VXC_DP4x4(scale_f0, scale_h, scale_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\ + UniFP16toFP32Lo4_dp4x4); + VXC_DP4x4(scale_f1, scale_h, scale_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\ + uniConvertSecFp16Fp32_4x4); + VXC_DP4x4(tmpData0, src0, zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\ + uniConvert1stUint8SubZpToFp32_4x4); + VXC_DP4x4(tmpData1, src0, zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\ + uniConvert2ndUint8SubZpToFp32_4x4); + tmpData0 = tmpData0 * input_scale - mean_vari.s0; + tmpData1 = tmpData1 * input_scale - mean_vari.s0; + + norm = scale_f0 * mean_vari.s1 * tmpData0 + bias_f0; + tmpVal0 = convert_int4_rte(norm * outputScale + output_zp); + norm = scale_f1 * mean_vari.s1 * tmpData1 + bias_f1; + tmpVal1 = convert_int4_rte(norm * outputScale + output_zp); + + VXC_DP2x8(outval, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1),\ + uniConvertInt32toUint8_2x8); + VXC_OP4_NoDest(img_store_3d, output, coord.xyzz, outval, \ + VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0)); + } +} + +__kernel __attribute__((reqd_work_group_size(16, 1, 1))) void layernorm_wh_I16toI16_2D( + image2d_t input, image2d_t bias, image2d_t scale, image2d_t meanVari, + image2d_t output, float eps) +{ + int2 coord = (int2)(get_global_id(0), 0); + int2 coord_bias = (int2)(0, 0); + vxc_short8 src0, src1, outval; + vxc_half8 scale_h; + vxc_float4 bias_f0, bias_f1, scale_f0, scale_f1; + vxc_float4 mean_vari = (vxc_float4)(0); + + for(int i = 0; i < group_num; i++) + { + mean_vari += read_imagef(meanVari, coord_bias); + coord_bias.x += 4; + } + mean_vari *= dimRatio; + mean_vari.s1 = mean_vari.s1 - mean_vari.s0 * mean_vari.s0 + eps; + mean_vari.s1 = rsqrt(mean_vari.s1); + + coord_bias = coord; + + short zp = inputZP; + vxc_float4 tmpData0, tmpData1, norm; + vxc_int4 tmpVal0, tmpVal1; + + for(coord.y = 0; coord.y < height; coord.y++) + { + VXC_ReadImage(src0, input, coord, VXC_5BITOFFSET_XY(0, 0),\ + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); + coord_bias.y = coord.y; + VXC_ReadImage(src1, scale, coord, VXC_5BITOFFSET_XY(0, 0),\ + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); + bias_f0 = read_imagef(bias, coord_bias); + coord_bias.x += 4; + bias_f1 = read_imagef(bias, coord_bias); + coord_bias.x = coord.x; + + _viv_asm(COPY, scale_h, src1, 16); + VXC_DP4x4(scale_f0, scale_h, scale_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\ + UniFP16toFP32Lo4_dp4x4); + VXC_DP4x4(scale_f1, scale_h, scale_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\ + uniConvertSecFp16Fp32_4x4); + VXC_DP4x4(tmpData0, src0, zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\ + uniConvert1stUint8SubZpToFp32_4x4); + VXC_DP4x4(tmpData1, src0, zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\ + uniConvert2ndUint8SubZpToFp32_4x4); + tmpData0 = tmpData0 * input_scale - mean_vari.s0; + tmpData1 = tmpData1 * input_scale - mean_vari.s0; + + norm = scale_f0 * mean_vari.s1 * tmpData0 + bias_f0; + tmpVal0 = convert_int4_rte(norm * outputScale + output_zp); + norm = scale_f1 * mean_vari.s1 * tmpData1 + bias_f1; + tmpVal1 = convert_int4_rte(norm * outputScale + output_zp); + + VXC_DP2x8(outval, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1),\ + uniConvertInt32toUint8_2x8); + VXC_WriteImage(output, coord, outval, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); + } +} \ No newline at end of file diff --git a/src/tim/vx/internal/src/libnnext/ops/vx/layer_normalization_wh_u8.vx b/src/tim/vx/internal/src/libnnext/ops/vx/layer_normalization_wh_u8.vx new file mode 100644 index 0000000..521a8cf --- /dev/null +++ b/src/tim/vx/internal/src/libnnext/ops/vx/layer_normalization_wh_u8.vx @@ -0,0 +1,419 @@ +#include "cl_viv_vx_ext.h" + +_viv_uniform VXC_512Bits uniSumU8_16x1; +_viv_uniform VXC_512Bits uniSqrSum_16x1; +_viv_uniform int sumInZp; +_viv_uniform int tmpZp1; +_viv_uniform float e2InScale; +_viv_uniform float rowSumScale; +_viv_uniform int width; + +_viv_uniform float input_scale; +_viv_uniform int height; + +_viv_uniform int height_depth; +_viv_uniform float dimRatio; +_viv_uniform int group_num; +_viv_uniform VXC_512Bits UniFP16toFP32Lo4_dp4x4; +_viv_uniform VXC_512Bits uniConvertSecFp16Fp32_4x4; +_viv_uniform VXC_512Bits uniConvertHalfToFp16_2x8; +_viv_uniform VXC_512Bits uniConvertInt32toUint8_2x8; +_viv_uniform float outputScale; +_viv_uniform float output_zp; + +_viv_uniform VXC_512Bits uniConvert1stUint8SubZpToFp32_4x4; +_viv_uniform VXC_512Bits uniConvert2ndUint8SubZpToFp32_4x4; +_viv_uniform int inputZP; + +__kernel __attribute__((reqd_work_group_size(16, 1, 1))) void layernorm_wh_sumSqr_U8toF32( + image2d_array_t input, image2d_t output) +{ + int gidx = get_global_id(0) << 4; + int lidx = get_local_id(0); + int gidz = get_global_id(1); + int4 coord = (int4)(gidx, 0, gidz, 0); + vxc_uchar16 src0; + float sum = 0, sqr = 0; + int tmpSum = 0, tmpSqr = 0, tmpSum1, tmpSqr1; + + __local float lcl_sum[16]; + __local float lcl_sqr[16]; + + int8 input_desc; + _viv_asm(COPY, input_desc, input, sizeof(input_desc)); + int baseAddr_a = (int)coord.z * input_desc.s4 + input_desc.s0; + _viv_asm(MOV, coord.w, baseAddr_a); + + if(gidx < width) + { + for(coord.y = 0; coord.y < height;) + { + VXC_OP4(img_load_3d, src0, input, coord.xyww, VXC_5BITOFFSET_XY(0, 0), + VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); + coord.y++; + VXC_DP16x1(tmpSum1, src0, src0, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0), uniSumU8_16x1); + tmpSum += (tmpSum1); + VXC_DP16x1(tmpSqr1, src0, src0, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0), uniSqrSum_16x1); + tmpSqr += (tmpSqr1 + tmpZp1 * tmpSum1); + } + sqr += (tmpSqr * e2InScale + rowSumScale); + sum = (tmpSum + sumInZp) * input_scale; + } + lcl_sum[lidx] = sum; + lcl_sqr[lidx] = sqr; + barrier(CLK_LOCAL_MEM_FENCE); + + int2 coord_out = (int2)(get_group_id(0) << 2, gidz); + if(lidx == 0) + { + float4 one = (float4)(1, 1, 1, 1); + __local float4* tmp_sum = (__local float4*)lcl_sum; + __local float4* tmp_sqr = (__local float4*)lcl_sqr; + sum = 0; sqr = 0; + for(int i = 0; i < 4; i++) + { + sum += dot(tmp_sum[i], one); + sqr += dot(tmp_sqr[i], one); + } + float4 data = (float4)(sum, sqr, 0, 0); + write_imagef(output, coord_out, data); + } +} + +__kernel __attribute__((reqd_work_group_size(16, 1, 1))) void layernorm_wh_sumSqr_U8toF32_2D( + image2d_t input, image2d_t output) +{ + int gidx = get_global_id(0) << 4; + int lidx = get_local_id(0); + int gidz = get_global_id(1); + int gidy = gidz * height; + + int2 coord = (int2)(gidx, gidy); + vxc_uchar16 src0; + float sum = 0, sqr = 0; + int tmpSum = 0, tmpSqr = 0, tmpSum1, tmpSqr1; + + __local float lcl_sum[16]; + __local float lcl_sqr[16]; + + int endH = gidy + height; + if(gidx < width) + { + for(; coord.y < endH;) + { + VXC_ReadImage(src0, input, coord, VXC_5BITOFFSET_XY(0, 0), + VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); + coord.y++; + VXC_DP16x1(tmpSum1, src0, src0, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0), uniSumU8_16x1); + tmpSum += (tmpSum1); + VXC_DP16x1(tmpSqr1, src0, src0, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0), uniSqrSum_16x1); + tmpSqr += (tmpSqr1 + tmpZp1 * tmpSum1); + } + sqr += (tmpSqr * e2InScale + rowSumScale); + sum = (tmpSum + sumInZp) * input_scale; + } + lcl_sum[lidx] = sum; + lcl_sqr[lidx] = sqr; + barrier(CLK_LOCAL_MEM_FENCE); + + int2 coord_out = (int2)(get_group_id(0) << 2, gidz); + if(lidx == 0) + { + float4 one = (float4)(1, 1, 1, 1); + __local float4* tmp_sum = (__local float4*)lcl_sum; + __local float4* tmp_sqr = (__local float4*)lcl_sqr; + sum = 0; sqr = 0; + for(int i = 0; i < 4; i++) + { + sum += dot(tmp_sum[i], one); + sqr += dot(tmp_sqr[i], one); + } + float4 data = (float4)(sum, sqr, 0, 0); + write_imagef(output, coord_out, data); + } +} + +__kernel __attribute__((reqd_work_group_size(16, 1, 1))) void layernorm_wh_U8toF16( + image2d_array_t input, image2d_array_t bias, image2d_array_t scale, image2d_t meanVari, + image2d_array_t output, float eps) +{ + int gidz = get_global_id(1); + int4 coord = (int4)(get_global_id(0), 0, gidz, 0); + int2 coord_sum = (int2)(0, gidz); + int4 coord_para = coord; + coord_para.z = (ushort)gidz / (ushort)(height_depth); + vxc_uchar16 src0; + vxc_short8 src1, outval; + vxc_half8 scale_h, dst; + vxc_float4 bias_f0, bias_f1, scale_f0, scale_f1; + vxc_float4 mean_vari = (vxc_float4)(0); + + for(int i = 0; i < group_num; i++) + { + mean_vari += read_imagef(meanVari, coord_sum); + coord_sum.x += 4; + } + mean_vari *= dimRatio; + mean_vari.s1 = mean_vari.s1 - mean_vari.s0 * mean_vari.s0 + eps; + mean_vari.s1 = rsqrt(mean_vari.s1); + + int4 coord_bias = coord_para; + + int8 input_desc, scale_desc, output_desc; + _viv_asm(COPY, input_desc, input, sizeof(input_desc)); + int baseAddr_a = (int)coord.z * input_desc.s4 + input_desc.s0; + _viv_asm(MOV, coord.w, baseAddr_a); + + _viv_asm(COPY, scale_desc, scale, sizeof(scale_desc)); + int baseAddr_c = (int)coord_para.z * scale_desc.s4 + scale_desc.s0; + _viv_asm(MOV, coord_para.w, baseAddr_c); + + _viv_asm(COPY, output_desc, output, sizeof(output_desc)); + int baseAddr = (int)coord.z * output_desc.s4 + output_desc.s0; + _viv_asm(MOV, coord.z, baseAddr); + + short zp = inputZP; + vxc_float4 tmpData0, tmpData1, norm; + half4 tmpVal0, tmpVal1; + + for(coord.y = 0; coord.y < height; coord.y++) + { + VXC_OP4(img_load_3d, src0, input, coord.xyww, VXC_5BITOFFSET_XY(0, 0), + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); + coord_para.y = coord.y; coord_bias.y = coord.y; + VXC_OP4(img_load_3d, src1, scale, coord_para.xyww, VXC_5BITOFFSET_XY(0, 0), + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); + bias_f0 = read_imagef(bias, coord_bias); + coord_bias.x += 4; + bias_f1 = read_imagef(bias, coord_bias); + coord_bias.x = coord.x; + + _viv_asm(COPY, scale_h, src1, 16); + VXC_DP4x4(scale_f0, scale_h, scale_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\ + UniFP16toFP32Lo4_dp4x4); + VXC_DP4x4(scale_f1, scale_h, scale_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\ + uniConvertSecFp16Fp32_4x4); + VXC_DP4x4(tmpData0, src0, zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\ + uniConvert1stUint8SubZpToFp32_4x4); + VXC_DP4x4(tmpData1, src0, zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\ + uniConvert2ndUint8SubZpToFp32_4x4); + tmpData0 = tmpData0 * input_scale - mean_vari.s0; + tmpData1 = tmpData1 * input_scale - mean_vari.s0; + + norm = scale_f0 * mean_vari.s1 * tmpData0 + bias_f0; + _viv_asm(CONV, tmpVal0, norm); + norm = scale_f1 * mean_vari.s1 * tmpData1 + bias_f1; + _viv_asm(CONV, tmpVal1, norm); + + VXC_DP2x8(dst, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0),\ + uniConvertHalfToFp16_2x8); + _viv_asm(COPY, outval, dst, 16); + VXC_OP4_NoDest(img_store_3d, output, coord.xyzz, outval, \ + VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0)); + } +} + +__kernel __attribute__((reqd_work_group_size(16, 1, 1))) void layernorm_wh_U8toF16_2D( + image2d_t input, image2d_t bias, image2d_t scale, image2d_t meanVari, + image2d_t output, float eps) +{ + int2 coord = (int2)(get_global_id(0), 0); + int2 coord_bias = (int2)(0, 0); + vxc_uchar16 src0; + vxc_short8 src1, outval; + vxc_half8 scale_h, dst; + vxc_float4 bias_f0, bias_f1, scale_f0, scale_f1; + vxc_float4 mean_vari = (vxc_float4)(0); + + for(int i = 0; i < group_num; i++) + { + mean_vari += read_imagef(meanVari, coord_bias); + coord_bias.x += 4; + } + mean_vari *= dimRatio; + mean_vari.s1 = mean_vari.s1 - mean_vari.s0 * mean_vari.s0 + eps; + mean_vari.s1 = rsqrt(mean_vari.s1); + + coord_bias = coord; + + short zp = inputZP; + vxc_float4 tmpData0, tmpData1, norm; + half4 tmpVal0, tmpVal1; + + for(coord.y = 0; coord.y < height; coord.y++) + { + VXC_ReadImage(src0, input, coord, VXC_5BITOFFSET_XY(0, 0),\ + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); + coord_bias.y = coord.y; + VXC_ReadImage(src1, scale, coord, VXC_5BITOFFSET_XY(0, 0),\ + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); + bias_f0 = read_imagef(bias, coord_bias); + coord_bias.x += 4; + bias_f1 = read_imagef(bias, coord_bias); + coord_bias.x = coord.x; + + _viv_asm(COPY, scale_h, src1, 16); + VXC_DP4x4(scale_f0, scale_h, scale_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\ + UniFP16toFP32Lo4_dp4x4); + VXC_DP4x4(scale_f1, scale_h, scale_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\ + uniConvertSecFp16Fp32_4x4); + VXC_DP4x4(tmpData0, src0, zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\ + uniConvert1stUint8SubZpToFp32_4x4); + VXC_DP4x4(tmpData1, src0, zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\ + uniConvert2ndUint8SubZpToFp32_4x4); + tmpData0 = tmpData0 * input_scale - mean_vari.s0; + tmpData1 = tmpData1 * input_scale - mean_vari.s0; + + norm = scale_f0 * mean_vari.s1 * tmpData0 + bias_f0; + _viv_asm(CONV, tmpVal0, norm); + norm = scale_f1 * mean_vari.s1 * tmpData1 + bias_f1; + _viv_asm(CONV, tmpVal1, norm); + + VXC_DP2x8(dst, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0),\ + uniConvertHalfToFp16_2x8); + _viv_asm(COPY, outval, dst, 16); + VXC_WriteImage(output, coord, outval, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); + } +} + +__kernel __attribute__((reqd_work_group_size(16, 1, 1))) void layernorm_wh_U8toU8( + image2d_array_t input, image2d_array_t bias, image2d_array_t scale, image2d_t meanVari, + image2d_array_t output, float eps) +{ + int gidz = get_global_id(1); + int4 coord = (int4)(get_global_id(0), 0, gidz, 0); + int2 coord_sum = (int2)(0, gidz); + int4 coord_para = coord; + coord_para.z = (ushort)gidz / (ushort)(height_depth); + vxc_uchar16 src0 , outval; + vxc_short8 src1; + vxc_half8 scale_h; + vxc_float4 bias_f0, bias_f1, scale_f0, scale_f1; + vxc_float4 mean_vari = (vxc_float4)(0); + + for(int i = 0; i < group_num; i++) + { + mean_vari += read_imagef(meanVari, coord_sum); + coord_sum.x += 4; + } + mean_vari *= dimRatio; + mean_vari.s1 = mean_vari.s1 - mean_vari.s0 * mean_vari.s0 + eps; + mean_vari.s1 = rsqrt(mean_vari.s1); + + int4 coord_bias = coord_para; + + int8 input_desc, scale_desc, output_desc; + _viv_asm(COPY, input_desc, input, sizeof(input_desc)); + int baseAddr_a = (int)coord.z * input_desc.s4 + input_desc.s0; + _viv_asm(MOV, coord.w, baseAddr_a); + + _viv_asm(COPY, scale_desc, scale, sizeof(scale_desc)); + int baseAddr_c = (int)coord_para.z * scale_desc.s4 + scale_desc.s0; + _viv_asm(MOV, coord_para.w, baseAddr_c); + + _viv_asm(COPY, output_desc, output, sizeof(output_desc)); + int baseAddr = (int)coord.z * output_desc.s4 + output_desc.s0; + _viv_asm(MOV, coord.z, baseAddr); + + short zp = inputZP; + vxc_float4 tmpData0, tmpData1, norm; + vxc_int4 tmpVal0, tmpVal1; + + for(coord.y = 0; coord.y < height; coord.y++) + { + VXC_OP4(img_load_3d, src0, input, coord.xyww, VXC_5BITOFFSET_XY(0, 0), + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); + coord_para.y = coord.y; + coord_bias.y = coord.y; + VXC_OP4(img_load_3d, src1, scale, coord_para.xyww, VXC_5BITOFFSET_XY(0, 0), + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); + bias_f0 = read_imagef(bias, coord_bias); + coord_bias.x += 4; + bias_f1 = read_imagef(bias, coord_bias); + coord_bias.x = coord.x; + + _viv_asm(COPY, scale_h, src1, 16); + VXC_DP4x4(scale_f0, scale_h, scale_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\ + UniFP16toFP32Lo4_dp4x4); + VXC_DP4x4(scale_f1, scale_h, scale_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\ + uniConvertSecFp16Fp32_4x4); + VXC_DP4x4(tmpData0, src0, zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\ + uniConvert1stUint8SubZpToFp32_4x4); + VXC_DP4x4(tmpData1, src0, zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\ + uniConvert2ndUint8SubZpToFp32_4x4); + tmpData0 = tmpData0 * input_scale - mean_vari.s0; + tmpData1 = tmpData1 * input_scale - mean_vari.s0; + + norm = scale_f0 * mean_vari.s1 * tmpData0 + bias_f0; + tmpVal0 = convert_int4_rte(norm * outputScale + output_zp); + norm = scale_f1 * mean_vari.s1 * tmpData1 + bias_f1; + tmpVal1 = convert_int4_rte(norm * outputScale + output_zp); + + VXC_DP2x8(outval, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1),\ + uniConvertInt32toUint8_2x8); + VXC_OP4_NoDest(img_store_3d, output, coord.xyzz, outval, \ + VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0)); + } +} + +__kernel __attribute__((reqd_work_group_size(16, 1, 1))) void layernorm_wh_U8toU8_2D( + image2d_t input, image2d_t bias, image2d_t scale, image2d_t meanVari, + image2d_t output, float eps) +{ + int2 coord = (int2)(get_global_id(0), 0); + int2 coord_bias = (int2)(0, 0); + vxc_uchar16 src0, outval; + vxc_short8 src1; + vxc_half8 scale_h; + vxc_float4 bias_f0, bias_f1, scale_f0, scale_f1; + vxc_float4 mean_vari = (vxc_float4)(0); + + for(int i = 0; i < group_num; i++) + { + mean_vari += read_imagef(meanVari, coord_bias); + coord_bias.x += 4; + } + mean_vari *= dimRatio; + mean_vari.s1 = mean_vari.s1 - mean_vari.s0 * mean_vari.s0 + eps; + mean_vari.s1 = rsqrt(mean_vari.s1); + + coord_bias = coord; + + short zp = inputZP; + vxc_float4 tmpData0, tmpData1, norm; + vxc_int4 tmpVal0, tmpVal1; + + for(coord.y = 0; coord.y < height; coord.y++) + { + VXC_ReadImage(src0, input, coord, VXC_5BITOFFSET_XY(0, 0),\ + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); + coord_bias.y = coord.y; + VXC_ReadImage(src1, scale, coord, VXC_5BITOFFSET_XY(0, 0),\ + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); + bias_f0 = read_imagef(bias, coord_bias); + coord_bias.x += 4; + bias_f1 = read_imagef(bias, coord_bias); + coord_bias.x = coord.x; + + _viv_asm(COPY, scale_h, src1, 16); + VXC_DP4x4(scale_f0, scale_h, scale_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\ + UniFP16toFP32Lo4_dp4x4); + VXC_DP4x4(scale_f1, scale_h, scale_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\ + uniConvertSecFp16Fp32_4x4); + VXC_DP4x4(tmpData0, src0, zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\ + uniConvert1stUint8SubZpToFp32_4x4); + VXC_DP4x4(tmpData1, src0, zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\ + uniConvert2ndUint8SubZpToFp32_4x4); + tmpData0 = tmpData0 * input_scale - mean_vari.s0; + tmpData1 = tmpData1 * input_scale - mean_vari.s0; + + norm = scale_f0 * mean_vari.s1 * tmpData0 + bias_f0; + tmpVal0 = convert_int4_rte(norm * outputScale + output_zp); + norm = scale_f1 * mean_vari.s1 * tmpData1 + bias_f1; + tmpVal1 = convert_int4_rte(norm * outputScale + output_zp); + + VXC_DP2x8(outval, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1),\ + uniConvertInt32toUint8_2x8); + VXC_WriteImage(output, coord, outval, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); + } +} \ No newline at end of file diff --git a/src/tim/vx/internal/src/libnnext/ops/vx/pre_process_bgra_trans.vx b/src/tim/vx/internal/src/libnnext/ops/vx/pre_process_bgra_trans.vx deleted file mode 100644 index 0ce3d53..0000000 --- a/src/tim/vx/internal/src/libnnext/ops/vx/pre_process_bgra_trans.vx +++ /dev/null @@ -1,136 +0,0 @@ -#include "cl_viv_vx_ext.h" - -_viv_uniform VXC_512Bits uniDescaleU8_4x4; -_viv_uniform VXC_512Bits uniConvertIntergetoF32_4x4; - -_viv_uniform VXC_512Bits uniBilinearTmp1BgraShort_4x4; -_viv_uniform VXC_512Bits uniBilinearTmp2BgraShort_4x4; -_viv_uniform VXC_512Bits uniBilinearTmp3BgraShort_4x4; -_viv_uniform VXC_512Bits uniBilinearTmp4BgraShort_4x4; -_viv_uniform VXC_512Bits uniBilinearTmp5BgraShort_4x4; -_viv_uniform VXC_512Bits uniBilinearTmp6BgraShort_4x4; -_viv_uniform VXC_512Bits uniBilinearTmp7BgraShort_4x4; -_viv_uniform VXC_512Bits uniBilinearTmp8BgraShort_4x4; - -_viv_uniform VXC_512Bits uniConvertInt32toUint8_2x8; -_viv_uniform VXC_512Bits uniExtractInt32BgraToU8Bgr_2x8; - -_viv_uniform int zp; -_viv_uniform float outputScale; - -__kernel void pre_process_bgra_scale_nhwc_U8toU8( - __read_only image2d_array_t input, __write_only image2d_array_t output, - global int *xRatio, global int * yRatio, global int * xOffset, global int * yOffset, - float rMean, float gMean, float bMean, float var, int reverse_channel, int trans) -{ - int4 gidx = get_global_id(0); - int gidy = get_global_id(1); - gidx += (int4)(0, 1, 2, 3); - - int4 fx = (gidx * (*xRatio) + ((*xRatio) >> 1)) - (1 << 14); - int4 sx = fx & 0xffff8000; // Floor - int fy, sy; - fx -= sx; - sx = sx >> 15; - fx = (fx +(1 << 4)) >> 5; - - // for y - fy = (gidy * (*yRatio) + ((*yRatio) >> 1)) - (1<< 14); - sy = fy & 0xffff8000; // Floor - fy -= sy; - sy = sy >> 15; - - sy = sy < 0 ? 0 : sy; - fy = fy < 0 ? 0 : fy; - - fy = (fy + (1<< 4)) >> 5; - sx = (sx + (*xOffset)) * 4 ; - sy += (*yOffset); - int4 srcPos = (int4)(sx.x, sy, sy + 1, sx.y); - vxc_uchar16 lineBGRA0, lineBGRA1, lineBGRA2, lineBGRA3; - vxc_uchar16 dataB, dataG, dataR; - - VXC_ReadImage(lineBGRA0, input, srcPos.xy, VXC_5BITOFFSET_XY(0, 0), - VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); - VXC_ReadImage(lineBGRA0, input, srcPos.xz, VXC_5BITOFFSET_XY(0, 0), - VXC_MODIFIER(8, 15, 0, VXC_RM_TowardZero, 0)); - - VXC_ReadImage(lineBGRA1, input, srcPos.wy, VXC_5BITOFFSET_XY(0, 0), - VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); - VXC_ReadImage(lineBGRA1, input, srcPos.wz, VXC_5BITOFFSET_XY(0, 0), - VXC_MODIFIER(8, 15, 0, VXC_RM_TowardZero, 0)); - - srcPos.x = sx.z; - srcPos.w = sx.w; - - VXC_ReadImage(lineBGRA2, input, srcPos.xy, VXC_5BITOFFSET_XY(0, 0), - VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); - VXC_ReadImage(lineBGRA2, input, srcPos.xz, VXC_5BITOFFSET_XY(0, 0), - VXC_MODIFIER(8, 15, 0, VXC_RM_TowardZero, 0)); - - VXC_ReadImage(lineBGRA3, input, srcPos.wy, VXC_5BITOFFSET_XY(0, 0), - VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); - VXC_ReadImage(lineBGRA3, input, srcPos.wz, VXC_5BITOFFSET_XY(0, 0), - VXC_MODIFIER(8, 15, 0, VXC_RM_TowardZero, 0)); - - vxc_uchar4 val_u8; - int4 tmp1, tmp2, result1, result2; - float4 tmpDst, tmp0; - float4 mean = (float4)(bMean, gMean, rMean, 0); - //tmpFx = (int4)(fx.x, fx.x, fx.x, fx.x); - int tmpV = 1 << 19; - vxc_short8 tmpFx; - VXC_DP2x8(tmpFx, fx, fx, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), - uniConvertInt32toUint8_2x8); - //tmpFx = fx.xxxx; - VXC_DP4x4(tmp1, lineBGRA0, tmpFx, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), - uniBilinearTmp1BgraShort_4x4); - VXC_DP4x4(tmp2, lineBGRA0, tmpFx, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), - uniBilinearTmp2BgraShort_4x4); - tmp1 = fy * (tmp2 - tmp1) + (tmp1 << 10); - VXC_DP4x4(val_u8, tmp1, tmpV, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), uniDescaleU8_4x4); - VXC_DP4x4(tmp0, val_u8, val_u8, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), \ - uniConvertIntergetoF32_4x4); - tmpDst = (tmp0 - mean) * var; - result1 = convert_int4_rte(tmpDst * outputScale + zp); - - //tmpFx = fx.yyyy; - VXC_DP4x4(tmp1, lineBGRA1, tmpFx, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniBilinearTmp3BgraShort_4x4); - VXC_DP4x4(tmp2, lineBGRA1, tmpFx, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniBilinearTmp4BgraShort_4x4); - tmp1 = fy * (tmp2 - tmp1) + (tmp1 << 10); - VXC_DP4x4(val_u8, tmp1, tmpV, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), uniDescaleU8_4x4); - VXC_DP4x4(tmp0, val_u8, val_u8, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), \ - uniConvertIntergetoF32_4x4); - tmpDst = (tmp0 - mean) * var; - result2 = convert_int4_rte(tmpDst * outputScale + zp); - - vxc_uchar16 dst; - VXC_DP2x8(dst, result1, result2, VXC_MODIFIER(0, 5, 0, VXC_RM_ToNearestEven, 1), - uniExtractInt32BgraToU8Bgr_2x8); - - //tmpFx = fx.zzzz; - VXC_DP4x4(tmp1, lineBGRA2, tmpFx, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniBilinearTmp5BgraShort_4x4); - VXC_DP4x4(tmp2, lineBGRA2, tmpFx, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniBilinearTmp6BgraShort_4x4); - tmp1 = fy * (tmp2 - tmp1) + (tmp1 << 10); - VXC_DP4x4(val_u8, tmp1, tmpV, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), uniDescaleU8_4x4); - VXC_DP4x4(tmp0, val_u8, val_u8, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), \ - uniConvertIntergetoF32_4x4); - tmpDst = (tmp0 - mean) * var; - result1 = convert_int4_rte(tmpDst * outputScale + zp); - - //tmpFx = fx.wwww; - VXC_DP4x4(tmp1, lineBGRA3, tmpFx, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniBilinearTmp7BgraShort_4x4); - VXC_DP4x4(tmp2, lineBGRA3, tmpFx, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniBilinearTmp8BgraShort_4x4); - tmp1 = fy * (tmp2 - tmp1) + (tmp1 << 10); - VXC_DP4x4(val_u8, tmp1, tmpV, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), uniDescaleU8_4x4); - VXC_DP4x4(tmp0, val_u8, val_u8, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), \ - uniConvertIntergetoF32_4x4); - tmpDst = (tmp0 - mean) * var; - result2 = convert_int4_rte(tmpDst * outputScale + zp); - - VXC_DP2x8(dst, result1, result2, VXC_MODIFIER(6, 11, 0, VXC_RM_ToNearestEven, 1), - uniExtractInt32BgraToU8Bgr_2x8); - - int4 dstPos = (int4)(get_global_id(0) * 3, gidy, 0, 0); - VXC_WriteImage2DArray(output, dstPos, dst, VXC_MODIFIER(0, 11, 0, VXC_RM_TowardZero, 0)); -} diff --git a/src/tim/vx/internal/src/libnnext/ops/vx/pre_process_nv12_trans_u8.vx b/src/tim/vx/internal/src/libnnext/ops/vx/pre_process_nv12_trans_u8.vx deleted file mode 100644 index e235c7f..0000000 --- a/src/tim/vx/internal/src/libnnext/ops/vx/pre_process_nv12_trans_u8.vx +++ /dev/null @@ -1,89 +0,0 @@ -#include "cl_viv_vx_ext.h" - -_viv_uniform int bOrder; -_viv_uniform int rOrder; - -_viv_uniform float outputScaleVar; -_viv_uniform float bMeanScaleVarZp; -_viv_uniform float gMeanScaleVarZp; -_viv_uniform float rMeanScaleVarZp; - -_viv_uniform uint xrIntFloat_16; -_viv_uniform uint yrIntFloat_16; - -_viv_uniform VXC_512Bits uniConvertNV12toB_4x4; -_viv_uniform VXC_512Bits uniConvertNV12toG_4x4; -_viv_uniform VXC_512Bits uniConvertNV12toR_4x4; - -_viv_uniform VXC_512Bits uniConvertInt32toUint8_2x8; -_viv_uniform VXC_512Bits uniConvertUVtoCharSub128_2x8; - -_viv_uniform VXC_512Bits uniTransPackBgr1st_2x8; -_viv_uniform VXC_512Bits uniTransPackBgr2nd_2x8; - -__kernel void pre_process_nv12_trans_U8toU8( - __read_only image2d_t y_img, __read_only image2d_t uv_img, - __write_only image2d_t output, - global int *xRatio, global int * yRatio, global int * xOffset, global int * yOffset, - float rMean, float gMean, float bMean, float var, int reverse_channel, int trans) -{ - uint4 gidx = get_global_id(0); - uint gidy = get_global_id(1); - gidx += (uint4)(0, 1, 2, 3); - - uint dy = (gidy * yrIntFloat_16) >> 16; - uint4 dx = (gidx * xrIntFloat_16) >> 16; - int sy = convert_int(dy) + (*yOffset); - int4 sx = convert_int4(dx) + (*xOffset); - int4 uvX = sx & 0xfffffffe; - int uvY = sy >> 1; - - vxc_uchar16 Y, UV; - int2 coord = (int2)(sx.x, sy); - int2 coord_uv = (int2)(uvX.x, uvY); - - VXC_ReadImage(Y, y_img, coord, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0)); - coord.x = sx.y; - VXC_ReadImage(Y, y_img, coord, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(1, 1, 0, VXC_RM_TowardZero, 0)); - coord.x = sx.z; - VXC_ReadImage(Y, y_img, coord, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(2, 2, 0, VXC_RM_TowardZero, 0)); - coord.x = sx.w; - VXC_ReadImage(Y, y_img, coord, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(3, 3, 0, VXC_RM_TowardZero, 0)); - - VXC_ReadImage(UV, uv_img,coord_uv, VXC_5BITOFFSET_XY(0,0),VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0)); - coord_uv.x = uvX.y; - VXC_ReadImage(UV, uv_img,coord_uv, VXC_5BITOFFSET_XY(0,0),VXC_MODIFIER(2, 3, 0, VXC_RM_TowardZero, 0)); - coord_uv.x = uvX.z; - VXC_ReadImage(UV, uv_img,coord_uv, VXC_5BITOFFSET_XY(0,0),VXC_MODIFIER(4, 5, 0, VXC_RM_TowardZero, 0)); - coord_uv.x = uvX.w; - VXC_ReadImage(UV, uv_img,coord_uv, VXC_5BITOFFSET_XY(0,0),VXC_MODIFIER(6, 7, 0, VXC_RM_TowardZero, 0)); - - vxc_char16 tmpUV; - short tmpVal = 128; - VXC_DP2x8(tmpUV, UV, tmpVal, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvertUVtoCharSub128_2x8); - - float4 tmpDstB, tmpDstG, tmpDstR; - VXC_DP4x4(tmpDstB, Y, tmpUV, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertNV12toB_4x4); - VXC_DP4x4(tmpDstG, Y, tmpUV, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertNV12toG_4x4); - VXC_DP4x4(tmpDstR, Y, tmpUV, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertNV12toR_4x4); - - int4 result, dstR, dstG, dstB; - vxc_uchar16 dst, tmpPack; - dstB = convert_int4_rte(tmpDstB * outputScaleVar + bMeanScaleVarZp); - dstG = convert_int4_rte(tmpDstG * outputScaleVar + gMeanScaleVarZp); - dstR = convert_int4_rte(tmpDstR * outputScaleVar + rMeanScaleVarZp); - - if(bOrder == 2) - { - int4 exchangeData = dstB; - dstB = dstR; - dstR = exchangeData; - } - - VXC_DP2x8(tmpPack, dstB, dstG, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniConvertInt32toUint8_2x8); - VXC_DP2x8(dst, tmpPack, dstR, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniTransPackBgr1st_2x8); - VXC_DP2x8(dst, tmpPack, dstR, VXC_MODIFIER(8, 11, 0, VXC_RM_ToNearestEven, 1), uniTransPackBgr2nd_2x8); - - int2 dstPos = (int2)(get_global_id(0) * 3, gidy); - VXC_WriteImage(output, dstPos, dst, VXC_MODIFIER(0, 11, 0, VXC_RM_TowardZero, 0)); -} diff --git a/src/tim/vx/internal/src/libnnext/ops/vx/pre_process_rgb_copy_trans.vx b/src/tim/vx/internal/src/libnnext/ops/vx/pre_process_rgb_copy_trans.vx deleted file mode 100644 index da337ab..0000000 --- a/src/tim/vx/internal/src/libnnext/ops/vx/pre_process_rgb_copy_trans.vx +++ /dev/null @@ -1,94 +0,0 @@ -#include "cl_viv_vx_ext.h" - -_viv_uniform float outputScale; -_viv_uniform float outputZP; -_viv_uniform VXC_512Bits uniNormilizationLo_2x8; -_viv_uniform VXC_512Bits uniNormilizationHi_2x8; -#define IMAGE_PRE_PROCESS_COPY_16BITS_NHWC(dst_name, dst_type, copy_type) \ -__kernel void pre_process_rgb_copy_nhwc_U8to##dst_name \ - ( \ - __read_only image2d_array_t input, \ - __write_only image2d_array_t output, \ - global int *xRatio, \ - global int *yRatio, \ - global int *xOffset, \ - global int *yOffset, \ - float rMean, \ - float gMean, \ - float bMean, \ - float f32Var, \ - int reverse_channel, \ - int trans \ - ) \ -{ \ - int2 coord = (int2)(get_global_id(0), get_global_id(1)); \ - \ - coord.xy += (int2) (*xOffset, *yOffset); \ - vxc_uchar16 src0, src1; \ - dst_type dst0, dst1; \ - copy_type dst; \ - \ - VXC_ReadImage(src0, input, coord.xy, 0, VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); \ - \ - f32Var *= outputScale; \ - float4 paramData = (float4)(rMean * f32Var - outputZP, gMean * f32Var - outputZP, \ - bMean * f32Var - outputZP, f32Var); \ - half4 paramData_f16; \ - _viv_asm(CONV, paramData_f16, paramData); \ - \ - int4 coord_out = (int4)(get_global_id(0), get_global_id(1), get_global_id(0), get_global_id(0)); \ - coord_out.z = coord_out.x + 8; \ - \ - VXC_DP2x8(dst0, src0, paramData_f16, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), \ - uniNormilizationLo_2x8); \ - VXC_DP2x8(dst1, src0, paramData_f16, VXC_MODIFIER(0, 6, 0, VXC_RM_ToNearestEven, 1), \ - uniNormilizationHi_2x8); \ - _viv_asm(COPY, dst, dst0, 16); \ - VXC_WriteImage(output, coord_out.xy, dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \ - _viv_asm(COPY, dst, dst1, 16); \ - VXC_WriteImage(output, coord_out.zy, dst, VXC_MODIFIER(0, 6, 0, VXC_RM_TowardZero, 0)); \ -} -IMAGE_PRE_PROCESS_COPY_16BITS_NHWC(I16, vxc_short8, vxc_short8) -IMAGE_PRE_PROCESS_COPY_16BITS_NHWC(F16, vxc_half8, vxc_short8) - -#define IMAGE_PRE_PROCESS_COPY_8BITS_NHWC(dst_name, dst_type) \ -__kernel void pre_process_rgb_copy_nhwc_U8to##dst_name \ - ( \ - __read_only image2d_array_t input, \ - __write_only image2d_array_t output, \ - global int *xRatio, \ - global int *yRatio, \ - global int *xOffset, \ - global int *yOffset, \ - float rMean, \ - float gMean, \ - float bMean, \ - float f32Var, \ - int reverse_channel, \ - int trans \ - ) \ -{ \ - int2 coord = (int2)(get_global_id(0), get_global_id(1)); \ - coord.xy += (int2) (*xOffset, *yOffset); \ - vxc_uchar16 src0, src1; \ - dst_type dst; \ - \ - VXC_ReadImage(src0, input, coord.xy, 0, VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); \ - \ - f32Var *= outputScale; \ - float4 paramData = (float4)(rMean * f32Var - outputZP, gMean * f32Var - outputZP, \ - bMean * f32Var - outputZP, f32Var); \ - \ - half4 paramData_f16; \ - _viv_asm(CONV, paramData_f16, paramData); \ - \ - int2 coord_out = (int2)(get_global_id(0), get_global_id(1)); \ - \ - VXC_DP2x8(dst, src0, paramData_f16, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), \ - uniNormilizationLo_2x8); \ - VXC_DP2x8(dst, src0, paramData_f16, VXC_MODIFIER(8, 14, 0, VXC_RM_ToNearestEven, 1), \ - uniNormilizationHi_2x8); \ - VXC_WriteImage(output, coord_out, dst, VXC_MODIFIER(0, 14, 0, VXC_RM_TowardZero, 0)); \ -} -IMAGE_PRE_PROCESS_COPY_8BITS_NHWC(U8, vxc_uchar16) -IMAGE_PRE_PROCESS_COPY_8BITS_NHWC(I8, vxc_char16) diff --git a/src/tim/vx/internal/src/libnnext/ops/vx/pre_process_rgb_trans.vx b/src/tim/vx/internal/src/libnnext/ops/vx/pre_process_rgb_trans.vx deleted file mode 100644 index 0820a03..0000000 --- a/src/tim/vx/internal/src/libnnext/ops/vx/pre_process_rgb_trans.vx +++ /dev/null @@ -1,172 +0,0 @@ -#include "cl_viv_vx_ext.h" - -_viv_uniform VXC_512Bits uniVecShift10; -_viv_uniform VXC_512Bits uniAddRShift; -_viv_uniform VXC_512Bits uniGetTempVal; -_viv_uniform VXC_512Bits uniExtractBytes; -_viv_uniform VXC_512Bits uniUnpackToR; -_viv_uniform VXC_512Bits uniUnpackToG; -_viv_uniform VXC_512Bits uniUnpackToB; - -_viv_uniform VXC_512Bits uniConvertIntergetoF32_4x4; -_viv_uniform float outputScale; -_viv_uniform VXC_512Bits uniExtract8Data_2x8; -_viv_uniform float outputZP; - -_viv_uniform VXC_512Bits uniRePackRGBLo_2x8; -_viv_uniform VXC_512Bits uniRePackRGBHi_2x8; -#define IMAGE_PRE_PROCESS_NHWC(dst_name, conv_type, dst_type, copy_type) \ -__kernel void pre_process_rgb_scale_nhwc_U8to##dst_name \ - ( \ -__read_only image2d_array_t input, \ -__write_only image2d_array_t output, \ - global int *xRatio, \ - global int *yRatio, \ - global int *xOffset, \ - global int *yOffset, \ - float rMean, \ - float gMean, \ - float bMean, \ - float f32Var, \ - int reverse_channel, \ - int trans \ - ) \ -{ \ - int2 ratioXY = (int2)(*xRatio, *yRatio); \ - int4 xPos = get_global_id(0); \ - int yPos = get_global_id(1); \ - int2 ratioSufXY = (ratioXY >> 1) - (1 << 14); \ - xPos += (int4)(0, 1, 2, 3); \ - \ - /*x*/ \ - int4 fx0 = xPos * ratioXY.x + ratioSufXY.x; \ - int4 sx = fx0 & 0xffff8000; \ - fx0 -= sx; \ - sx = sx >> 15; \ - \ - vxc_short4 fx; \ - VXC_DP4x4(fx, fx0, 1 << 4, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAddRShift); \ - /*y*/ \ - int fy = yPos * ratioXY.y + ratioSufXY.y; \ - int sy = fy & 0xffff8000; \ - \ - fy -= sy; \ - sy = sy >> 15; \ - \ - fy = (fy + (1<< 4)) >> 5; \ - \ - vxc_uchar16 line0RGB1, line0RGB2; \ - vxc_uchar16 line1RGB3, line1RGB4; \ - int4 coord; \ - sx = sx * 3 + *xOffset; \ - coord.xyz = sx.xyz; \ - coord.w = sy + *yOffset; \ - int2 coord1 = (int2)(sx.w, coord.w); \ - VXC_ReadImage(line0RGB1, input, coord.xw, 0, VXC_MODIFIER(0, 5, 0, VXC_RM_TowardZero, 0)); \ - VXC_ReadImage(line0RGB1, input, coord.yw, 0, VXC_MODIFIER(6, 11, 0, VXC_RM_TowardZero, 0)); \ - VXC_ReadImage(line0RGB2, input, coord.zw, 0, VXC_MODIFIER(0, 5, 0, VXC_RM_TowardZero, 0)); \ - VXC_ReadImage(line0RGB2, input, coord1, 0, VXC_MODIFIER(6, 11, 0, VXC_RM_TowardZero, 0)); \ - \ - VXC_ReadImage(line1RGB3, input, coord.xw, VXC_5BITOFFSET_XY(0, 1), \ - VXC_MODIFIER(0, 5, 0, VXC_RM_TowardZero, 0)); \ - VXC_ReadImage(line1RGB3, input, coord.yw, VXC_5BITOFFSET_XY(0, 1), \ - VXC_MODIFIER(6, 11, 0, VXC_RM_TowardZero, 0)); \ - VXC_ReadImage(line1RGB4, input, coord.zw, VXC_5BITOFFSET_XY(0, 1), \ - VXC_MODIFIER(0, 5, 0, VXC_RM_TowardZero, 0)); \ - VXC_ReadImage(line1RGB4, input, coord1, VXC_5BITOFFSET_XY(0, 1), \ - VXC_MODIFIER(6, 11, 0, VXC_RM_TowardZero, 0)); \ - \ - float4 bgrMean = (float4)(bMean, gMean, rMean, 0); \ - \ - bgrMean *= f32Var; \ - \ - int4 test01, temp1; \ - int4 test02, temp2; \ - int4 tt; \ - vxc_uchar4 val; \ - int4 coord_out = (int4)(xPos.x * 3, yPos, xPos.x * 3 + 6, 0); \ - \ - vxc_uchar8 line1, line2; \ - \ - /*R*/ \ - VXC_DP2x8(line1, line0RGB1, line0RGB2, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniUnpackToR); \ - VXC_DP2x8(line2, line1RGB3, line1RGB4, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniUnpackToR); \ - \ - VXC_DP4x4(test01, line1, line1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniVecShift10); \ - VXC_DP4x4(temp1, line1, fx, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniGetTempVal); \ - temp1 = temp1 + test01; \ - \ - VXC_DP4x4(test02, line2, line2, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniVecShift10); \ - VXC_DP4x4(temp2, line2, fx, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniGetTempVal); \ - temp2 = temp2 + test02; \ - temp2 = fy * (temp2 - temp1) + (temp1 << 10); \ - \ - vxc_float4 tmp_dst; \ - vxc_uchar4 u8_dst; \ - VXC_DP4x4(u8_dst, temp2, 1 << 19, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 1), uniExtractBytes); \ - VXC_DP4x4(tmp_dst, u8_dst, u8_dst, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 1), \ - uniConvertIntergetoF32_4x4); \ - \ - /*convert U8 to dst*/ \ - dst_type dstRG, dstB, dst; \ - tmp_dst = tmp_dst * f32Var - bgrMean.zzzz; \ - tmp_dst = tmp_dst * outputScale + outputZP; \ - conv_type dst0; \ - _viv_asm(CONV_RTE, dst0, tmp_dst); \ - VXC_DP2x8(dstRG, dst0, dst0, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), uniExtract8Data_2x8); \ - \ - /*G*/ \ - VXC_DP2x8(line1, line0RGB1, line0RGB2, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniUnpackToG); \ - VXC_DP2x8(line2, line1RGB3, line1RGB4, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniUnpackToG); \ - \ - VXC_DP4x4(test01, line1, line1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniVecShift10); \ - VXC_DP4x4(temp1, line1, fx, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniGetTempVal); \ - temp1 = temp1 + test01; \ - \ - VXC_DP4x4(test02, line2, line2, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniVecShift10); \ - VXC_DP4x4(temp2, line2, fx, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniGetTempVal); \ - temp2 = temp2 + test02; \ - temp2 = fy * (temp2 - temp1) + (temp1 << 10); \ - \ - VXC_DP4x4(u8_dst, temp2, 1 << 19, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 1), uniExtractBytes); \ - VXC_DP4x4(tmp_dst, u8_dst, u8_dst, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 1), \ - uniConvertIntergetoF32_4x4); \ - \ - tmp_dst = tmp_dst * f32Var - bgrMean.y; \ - tmp_dst = tmp_dst * outputScale + outputZP; \ - _viv_asm(CONV_RTE, dst0, tmp_dst); \ - VXC_DP2x8(dstRG, dst0, dst0, VXC_MODIFIER(4, 7, 0, VXC_RM_ToNearestEven, 1), uniExtract8Data_2x8); \ - \ - /*B*/ \ - VXC_DP2x8(line1, line0RGB1, line0RGB2, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniUnpackToB); \ - VXC_DP2x8(line2, line1RGB3, line1RGB4, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniUnpackToB); \ - \ - VXC_DP4x4(test01, line1, line1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniVecShift10); \ - VXC_DP4x4(temp1, line1, fx, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniGetTempVal); \ - temp1 = temp1 + test01; \ - \ - VXC_DP4x4(test02, line2, line2, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniVecShift10); \ - VXC_DP4x4(temp2, line2, fx, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniGetTempVal); \ - temp2 = temp2 + test02; \ - temp2 = fy * (temp2 - temp1) + (temp1 << 10); \ - \ - VXC_DP4x4(u8_dst, temp2, 1 << 19, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 1), uniExtractBytes); \ - VXC_DP4x4(tmp_dst, u8_dst, u8_dst, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 1), \ - uniConvertIntergetoF32_4x4); \ - \ - tmp_dst = tmp_dst * f32Var - bgrMean.x; \ - tmp_dst = tmp_dst * outputScale + outputZP; \ - _viv_asm(CONV_RTE, dst0, tmp_dst); \ - VXC_DP2x8(dstB, dst0, dst0, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), uniExtract8Data_2x8); \ - VXC_DP2x8(dst, dstRG, dstB, VXC_MODIFIER(0, 5, 0, VXC_RM_ToNearestEven, 1), uniRePackRGBLo_2x8); \ - copy_type result; \ - _viv_asm(COPY, result, dst, 16); \ - VXC_WriteImage(output, coord_out.xy, result, VXC_MODIFIER(0, 5, 0, VXC_RM_TowardZero, 0)); \ - VXC_DP2x8(dst, dstRG, dstB, VXC_MODIFIER(0, 5, 0, VXC_RM_ToNearestEven, 1), uniRePackRGBHi_2x8); \ - _viv_asm(COPY, result, dst, 16); \ - VXC_WriteImage(output, coord_out.zy, result, VXC_MODIFIER(0, 5, 0, VXC_RM_TowardZero, 0)); \ -} -IMAGE_PRE_PROCESS_NHWC(U8, uint4, vxc_uchar16, vxc_uchar16) -IMAGE_PRE_PROCESS_NHWC(I8, int4, vxc_char16, vxc_char16) -IMAGE_PRE_PROCESS_NHWC(I16, int4, vxc_short8, vxc_short8) -IMAGE_PRE_PROCESS_NHWC(F16, half4, vxc_half8, vxc_short8) diff --git a/src/tim/vx/internal/src/libnnext/ops/vx/pre_process_yuv420_copy_u8.vx b/src/tim/vx/internal/src/libnnext/ops/vx/pre_process_yuv420_copy_u8.vx index 4600537..951ee96 100644 --- a/src/tim/vx/internal/src/libnnext/ops/vx/pre_process_yuv420_copy_u8.vx +++ b/src/tim/vx/internal/src/libnnext/ops/vx/pre_process_yuv420_copy_u8.vx @@ -23,19 +23,6 @@ _viv_uniform VXC_512Bits uniCalculateTmpB3rd_4x4; _viv_uniform VXC_512Bits uniCalculateTmpB4th_4x4; _viv_uniform VXC_512Bits uniCalculateB1st_4x4; -_viv_uniform VXC_512Bits uniPackBG0_2x8; -_viv_uniform VXC_512Bits uniPackTmpAndR_2x8; -_viv_uniform VXC_512Bits uniPackRB0_2x8; -_viv_uniform VXC_512Bits uniPackTmp0AndG_2x8; -_viv_uniform VXC_512Bits uniPackGR1_2x8; -_viv_uniform VXC_512Bits uniPackTmp1AndB_2x8; -_viv_uniform VXC_512Bits uniPackBG1_2x8; -_viv_uniform VXC_512Bits uniPackTmp1AndR_2x8; -_viv_uniform VXC_512Bits uniPackRB2_2x8; -_viv_uniform VXC_512Bits uniPackTmp2AndG_2x8; -_viv_uniform VXC_512Bits uniPackGR2_2x8; -_viv_uniform VXC_512Bits uniPackTmp2AndB_2x8; - _viv_uniform VXC_512Bits uniQuantU8toU8LoB_2x8; _viv_uniform VXC_512Bits uniQuantU8toU8HiB_2x8; _viv_uniform VXC_512Bits uniQuantU8toU8LoG_2x8; @@ -145,137 +132,3 @@ __kernel void pre_process_yuv420_copy_U8toU8( VXC_WriteImage2DArray(output, pos, dst2, VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); } -// store bgrbgrbgr -__kernel void pre_process_yuv420_copy_trans_U8( - __read_only image2d_t y_img, - __read_only image2d_t u_img, - __read_only image2d_t v_img, - __write_only image2d_array_t output, - global int * xRatio, - global int * yRatio, - global int * xOffset, - global int * yOffset, - float rMean, - float gMean, - float bMean, - float var, - int reverse_channel, - int trans - ) -{ - int4 pos = (int4)(get_global_id(0) + (*xOffset), get_global_id(1) + (*yOffset), 0, 0); - int4 pos1 = (int4)((get_global_id(0) + (*xOffset)) >> 1, (get_global_id(1) + (*yOffset)) >> 1, 0, 0); - vxc_uchar16 Y; - vxc_uchar8 U, V; - vxc_int4 C0, C1, C2, C3; - vxc_uchar16 R, G, B; - vxc_uchar16 dst; - - VXC_ReadImage(Y, y_img, pos.xy, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); - VXC_ReadImage(U, u_img, pos1.xy, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); - VXC_ReadImage(V, v_img, pos1.xy, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); - - var *= outputScale; - float4 paramData = (float4)(bMean * var - zp, gMean * var - zp,\ - rMean * var - zp, var); - half4 paramData_f16; - _viv_asm(CONV, paramData_f16, paramData); - - //C = Y - 16; - //D = U - 128; - //E = V - 128; - // calculate R - // ((298 * C + 409 * E + 128) >> 8) --> [(298Y + 409V - 56992) >> 8] - int tmpV = -56992; - VXC_DP4x4(C0, Y, V, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniCalculateTmpR1st_4x4); - VXC_DP4x4(C1, Y, V, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniCalculateTmpR2nd_4x4); - VXC_DP4x4(C2, Y, V, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniCalculateTmpR3rd_4x4); - VXC_DP4x4(C3, Y, V, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniCalculateTmpR4th_4x4); - - VXC_DP4x4(dst, C0, tmpV, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), uniCalculateR1st_4x4); - VXC_DP4x4(dst, C1, tmpV, VXC_MODIFIER(4, 7, 0, VXC_RM_ToNearestEven, 1), uniCalculateR1st_4x4); - VXC_DP4x4(dst, C2, tmpV, VXC_MODIFIER(8, 11, 0, VXC_RM_ToNearestEven, 1), uniCalculateR1st_4x4); - VXC_DP4x4(dst, C3, tmpV, VXC_MODIFIER(12, 15, 0, VXC_RM_ToNearestEven, 1), uniCalculateR1st_4x4); - - VXC_DP2x8(R, dst, paramData_f16, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniQuantU8toU8LoR_2x8); - VXC_DP2x8(R, dst, paramData_f16, VXC_MODIFIER(8, 15, 0, VXC_RM_ToNearestEven, 1), uniQuantU8toU8HiR_2x8); - - // calculate G - // ((298 * C - 100* D - 208 * E + 128) >> 8) --> [(298Y - 100U - 208V + 34784) >> 8] - // 298Y - 208V - VXC_DP4x4(C0, Y, V, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniCalculateTmpG1st_4x4); - VXC_DP4x4(C1, Y, V, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniCalculateTmpG2nd_4x4); - VXC_DP4x4(C2, Y, V, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniCalculateTmpG3rd_4x4); - VXC_DP4x4(C3, Y, V, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniCalculateTmpG4th_4x4); - // 34784 - 100U - ushort tmpG = 34784; - vxc_ushort8 tmpDstG; - VXC_DP2x8(tmpDstG, U, tmpG, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniCalculateTmpGbyU_2x8); - VXC_DP4x4(dst, C0, tmpDstG, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), uniCalculateG1st_4x4); - VXC_DP4x4(dst, C1, tmpDstG, VXC_MODIFIER(4, 7, 0, VXC_RM_ToNearestEven, 1), uniCalculateG2nd_4x4); - VXC_DP4x4(dst, C2, tmpDstG, VXC_MODIFIER(8, 11, 0, VXC_RM_ToNearestEven, 1), uniCalculateG3rd_4x4); - VXC_DP4x4(dst, C3, tmpDstG, VXC_MODIFIER(12, 15, 0, VXC_RM_ToNearestEven, 1), uniCalculateG4th_4x4); - - VXC_DP2x8(G, dst, paramData_f16, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniQuantU8toU8LoG_2x8); - VXC_DP2x8(G, dst, paramData_f16, VXC_MODIFIER(8, 15, 0, VXC_RM_ToNearestEven, 1), uniQuantU8toU8HiG_2x8); - - // calculate B - // ((298 * C + 516 * D + 128) >> 8) ==> [(298Y + 516U - 70688) >> 8] - VXC_DP4x4(C0, Y, U, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniCalculateTmpB1st_4x4); - VXC_DP4x4(C1, Y, U, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniCalculateTmpB2nd_4x4); - VXC_DP4x4(C2, Y, U, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniCalculateTmpB3rd_4x4); - VXC_DP4x4(C3, Y, U, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniCalculateTmpB4th_4x4); - tmpV = -70688; - VXC_DP4x4(dst, C0, tmpV, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), uniCalculateB1st_4x4); - VXC_DP4x4(dst, C1, tmpV, VXC_MODIFIER(4, 7, 0, VXC_RM_ToNearestEven, 1), uniCalculateB1st_4x4); - VXC_DP4x4(dst, C2, tmpV, VXC_MODIFIER(8, 11, 0, VXC_RM_ToNearestEven, 1), uniCalculateB1st_4x4); - VXC_DP4x4(dst, C3, tmpV, VXC_MODIFIER(12, 15, 0, VXC_RM_ToNearestEven, 1), uniCalculateB1st_4x4); - - VXC_DP2x8(B, dst, paramData_f16, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniQuantU8toU8LoB_2x8); - VXC_DP2x8(B, dst, paramData_f16, VXC_MODIFIER(8, 15, 0, VXC_RM_ToNearestEven, 1), uniQuantU8toU8HiB_2x8); - - // reorder to bgr - vxc_uchar8 tmpdst0, tmpdst1; - vxc_uchar16 dst0, dst1, dst2; - - if(bOrder == 2) - { - vxc_uchar16 exchangeData = B; - B = R; - R = exchangeData; - } - - // BGR BGR BG - VXC_DP2x8(tmpdst0, B, G, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniPackBG0_2x8); - VXC_DP2x8(dst0, tmpdst0, R, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniPackTmpAndR_2x8); - - // RBG RBG RB - VXC_DP2x8(tmpdst0, R, B, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniPackRB0_2x8); - VXC_DP2x8(dst0, tmpdst0, G, VXC_MODIFIER(8, 15, 0, VXC_RM_TowardZero, 0), uniPackTmp0AndG_2x8); - - pos = (int4)(get_global_id(0) * 3, get_global_id(1), 0, 0); - - VXC_WriteImage2DArray(output, pos, dst0, VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); - pos.x += 16; - - // GRB GRB GR - VXC_DP2x8(tmpdst0, G, R, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniPackGR1_2x8); - VXC_DP2x8(dst1, tmpdst0, B, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniPackTmp1AndB_2x8); - - // BGR BGR BG - VXC_DP2x8(tmpdst0, B, G, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniPackBG1_2x8); - VXC_DP2x8(dst1, tmpdst0, R, VXC_MODIFIER(8, 15, 0, VXC_RM_TowardZero, 0), uniPackTmp1AndR_2x8); - - VXC_WriteImage2DArray(output, pos, dst1, VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); - pos.x += 16; - - // RBG RBG RB - VXC_DP2x8(tmpdst0, R, B, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniPackRB2_2x8); - VXC_DP2x8(dst2, tmpdst0, G, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniPackTmp2AndG_2x8); - - // GRB GRB GR - VXC_DP2x8(tmpdst1, G, R, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniPackGR2_2x8); - VXC_DP2x8(dst2, tmpdst1, B, VXC_MODIFIER(8, 15, 0, VXC_RM_TowardZero, 0), uniPackTmp2AndB_2x8); - - VXC_WriteImage2DArray(output, pos, dst2, VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); -} diff --git a/src/tim/vx/internal/src/libnnext/ops/vx/pre_process_yuv420_trans_u8.vx b/src/tim/vx/internal/src/libnnext/ops/vx/pre_process_yuv420_trans_u8.vx deleted file mode 100644 index afb6bef..0000000 --- a/src/tim/vx/internal/src/libnnext/ops/vx/pre_process_yuv420_trans_u8.vx +++ /dev/null @@ -1,235 +0,0 @@ -#include "cl_viv_vx_ext.h" - -_viv_uniform VXC_512Bits uniCalculateR1st_4x4; -_viv_uniform VXC_512Bits uniCalculateTmpGbyU_2x8; -_viv_uniform VXC_512Bits uniCalculateTmpGbyU2nd_2x8; - -_viv_uniform VXC_512Bits uniCalculateB1st_4x4; -_viv_uniform VXC_512Bits uniConvertInt32toUint8_2x8; -_viv_uniform VXC_512Bits uniDescaleU8_4x4; - -_viv_uniform VXC_512Bits uniCalculateTmpRWise_4x4; -_viv_uniform VXC_512Bits uniCalculateTmpRWise2nd_4x4; -_viv_uniform VXC_512Bits uniCalculateTmpRWise3rd_4x4; -_viv_uniform VXC_512Bits uniCalculateTmpRWise4th_4x4; -_viv_uniform VXC_512Bits uniCalculateTmpGWise_4x4; -_viv_uniform VXC_512Bits uniCalculateTmpGWise2nd_4x4; -_viv_uniform VXC_512Bits uniCalculateTmpGWise3rd_4x4; -_viv_uniform VXC_512Bits uniCalculateTmpGWise4th_4x4; -_viv_uniform VXC_512Bits uniCalculateGWise_4x4; -_viv_uniform VXC_512Bits uniCalculateGWise2nd_4x4; - -_viv_uniform VXC_512Bits uniCalculateTmpBWise_4x4; -_viv_uniform VXC_512Bits uniCalculateTmpBWise2nd_4x4; -_viv_uniform VXC_512Bits uniCalculateTmpBWise3rd_4x4; -_viv_uniform VXC_512Bits uniCalculateTmpBWise4th_4x4; - -_viv_uniform VXC_512Bits uniBilinearTmp1st_4x4; -_viv_uniform VXC_512Bits uniBilinearTmp2nd_4x4; -_viv_uniform VXC_512Bits uniBilinearTmp3rd_4x4; -_viv_uniform VXC_512Bits uniBilinearTmp4th_4x4; - -_viv_uniform VXC_512Bits uniTransPackBgr1st_2x8; -_viv_uniform VXC_512Bits uniTransPackBgr2nd_2x8; - -_viv_uniform int bOrder; -_viv_uniform int rOrder; -_viv_uniform int zp; -_viv_uniform float outputScale; - -__kernel void pre_process_yuv420_trans_U8toU8( - __read_only image2d_array_t y_img, __read_only image2d_array_t u_img, - __read_only image2d_array_t v_img, __write_only image2d_array_t output, - global int *xRatio, global int * yRatio, global int * xOffset, global int * yOffset, - float rMean, float gMean, float bMean, float var, int reverse_channel, int trans) -{ - int4 gidx = get_global_id(0); - int gidy = get_global_id(1); - gidx += (int4)(0, 1, 2, 3); - - int4 fx = (gidx * (*xRatio) + ((*xRatio) >> 1)) - (1 << 14); - int4 sx = fx & 0xffff8000; // Floor - int fy, sy; - fx -= sx; - sx = sx >> 15; - fx = (fx +(1 << 4)) >> 5; - - // for y - fy = (gidy * (*yRatio) + ((*yRatio) >> 1)) - (1<< 14); - sy = fy & 0xffff8000; // Floor - fy -= sy; - sy = sy >> 15; - - sy = sy < 0 ? 0 : sy; - fy = fy < 0 ? 0 : fy; - - fy = (fy + (1<< 4)) >> 5; - sx += (*xOffset); - sy += (*yOffset); - int4 srcPos = (int4)(sx.x, sy, get_global_id(2), 0); - int4 srcPos1 = (int4)(sx.x >> 1, sy >> 1, get_global_id(2), 0); - int4 srcPos2 = (int4)(sx.x >> 1, (sy + 1) >> 1, get_global_id(2), 0); - - vxc_uchar16 Y, U, V; - vxc_int4 C0, C1, C2, C3; - vxc_uchar16 R, G, B; - - VXC_ReadImage(Y, y_img, srcPos, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0)); - VXC_ReadImage(Y, y_img, srcPos, VXC_5BITOFFSET_XY(0, 1), VXC_MODIFIER(2, 3, 0, VXC_RM_TowardZero, 0)); - VXC_ReadImage(U, u_img, srcPos1, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0)); - VXC_ReadImage(V, v_img, srcPos1, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0)); - srcPos1.x = (sx.x + 1) >> 1; - VXC_ReadImage(U, u_img, srcPos1, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(1, 1, 0, VXC_RM_TowardZero, 0)); - VXC_ReadImage(V, v_img, srcPos1, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(1, 1, 0, VXC_RM_TowardZero, 0)); - - VXC_ReadImage(U, u_img, srcPos2, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(2, 2, 0, VXC_RM_TowardZero, 0)); - VXC_ReadImage(V, v_img, srcPos2, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(2, 2, 0, VXC_RM_TowardZero, 0)); - srcPos2.x = (sx.x + 1) >> 1; - VXC_ReadImage(U, u_img, srcPos2, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(3, 3, 0, VXC_RM_TowardZero, 0)); - VXC_ReadImage(V, v_img, srcPos2, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(3, 3, 0, VXC_RM_TowardZero, 0)); - - srcPos.x = sx.y; - srcPos1.x = sx.y >> 1; - srcPos2.x = sx.y >> 1; - VXC_ReadImage(Y, y_img, srcPos, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(4, 5, 0, VXC_RM_TowardZero, 0)); - VXC_ReadImage(Y, y_img, srcPos, VXC_5BITOFFSET_XY(0, 1), VXC_MODIFIER(6, 7, 0, VXC_RM_TowardZero, 0)); - VXC_ReadImage(U, u_img, srcPos1, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(4, 4, 0, VXC_RM_TowardZero, 0)); - VXC_ReadImage(V, v_img, srcPos1, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(4, 4, 0, VXC_RM_TowardZero, 0)); - srcPos1.x = (sx.y + 1) >> 1; - VXC_ReadImage(U, u_img, srcPos1, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(5, 5, 0, VXC_RM_TowardZero, 0)); - VXC_ReadImage(V, v_img, srcPos1, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(5, 5, 0, VXC_RM_TowardZero, 0)); - - VXC_ReadImage(U, u_img, srcPos2, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(6, 6, 0, VXC_RM_TowardZero, 0)); - VXC_ReadImage(V, v_img, srcPos2, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(6, 6, 0, VXC_RM_TowardZero, 0)); - srcPos2.x = (sx.y + 1) >> 1; - VXC_ReadImage(U, u_img, srcPos2, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(7, 7, 0, VXC_RM_TowardZero, 0)); - VXC_ReadImage(V, v_img, srcPos2, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(7, 7, 0, VXC_RM_TowardZero, 0)); - - srcPos.x = sx.z; - srcPos1.x = sx.z >> 1; - srcPos2.x = sx.z >> 1; - VXC_ReadImage(Y, y_img, srcPos, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(8, 9, 0, VXC_RM_TowardZero, 0)); - VXC_ReadImage(Y, y_img, srcPos, VXC_5BITOFFSET_XY(0, 1), VXC_MODIFIER(10, 11, 0, VXC_RM_TowardZero, 0)); - VXC_ReadImage(U, u_img, srcPos1, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(8, 8, 0, VXC_RM_TowardZero, 0)); - VXC_ReadImage(V, v_img, srcPos1, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(8, 8, 0, VXC_RM_TowardZero, 0)); - srcPos1.x = (sx.z + 1) >> 1; - VXC_ReadImage(U, u_img, srcPos1, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(9, 9, 0, VXC_RM_TowardZero, 0)); - VXC_ReadImage(V, v_img, srcPos1, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(9, 9, 0, VXC_RM_TowardZero, 0)); - - VXC_ReadImage(U, u_img, srcPos2, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(10, 10, 0, VXC_RM_TowardZero, 0)); - VXC_ReadImage(V, v_img, srcPos2, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(10, 10, 0, VXC_RM_TowardZero, 0)); - srcPos2.x = (sx.z + 1) >> 1; - VXC_ReadImage(U, u_img, srcPos2, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(11, 11, 0, VXC_RM_TowardZero, 0)); - VXC_ReadImage(V, v_img, srcPos2, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(11, 11, 0, VXC_RM_TowardZero, 0)); - - srcPos.x = sx.w; - srcPos1.x = sx.w >> 1; - srcPos2.x = sx.w >> 1; - VXC_ReadImage(Y, y_img, srcPos, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(12, 13, 0, VXC_RM_TowardZero, 0)); - VXC_ReadImage(Y, y_img, srcPos, VXC_5BITOFFSET_XY(0, 1), VXC_MODIFIER(14, 15, 0, VXC_RM_TowardZero, 0)); - VXC_ReadImage(U, u_img, srcPos1, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(12, 12, 0, VXC_RM_TowardZero, 0)); - VXC_ReadImage(V, v_img, srcPos1, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(12, 12, 0, VXC_RM_TowardZero, 0)); - srcPos1.x = (sx.w + 1) >> 1; - VXC_ReadImage(U, u_img, srcPos1, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(13, 13, 0, VXC_RM_TowardZero, 0)); - VXC_ReadImage(V, v_img, srcPos1, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(13, 13, 0, VXC_RM_TowardZero, 0)); - - VXC_ReadImage(U, u_img, srcPos2, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(14, 14, 0, VXC_RM_TowardZero, 0)); - VXC_ReadImage(V, v_img, srcPos2, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(14, 14, 0, VXC_RM_TowardZero, 0)); - srcPos2.x = (sx.w + 1) >> 1; - VXC_ReadImage(U, u_img, srcPos2, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(15, 15, 0, VXC_RM_TowardZero, 0)); - VXC_ReadImage(V, v_img, srcPos2, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(15, 15, 0, VXC_RM_TowardZero, 0)); - - //C = Y - 16; D = U - 128; E = V - 128; - // ((298 * C + 409 * E + 128) >> 8) --> [(298Y + 409V - 56992) >> 8] - int tmpV = -56992; - VXC_DP4x4(C0, Y, V, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniCalculateTmpRWise_4x4); - VXC_DP4x4(C1, Y, V, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniCalculateTmpRWise2nd_4x4); - VXC_DP4x4(C2, Y, V, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniCalculateTmpRWise3rd_4x4); - VXC_DP4x4(C3, Y, V, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniCalculateTmpRWise4th_4x4); - VXC_DP4x4(R, C0, tmpV, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), uniCalculateR1st_4x4); - VXC_DP4x4(R, C1, tmpV, VXC_MODIFIER(4, 7, 0, VXC_RM_ToNearestEven, 1), uniCalculateR1st_4x4); - VXC_DP4x4(R, C2, tmpV, VXC_MODIFIER(8, 11, 0, VXC_RM_ToNearestEven, 1), uniCalculateR1st_4x4); - VXC_DP4x4(R, C3, tmpV, VXC_MODIFIER(12, 15, 0, VXC_RM_ToNearestEven, 1), uniCalculateR1st_4x4); - - // ((298 * C - 100* D - 208 * E + 128) >> 8) --> [(298Y - 100U - 208V + 34784) >> 8] - // 298Y - 208V - VXC_DP4x4(C0, Y, V, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniCalculateTmpGWise_4x4); - VXC_DP4x4(C1, Y, V, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniCalculateTmpGWise2nd_4x4); - VXC_DP4x4(C2, Y, V, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniCalculateTmpGWise3rd_4x4); - VXC_DP4x4(C3, Y, V, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniCalculateTmpGWise4th_4x4); - // 34784 - 100U - ushort tmpG = 34784; - vxc_ushort8 tmpDstG, tmpDstG1; - VXC_DP2x8(tmpDstG, U, tmpG, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniCalculateTmpGbyU_2x8); - VXC_DP2x8(tmpDstG1, U, tmpG, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniCalculateTmpGbyU2nd_2x8); - VXC_DP4x4(G, C0, tmpDstG, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), uniCalculateGWise_4x4); - VXC_DP4x4(G, C1, tmpDstG, VXC_MODIFIER(4, 7, 0, VXC_RM_ToNearestEven, 1), uniCalculateGWise2nd_4x4); - VXC_DP4x4(G, C2, tmpDstG1, VXC_MODIFIER(8, 11, 0, VXC_RM_ToNearestEven, 1), uniCalculateGWise_4x4); - VXC_DP4x4(G, C3, tmpDstG1, VXC_MODIFIER(12, 15, 0, VXC_RM_ToNearestEven, 1), uniCalculateGWise2nd_4x4); - - // ((298 * C + 516 * D + 128) >> 8) ==> [(298Y + 516U - 70688) >> 8] - VXC_DP4x4(C0, Y, U, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniCalculateTmpBWise_4x4); - VXC_DP4x4(C1, Y, U, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniCalculateTmpBWise2nd_4x4); - VXC_DP4x4(C2, Y, U, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniCalculateTmpBWise3rd_4x4); - VXC_DP4x4(C3, Y, U, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniCalculateTmpBWise4th_4x4); - tmpV = -70688; - VXC_DP4x4(B, C0, tmpV, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), uniCalculateB1st_4x4); - VXC_DP4x4(B, C1, tmpV, VXC_MODIFIER(4, 7, 0, VXC_RM_ToNearestEven, 1), uniCalculateB1st_4x4); - VXC_DP4x4(B, C2, tmpV, VXC_MODIFIER(8, 11, 0, VXC_RM_ToNearestEven, 1), uniCalculateB1st_4x4); - VXC_DP4x4(B, C3, tmpV, VXC_MODIFIER(12, 15, 0, VXC_RM_ToNearestEven, 1), uniCalculateB1st_4x4); - - int4 result, temp1, temp2, dstR, dstG, dstB; - int4 tmpData0, tmpData1; - - VXC_DP4x4(tmpData0, B, B, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniBilinearTmp1st_4x4); - VXC_DP4x4(tmpData1, B, B, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniBilinearTmp2nd_4x4); - temp1 = fx * tmpData0 + tmpData1; - // temp2 - temp1 - VXC_DP4x4(tmpData0, B, B, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniBilinearTmp3rd_4x4); - VXC_DP4x4(tmpData1, B, B, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniBilinearTmp4th_4x4); - temp2 = fx * tmpData0 + tmpData1; - result = fy * temp2 + (temp1 << 10); - - tmpV = 1 << 19; - vxc_uchar8 dst, tmpPack; - float4 tmpDst; - VXC_DP4x4(tmpDst, result, tmpV, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), uniDescaleU8_4x4); - tmpDst = (tmpDst - bMean) * var; - dstB = convert_int4_rte(tmpDst * outputScale + zp); - - VXC_DP4x4(tmpData0, G, G, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniBilinearTmp1st_4x4); - VXC_DP4x4(tmpData1, G, G, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniBilinearTmp2nd_4x4); - temp1 = fx * tmpData0 + tmpData1; - VXC_DP4x4(tmpData0, G, G, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniBilinearTmp3rd_4x4); - VXC_DP4x4(tmpData1, G, G, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniBilinearTmp4th_4x4); - temp2 = fx * tmpData0 + tmpData1; - result = fy * temp2 + (temp1 << 10); - VXC_DP4x4(tmpDst, result, tmpV, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), uniDescaleU8_4x4); - tmpDst = (tmpDst - gMean) * var; - dstG = convert_int4_rte(tmpDst * outputScale + zp); - - VXC_DP4x4(tmpData0, R, R, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniBilinearTmp1st_4x4); - VXC_DP4x4(tmpData1, R, R, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniBilinearTmp2nd_4x4); - temp1 = fx * tmpData0 + tmpData1; - VXC_DP4x4(tmpData0, R, R, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniBilinearTmp3rd_4x4); - VXC_DP4x4(tmpData1, R, R, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniBilinearTmp4th_4x4); - temp2 = fx * tmpData0 + tmpData1; - result = fy * temp2 + (temp1 << 10); - VXC_DP4x4(tmpDst, result, tmpV, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), uniDescaleU8_4x4); - tmpDst = (tmpDst - rMean) * var; - dstR = convert_int4_rte(tmpDst * outputScale + zp); - - if(bOrder == 2) - { - int4 exchangeData = dstB; - dstB = dstR; - dstR = exchangeData; - } - - VXC_DP2x8(tmpPack, dstB, dstG, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniConvertInt32toUint8_2x8); - VXC_DP2x8(dst, tmpPack, dstR, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniTransPackBgr1st_2x8); - VXC_DP2x8(dst, tmpPack, dstR, VXC_MODIFIER(8, 11, 0, VXC_RM_ToNearestEven, 1), uniTransPackBgr2nd_2x8); - - int2 dstPos = (int2)(get_global_id(0) * 3, gidy); - VXC_WriteImage(output, dstPos, dst, VXC_MODIFIER(0, 11, 0, VXC_RM_TowardZero, 0)); -} \ No newline at end of file diff --git a/src/tim/vx/internal/src/libnnext/ops/vx/pre_process_yuv444_copy_u8.vx b/src/tim/vx/internal/src/libnnext/ops/vx/pre_process_yuv444_copy_u8.vx index ca99597..20803c9 100644 --- a/src/tim/vx/internal/src/libnnext/ops/vx/pre_process_yuv444_copy_u8.vx +++ b/src/tim/vx/internal/src/libnnext/ops/vx/pre_process_yuv444_copy_u8.vx @@ -22,19 +22,6 @@ _viv_uniform VXC_512Bits uniCalculateTmpB3rd_4x4; _viv_uniform VXC_512Bits uniCalculateTmpB4th_4x4; _viv_uniform VXC_512Bits uniCalculateB1st_4x4; -_viv_uniform VXC_512Bits uniPackBG0_2x8; -_viv_uniform VXC_512Bits uniPackTmpAndR_2x8; -_viv_uniform VXC_512Bits uniPackRB0_2x8; -_viv_uniform VXC_512Bits uniPackTmp0AndG_2x8; -_viv_uniform VXC_512Bits uniPackGR1_2x8; -_viv_uniform VXC_512Bits uniPackTmp1AndB_2x8; -_viv_uniform VXC_512Bits uniPackBG1_2x8; -_viv_uniform VXC_512Bits uniPackTmp1AndR_2x8; -_viv_uniform VXC_512Bits uniPackRB2_2x8; -_viv_uniform VXC_512Bits uniPackTmp2AndG_2x8; -_viv_uniform VXC_512Bits uniPackGR2_2x8; -_viv_uniform VXC_512Bits uniPackTmp2AndB_2x8; - _viv_uniform VXC_512Bits uniQuantU8toU8LoB_2x8; _viv_uniform VXC_512Bits uniQuantU8toU8HiB_2x8; _viv_uniform VXC_512Bits uniQuantU8toU8LoG_2x8; @@ -143,137 +130,3 @@ __kernel void pre_process_yuv444_copy_U8toU8( pos.z = rOrder; VXC_WriteImage2DArray(output, pos, dst2, VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); } - -// store bgrbgrbgr -__kernel void pre_process_yuv444_copy_trans_U8( - __read_only image2d_t y_img, - __read_only image2d_t u_img, - __read_only image2d_t v_img, - __write_only image2d_array_t output, - global int * xRatio, - global int * yRatio, - global int * xOffset, - global int * yOffset, - float rMean, - float gMean, - float bMean, - float var, - int reverse_channel, - int trans - ) -{ - int4 pos = (int4)(get_global_id(0) + (*xOffset), get_global_id(1) + (*yOffset), 0, 0); - vxc_uchar16 Y, U, V; - vxc_int4 C0, C1, C2, C3; - vxc_uchar16 R, G, B; - vxc_uchar16 dst; - - VXC_ReadImage(Y, y_img, pos.xy, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); - VXC_ReadImage(U, u_img, pos.xy, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); - VXC_ReadImage(V, v_img, pos.xy, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); - - var *= outputScale; - float4 paramData = (float4)(bMean * var - zp, gMean * var - zp,\ - rMean * var - zp, var); - half4 paramData_f16; - _viv_asm(CONV, paramData_f16, paramData); - - //C = Y - 16; - //D = U - 128; - //E = V - 128; - // calculate R - // ((298 * C + 409 * E + 128) >> 8) --> [(298Y + 409V - 56992) >> 8] - int tmpV = -56992; - VXC_DP4x4(C0, Y, V, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniCalculateTmpR1st_4x4); - VXC_DP4x4(C1, Y, V, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniCalculateTmpR2nd_4x4); - VXC_DP4x4(C2, Y, V, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniCalculateTmpR3rd_4x4); - VXC_DP4x4(C3, Y, V, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniCalculateTmpR4th_4x4); - - VXC_DP4x4(dst, C0, tmpV, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), uniCalculateR1st_4x4); - VXC_DP4x4(dst, C1, tmpV, VXC_MODIFIER(4, 7, 0, VXC_RM_ToNearestEven, 1), uniCalculateR1st_4x4); - VXC_DP4x4(dst, C2, tmpV, VXC_MODIFIER(8, 11, 0, VXC_RM_ToNearestEven, 1), uniCalculateR1st_4x4); - VXC_DP4x4(dst, C3, tmpV, VXC_MODIFIER(12, 15, 0, VXC_RM_ToNearestEven, 1), uniCalculateR1st_4x4); - - VXC_DP2x8(R, dst, paramData_f16, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniQuantU8toU8LoR_2x8); - VXC_DP2x8(R, dst, paramData_f16, VXC_MODIFIER(8, 15, 0, VXC_RM_ToNearestEven, 1), uniQuantU8toU8HiR_2x8); - - // calculate G - // ((298 * C - 100* D - 208 * E + 128) >> 8) --> [(298Y - 100U - 208V + 34784) >> 8] - // 298Y - 208V - VXC_DP4x4(C0, Y, V, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniCalculateTmpG1st_4x4); - VXC_DP4x4(C1, Y, V, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniCalculateTmpG2nd_4x4); - VXC_DP4x4(C2, Y, V, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniCalculateTmpG3rd_4x4); - VXC_DP4x4(C3, Y, V, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniCalculateTmpG4th_4x4); - // 34784 - 100U - ushort tmpG = 34784; - vxc_ushort8 tmpDstG0, tmpDstG1; - VXC_DP2x8(tmpDstG0, U, tmpG, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniCalculateTmpGbyU_2x8); - VXC_DP2x8(tmpDstG1, U, tmpG, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniCalculateTmpGbyU2_2x8); - VXC_DP4x4(dst, C0, tmpDstG0, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), uniCalculateG1st_4x4); - VXC_DP4x4(dst, C1, tmpDstG0, VXC_MODIFIER(4, 7, 0, VXC_RM_ToNearestEven, 1), uniCalculateG2nd_4x4); - VXC_DP4x4(dst, C2, tmpDstG1, VXC_MODIFIER(8, 11, 0, VXC_RM_ToNearestEven, 1), uniCalculateG1st_4x4); - VXC_DP4x4(dst, C3, tmpDstG1, VXC_MODIFIER(12, 15, 0, VXC_RM_ToNearestEven, 1), uniCalculateG2nd_4x4); - - VXC_DP2x8(G, dst, paramData_f16, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniQuantU8toU8LoG_2x8); - VXC_DP2x8(G, dst, paramData_f16, VXC_MODIFIER(8, 15, 0, VXC_RM_ToNearestEven, 1), uniQuantU8toU8HiG_2x8); - - // calculate B - // ((298 * C + 516 * D + 128) >> 8) ==> [(298Y + 516U - 70688) >> 8] - VXC_DP4x4(C0, Y, U, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniCalculateTmpB1st_4x4); - VXC_DP4x4(C1, Y, U, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniCalculateTmpB2nd_4x4); - VXC_DP4x4(C2, Y, U, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniCalculateTmpB3rd_4x4); - VXC_DP4x4(C3, Y, U, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniCalculateTmpB4th_4x4); - tmpV = -70688; - VXC_DP4x4(dst, C0, tmpV, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), uniCalculateB1st_4x4); - VXC_DP4x4(dst, C1, tmpV, VXC_MODIFIER(4, 7, 0, VXC_RM_ToNearestEven, 1), uniCalculateB1st_4x4); - VXC_DP4x4(dst, C2, tmpV, VXC_MODIFIER(8, 11, 0, VXC_RM_ToNearestEven, 1), uniCalculateB1st_4x4); - VXC_DP4x4(dst, C3, tmpV, VXC_MODIFIER(12, 15, 0, VXC_RM_ToNearestEven, 1), uniCalculateB1st_4x4); - - VXC_DP2x8(B, dst, paramData_f16, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniQuantU8toU8LoB_2x8); - VXC_DP2x8(B, dst, paramData_f16, VXC_MODIFIER(8, 15, 0, VXC_RM_ToNearestEven, 1), uniQuantU8toU8HiB_2x8); - - // reorder to bgr - vxc_uchar8 tmpdst0, tmpdst1; - vxc_uchar16 dst0, dst1, dst2; - - if(bOrder == 2) - { - vxc_uchar16 exchangeData = B; - B = R; - R = exchangeData; - } - - // BGR BGR BG - VXC_DP2x8(tmpdst0, B, G, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniPackBG0_2x8); - VXC_DP2x8(dst0, tmpdst0, R, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniPackTmpAndR_2x8); - - // RBG RBG RB - VXC_DP2x8(tmpdst0, R, B, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniPackRB0_2x8); - VXC_DP2x8(dst0, tmpdst0, G, VXC_MODIFIER(8, 15, 0, VXC_RM_TowardZero, 0), uniPackTmp0AndG_2x8); - - pos = (int4)(get_global_id(0) * 3, get_global_id(1), 0, 0); - - VXC_WriteImage2DArray(output, pos, dst0, VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); - pos.x += 16; - - // GRB GRB GR - VXC_DP2x8(tmpdst0, G, R, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniPackGR1_2x8); - VXC_DP2x8(dst1, tmpdst0, B, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniPackTmp1AndB_2x8); - - // BGR BGR BG - VXC_DP2x8(tmpdst0, B, G, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniPackBG1_2x8); - VXC_DP2x8(dst1, tmpdst0, R, VXC_MODIFIER(8, 15, 0, VXC_RM_TowardZero, 0), uniPackTmp1AndR_2x8); - - VXC_WriteImage2DArray(output, pos, dst1, VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); - pos.x += 16; - - // RBG RBG RB - VXC_DP2x8(tmpdst0, R, B, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniPackRB2_2x8); - VXC_DP2x8(dst2, tmpdst0, G, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniPackTmp2AndG_2x8); - - // GRB GRB GR - VXC_DP2x8(tmpdst1, G, R, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniPackGR2_2x8); - VXC_DP2x8(dst2, tmpdst1, B, VXC_MODIFIER(8, 15, 0, VXC_RM_TowardZero, 0), uniPackTmp2AndB_2x8); - - VXC_WriteImage2DArray(output, pos, dst2, VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); -} \ No newline at end of file diff --git a/src/tim/vx/internal/src/libnnext/ops/vx/pre_process_yuv444_trans_u8.vx b/src/tim/vx/internal/src/libnnext/ops/vx/pre_process_yuv444_trans_u8.vx deleted file mode 100644 index 8217d2f..0000000 --- a/src/tim/vx/internal/src/libnnext/ops/vx/pre_process_yuv444_trans_u8.vx +++ /dev/null @@ -1,196 +0,0 @@ -#include "cl_viv_vx_ext.h" - -_viv_uniform VXC_512Bits uniCalculateR1st_4x4; -_viv_uniform VXC_512Bits uniCalculateTmpGbyU_2x8; -_viv_uniform VXC_512Bits uniCalculateTmpGbyU2nd_2x8; - -_viv_uniform VXC_512Bits uniCalculateB1st_4x4; -_viv_uniform VXC_512Bits uniConvertInt32toUint8_2x8; -_viv_uniform VXC_512Bits uniDescaleU8_4x4; - -_viv_uniform VXC_512Bits uniCalculateTmpRWise_4x4; -_viv_uniform VXC_512Bits uniCalculateTmpRWise2nd_4x4; -_viv_uniform VXC_512Bits uniCalculateTmpRWise3rd_4x4; -_viv_uniform VXC_512Bits uniCalculateTmpRWise4th_4x4; -_viv_uniform VXC_512Bits uniCalculateTmpGWise_4x4; -_viv_uniform VXC_512Bits uniCalculateTmpGWise2nd_4x4; -_viv_uniform VXC_512Bits uniCalculateTmpGWise3rd_4x4; -_viv_uniform VXC_512Bits uniCalculateTmpGWise4th_4x4; -_viv_uniform VXC_512Bits uniCalculateGWise_4x4; -_viv_uniform VXC_512Bits uniCalculateGWise2nd_4x4; - -_viv_uniform VXC_512Bits uniCalculateTmpBWise_4x4; -_viv_uniform VXC_512Bits uniCalculateTmpBWise2nd_4x4; -_viv_uniform VXC_512Bits uniCalculateTmpBWise3rd_4x4; -_viv_uniform VXC_512Bits uniCalculateTmpBWise4th_4x4; - -_viv_uniform VXC_512Bits uniBilinearTmp1st_4x4; -_viv_uniform VXC_512Bits uniBilinearTmp2nd_4x4; -_viv_uniform VXC_512Bits uniBilinearTmp3rd_4x4; -_viv_uniform VXC_512Bits uniBilinearTmp4th_4x4; - -_viv_uniform VXC_512Bits uniTransPackBgr1st_2x8; -_viv_uniform VXC_512Bits uniTransPackBgr2nd_2x8; - -_viv_uniform int bOrder; -_viv_uniform int rOrder; -_viv_uniform int zp; -_viv_uniform float outputScale; - -#define IMAGE_PRE_PROCESS_YUV444_TRANS(dst_name, dst_type) \ -__kernel void pre_process_yuv444_trans_U8to##dst_name( \ - __read_only image2d_t y_img, __read_only image2d_t u_img, \ - __read_only image2d_t v_img, __write_only image2d_t output, \ - global int *xRatio, global int * yRatio, global int * xOffset, global int * yOffset, \ - float rMean, float gMean, float bMean, float var, int reverse_channel, int trans) \ -{ \ - int4 gidx = get_global_id(0); \ - int gidy = get_global_id(1); \ - gidx += (int4)(0, 1, 2, 3); \ - \ - int4 fx = (gidx * (*xRatio) + ((*xRatio) >> 1)) - (1 << 14); \ - int4 sx = fx & 0xffff8000; \ - int fy, sy; \ - fx -= sx; \ - sx = sx >> 15; \ - fx = (fx +(1 << 4)) >> 5; \ - \ - fy = (gidy * (*yRatio) + ((*yRatio) >> 1)) - (1<< 14); \ - sy = fy & 0xffff8000; \ - fy -= sy; \ - sy = sy >> 15; \ - \ - sy = sy < 0 ? 0 : sy; \ - fy = fy < 0 ? 0 : fy; \ - \ - fy = (fy + (1<< 4)) >> 5; \ - sx += (*xOffset); \ - sy += (*yOffset); \ - int2 srcPos = (int2)(sx.x, sy); \ - \ - vxc_uchar16 Y, U, V; \ - vxc_int4 C0, C1, C2, C3; \ - vxc_uchar16 R, G, B; \ - \ - VXC_ReadImage(Y, y_img, srcPos, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0)); \ - VXC_ReadImage(Y, y_img, srcPos, VXC_5BITOFFSET_XY(0, 1), VXC_MODIFIER(2, 3, 0, VXC_RM_TowardZero, 0)); \ - VXC_ReadImage(U, u_img, srcPos, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0)); \ - VXC_ReadImage(U, u_img, srcPos, VXC_5BITOFFSET_XY(0, 1), VXC_MODIFIER(2, 3, 0, VXC_RM_TowardZero, 0)); \ - VXC_ReadImage(V, v_img, srcPos, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0)); \ - VXC_ReadImage(V, v_img, srcPos, VXC_5BITOFFSET_XY(0, 1), VXC_MODIFIER(2, 3, 0, VXC_RM_TowardZero, 0)); \ - \ - srcPos.x = sx.y; \ - VXC_ReadImage(Y, y_img, srcPos, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(4, 5, 0, VXC_RM_TowardZero, 0)); \ - VXC_ReadImage(Y, y_img, srcPos, VXC_5BITOFFSET_XY(0, 1), VXC_MODIFIER(6, 7, 0, VXC_RM_TowardZero, 0)); \ - VXC_ReadImage(U, u_img, srcPos, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(4, 5, 0, VXC_RM_TowardZero, 0)); \ - VXC_ReadImage(U, u_img, srcPos, VXC_5BITOFFSET_XY(0, 1), VXC_MODIFIER(6, 7, 0, VXC_RM_TowardZero, 0)); \ - VXC_ReadImage(V, v_img, srcPos, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(4, 5, 0, VXC_RM_TowardZero, 0)); \ - VXC_ReadImage(V, v_img, srcPos, VXC_5BITOFFSET_XY(0, 1), VXC_MODIFIER(6, 7, 0, VXC_RM_TowardZero, 0)); \ - \ - srcPos.x = sx.z; \ - VXC_ReadImage(Y, y_img, srcPos, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(8, 9, 0, VXC_RM_TowardZero, 0)); \ - VXC_ReadImage(Y, y_img, srcPos, VXC_5BITOFFSET_XY(0, 1), VXC_MODIFIER(10, 11, 0, VXC_RM_TowardZero, 0)); \ - VXC_ReadImage(U, u_img, srcPos, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(8, 9, 0, VXC_RM_TowardZero, 0)); \ - VXC_ReadImage(U, u_img, srcPos, VXC_5BITOFFSET_XY(0, 1), VXC_MODIFIER(10, 11, 0, VXC_RM_TowardZero, 0)); \ - VXC_ReadImage(V, v_img, srcPos, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(8, 9, 0, VXC_RM_TowardZero, 0)); \ - VXC_ReadImage(V, v_img, srcPos, VXC_5BITOFFSET_XY(0, 1), VXC_MODIFIER(10, 11, 0, VXC_RM_TowardZero, 0)); \ - \ - srcPos.x = sx.w; \ - VXC_ReadImage(Y, y_img, srcPos, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(12, 13, 0, VXC_RM_TowardZero, 0)); \ - VXC_ReadImage(Y, y_img, srcPos, VXC_5BITOFFSET_XY(0, 1), VXC_MODIFIER(14, 15, 0, VXC_RM_TowardZero, 0)); \ - VXC_ReadImage(U, u_img, srcPos, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(12, 13, 0, VXC_RM_TowardZero, 0)); \ - VXC_ReadImage(U, u_img, srcPos, VXC_5BITOFFSET_XY(0, 1), VXC_MODIFIER(14, 15, 0, VXC_RM_TowardZero, 0)); \ - VXC_ReadImage(V, v_img, srcPos, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(12, 13, 0, VXC_RM_TowardZero, 0)); \ - VXC_ReadImage(V, v_img, srcPos, VXC_5BITOFFSET_XY(0, 1), VXC_MODIFIER(14, 15, 0, VXC_RM_TowardZero, 0)); \ - \ - int tmpV = -56992; \ - VXC_DP4x4(C0, Y, V, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniCalculateTmpRWise_4x4); \ - VXC_DP4x4(C1, Y, V, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniCalculateTmpRWise2nd_4x4); \ - VXC_DP4x4(C2, Y, V, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniCalculateTmpRWise3rd_4x4); \ - VXC_DP4x4(C3, Y, V, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniCalculateTmpRWise4th_4x4); \ - VXC_DP4x4(R, C0, tmpV, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), uniCalculateR1st_4x4); \ - VXC_DP4x4(R, C1, tmpV, VXC_MODIFIER(4, 7, 0, VXC_RM_ToNearestEven, 1), uniCalculateR1st_4x4); \ - VXC_DP4x4(R, C2, tmpV, VXC_MODIFIER(8, 11, 0, VXC_RM_ToNearestEven, 1), uniCalculateR1st_4x4); \ - VXC_DP4x4(R, C3, tmpV, VXC_MODIFIER(12, 15, 0, VXC_RM_ToNearestEven, 1), uniCalculateR1st_4x4); \ - \ - VXC_DP4x4(C0, Y, V, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniCalculateTmpGWise_4x4); \ - VXC_DP4x4(C1, Y, V, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniCalculateTmpGWise2nd_4x4); \ - VXC_DP4x4(C2, Y, V, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniCalculateTmpGWise3rd_4x4); \ - VXC_DP4x4(C3, Y, V, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniCalculateTmpGWise4th_4x4); \ - \ - ushort tmpG = 34784; \ - vxc_ushort8 tmpDstG, tmpDstG1; \ - VXC_DP2x8(tmpDstG, U, tmpG, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniCalculateTmpGbyU_2x8); \ - VXC_DP2x8(tmpDstG1, U, tmpG, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniCalculateTmpGbyU2nd_2x8); \ - VXC_DP4x4(G, C0, tmpDstG, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), uniCalculateGWise_4x4); \ - VXC_DP4x4(G, C1, tmpDstG, VXC_MODIFIER(4, 7, 0, VXC_RM_ToNearestEven, 1), uniCalculateGWise2nd_4x4); \ - VXC_DP4x4(G, C2, tmpDstG1, VXC_MODIFIER(8, 11, 0, VXC_RM_ToNearestEven, 1), uniCalculateGWise_4x4); \ - VXC_DP4x4(G, C3, tmpDstG1, VXC_MODIFIER(12, 15, 0, VXC_RM_ToNearestEven, 1), uniCalculateGWise2nd_4x4); \ - \ - VXC_DP4x4(C0, Y, U, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniCalculateTmpBWise_4x4); \ - VXC_DP4x4(C1, Y, U, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniCalculateTmpBWise2nd_4x4); \ - VXC_DP4x4(C2, Y, U, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniCalculateTmpBWise3rd_4x4); \ - VXC_DP4x4(C3, Y, U, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniCalculateTmpBWise4th_4x4); \ - tmpV = -70688; \ - VXC_DP4x4(B, C0, tmpV, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), uniCalculateB1st_4x4); \ - VXC_DP4x4(B, C1, tmpV, VXC_MODIFIER(4, 7, 0, VXC_RM_ToNearestEven, 1), uniCalculateB1st_4x4); \ - VXC_DP4x4(B, C2, tmpV, VXC_MODIFIER(8, 11, 0, VXC_RM_ToNearestEven, 1), uniCalculateB1st_4x4); \ - VXC_DP4x4(B, C3, tmpV, VXC_MODIFIER(12, 15, 0, VXC_RM_ToNearestEven, 1), uniCalculateB1st_4x4); \ - \ - int4 result, temp1, temp2, dstR, dstG, dstB; \ - int4 tmpData0, tmpData1; \ - \ - VXC_DP4x4(tmpData0, B, B, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniBilinearTmp1st_4x4); \ - VXC_DP4x4(tmpData1, B, B, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniBilinearTmp2nd_4x4); \ - temp1 = fx * tmpData0 + tmpData1; \ - \ - VXC_DP4x4(tmpData0, B, B, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniBilinearTmp3rd_4x4); \ - VXC_DP4x4(tmpData1, B, B, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniBilinearTmp4th_4x4); \ - temp2 = fx * tmpData0 + tmpData1; \ - result = fy * temp2 + (temp1 << 10); \ - \ - tmpV = 1 << 19; \ - dst_type dst, tmpPack; \ - float4 tmpDst; \ - \ - VXC_DP4x4(tmpDst, result, tmpV, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), uniDescaleU8_4x4); \ - tmpDst = (tmpDst - bMean) * var; \ - dstB = convert_int4_rte(tmpDst * outputScale + zp); \ - \ - VXC_DP4x4(tmpData0, G, G, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniBilinearTmp1st_4x4); \ - VXC_DP4x4(tmpData1, G, G, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniBilinearTmp2nd_4x4); \ - temp1 = fx * tmpData0 + tmpData1; \ - VXC_DP4x4(tmpData0, G, G, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniBilinearTmp3rd_4x4); \ - VXC_DP4x4(tmpData1, G, G, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniBilinearTmp4th_4x4); \ - temp2 = fx * tmpData0 + tmpData1; \ - result = fy * temp2 + (temp1 << 10); \ - VXC_DP4x4(tmpDst, result, tmpV, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), uniDescaleU8_4x4); \ - tmpDst = (tmpDst - gMean) * var; \ - dstG = convert_int4_rte(tmpDst * outputScale + zp); \ - \ - VXC_DP4x4(tmpData0, R, R, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniBilinearTmp1st_4x4); \ - VXC_DP4x4(tmpData1, R, R, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniBilinearTmp2nd_4x4); \ - temp1 = fx * tmpData0 + tmpData1; \ - VXC_DP4x4(tmpData0, R, R, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniBilinearTmp3rd_4x4); \ - VXC_DP4x4(tmpData1, R, R, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniBilinearTmp4th_4x4); \ - temp2 = fx * tmpData0 + tmpData1; \ - result = fy * temp2 + (temp1 << 10); \ - VXC_DP4x4(tmpDst, result, tmpV, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), uniDescaleU8_4x4); \ - tmpDst = (tmpDst - rMean) * var; \ - dstR = convert_int4_rte(tmpDst * outputScale + zp); \ - \ - if(bOrder == 2) \ - { \ - int4 exchangeData = dstB; \ - dstB = dstR; \ - dstR = exchangeData; \ - } \ - \ - VXC_DP2x8(tmpPack, dstB, dstG, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniConvertInt32toUint8_2x8); \ - VXC_DP2x8(dst, tmpPack, dstR, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniTransPackBgr1st_2x8); \ - VXC_DP2x8(dst, tmpPack, dstR, VXC_MODIFIER(8, 11, 0, VXC_RM_ToNearestEven, 1), uniTransPackBgr2nd_2x8); \ - \ - int2 dstPos = (int2)(get_global_id(0) * 3, gidy); \ - VXC_WriteImage(output, dstPos, dst, VXC_MODIFIER(0, 11, 0, VXC_RM_TowardZero, 0)); \ -} -IMAGE_PRE_PROCESS_YUV444_TRANS(U8, vxc_uchar16) \ No newline at end of file diff --git a/src/tim/vx/internal/src/libnnext/ops/vx/resize_bilinear_BF16.vx b/src/tim/vx/internal/src/libnnext/ops/vx/resize_bilinear_BF16.vx index cd56af5..8f7826b 100644 --- a/src/tim/vx/internal/src/libnnext/ops/vx/resize_bilinear_BF16.vx +++ b/src/tim/vx/internal/src/libnnext/ops/vx/resize_bilinear_BF16.vx @@ -28,37 +28,34 @@ __kernel void resize_bilinear_BF16toBF16_DOWN float top_y_f = floor(in_y); float y_lerp = in_y - top_y_f; int top_y_idx = convert_int(top_y_f); - int bottom_y_idx = top_y_idx + 1; vxc_short8 top; vxc_short8 bottom; vxc_short8 zero = (vxc_short8)(0, 0, 0, 0, 0, 0, 0, 0); int4 coord_in = (int4)(left_x_idx.x, top_y_idx, coord_out.z, 0); - VXC_ReadImage2DArray(top, input, coord_in, \ - VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0)); + int8 input_desc; + _viv_asm(COPY, input_desc, input, sizeof(input_desc)); + int baseAddr = (int)coord_in.z * input_desc.s4 + input_desc.s0; + _viv_asm(MOV, coord_in.w, baseAddr); + VXC_OP4(img_load_3d, top, input, coord_in.xyww, VXC_5BITOFFSET_XY(0, 0), + VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0)); + VXC_OP4(img_load_3d, bottom, input, coord_in.xyww, VXC_5BITOFFSET_XY(0, 1), + VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0)); coord_in.x = left_x_idx.y; - VXC_ReadImage2DArray(top, input, coord_in, \ - VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(2, 3, 0, VXC_RM_TowardZero, 0)); + VXC_OP4(img_load_3d, top, input, coord_in.xyww, VXC_5BITOFFSET_XY(0, 0), + VXC_MODIFIER(2, 3, 0, VXC_RM_TowardZero, 0)); + VXC_OP4(img_load_3d, bottom, input, coord_in.xyww, VXC_5BITOFFSET_XY(0, 1), + VXC_MODIFIER(2, 3, 0, VXC_RM_TowardZero, 0)); coord_in.x = left_x_idx.z; - VXC_ReadImage2DArray(top, input, coord_in, \ - VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(4, 5, 0, VXC_RM_TowardZero, 0)); + VXC_OP4(img_load_3d, top, input, coord_in.xyww, VXC_5BITOFFSET_XY(0, 0), + VXC_MODIFIER(4, 5, 0, VXC_RM_TowardZero, 0)); + VXC_OP4(img_load_3d, bottom, input, coord_in.xyww, VXC_5BITOFFSET_XY(0, 1), + VXC_MODIFIER(4, 5, 0, VXC_RM_TowardZero, 0)); coord_in.x = left_x_idx.w; - VXC_ReadImage2DArray(top, input, coord_in, \ - VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(6, 7, 0, VXC_RM_TowardZero, 0)); - - coord_in.y = bottom_y_idx; - coord_in.x = left_x_idx.x; - VXC_ReadImage2DArray(bottom, input, coord_in, \ - VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0)); - coord_in.x = left_x_idx.y; - VXC_ReadImage2DArray(bottom, input, coord_in, \ - VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(2, 3, 0, VXC_RM_TowardZero, 0)); - coord_in.x = left_x_idx.z; - VXC_ReadImage2DArray(bottom, input, coord_in, \ - VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(4, 5, 0, VXC_RM_TowardZero, 0)); - coord_in.x = left_x_idx.w; - VXC_ReadImage2DArray(bottom, input, coord_in, \ - VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(6, 7, 0, VXC_RM_TowardZero, 0)); + VXC_OP4(img_load_3d, top, input, coord_in.xyww, VXC_5BITOFFSET_XY(0, 0), + VXC_MODIFIER(6, 7, 0, VXC_RM_TowardZero, 0)); + VXC_OP4(img_load_3d, bottom, input, coord_in.xyww, VXC_5BITOFFSET_XY(0, 1), + VXC_MODIFIER(6, 7, 0, VXC_RM_TowardZero, 0)); vxc_ushort8 src; float4 left4; @@ -84,7 +81,14 @@ __kernel void resize_bilinear_BF16toBF16_DOWN vxc_ushort8 tmp, dst; _viv_asm(COPY, tmp, dst4, 16); dst.s0123 = tmp.s1357; - VXC_WriteImage2DArray(output, coord_out, dst, VXC_MODIFIER(0, 3, 0,VXC_RM_TowardZero, 0)); + + int8 output_desc; + _viv_asm(COPY, output_desc, output, sizeof(output_desc)); + baseAddr = (int)coord_out.z * output_desc.s4 + output_desc.s0; + _viv_asm(MOV, coord_out.w, baseAddr); + + VXC_OP4_NoDest(img_store_3d, output, coord_out.xyww, dst, + VXC_MODIFIER(0, 3, 0,VXC_RM_TowardZero, 0)); } __kernel void resize_bilinear_BF16toBF16_UP @@ -107,22 +111,24 @@ __kernel void resize_bilinear_BF16toBF16_UP float top_y_f = floor(in_y); float y_lerp = in_y - top_y_f; int top_y_idx = convert_int(top_y_f); - float bottom_y_f = ceil(in_y); - int bottom_y_idx= convert_int(bottom_y_f); vxc_ushort8 src0, src1, src2, src3, dst0, dst1; vxc_short8 zero = (vxc_short8)(0, 0, 0, 0, 0, 0, 0, 0); int4 coord_in = (int4)(left_x_idx.x, top_y_idx, coord_out.z, 0); - VXC_ReadImage2DArray(src0, input, coord_in, \ - VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); - VXC_ReadImage2DArray(src1, input, coord_in, \ - VXC_5BITOFFSET_XY(8, 0), VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); + int8 input_desc; + _viv_asm(COPY, input_desc, input, sizeof(input_desc)); + int baseAddr = (int)coord_in.z * input_desc.s4 + input_desc.s0; + _viv_asm(MOV, coord_in.w, baseAddr); + + VXC_OP4(img_load_3d, src0, input, coord_in.xyww, VXC_5BITOFFSET_XY(0, 0), + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); + VXC_OP4(img_load_3d, src1, input, coord_in.xyww, VXC_5BITOFFSET_XY(8, 0), + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); + VXC_OP4(img_load_3d, src2, input, coord_in.xyww, VXC_5BITOFFSET_XY(0, 1), + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); + VXC_OP4(img_load_3d, src3, input, coord_in.xyww, VXC_5BITOFFSET_XY(8, 1), + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); - coord_in.y = bottom_y_idx; - VXC_ReadImage2DArray(src2, input, coord_in, \ - VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); - VXC_ReadImage2DArray(src3, input, coord_in, \ - VXC_5BITOFFSET_XY(8, 0), VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); vxc_ushort8 bitextract_p0; vxc_uchar16 maskShift = {16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16}; @@ -132,29 +138,36 @@ __kernel void resize_bilinear_BF16toBF16_UP VXC_DP2x8(maskShift, bitextract_p0, constData, \ VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniGetMaskShift_2x8); - do + int8 output_desc; + _viv_asm(COPY, output_desc, output, sizeof(output_desc)); + baseAddr = (int)coord_out.z * output_desc.s4 + output_desc.s0; + _viv_asm(MOV, coord_out.w, baseAddr); + + float4 left4; + float4 right4; + float4 top4; + float4 bottom4; + + int loop = depth - 1; + while (coord_in.z < loop) { VXC_BitExtract(dst0, src0, src1, maskShift, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); VXC_BitExtract(dst1, src2, src3, maskShift, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); - coord_in.z ++; - coord_in.y = top_y_idx; - VXC_ReadImage2DArray(src0, input, coord_in, \ - VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); - VXC_ReadImage2DArray(src1, input, coord_in, \ - VXC_5BITOFFSET_XY(8, 0), VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); - coord_in.y = bottom_y_idx; - VXC_ReadImage2DArray(src2, input, coord_in, \ - VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); - VXC_ReadImage2DArray(src3, input, coord_in, \ - VXC_5BITOFFSET_XY(8, 0), VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); + coord_in.w += input_desc.s4; + VXC_OP4(img_load_3d, src0, input, coord_in.xyww, VXC_5BITOFFSET_XY(0, 0), + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); + VXC_OP4(img_load_3d, src1, input, coord_in.xyww, VXC_5BITOFFSET_XY(8, 0), + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); + VXC_OP4(img_load_3d, src2, input, coord_in.xyww, VXC_5BITOFFSET_XY(0, 1), + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); + VXC_OP4(img_load_3d, src3, input, coord_in.xyww, VXC_5BITOFFSET_XY(8, 1), + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); + coord_in.z ++; vxc_ushort8 dst_tmp; - float4 left4; - float4 right4; - float4 top4; - float4 bottom4; + VXC_DP2x8(dst_tmp, dst0, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvBF16toF32_Part0_2x8); _viv_asm(COPY, left4, dst_tmp, 16); @@ -176,7 +189,30 @@ __kernel void resize_bilinear_BF16toBF16_UP vxc_ushort8 tmp, dst; _viv_asm(COPY, tmp, dst4, 16); dst.s0123 = tmp.s1357; - VXC_WriteImage2DArray(output, coord_out, dst, VXC_MODIFIER(0, 3, 0,VXC_RM_TowardZero, 0)); - coord_out.z ++; - } while (coord_in.z < depth); + + VXC_OP4_NoDest(img_store_3d, output, coord_out.xyww, dst, VXC_MODIFIER(0, 3, 0,VXC_RM_TowardZero, 0)); + coord_out.w += output_desc.s4; + } + + VXC_BitExtract(dst0, src0, src1, maskShift, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); + VXC_BitExtract(dst1, src2, src3, maskShift, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); + vxc_ushort8 dst_tmp; + VXC_DP2x8(dst_tmp, dst0, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvBF16toF32_Part0_2x8); + _viv_asm(COPY, left4, dst_tmp, 16); + VXC_DP2x8(dst_tmp, dst0, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvBF16toF32_Part1_2x8); + _viv_asm(COPY, right4, dst_tmp, 16); + right4 -= left4; + top4 = right4 * x_lerp + left4; + VXC_DP2x8(dst_tmp, dst1, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvBF16toF32_Part0_2x8); + _viv_asm(COPY, left4, dst_tmp, 16); + VXC_DP2x8(dst_tmp, dst1, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvBF16toF32_Part1_2x8); + _viv_asm(COPY, right4, dst_tmp, 16); + right4 -= left4; + bottom4 = right4 * x_lerp + left4; + bottom4 -= top4; + float4 dst4 = bottom4 * y_lerp + top4; + vxc_ushort8 tmp, dst; + _viv_asm(COPY, tmp, dst4, 16); + dst.s0123 = tmp.s1357; + VXC_OP4_NoDest(img_store_3d, output, coord_out.xyww, dst, VXC_MODIFIER(0, 3, 0,VXC_RM_TowardZero, 0)); } diff --git a/src/tim/vx/internal/src/libnnext/ops/vx/resize_bilinear_F16.vx b/src/tim/vx/internal/src/libnnext/ops/vx/resize_bilinear_F16.vx index f910d21..463b5a2 100644 --- a/src/tim/vx/internal/src/libnnext/ops/vx/resize_bilinear_F16.vx +++ b/src/tim/vx/internal/src/libnnext/ops/vx/resize_bilinear_F16.vx @@ -1,7 +1,7 @@ #include "cl_viv_vx_ext.h" _viv_uniform VXC_512Bits uniExtact8Bit_2x8; -_viv_uniform VXC_512Bits uniFp16toFp32_4x4; +_viv_uniform VXC_512Bits uniFp16toFp32_left_4x4; _viv_uniform VXC_512Bits uniRightSubLeft_4x4; _viv_uniform VXC_512Bits uniExtactHalf8_2x8; _viv_uniform float2 scale_xy; @@ -27,94 +27,66 @@ __kernel void resize_bilinear_F16toF16_DOWN float4 left_x_f = floor(in_x); float4 x_lerp = in_x - left_x_f; int4 left_x_idx = convert_int4(left_x_f); - float4 right_x_f = ceil(in_x); - int4 right_x_idx = convert_int4(right_x_f); float in_y = (convert_float(coord_out.y) + half_pixel_value) * scale_xy.y - half_pixel_value; float top_y_f = floor(in_y); float y_lerp = in_y - top_y_f; int top_y_idx = convert_int(top_y_f); - float bottom_y_f = ceil(in_y); - int bottom_y_idx= convert_int(bottom_y_f); - vxc_short8 top_left0, top_right0; - vxc_short8 bottom_left0, bottom_right0; - vxc_half8 top_left, top_right; - vxc_half8 bottom_left, bottom_right; + vxc_short8 top_short, bottom_short, dst; + vxc_half8 top, bottom, result; int4 coord_in = (int4)(left_x_idx.x, top_y_idx, coord_out.z, 0); - VXC_ReadImage2DArray(top_left0, input, coord_in, \ - VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0)); + int8 input_desc; + _viv_asm(COPY, input_desc, input, sizeof(input_desc)); + int baseAddr = (int)coord_in.z * input_desc.s4 + input_desc.s0; + _viv_asm(MOV, coord_in.w, baseAddr); + VXC_OP4(img_load_3d, top_short, input, coord_in.xyww, VXC_5BITOFFSET_XY(0, 0), + VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0)); + VXC_OP4(img_load_3d, bottom_short, input, coord_in.xyww, VXC_5BITOFFSET_XY(0, 1), + VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0)); coord_in.x = left_x_idx.y; - VXC_ReadImage2DArray(top_left0, input, coord_in, \ - VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(1, 1, 0, VXC_RM_TowardZero, 0)); + VXC_OP4(img_load_3d, top_short, input, coord_in.xyww, VXC_5BITOFFSET_XY(0, 0), + VXC_MODIFIER(2, 3, 0, VXC_RM_TowardZero, 0)); + VXC_OP4(img_load_3d, bottom_short, input, coord_in.xyww, VXC_5BITOFFSET_XY(0, 1), + VXC_MODIFIER(2, 3, 0, VXC_RM_TowardZero, 0)); coord_in.x = left_x_idx.z; - VXC_ReadImage2DArray(top_left0, input, coord_in, \ - VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(2, 2, 0, VXC_RM_TowardZero, 0)); + VXC_OP4(img_load_3d, top_short, input, coord_in.xyww, VXC_5BITOFFSET_XY(0, 0), + VXC_MODIFIER(4, 5, 0, VXC_RM_TowardZero, 0)); + VXC_OP4(img_load_3d, bottom_short, input, coord_in.xyww, VXC_5BITOFFSET_XY(0, 1), + VXC_MODIFIER(4, 5, 0, VXC_RM_TowardZero, 0)); coord_in.x = left_x_idx.w; - VXC_ReadImage2DArray(top_left0, input, coord_in, \ - VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(3, 3, 0, VXC_RM_TowardZero, 0)); - _viv_asm(COPY, top_left, top_left0, 16); - - coord_in.x = right_x_idx.x; - VXC_ReadImage2DArray(top_right0, input, coord_in, \ - VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0)); - coord_in.x = right_x_idx.y; - VXC_ReadImage2DArray(top_right0, input, coord_in, \ - VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(1, 1, 0, VXC_RM_TowardZero, 0)); - coord_in.x = right_x_idx.z; - VXC_ReadImage2DArray(top_right0, input, coord_in, \ - VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(2, 2, 0, VXC_RM_TowardZero, 0)); - coord_in.x = right_x_idx.w; - VXC_ReadImage2DArray(top_right0, input, coord_in, \ - VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(3, 3, 0, VXC_RM_TowardZero, 0)); - _viv_asm(COPY, top_right, top_right0, 16); - - coord_in.y = bottom_y_idx; - coord_in.x = left_x_idx.x; - VXC_ReadImage2DArray(bottom_left0, input, coord_in, \ - VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0)); - coord_in.x = left_x_idx.y; - VXC_ReadImage2DArray(bottom_left0, input, coord_in, \ - VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(1, 1, 0, VXC_RM_TowardZero, 0)); - coord_in.x = left_x_idx.z; - VXC_ReadImage2DArray(bottom_left0, input, coord_in, \ - VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(2, 2, 0, VXC_RM_TowardZero, 0)); - coord_in.x = left_x_idx.w; - VXC_ReadImage2DArray(bottom_left0, input, coord_in, \ - VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(3, 3, 0, VXC_RM_TowardZero, 0)); - _viv_asm(COPY, bottom_left, bottom_left0, 16); - - coord_in.x = right_x_idx.x; - VXC_ReadImage2DArray(bottom_right0, input, coord_in, \ - VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0)); - coord_in.x = right_x_idx.y; - VXC_ReadImage2DArray(bottom_right0, input, coord_in, \ - VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(1, 1, 0, VXC_RM_TowardZero, 0)); - coord_in.x = right_x_idx.z; - VXC_ReadImage2DArray(bottom_right0, input, coord_in, \ - VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(2, 2, 0, VXC_RM_TowardZero, 0)); - coord_in.x = right_x_idx.w; - VXC_ReadImage2DArray(bottom_right0, input, coord_in, \ - VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(3, 3, 0, VXC_RM_TowardZero, 0)); - _viv_asm(COPY, bottom_right, bottom_right0, 16); + VXC_OP4(img_load_3d, top_short, input, coord_in.xyww, VXC_5BITOFFSET_XY(0, 0), + VXC_MODIFIER(6, 7, 0, VXC_RM_TowardZero, 0)); + VXC_OP4(img_load_3d, bottom_short, input, coord_in.xyww, VXC_5BITOFFSET_XY(0, 1), + VXC_MODIFIER(6, 7, 0, VXC_RM_TowardZero, 0)); + _viv_asm(COPY, top, top_short, 16); + _viv_asm(COPY, bottom, bottom_short, 16); float4 left4; float4 right4; float4 top4; float4 bottom4; - VXC_DP4x4(left4, top_left, top_left, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniFp16toFp32_4x4); - VXC_DP4x4(right4, top_right, top_left, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniRightSubLeft_4x4); - top4 = right4 * x_lerp + left4; - VXC_DP4x4(left4, bottom_left, bottom_left, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniFp16toFp32_4x4); - VXC_DP4x4(right4, bottom_right, bottom_left, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniRightSubLeft_4x4); - bottom4 = right4 * x_lerp + left4; - bottom4 -= top4; - float4 dst4 = bottom4 * y_lerp + top4; + VXC_DP4x4(left4, top, top, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniFp16toFp32_left_4x4); + VXC_DP4x4(right4, top, top, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniRightSubLeft_4x4); + top4 = right4 * x_lerp + left4; + VXC_DP4x4(left4, bottom, bottom, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniFp16toFp32_left_4x4); + VXC_DP4x4(right4, bottom, bottom, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniRightSubLeft_4x4); + bottom4 = right4 * x_lerp + left4; + bottom4 -= top4; + float4 dst4 = bottom4 * y_lerp + top4; + half4 tmp; _viv_asm(CONV, tmp, dst4); - VXC_DP2x8(top_left, tmp, tmp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniExtactHalf8_2x8); - _viv_asm(COPY, top_left0, top_left, 16); - VXC_WriteImage2DArray(output, coord_out, top_left0, VXC_MODIFIER(0, 3, 0,VXC_RM_TowardZero, 0)); + VXC_DP2x8(result, tmp, tmp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniExtactHalf8_2x8); + _viv_asm(COPY, dst, result, 16); + + int8 output_desc; + _viv_asm(COPY, output_desc, output, sizeof(output_desc)); + baseAddr = (int)coord_out.z * output_desc.s4 + output_desc.s0; + _viv_asm(MOV, coord_out.w, baseAddr); + + VXC_OP4_NoDest(img_store_3d, output, coord_out.xyww, dst, + VXC_MODIFIER(0, 3, 0,VXC_RM_TowardZero, 0)); } __kernel void resize_bilinear_F16toU8_DOWN @@ -131,84 +103,50 @@ __kernel void resize_bilinear_F16toU8_DOWN float4 left_x_f = floor(in_x); float4 x_lerp = in_x - left_x_f; int4 left_x_idx = convert_int4(left_x_f); - float4 right_x_f = ceil(in_x); - int4 right_x_idx = convert_int4(right_x_f); float in_y = (convert_float(coord_out.y) + half_pixel_value) * scale_xy.y - half_pixel_value; float top_y_f = floor(in_y); float y_lerp = in_y - top_y_f; int top_y_idx = convert_int(top_y_f); - float bottom_y_f = ceil(in_y); - int bottom_y_idx= convert_int(bottom_y_f); - vxc_short8 top_left0, top_right0; - vxc_short8 bottom_left0, bottom_right0; - vxc_half8 top_left, top_right; - vxc_half8 bottom_left, bottom_right; + + vxc_short8 top_short, bottom_short; + vxc_half8 top, bottom; int4 coord_in = (int4)(left_x_idx.x, top_y_idx, coord_out.z, 0); - VXC_ReadImage2DArray(top_left0, input, coord_in, \ - VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0)); + int8 input_desc; + _viv_asm(COPY, input_desc, input, sizeof(input_desc)); + int baseAddr = (int)coord_in.z * input_desc.s4 + input_desc.s0; + _viv_asm(MOV, coord_in.w, baseAddr); + VXC_OP4(img_load_3d, top_short, input, coord_in.xyww, VXC_5BITOFFSET_XY(0, 0), + VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0)); + VXC_OP4(img_load_3d, bottom_short, input, coord_in.xyww, VXC_5BITOFFSET_XY(0, 1), + VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0)); coord_in.x = left_x_idx.y; - VXC_ReadImage2DArray(top_left0, input, coord_in, \ - VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(1, 1, 0, VXC_RM_TowardZero, 0)); + VXC_OP4(img_load_3d, top_short, input, coord_in.xyww, VXC_5BITOFFSET_XY(0, 0), + VXC_MODIFIER(2, 3, 0, VXC_RM_TowardZero, 0)); + VXC_OP4(img_load_3d, bottom_short, input, coord_in.xyww, VXC_5BITOFFSET_XY(0, 1), + VXC_MODIFIER(2, 3, 0, VXC_RM_TowardZero, 0)); coord_in.x = left_x_idx.z; - VXC_ReadImage2DArray(top_left0, input, coord_in, \ - VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(2, 2, 0, VXC_RM_TowardZero, 0)); + VXC_OP4(img_load_3d, top_short, input, coord_in.xyww, VXC_5BITOFFSET_XY(0, 0), + VXC_MODIFIER(4, 5, 0, VXC_RM_TowardZero, 0)); + VXC_OP4(img_load_3d, bottom_short, input, coord_in.xyww, VXC_5BITOFFSET_XY(0, 1), + VXC_MODIFIER(4, 5, 0, VXC_RM_TowardZero, 0)); coord_in.x = left_x_idx.w; - VXC_ReadImage2DArray(top_left0, input, coord_in, \ - VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(3, 3, 0, VXC_RM_TowardZero, 0)); - _viv_asm(COPY, top_left, top_left0, 16); + VXC_OP4(img_load_3d, top_short, input, coord_in.xyww, VXC_5BITOFFSET_XY(0, 0), + VXC_MODIFIER(6, 7, 0, VXC_RM_TowardZero, 0)); + VXC_OP4(img_load_3d, bottom_short, input, coord_in.xyww, VXC_5BITOFFSET_XY(0, 1), + VXC_MODIFIER(6, 7, 0, VXC_RM_TowardZero, 0)); + _viv_asm(COPY, top, top_short, 16); + _viv_asm(COPY, bottom, bottom_short, 16); - coord_in.x = right_x_idx.x; - VXC_ReadImage2DArray(top_right0, input, coord_in, \ - VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0)); - coord_in.x = right_x_idx.y; - VXC_ReadImage2DArray(top_right0, input, coord_in, \ - VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(1, 1, 0, VXC_RM_TowardZero, 0)); - coord_in.x = right_x_idx.z; - VXC_ReadImage2DArray(top_right0, input, coord_in, \ - VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(2, 2, 0, VXC_RM_TowardZero, 0)); - coord_in.x = right_x_idx.w; - VXC_ReadImage2DArray(top_right0, input, coord_in, \ - VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(3, 3, 0, VXC_RM_TowardZero, 0)); - _viv_asm(COPY, top_right, top_right0, 16); - - coord_in.y = bottom_y_idx; - coord_in.x = left_x_idx.x; - VXC_ReadImage2DArray(bottom_left0, input, coord_in, \ - VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0)); - coord_in.x = left_x_idx.y; - VXC_ReadImage2DArray(bottom_left0, input, coord_in, \ - VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(1, 1, 0, VXC_RM_TowardZero, 0)); - coord_in.x = left_x_idx.z; - VXC_ReadImage2DArray(bottom_left0, input, coord_in, \ - VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(2, 2, 0, VXC_RM_TowardZero, 0)); - coord_in.x = left_x_idx.w; - VXC_ReadImage2DArray(bottom_left0, input, coord_in, \ - VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(3, 3, 0, VXC_RM_TowardZero, 0)); - _viv_asm(COPY, bottom_left, bottom_left0, 16); - - coord_in.x = right_x_idx.x; - VXC_ReadImage2DArray(bottom_right0, input, coord_in, \ - VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0)); - coord_in.x = right_x_idx.y; - VXC_ReadImage2DArray(bottom_right0, input, coord_in, \ - VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(1, 1, 0, VXC_RM_TowardZero, 0)); - coord_in.x = right_x_idx.z; - VXC_ReadImage2DArray(bottom_right0, input, coord_in, \ - VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(2, 2, 0, VXC_RM_TowardZero, 0)); - coord_in.x = right_x_idx.w; - VXC_ReadImage2DArray(bottom_right0, input, coord_in, \ - VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(3, 3, 0, VXC_RM_TowardZero, 0)); - _viv_asm(COPY, bottom_right, bottom_right0, 16); float4 left4; float4 right4; float4 top4; float4 bottom4; - VXC_DP4x4(left4, top_left, top_left, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniFp16toFp32_4x4); - VXC_DP4x4(right4, top_right, top_left, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniRightSubLeft_4x4); + VXC_DP4x4(left4, top, top, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniFp16toFp32_left_4x4); + VXC_DP4x4(right4, top, top, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniRightSubLeft_4x4); top4 = right4 * x_lerp + left4; - VXC_DP4x4(left4, bottom_left, bottom_left, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniFp16toFp32_4x4); - VXC_DP4x4(right4, bottom_right, bottom_left, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniRightSubLeft_4x4); + VXC_DP4x4(left4, bottom, bottom, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniFp16toFp32_left_4x4); + VXC_DP4x4(right4, bottom, bottom, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniRightSubLeft_4x4); bottom4 = right4 * x_lerp + left4; bottom4 -= top4; float4 dst4 = bottom4 * y_lerp + top4; @@ -216,7 +154,14 @@ __kernel void resize_bilinear_F16toU8_DOWN int4 dst = convert_int4_rte(dst4); vxc_uchar8 dst_uchar; VXC_DP2x8(dst_uchar, dst, dst, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), uniExtact8Bit_2x8); - VXC_WriteImage2DArray(output, coord_out, dst_uchar, VXC_MODIFIER(0, 3, 0,VXC_RM_TowardZero, 0)); + + int8 output_desc; + _viv_asm(COPY, output_desc, output, sizeof(output_desc)); + baseAddr = (int)coord_out.z * output_desc.s4 + output_desc.s0; + _viv_asm(MOV, coord_out.w, baseAddr); + + VXC_OP4_NoDest(img_store_3d, output, coord_out.xyww, dst_uchar, + VXC_MODIFIER(0, 3, 0,VXC_RM_TowardZero, 0)); } __kernel void resize_bilinear_F16toF16_UP @@ -239,24 +184,26 @@ __kernel void resize_bilinear_F16toF16_UP float top_y_f = floor(in_y); float y_lerp = in_y - top_y_f; int top_y_idx = convert_int(top_y_f); - float bottom_y_f = ceil(in_y); - int bottom_y_idx= convert_int(bottom_y_f); + vxc_ushort8 src0, src1, src2, src3, dst0, dst1; vxc_half8 top; vxc_half8 bottom; int4 coord_in = (int4)(left_x_idx.x, top_y_idx, coord_out.z, 0); - VXC_ReadImage2DArray(src0, input, coord_in, \ - VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); - VXC_ReadImage2DArray(src1, input, coord_in, \ - VXC_5BITOFFSET_XY(8, 0), VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); + int8 input_desc; + _viv_asm(COPY, input_desc, input, sizeof(input_desc)); + int baseAddr = (int)coord_in.z * input_desc.s4 + input_desc.s0; + _viv_asm(MOV, coord_in.w, baseAddr); - coord_in.y = bottom_y_idx; - VXC_ReadImage2DArray(src2, input, coord_in, \ - VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); - VXC_ReadImage2DArray(src3, input, coord_in, \ - VXC_5BITOFFSET_XY(8, 0), VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); + VXC_OP4(img_load_3d, src0, input, coord_in.xyww, VXC_5BITOFFSET_XY(0, 0), + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); + VXC_OP4(img_load_3d, src1, input, coord_in.xyww, VXC_5BITOFFSET_XY(8, 0), + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); + VXC_OP4(img_load_3d, src2, input, coord_in.xyww, VXC_5BITOFFSET_XY(0, 1), + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); + VXC_OP4(img_load_3d, src3, input, coord_in.xyww, VXC_5BITOFFSET_XY(8, 1), + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); vxc_ushort8 bitextract_p0; vxc_uchar16 maskShift = {16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16}; @@ -266,32 +213,41 @@ __kernel void resize_bilinear_F16toF16_UP VXC_DP2x8(maskShift, bitextract_p0, constData, \ VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniGetMaskShift_2x8); - do + int8 output_desc; + _viv_asm(COPY, output_desc, output, sizeof(output_desc)); + baseAddr = (int)coord_out.z * output_desc.s4 + output_desc.s0; + _viv_asm(MOV, coord_out.w, baseAddr); + + float4 left4; + float4 right4; + float4 top4; + float4 bottom4; + + int loop = depth - 1; + while (coord_in.z < loop) { VXC_BitExtract(dst0, src0, src1, maskShift, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); VXC_BitExtract(dst1, src2, src3, maskShift, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); _viv_asm(COPY, top, dst0, 16); _viv_asm(COPY, bottom, dst1, 16); + + + coord_in.w += input_desc.s4; + VXC_OP4(img_load_3d, src0, input, coord_in.xyww, VXC_5BITOFFSET_XY(0, 0), + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); + VXC_OP4(img_load_3d, src1, input, coord_in.xyww, VXC_5BITOFFSET_XY(8, 0), + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); + VXC_OP4(img_load_3d, src2, input, coord_in.xyww, VXC_5BITOFFSET_XY(0, 1), + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); + VXC_OP4(img_load_3d, src3, input, coord_in.xyww, VXC_5BITOFFSET_XY(8, 1), + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); coord_in.z ++; - coord_in.y = top_y_idx; - VXC_ReadImage2DArray(src0, input, coord_in, \ - VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); - VXC_ReadImage2DArray(src1, input, coord_in, \ - VXC_5BITOFFSET_XY(8, 0), VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); - coord_in.y = bottom_y_idx; - VXC_ReadImage2DArray(src2, input, coord_in, \ - VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); - VXC_ReadImage2DArray(src3, input, coord_in, \ - VXC_5BITOFFSET_XY(8, 0), VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); - float4 left4; - float4 right4; - float4 top4; - float4 bottom4; - VXC_DP4x4(left4, top, top, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniFp16toFp32_4x4); - VXC_DP4x4(right4, top, top, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniFp16toFp32_part1_4x4); + + VXC_DP4x4(left4, top, top, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniFp16toFp32_left_4x4); + VXC_DP4x4(right4, top, top, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniRightSubLeft_4x4); top4 = right4 * x_lerp + left4; - VXC_DP4x4(left4, bottom, bottom, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniFp16toFp32_4x4); - VXC_DP4x4(right4, bottom, bottom, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniFp16toFp32_part1_4x4); + VXC_DP4x4(left4, bottom, bottom, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniFp16toFp32_left_4x4); + VXC_DP4x4(right4, bottom, bottom, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniRightSubLeft_4x4); bottom4 = right4 * x_lerp + left4; bottom4 -= top4; float4 dst4 = bottom4 * y_lerp + top4; @@ -299,7 +255,28 @@ __kernel void resize_bilinear_F16toF16_UP _viv_asm(CONV, tmp, dst4); VXC_DP2x8(top, tmp, tmp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniExtactHalf8_2x8); _viv_asm(COPY, dst0, top, 16); - VXC_WriteImage2DArray(output, coord_out, dst0, VXC_MODIFIER(0, 3, 0,VXC_RM_TowardZero, 0)); - coord_out.z ++; - } while (coord_in.z < depth); + + VXC_OP4_NoDest(img_store_3d, output, coord_out.xyww, dst0, VXC_MODIFIER(0, 3, 0,VXC_RM_TowardZero, 0)); + coord_out.w += output_desc.s4; + } + + VXC_BitExtract(dst0, src0, src1, maskShift, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); + VXC_BitExtract(dst1, src2, src3, maskShift, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); + _viv_asm(COPY, top, dst0, 16); + _viv_asm(COPY, bottom, dst1, 16); + + VXC_DP4x4(left4, top, top, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniFp16toFp32_left_4x4); + VXC_DP4x4(right4, top, top, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniRightSubLeft_4x4); + top4 = right4 * x_lerp + left4; + VXC_DP4x4(left4, bottom, bottom, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniFp16toFp32_left_4x4); + VXC_DP4x4(right4, bottom, bottom, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniRightSubLeft_4x4); + bottom4 = right4 * x_lerp + left4; + bottom4 -= top4; + float4 dst4 = bottom4 * y_lerp + top4; + half4 tmp; + _viv_asm(CONV, tmp, dst4); + VXC_DP2x8(top, tmp, tmp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniExtactHalf8_2x8); + _viv_asm(COPY, dst0, top, 16); + + VXC_OP4_NoDest(img_store_3d, output, coord_out.xyww, dst0, VXC_MODIFIER(0, 3, 0,VXC_RM_TowardZero, 0)); } diff --git a/src/tim/vx/internal/src/libnnext/ops/vx/resize_bilinear_I16.vx b/src/tim/vx/internal/src/libnnext/ops/vx/resize_bilinear_I16.vx index 7f5b21f..bdfa3fb 100644 --- a/src/tim/vx/internal/src/libnnext/ops/vx/resize_bilinear_I16.vx +++ b/src/tim/vx/internal/src/libnnext/ops/vx/resize_bilinear_I16.vx @@ -5,8 +5,8 @@ _viv_uniform float2 scale_xy; _viv_uniform int depth; _viv_uniform VXC_512Bits uniConvertI32toI16_2x8; _viv_uniform VXC_512Bits uniGetMaskShift_2x8; -_viv_uniform VXC_512Bits uniConvertDFP2FP32_part1_4x4; -_viv_uniform VXC_512Bits uniConvertDFP2FP32_4x4; +_viv_uniform VXC_512Bits uniDFPtoFp32_left_4x4; +_viv_uniform VXC_512Bits uniRightSubLeft_4x4; _viv_uniform float dfpScale; _viv_uniform float half_pixel_value; @@ -34,8 +34,6 @@ __kernel void resize_bilinear_I16toI16_UP float top_y_f = floor(in_y); float y_lerp = in_y - top_y_f; int top_y_idx = convert_int(top_y_f); - float bottom_y_f = ceil(in_y); - int bottom_y_idx= convert_int(bottom_y_f); vxc_ushort8 src0, src1, src2, src3, dst0, dst1; @@ -44,16 +42,19 @@ __kernel void resize_bilinear_I16toI16_UP int4 coord_in = (int4)(left_x_idx.x, top_y_idx, coord_out.z, 0); - VXC_ReadImage2DArray(src0, input, coord_in, \ - VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); - VXC_ReadImage2DArray(src1, input, coord_in, \ - VXC_5BITOFFSET_XY(8, 0), VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); + int8 input_desc; + _viv_asm(COPY, input_desc, input, sizeof(input_desc)); + int baseAddr = (int)coord_in.z * input_desc.s4 + input_desc.s0; + _viv_asm(MOV, coord_in.w, baseAddr); - coord_in.y = bottom_y_idx; - VXC_ReadImage2DArray(src2, input, coord_in, \ - VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); - VXC_ReadImage2DArray(src3, input, coord_in, \ - VXC_5BITOFFSET_XY(8, 0), VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); + VXC_OP4(img_load_3d, src0, input, coord_in.xyww, VXC_5BITOFFSET_XY(0, 0), + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); + VXC_OP4(img_load_3d, src1, input, coord_in.xyww, VXC_5BITOFFSET_XY(8, 0), + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); + VXC_OP4(img_load_3d, src2, input, coord_in.xyww, VXC_5BITOFFSET_XY(0, 1), + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); + VXC_OP4(img_load_3d, src3, input, coord_in.xyww, VXC_5BITOFFSET_XY(8, 1), + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); vxc_ushort8 bitextract_p0; vxc_uchar16 maskShift = {16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16}; @@ -63,39 +64,42 @@ __kernel void resize_bilinear_I16toI16_UP VXC_DP2x8(maskShift, bitextract_p0, constData, \ VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniGetMaskShift_2x8); - do + int8 output_desc; + _viv_asm(COPY, output_desc, output, sizeof(output_desc)); + baseAddr = (int)coord_out.z * output_desc.s4 + output_desc.s0; + _viv_asm(MOV, coord_out.w, baseAddr); + + float4 left4; + float4 right4; + float4 top4; + float4 bottom4; + + int loop = depth - 1; + while (coord_in.z < loop) { VXC_BitExtract(dst0, src0, src1, maskShift, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); VXC_BitExtract(dst1, src2, src3, maskShift, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); _viv_asm(COPY, top, dst0, 16); _viv_asm(COPY, bottom, dst1, 16); - - float4 left4; - float4 right4; - float4 top4; - float4 bottom4; - + coord_in.w += input_desc.s4; + VXC_OP4(img_load_3d, src0, input, coord_in.xyww, VXC_5BITOFFSET_XY(0, 0), + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); + VXC_OP4(img_load_3d, src1, input, coord_in.xyww, VXC_5BITOFFSET_XY(8, 0), + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); + VXC_OP4(img_load_3d, src2, input, coord_in.xyww, VXC_5BITOFFSET_XY(0, 1), + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); + VXC_OP4(img_load_3d, src3, input, coord_in.xyww, VXC_5BITOFFSET_XY(8, 1), + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); coord_in.z ++; - coord_in.y = top_y_idx; - VXC_ReadImage2DArray(src0, input, coord_in, \ - VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); - VXC_ReadImage2DArray(src1, input, coord_in, \ - VXC_5BITOFFSET_XY(8, 0), VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); - coord_in.y = bottom_y_idx; - VXC_ReadImage2DArray(src2, input, coord_in, \ - VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); - VXC_ReadImage2DArray(src3, input, coord_in, \ - VXC_5BITOFFSET_XY(8, 0), VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); - - VXC_DP4x4(left4, top, top, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertDFP2FP32_4x4); - VXC_DP4x4(right4, top, top, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertDFP2FP32_part1_4x4); + VXC_DP4x4(left4, top, top, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniDFPtoFp32_left_4x4); + VXC_DP4x4(right4, top, top, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniRightSubLeft_4x4); top4 = right4 * x_lerp + left4; VXC_DP4x4(left4, bottom, bottom, \ - VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertDFP2FP32_4x4); + VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniDFPtoFp32_left_4x4); VXC_DP4x4(right4, bottom, bottom, \ - VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertDFP2FP32_part1_4x4); + VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniRightSubLeft_4x4); bottom4 = right4 * x_lerp + left4; bottom4 -= top4; float4 dst4 = bottom4 * y_lerp + top4; @@ -103,10 +107,30 @@ __kernel void resize_bilinear_I16toI16_UP int4 dst = convert_int4_rte(dst4); VXC_DP2x8(top, dst, dst, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), uniExtact8Bit_2x8); - VXC_WriteImage2DArray(output, coord_out, top, VXC_MODIFIER(0, 3, 0,VXC_RM_TowardZero, 0)); - coord_out.z ++; - } while (coord_in.z < depth); + VXC_OP4_NoDest(img_store_3d, output, coord_out.xyww, top, VXC_MODIFIER(0, 3, 0,VXC_RM_TowardZero, 0)); + coord_out.w += output_desc.s4; + } + + VXC_BitExtract(dst0, src0, src1, maskShift, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); + VXC_BitExtract(dst1, src2, src3, maskShift, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); + _viv_asm(COPY, top, dst0, 16); + _viv_asm(COPY, bottom, dst1, 16); + VXC_DP4x4(left4, top, top, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniDFPtoFp32_left_4x4); + VXC_DP4x4(right4, top, top, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniRightSubLeft_4x4); + top4 = right4 * x_lerp + left4; + VXC_DP4x4(left4, bottom, bottom, \ + VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniDFPtoFp32_left_4x4); + VXC_DP4x4(right4, bottom, bottom, \ + VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniRightSubLeft_4x4); + bottom4 = right4 * x_lerp + left4; + bottom4 -= top4; + float4 dst4 = bottom4 * y_lerp + top4; + dst4 = dst4 * dfpScale; + int4 dst = convert_int4_rte(dst4); + VXC_DP2x8(top, dst, dst, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), uniExtact8Bit_2x8); + VXC_OP4_NoDest(img_store_3d, output, coord_out.xyww, top, VXC_MODIFIER(0, 3, 0,VXC_RM_TowardZero, 0)); + } __kernel void resize_bilinear_I16toI16_DOWN @@ -125,103 +149,67 @@ __kernel void resize_bilinear_I16toI16_DOWN float4 left_x_f = floor(in_x); float4 x_lerp = in_x - left_x_f; int4 left_x_idx = convert_int4(left_x_f); - float4 right_x_f = ceil(in_x); - int4 right_x_idx = convert_int4(right_x_f); - float in_y = (convert_float(coord_out.y) + half_pixel_value) * scale_xy.y - half_pixel_value; - float top_y_f = floor(in_y); float y_lerp = in_y - top_y_f; int top_y_idx = convert_int(top_y_f); - float bottom_y_f = ceil(in_y); - int bottom_y_idx= convert_int(bottom_y_f); - - vxc_short8 top_left, top_right; - vxc_short8 bottom_left, bottom_right; + vxc_short8 top, bottom, result; int4 coord_in = (int4)(left_x_idx.x, top_y_idx, coord_out.z, 0); - VXC_ReadImage2DArray(top_left, input, coord_in, \ - VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0)); + int8 input_desc; + _viv_asm(COPY, input_desc, input, sizeof(input_desc)); + int baseAddr = (int)coord_in.z * input_desc.s4 + input_desc.s0; + _viv_asm(MOV, coord_in.w, baseAddr); + VXC_OP4(img_load_3d, top, input, coord_in.xyww, VXC_5BITOFFSET_XY(0, 0), + VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0)); + VXC_OP4(img_load_3d, bottom, input, coord_in.xyww, VXC_5BITOFFSET_XY(0, 1), + VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0)); coord_in.x = left_x_idx.y; - VXC_ReadImage2DArray(top_left, input, coord_in, \ - VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(1, 1, 0, VXC_RM_TowardZero, 0)); + VXC_OP4(img_load_3d, top, input, coord_in.xyww, VXC_5BITOFFSET_XY(0, 0), + VXC_MODIFIER(2, 3, 0, VXC_RM_TowardZero, 0)); + VXC_OP4(img_load_3d, bottom, input, coord_in.xyww, VXC_5BITOFFSET_XY(0, 1), + VXC_MODIFIER(2, 3, 0, VXC_RM_TowardZero, 0)); coord_in.x = left_x_idx.z; - VXC_ReadImage2DArray(top_left, input, coord_in, \ - VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(2, 2, 0, VXC_RM_TowardZero, 0)); + VXC_OP4(img_load_3d, top, input, coord_in.xyww, VXC_5BITOFFSET_XY(0, 0), + VXC_MODIFIER(4, 5, 0, VXC_RM_TowardZero, 0)); + VXC_OP4(img_load_3d, bottom, input, coord_in.xyww, VXC_5BITOFFSET_XY(0, 1), + VXC_MODIFIER(4, 5, 0, VXC_RM_TowardZero, 0)); coord_in.x = left_x_idx.w; - VXC_ReadImage2DArray(top_left, input, coord_in, \ - VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(3, 3, 0, VXC_RM_TowardZero, 0)); - - coord_in.x = right_x_idx.x; - VXC_ReadImage2DArray(top_right, input, coord_in, \ - VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0)); - coord_in.x = right_x_idx.y; - VXC_ReadImage2DArray(top_right, input, coord_in, \ - VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(1, 1, 0, VXC_RM_TowardZero, 0)); - coord_in.x = right_x_idx.z; - VXC_ReadImage2DArray(top_right, input, coord_in, \ - VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(2, 2, 0, VXC_RM_TowardZero, 0)); - coord_in.x = right_x_idx.w; - VXC_ReadImage2DArray(top_right, input, coord_in, \ - VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(3, 3, 0, VXC_RM_TowardZero, 0)); - - coord_in.y = bottom_y_idx; - coord_in.x = left_x_idx.x; - VXC_ReadImage2DArray(bottom_left, input, coord_in, \ - VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0)); - coord_in.x = left_x_idx.y; - VXC_ReadImage2DArray(bottom_left, input, coord_in, \ - VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(1, 1, 0, VXC_RM_TowardZero, 0)); - coord_in.x = left_x_idx.z; - VXC_ReadImage2DArray(bottom_left, input, coord_in, \ - VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(2, 2, 0, VXC_RM_TowardZero, 0)); - coord_in.x = left_x_idx.w; - VXC_ReadImage2DArray(bottom_left, input, coord_in, \ - VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(3, 3, 0, VXC_RM_TowardZero, 0)); - - coord_in.x = right_x_idx.x; - VXC_ReadImage2DArray(bottom_right, input, coord_in, \ - VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0)); - coord_in.x = right_x_idx.y; - VXC_ReadImage2DArray(bottom_right, input, coord_in, \ - VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(1, 1, 0, VXC_RM_TowardZero, 0)); - coord_in.x = right_x_idx.z; - VXC_ReadImage2DArray(bottom_right, input, coord_in, \ - VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(2, 2, 0, VXC_RM_TowardZero, 0)); - coord_in.x = right_x_idx.w; - VXC_ReadImage2DArray(bottom_right, input, coord_in, \ - VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(3, 3, 0, VXC_RM_TowardZero, 0)); + VXC_OP4(img_load_3d, top, input, coord_in.xyww, VXC_5BITOFFSET_XY(0, 0), + VXC_MODIFIER(6, 7, 0, VXC_RM_TowardZero, 0)); + VXC_OP4(img_load_3d, bottom, input, coord_in.xyww, VXC_5BITOFFSET_XY(0, 1), + VXC_MODIFIER(6, 7, 0, VXC_RM_TowardZero, 0)); float4 left4; float4 right4; float4 top4; float4 bottom4; - VXC_DP4x4(left4, top_left, top_left, \ - VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertDFP2FP32_4x4); - VXC_DP4x4(right4, top_right, top_right, \ - VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertDFP2FP32_4x4); - - right4 -= left4; + VXC_DP4x4(left4, top, top, \ + VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniDFPtoFp32_left_4x4); + VXC_DP4x4(right4, top, top, \ + VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniRightSubLeft_4x4); top4 = right4 * x_lerp + left4; - VXC_DP4x4(left4, bottom_left, bottom_left, \ - VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertDFP2FP32_4x4); - VXC_DP4x4(right4, bottom_right, bottom_right, \ - VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertDFP2FP32_4x4); - - right4 -= left4; + VXC_DP4x4(left4, bottom, bottom, \ + VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniDFPtoFp32_left_4x4); + VXC_DP4x4(right4, bottom, bottom, \ + VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniRightSubLeft_4x4); bottom4 = right4 * x_lerp + left4; - bottom4 -= top4; float4 dst4 = bottom4 * y_lerp + top4; - dst4 = dst4 * dfpScale; - int4 dst = convert_int4_rte(dst4); - VXC_DP2x8(top_left, dst, dst, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), uniExtact8Bit_2x8); - VXC_WriteImage2DArray(output, coord_out, top_left, VXC_MODIFIER(0, 3, 0,VXC_RM_TowardZero, 0)); + VXC_DP2x8(result, dst, dst, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), uniExtact8Bit_2x8); + + int8 output_desc; + _viv_asm(COPY, output_desc, output, sizeof(output_desc)); + baseAddr = (int)coord_out.z * output_desc.s4 + output_desc.s0; + _viv_asm(MOV, coord_out.w, baseAddr); + + VXC_OP4_NoDest(img_store_3d, output, coord_out.xyww, result, + VXC_MODIFIER(0, 3, 0,VXC_RM_TowardZero, 0)); } diff --git a/src/tim/vx/internal/src/libnnext/ops/vx/resize_bilinear_I8.vx b/src/tim/vx/internal/src/libnnext/ops/vx/resize_bilinear_I8.vx index aebf873..0be6cc5 100644 --- a/src/tim/vx/internal/src/libnnext/ops/vx/resize_bilinear_I8.vx +++ b/src/tim/vx/internal/src/libnnext/ops/vx/resize_bilinear_I8.vx @@ -5,8 +5,8 @@ _viv_uniform float2 scale_xy; _viv_uniform int depth; _viv_uniform VXC_512Bits uniConvertI32toI16_2x8; _viv_uniform VXC_512Bits uniGetMaskShift_2x8; -_viv_uniform VXC_512Bits uniConvertDFP2FP32_part1_4x4; -_viv_uniform VXC_512Bits uniConvertDFP2FP32_4x4; +_viv_uniform VXC_512Bits uniDFPtoFp32_left_4x4; +_viv_uniform VXC_512Bits uniRightSubLeft_4x4; _viv_uniform float dfpScale; _viv_uniform float half_pixel_value; @@ -34,8 +34,6 @@ __kernel void resize_bilinear_I8toI8_UP float top_y_f = floor(in_y); float y_lerp = in_y - top_y_f; int top_y_idx = convert_int(top_y_f); - float bottom_y_f = ceil(in_y); - int bottom_y_idx= convert_int(bottom_y_f); vxc_uchar16 src0, src1, dst0, dst1; @@ -44,12 +42,15 @@ __kernel void resize_bilinear_I8toI8_UP int4 coord_in = (int4)(left_x_idx.x, top_y_idx, coord_out.z, 0); - VXC_ReadImage2DArray(src0, input, coord_in, \ - VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); + int8 input_desc; + _viv_asm(COPY, input_desc, input, sizeof(input_desc)); + int baseAddr = (int)coord_in.z * input_desc.s4 + input_desc.s0; + _viv_asm(MOV, coord_in.w, baseAddr); - coord_in.y = bottom_y_idx; - VXC_ReadImage2DArray(src1, input, coord_in, \ - VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); + VXC_OP4(img_load_3d, src0, input, coord_in.xyww, VXC_5BITOFFSET_XY(0, 0), + VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); + VXC_OP4(img_load_3d, src1, input, coord_in.xyww, VXC_5BITOFFSET_XY(0, 1), + VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); vxc_ushort8 bitextract_p0; vxc_uchar16 maskShift = {8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8}; @@ -59,37 +60,42 @@ __kernel void resize_bilinear_I8toI8_UP VXC_DP2x8(maskShift, bitextract_p0, constData, \ VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniGetMaskShift_2x8); - do + int8 output_desc; + _viv_asm(COPY, output_desc, output, sizeof(output_desc)); + baseAddr = (int)coord_out.z * output_desc.s4 + output_desc.s0; + _viv_asm(MOV, coord_out.w, baseAddr); + + float4 left4; + float4 right4; + float4 top4; + float4 bottom4; + + int loop = depth - 1; + while (coord_in.z < loop) { VXC_BitExtract(dst0, src0, src0, maskShift, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); VXC_BitExtract(dst1, src1, src1, maskShift, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); _viv_asm(COPY, top, dst0, 16); _viv_asm(COPY, bottom, dst1, 16); + coord_in.w += input_desc.s4; + VXC_OP4(img_load_3d, src0, input, coord_in.xyww, VXC_5BITOFFSET_XY(0, 0), + VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); + VXC_OP4(img_load_3d, src1, input, coord_in.xyww, VXC_5BITOFFSET_XY(0, 1), + VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); coord_in.z ++; - coord_in.y = top_y_idx; - VXC_ReadImage2DArray(src0, input, coord_in, \ - VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); - coord_in.y = bottom_y_idx; - VXC_ReadImage2DArray(src1, input, coord_in, \ - VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); - - float4 left4; - float4 right4; - float4 top4; - float4 bottom4; VXC_DP4x4(left4, top, top, \ - VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertDFP2FP32_4x4); + VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniDFPtoFp32_left_4x4); VXC_DP4x4(right4, top, top, \ - VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertDFP2FP32_part1_4x4); + VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniRightSubLeft_4x4); top4 = right4 * x_lerp + left4; VXC_DP4x4(left4, bottom, bottom, \ - VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertDFP2FP32_4x4); + VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniDFPtoFp32_left_4x4); VXC_DP4x4(right4, bottom, bottom, \ - VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertDFP2FP32_part1_4x4); + VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniRightSubLeft_4x4); bottom4 = right4 * x_lerp + left4; bottom4 -= top4; @@ -97,10 +103,31 @@ __kernel void resize_bilinear_I8toI8_UP dst4 = dst4 * dfpScale; int4 dst = convert_int4_rte(dst4); VXC_DP2x8(top, dst, dst, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), uniExtact8Bit_2x8); - VXC_WriteImage2DArray(output, coord_out, top, VXC_MODIFIER(0, 3, 0,VXC_RM_TowardZero, 0)); - coord_out.z ++; - } while (coord_in.z < depth); + VXC_OP4_NoDest(img_store_3d, output, coord_out.xyww, top, VXC_MODIFIER(0, 3, 0,VXC_RM_TowardZero, 0)); + coord_out.w += output_desc.s4; + } + + VXC_BitExtract(dst0, src0, src0, maskShift, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); + VXC_BitExtract(dst1, src1, src1, maskShift, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); + _viv_asm(COPY, top, dst0, 16); + _viv_asm(COPY, bottom, dst1, 16); + VXC_DP4x4(left4, top, top, \ + VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniDFPtoFp32_left_4x4); + VXC_DP4x4(right4, top, top, \ + VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniRightSubLeft_4x4); + top4 = right4 * x_lerp + left4; + VXC_DP4x4(left4, bottom, bottom, \ + VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniDFPtoFp32_left_4x4); + VXC_DP4x4(right4, bottom, bottom, \ + VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniRightSubLeft_4x4); + bottom4 = right4 * x_lerp + left4; + bottom4 -= top4; + float4 dst4 = bottom4 * y_lerp + top4; + dst4 = dst4 * dfpScale; + int4 dst = convert_int4_rte(dst4); + VXC_DP2x8(top, dst, dst, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), uniExtact8Bit_2x8); + VXC_OP4_NoDest(img_store_3d, output, coord_out.xyww, top, VXC_MODIFIER(0, 3, 0,VXC_RM_TowardZero, 0)); } __kernel void resize_bilinear_I8toI8_DOWN @@ -112,98 +139,55 @@ __kernel void resize_bilinear_I8toI8_DOWN ) { int4 coord_out = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0); - int4 coord_x = coord_out.xxxx + (int4)(0, 1, 2, 3); float4 in_x = (convert_float4(coord_x) + half_pixel_value) * scale_xy.xxxx - half_pixel_value; - float4 left_x_f = floor(in_x); float4 x_lerp = in_x - left_x_f; int4 left_x_idx = convert_int4(left_x_f); - float4 right_x_f = ceil(in_x); - int4 right_x_idx = convert_int4(right_x_f); - float in_y = (convert_float(coord_out.y) + half_pixel_value) * scale_xy.y - half_pixel_value; - float top_y_f = floor(in_y); float y_lerp = in_y - top_y_f; int top_y_idx = convert_int(top_y_f); - float bottom_y_f = ceil(in_y); - int bottom_y_idx= convert_int(bottom_y_f); - - vxc_char16 top_left, top_right; - vxc_char16 bottom_left, bottom_right; - + vxc_char16 top, bottom, result; int4 coord_in = (int4)(left_x_idx.x, top_y_idx, coord_out.z, 0); - VXC_ReadImage2DArray(top_left, input, coord_in, \ - VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0)); + int8 input_desc; + _viv_asm(COPY, input_desc, input, sizeof(input_desc)); + int baseAddr = (int)coord_in.z * input_desc.s4 + input_desc.s0; + _viv_asm(MOV, coord_in.w, baseAddr); + VXC_OP4(img_load_3d, top, input, coord_in.xyww, VXC_5BITOFFSET_XY(0, 0), + VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0)); + VXC_OP4(img_load_3d, bottom, input, coord_in.xyww, VXC_5BITOFFSET_XY(0, 1), + VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0)); coord_in.x = left_x_idx.y; - VXC_ReadImage2DArray(top_left, input, coord_in, \ - VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(1, 1, 0, VXC_RM_TowardZero, 0)); + VXC_OP4(img_load_3d, top, input, coord_in.xyww, VXC_5BITOFFSET_XY(0, 0), + VXC_MODIFIER(2, 3, 0, VXC_RM_TowardZero, 0)); + VXC_OP4(img_load_3d, bottom, input, coord_in.xyww, VXC_5BITOFFSET_XY(0, 1), + VXC_MODIFIER(2, 3, 0, VXC_RM_TowardZero, 0)); coord_in.x = left_x_idx.z; - VXC_ReadImage2DArray(top_left, input, coord_in, \ - VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(2, 2, 0, VXC_RM_TowardZero, 0)); + VXC_OP4(img_load_3d, top, input, coord_in.xyww, VXC_5BITOFFSET_XY(0, 0), + VXC_MODIFIER(4, 5, 0, VXC_RM_TowardZero, 0)); + VXC_OP4(img_load_3d, bottom, input, coord_in.xyww, VXC_5BITOFFSET_XY(0, 1), + VXC_MODIFIER(4, 5, 0, VXC_RM_TowardZero, 0)); coord_in.x = left_x_idx.w; - VXC_ReadImage2DArray(top_left, input, coord_in, \ - VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(3, 3, 0, VXC_RM_TowardZero, 0)); - - coord_in.x = right_x_idx.x; - VXC_ReadImage2DArray(top_right, input, coord_in, \ - VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0)); - coord_in.x = right_x_idx.y; - VXC_ReadImage2DArray(top_right, input, coord_in, \ - VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(1, 1, 0, VXC_RM_TowardZero, 0)); - coord_in.x = right_x_idx.z; - VXC_ReadImage2DArray(top_right, input, coord_in, \ - VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(2, 2, 0, VXC_RM_TowardZero, 0)); - coord_in.x = right_x_idx.w; - VXC_ReadImage2DArray(top_right, input, coord_in, \ - VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(3, 3, 0, VXC_RM_TowardZero, 0)); - - coord_in.y = bottom_y_idx; - coord_in.x = left_x_idx.x; - VXC_ReadImage2DArray(bottom_left, input, coord_in, \ - VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0)); - coord_in.x = left_x_idx.y; - VXC_ReadImage2DArray(bottom_left, input, coord_in, \ - VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(1, 1, 0, VXC_RM_TowardZero, 0)); - coord_in.x = left_x_idx.z; - VXC_ReadImage2DArray(bottom_left, input, coord_in, \ - VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(2, 2, 0, VXC_RM_TowardZero, 0)); - coord_in.x = left_x_idx.w; - VXC_ReadImage2DArray(bottom_left, input, coord_in, \ - VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(3, 3, 0, VXC_RM_TowardZero, 0)); - - coord_in.x = right_x_idx.x; - VXC_ReadImage2DArray(bottom_right, input, coord_in, \ - VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0)); - coord_in.x = right_x_idx.y; - VXC_ReadImage2DArray(bottom_right, input, coord_in, \ - VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(1, 1, 0, VXC_RM_TowardZero, 0)); - coord_in.x = right_x_idx.z; - VXC_ReadImage2DArray(bottom_right, input, coord_in, \ - VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(2, 2, 0, VXC_RM_TowardZero, 0)); - coord_in.x = right_x_idx.w; - VXC_ReadImage2DArray(bottom_right, input, coord_in, \ - VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(3, 3, 0, VXC_RM_TowardZero, 0)); + VXC_OP4(img_load_3d, top, input, coord_in.xyww, VXC_5BITOFFSET_XY(0, 0), + VXC_MODIFIER(6, 7, 0, VXC_RM_TowardZero, 0)); + VXC_OP4(img_load_3d, bottom, input, coord_in.xyww, VXC_5BITOFFSET_XY(0, 1), + VXC_MODIFIER(6, 7, 0, VXC_RM_TowardZero, 0)); float4 left4; float4 right4; float4 top4; float4 bottom4; - VXC_DP4x4(left4, top_left, top_left, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertDFP2FP32_4x4); - VXC_DP4x4(right4, top_right, top_right, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertDFP2FP32_4x4); - - right4 -= left4; + VXC_DP4x4(left4, top, top, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniDFPtoFp32_left_4x4); + VXC_DP4x4(right4, top, top, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniRightSubLeft_4x4); top4 = right4 * x_lerp + left4; - VXC_DP4x4(left4, bottom_left, bottom_left, \ - VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertDFP2FP32_4x4); - VXC_DP4x4(right4, bottom_right, bottom_right, \ - VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertDFP2FP32_4x4); - - right4 -= left4; + VXC_DP4x4(left4, bottom, bottom, \ + VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniDFPtoFp32_left_4x4); + VXC_DP4x4(right4, bottom, bottom, \ + VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniRightSubLeft_4x4); bottom4 = right4 * x_lerp + left4; bottom4 -= top4; @@ -213,6 +197,11 @@ __kernel void resize_bilinear_I8toI8_DOWN int4 dst = convert_int4_rte(dst4); - VXC_DP2x8(top_left, dst, dst, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), uniExtact8Bit_2x8); - VXC_WriteImage2DArray(output, coord_out, top_left, VXC_MODIFIER(0, 3, 0,VXC_RM_TowardZero, 0)); + VXC_DP2x8(result, dst, dst, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), uniExtact8Bit_2x8); + + int8 output_desc; + _viv_asm(COPY, output_desc, output, sizeof(output_desc)); + baseAddr = (int)coord_out.z * output_desc.s4 + output_desc.s0; + _viv_asm(MOV, coord_out.w, baseAddr); + VXC_OP4_NoDest(img_store_3d, output, coord_out.xyww, result, VXC_MODIFIER(0, 3, 0,VXC_RM_TowardZero, 0)); } diff --git a/src/tim/vx/internal/src/libnnext/ops/vx/resize_bilinear_U8.vx b/src/tim/vx/internal/src/libnnext/ops/vx/resize_bilinear_U8.vx index 4c21bd7..39f239a 100644 --- a/src/tim/vx/internal/src/libnnext/ops/vx/resize_bilinear_U8.vx +++ b/src/tim/vx/internal/src/libnnext/ops/vx/resize_bilinear_U8.vx @@ -1,13 +1,13 @@ #include "cl_viv_vx_ext.h" -_viv_uniform VXC_512Bits uniU8SubZPtoFp32_4x4; +_viv_uniform VXC_512Bits uniU8SubZPtoFp32_left_4x4; +_viv_uniform VXC_512Bits uniU8RightSubLeft_4x4; _viv_uniform VXC_512Bits uniExtact8Bit_2x8; _viv_uniform float2 scale_xy; _viv_uniform int depth; _viv_uniform int input_ZP; _viv_uniform float uint8Scale; _viv_uniform float output_ZP; -_viv_uniform VXC_512Bits uniU8SubZPtoFp32_part1_4x4; _viv_uniform VXC_512Bits uniConvertI32toI16_2x8; _viv_uniform VXC_512Bits uniGetMaskShift_2x8; _viv_uniform float half_pixel_value; @@ -26,69 +26,36 @@ __kernel void resize_bilinear_U8toF16_DOWN float4 left_x_f = floor(in_x); float4 x_lerp = in_x - left_x_f; int4 left_x_idx = convert_int4(left_x_f); - float4 right_x_f = ceil(in_x); - int4 right_x_idx = convert_int4(right_x_f); float in_y = (convert_float(coord_out.y) + half_pixel_value) * scale_xy.y - half_pixel_value; float top_y_f = floor(in_y); float y_lerp = in_y - top_y_f; int top_y_idx = convert_int(top_y_f); - float bottom_y_f = ceil(in_y); - int bottom_y_idx= convert_int(bottom_y_f); - vxc_uchar16 top_left, top_right; - vxc_uchar16 bottom_left, bottom_right; + vxc_uchar16 top, bottom; int4 coord_in = (int4)(left_x_idx.x, top_y_idx, coord_out.z, 0); - VXC_ReadImage2DArray(top_left, input, coord_in, \ - VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0)); + int8 input_desc; + _viv_asm(COPY, input_desc, input, sizeof(input_desc)); + int baseAddr = (int)coord_in.z * input_desc.s4 + input_desc.s0; + _viv_asm(MOV, coord_in.w, baseAddr); + VXC_OP4(img_load_3d, top, input, coord_in.xyww, VXC_5BITOFFSET_XY(0, 0), + VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0)); + VXC_OP4(img_load_3d, bottom, input, coord_in.xyww, VXC_5BITOFFSET_XY(0, 1), + VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0)); coord_in.x = left_x_idx.y; - VXC_ReadImage2DArray(top_left, input, coord_in, \ - VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(1, 1, 0, VXC_RM_TowardZero, 0)); + VXC_OP4(img_load_3d, top, input, coord_in.xyww, VXC_5BITOFFSET_XY(0, 0), + VXC_MODIFIER(2, 3, 0, VXC_RM_TowardZero, 0)); + VXC_OP4(img_load_3d, bottom, input, coord_in.xyww, VXC_5BITOFFSET_XY(0, 1), + VXC_MODIFIER(2, 3, 0, VXC_RM_TowardZero, 0)); coord_in.x = left_x_idx.z; - VXC_ReadImage2DArray(top_left, input, coord_in, \ - VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(2, 2, 0, VXC_RM_TowardZero, 0)); + VXC_OP4(img_load_3d, top, input, coord_in.xyww, VXC_5BITOFFSET_XY(0, 0), + VXC_MODIFIER(4, 5, 0, VXC_RM_TowardZero, 0)); + VXC_OP4(img_load_3d, bottom, input, coord_in.xyww, VXC_5BITOFFSET_XY(0, 1), + VXC_MODIFIER(4, 5, 0, VXC_RM_TowardZero, 0)); coord_in.x = left_x_idx.w; - VXC_ReadImage2DArray(top_left, input, coord_in, \ - VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(3, 3, 0, VXC_RM_TowardZero, 0)); - - coord_in.x = right_x_idx.x; - VXC_ReadImage2DArray(top_right, input, coord_in, \ - VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0)); - coord_in.x = right_x_idx.y; - VXC_ReadImage2DArray(top_right, input, coord_in, \ - VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(1, 1, 0, VXC_RM_TowardZero, 0)); - coord_in.x = right_x_idx.z; - VXC_ReadImage2DArray(top_right, input, coord_in, \ - VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(2, 2, 0, VXC_RM_TowardZero, 0)); - coord_in.x = right_x_idx.w; - VXC_ReadImage2DArray(top_right, input, coord_in, \ - VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(3, 3, 0, VXC_RM_TowardZero, 0)); - - coord_in.y = bottom_y_idx; - coord_in.x = left_x_idx.x; - VXC_ReadImage2DArray(bottom_left, input, coord_in, \ - VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0)); - coord_in.x = left_x_idx.y; - VXC_ReadImage2DArray(bottom_left, input, coord_in, \ - VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(1, 1, 0, VXC_RM_TowardZero, 0)); - coord_in.x = left_x_idx.z; - VXC_ReadImage2DArray(bottom_left, input, coord_in, \ - VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(2, 2, 0, VXC_RM_TowardZero, 0)); - coord_in.x = left_x_idx.w; - VXC_ReadImage2DArray(bottom_left, input, coord_in, \ - VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(3, 3, 0, VXC_RM_TowardZero, 0)); - - coord_in.x = right_x_idx.x; - VXC_ReadImage2DArray(bottom_right, input, coord_in, \ - VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0)); - coord_in.x = right_x_idx.y; - VXC_ReadImage2DArray(bottom_right, input, coord_in, \ - VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(1, 1, 0, VXC_RM_TowardZero, 0)); - coord_in.x = right_x_idx.z; - VXC_ReadImage2DArray(bottom_right, input, coord_in, \ - VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(2, 2, 0, VXC_RM_TowardZero, 0)); - coord_in.x = right_x_idx.w; - VXC_ReadImage2DArray(bottom_right, input, coord_in, \ - VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(3, 3, 0, VXC_RM_TowardZero, 0)); + VXC_OP4(img_load_3d, top, input, coord_in.xyww, VXC_5BITOFFSET_XY(0, 0), + VXC_MODIFIER(6, 7, 0, VXC_RM_TowardZero, 0)); + VXC_OP4(img_load_3d, bottom, input, coord_in.xyww, VXC_5BITOFFSET_XY(0, 1), + VXC_MODIFIER(6, 7, 0, VXC_RM_TowardZero, 0)); float4 left4; float4 right4; @@ -97,16 +64,12 @@ __kernel void resize_bilinear_U8toF16_DOWN unsigned char inputZP; _viv_asm(COPY, inputZP, input_ZP, 4); - VXC_DP4x4(left4, top_left, inputZP, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniU8SubZPtoFp32_4x4); - VXC_DP4x4(right4, top_right, inputZP, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniU8SubZPtoFp32_4x4); - - right4 -= left4; + VXC_DP4x4(left4, top, inputZP, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniU8SubZPtoFp32_left_4x4); + VXC_DP4x4(right4, top, inputZP, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniU8RightSubLeft_4x4); top4 = right4 * x_lerp + left4; - VXC_DP4x4(left4, bottom_left, inputZP, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniU8SubZPtoFp32_4x4); - VXC_DP4x4(right4, bottom_right, inputZP, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniU8SubZPtoFp32_4x4); - - right4 -= left4; + VXC_DP4x4(left4, bottom, inputZP, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniU8SubZPtoFp32_left_4x4); + VXC_DP4x4(right4, bottom, inputZP, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniU8RightSubLeft_4x4); bottom4 = right4 * x_lerp + left4; bottom4 -= top4; @@ -120,7 +83,12 @@ __kernel void resize_bilinear_U8toF16_DOWN vxc_short8 dst_short; _viv_asm(COPY, dst_short, dst, 16); - VXC_WriteImage2DArray(output, coord_out, dst_short.s0246, VXC_MODIFIER(0, 3, 0,VXC_RM_TowardZero, 0)); + int8 output_desc; + _viv_asm(COPY, output_desc, output, sizeof(output_desc)); + baseAddr = (int)coord_out.z * output_desc.s4 + output_desc.s0; + _viv_asm(MOV, coord_out.w, baseAddr); + VXC_OP4_NoDest(img_store_3d, output, coord_out.xyww, dst_short.s0246, + VXC_MODIFIER(0, 3, 0,VXC_RM_TowardZero, 0)); } __kernel void resize_bilinear_U8toU8_UP @@ -147,8 +115,6 @@ __kernel void resize_bilinear_U8toU8_UP float top_y_f = floor(in_y); float y_lerp = in_y - top_y_f; int top_y_idx = convert_int(top_y_f); - float bottom_y_f = ceil(in_y); - int bottom_y_idx= convert_int(bottom_y_f); vxc_uchar16 src0, src1; @@ -157,12 +123,15 @@ __kernel void resize_bilinear_U8toU8_UP int4 coord_in = (int4)(left_x_idx.x, top_y_idx, coord_out.z, 0); - VXC_ReadImage2DArray(src0, input, coord_in, \ - VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); + int8 input_desc; + _viv_asm(COPY, input_desc, input, sizeof(input_desc)); + int baseAddr = (int)coord_in.z * input_desc.s4 + input_desc.s0; + _viv_asm(MOV, coord_in.w, baseAddr); - coord_in.y = bottom_y_idx; - VXC_ReadImage2DArray(src1, input, coord_in, \ - VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); + VXC_OP4(img_load_3d, src0, input, coord_in.xyww, VXC_5BITOFFSET_XY(0, 0), + VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); + VXC_OP4(img_load_3d, src1, input, coord_in.xyww, VXC_5BITOFFSET_XY(0, 1), + VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); vxc_ushort8 bitextract_p0; vxc_uchar16 maskShift = {8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8}; @@ -172,46 +141,67 @@ __kernel void resize_bilinear_U8toU8_UP VXC_DP2x8(maskShift, bitextract_p0, constData, \ VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniGetMaskShift_2x8); - do + int8 output_desc; + _viv_asm(COPY, output_desc, output, sizeof(output_desc)); + baseAddr = (int)coord_out.z * output_desc.s4 + output_desc.s0; + _viv_asm(MOV, coord_out.w, baseAddr); + + float4 left4; + float4 right4; + float4 top4; + float4 bottom4; + + int loop = depth - 1; + while (coord_in.z < loop) { VXC_BitExtract(top, src0, src0, maskShift, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); VXC_BitExtract(bottom, src1, src1, maskShift, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); + coord_in.w += input_desc.s4; + VXC_OP4(img_load_3d, src0, input, coord_in.xyww, VXC_5BITOFFSET_XY(0, 0), + VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); + VXC_OP4(img_load_3d, src1, input, coord_in.xyww, VXC_5BITOFFSET_XY(0, 1), + VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); coord_in.z ++; - coord_in.y = top_y_idx; - VXC_ReadImage2DArray(src0, input, coord_in, \ - VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); - coord_in.y = bottom_y_idx; - VXC_ReadImage2DArray(src1, input, coord_in, \ - VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); - - float4 left4; - float4 right4; - float4 top4; - float4 bottom4; - unsigned char inputZP; _viv_asm(COPY, inputZP, input_ZP, 4); - VXC_DP4x4(left4, top, inputZP, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniU8SubZPtoFp32_4x4); - VXC_DP4x4(right4, top, top, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniU8SubZPtoFp32_part1_4x4); - + VXC_DP4x4(left4, top, inputZP, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniU8SubZPtoFp32_left_4x4); + VXC_DP4x4(right4, top, top, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniU8RightSubLeft_4x4); top4 = right4 * x_lerp + left4; VXC_DP4x4(left4, bottom, inputZP, \ - VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniU8SubZPtoFp32_4x4); + VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniU8SubZPtoFp32_left_4x4); VXC_DP4x4(right4, bottom, bottom, \ - VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniU8SubZPtoFp32_part1_4x4); - + VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniU8RightSubLeft_4x4); bottom4 = right4 * x_lerp + left4; bottom4 -= top4; float4 dst4 = bottom4 * y_lerp + top4; dst4 = dst4 * uint8Scale + output_ZP; int4 dst = convert_int4_rte(dst4); VXC_DP2x8(top, dst, dst, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), uniExtact8Bit_2x8); - VXC_WriteImage2DArray(output, coord_out, top, VXC_MODIFIER(0, 3, 0,VXC_RM_TowardZero, 0)); + VXC_OP4_NoDest(img_store_3d, output, coord_out.xyww, top, VXC_MODIFIER(0, 3, 0,VXC_RM_TowardZero, 0)); + coord_out.w += output_desc.s4; + } - coord_out.z ++; - } while (coord_in.z < depth); + VXC_BitExtract(top, src0, src0, maskShift, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); + VXC_BitExtract(bottom, src1, src1, maskShift, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); + unsigned char inputZP; + _viv_asm(COPY, inputZP, input_ZP, 4); + VXC_DP4x4(left4, top, inputZP, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniU8SubZPtoFp32_left_4x4); + VXC_DP4x4(right4, top, top, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniU8RightSubLeft_4x4); + top4 = right4 * x_lerp + left4; + + VXC_DP4x4(left4, bottom, inputZP, \ + VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniU8SubZPtoFp32_left_4x4); + VXC_DP4x4(right4, bottom, bottom, \ + VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniU8RightSubLeft_4x4); + bottom4 = right4 * x_lerp + left4; + bottom4 -= top4; + float4 dst4 = bottom4 * y_lerp + top4; + dst4 = dst4 * uint8Scale + output_ZP; + int4 dst = convert_int4_rte(dst4); + VXC_DP2x8(top, dst, dst, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), uniExtact8Bit_2x8); + VXC_OP4_NoDest(img_store_3d, output, coord_out.xyww, top, VXC_MODIFIER(0, 3, 0,VXC_RM_TowardZero, 0)); } __kernel void resize_bilinear_U8toU8_DOWN @@ -228,69 +218,36 @@ __kernel void resize_bilinear_U8toU8_DOWN float4 left_x_f = floor(in_x); float4 x_lerp = in_x - left_x_f; int4 left_x_idx = convert_int4(left_x_f); - float4 right_x_f = ceil(in_x); - int4 right_x_idx = convert_int4(right_x_f); float in_y = (convert_float(coord_out.y) + half_pixel_value) * scale_xy.y - half_pixel_value; float top_y_f = floor(in_y); float y_lerp = in_y - top_y_f; int top_y_idx = convert_int(top_y_f); - float bottom_y_f = ceil(in_y); - int bottom_y_idx= convert_int(bottom_y_f); - vxc_uchar16 top_left, top_right; - vxc_uchar16 bottom_left, bottom_right; + vxc_uchar16 top, bottom, result; int4 coord_in = (int4)(left_x_idx.x, top_y_idx, coord_out.z, 0); - VXC_ReadImage2DArray(top_left, input, coord_in, \ - VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0)); + int8 input_desc; + _viv_asm(COPY, input_desc, input, sizeof(input_desc)); + int baseAddr = (int)coord_in.z * input_desc.s4 + input_desc.s0; + _viv_asm(MOV, coord_in.w, baseAddr); + VXC_OP4(img_load_3d, top, input, coord_in.xyww, VXC_5BITOFFSET_XY(0, 0), + VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0)); + VXC_OP4(img_load_3d, bottom, input, coord_in.xyww, VXC_5BITOFFSET_XY(0, 1), + VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0)); coord_in.x = left_x_idx.y; - VXC_ReadImage2DArray(top_left, input, coord_in, \ - VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(1, 1, 0, VXC_RM_TowardZero, 0)); + VXC_OP4(img_load_3d, top, input, coord_in.xyww, VXC_5BITOFFSET_XY(0, 0), + VXC_MODIFIER(2, 3, 0, VXC_RM_TowardZero, 0)); + VXC_OP4(img_load_3d, bottom, input, coord_in.xyww, VXC_5BITOFFSET_XY(0, 1), + VXC_MODIFIER(2, 3, 0, VXC_RM_TowardZero, 0)); coord_in.x = left_x_idx.z; - VXC_ReadImage2DArray(top_left, input, coord_in, \ - VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(2, 2, 0, VXC_RM_TowardZero, 0)); + VXC_OP4(img_load_3d, top, input, coord_in.xyww, VXC_5BITOFFSET_XY(0, 0), + VXC_MODIFIER(4, 5, 0, VXC_RM_TowardZero, 0)); + VXC_OP4(img_load_3d, bottom, input, coord_in.xyww, VXC_5BITOFFSET_XY(0, 1), + VXC_MODIFIER(4, 5, 0, VXC_RM_TowardZero, 0)); coord_in.x = left_x_idx.w; - VXC_ReadImage2DArray(top_left, input, coord_in, \ - VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(3, 3, 0, VXC_RM_TowardZero, 0)); - - coord_in.x = right_x_idx.x; - VXC_ReadImage2DArray(top_right, input, coord_in, \ - VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0)); - coord_in.x = right_x_idx.y; - VXC_ReadImage2DArray(top_right, input, coord_in, \ - VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(1, 1, 0, VXC_RM_TowardZero, 0)); - coord_in.x = right_x_idx.z; - VXC_ReadImage2DArray(top_right, input, coord_in, \ - VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(2, 2, 0, VXC_RM_TowardZero, 0)); - coord_in.x = right_x_idx.w; - VXC_ReadImage2DArray(top_right, input, coord_in, \ - VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(3, 3, 0, VXC_RM_TowardZero, 0)); - - coord_in.y = bottom_y_idx; - coord_in.x = left_x_idx.x; - VXC_ReadImage2DArray(bottom_left, input, coord_in, \ - VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0)); - coord_in.x = left_x_idx.y; - VXC_ReadImage2DArray(bottom_left, input, coord_in, \ - VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(1, 1, 0, VXC_RM_TowardZero, 0)); - coord_in.x = left_x_idx.z; - VXC_ReadImage2DArray(bottom_left, input, coord_in, \ - VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(2, 2, 0, VXC_RM_TowardZero, 0)); - coord_in.x = left_x_idx.w; - VXC_ReadImage2DArray(bottom_left, input, coord_in, \ - VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(3, 3, 0, VXC_RM_TowardZero, 0)); - - coord_in.x = right_x_idx.x; - VXC_ReadImage2DArray(bottom_right, input, coord_in, \ - VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0)); - coord_in.x = right_x_idx.y; - VXC_ReadImage2DArray(bottom_right, input, coord_in, \ - VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(1, 1, 0, VXC_RM_TowardZero, 0)); - coord_in.x = right_x_idx.z; - VXC_ReadImage2DArray(bottom_right, input, coord_in, \ - VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(2, 2, 0, VXC_RM_TowardZero, 0)); - coord_in.x = right_x_idx.w; - VXC_ReadImage2DArray(bottom_right, input, coord_in, \ - VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(3, 3, 0, VXC_RM_TowardZero, 0)); + VXC_OP4(img_load_3d, top, input, coord_in.xyww, VXC_5BITOFFSET_XY(0, 0), + VXC_MODIFIER(6, 7, 0, VXC_RM_TowardZero, 0)); + VXC_OP4(img_load_3d, bottom, input, coord_in.xyww, VXC_5BITOFFSET_XY(0, 1), + VXC_MODIFIER(6, 7, 0, VXC_RM_TowardZero, 0)); float4 left4; float4 right4; @@ -299,25 +256,26 @@ __kernel void resize_bilinear_U8toU8_DOWN unsigned char inputZP; _viv_asm(COPY, inputZP, input_ZP, 4); - VXC_DP4x4(left4, top_left, inputZP, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniU8SubZPtoFp32_4x4); - VXC_DP4x4(right4, top_right, inputZP, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniU8SubZPtoFp32_4x4); - - right4 -= left4; + VXC_DP4x4(left4, top, inputZP, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniU8SubZPtoFp32_left_4x4); + VXC_DP4x4(right4, top, inputZP, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniU8RightSubLeft_4x4); top4 = right4 * x_lerp + left4; - VXC_DP4x4(left4, bottom_left, inputZP, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniU8SubZPtoFp32_4x4); - VXC_DP4x4(right4, bottom_right, inputZP, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniU8SubZPtoFp32_4x4); - - right4 -= left4; + VXC_DP4x4(left4, bottom, inputZP, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniU8SubZPtoFp32_left_4x4); + VXC_DP4x4(right4, bottom, inputZP, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniU8RightSubLeft_4x4); bottom4 = right4 * x_lerp + left4; bottom4 -= top4; float4 dst4 = bottom4 * y_lerp + top4; dst4 = dst4 * uint8Scale + output_ZP; - int4 dst = convert_int4_rte(dst4); - VXC_DP2x8(top_left, dst, dst, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), uniExtact8Bit_2x8); - VXC_WriteImage2DArray(output, coord_out, top_left, VXC_MODIFIER(0, 3, 0,VXC_RM_TowardZero, 0)); + VXC_DP2x8(result, dst, dst, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), uniExtact8Bit_2x8); + + int8 output_desc; + _viv_asm(COPY, output_desc, output, sizeof(output_desc)); + baseAddr = (int)coord_out.z * output_desc.s4 + output_desc.s0; + _viv_asm(MOV, coord_out.w, baseAddr); + VXC_OP4_NoDest(img_store_3d, output, coord_out.xyww, result, + VXC_MODIFIER(0, 3, 0,VXC_RM_TowardZero, 0)); } diff --git a/src/tim/vx/internal/src/libnnext/ops/vx/resize_bilinear_U8_opt.vx b/src/tim/vx/internal/src/libnnext/ops/vx/resize_bilinear_U8_opt.vx index 640560e..59e8211 100644 --- a/src/tim/vx/internal/src/libnnext/ops/vx/resize_bilinear_U8_opt.vx +++ b/src/tim/vx/internal/src/libnnext/ops/vx/resize_bilinear_U8_opt.vx @@ -69,7 +69,8 @@ __kernel void resize_bilinear_U8toU8_UP_opt baseAddr = (int)coord_out.z * output_desc.s4 + output_desc.s0; _viv_asm(MOV, coord_out.w, baseAddr); - do + int loop = depth - 1; + while (coord_in.z < loop) { VXC_BitExtract(top_bottom, src0, src0, maskShift, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); VXC_BitExtract(top_bottom, src1, src1, maskShift, VXC_MODIFIER(8, 15, 0, VXC_RM_TowardZero, 0)); @@ -88,8 +89,17 @@ __kernel void resize_bilinear_U8toU8_UP_opt VXC_MODIFIER(0, 3, 0,VXC_RM_TowardZero, 0)); coord_out.w += output_desc.s4; - coord_out.z ++; - } while (coord_out.z < depth); + coord_in.z ++; + } + + VXC_BitExtract(top_bottom, src0, src0, maskShift, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); + VXC_BitExtract(top_bottom, src1, src1, maskShift, VXC_MODIFIER(8, 15, 0, VXC_RM_TowardZero, 0)); + vxc_uchar16 dst; + VXC_DP4x4_b(dst, lerp.hi, lerp.lo, top_bottom, + VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), uniBilinear_4x4_b); + VXC_OP4_NoDest(img_store_3d, output, coord_out.xyww, dst, + VXC_MODIFIER(0, 3, 0,VXC_RM_TowardZero, 0)); + } #endif \ No newline at end of file diff --git a/src/tim/vx/internal/src/libnnext/ops/vx/resize_nearest.vx b/src/tim/vx/internal/src/libnnext/ops/vx/resize_nearest.vx index 9d2838c..7172017 100644 --- a/src/tim/vx/internal/src/libnnext/ops/vx/resize_nearest.vx +++ b/src/tim/vx/internal/src/libnnext/ops/vx/resize_nearest.vx @@ -28,18 +28,30 @@ __kernel void resize_nearest_F16toF16 vxc_short8 src; int4 coord_in = (int4)(in_x_idx.x, in_y_idx, coord_out.z, 0); - VXC_ReadImage2DArray(src, input, coord_in, \ - VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0)); + int8 input_desc; + _viv_asm(COPY, input_desc, input, sizeof(input_desc)); + int baseAddr = (int)coord_in.z * input_desc.s4 + input_desc.s0; + _viv_asm(MOV, coord_in.w, baseAddr); + + VXC_OP4(img_load_3d, src, input, coord_in.xyww, VXC_5BITOFFSET_XY(0, 0), + VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0)); coord_in.x = in_x_idx.y; - VXC_ReadImage2DArray(src, input, coord_in, \ - VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(1, 1, 0, VXC_RM_TowardZero, 0)); + VXC_OP4(img_load_3d, src, input, coord_in.xyww, VXC_5BITOFFSET_XY(0, 0), + VXC_MODIFIER(1, 1, 0, VXC_RM_TowardZero, 0)); coord_in.x = in_x_idx.z; - VXC_ReadImage2DArray(src, input, coord_in, \ - VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(2, 2, 0, VXC_RM_TowardZero, 0)); + VXC_OP4(img_load_3d, src, input, coord_in.xyww, VXC_5BITOFFSET_XY(0, 0), + VXC_MODIFIER(2, 2, 0, VXC_RM_TowardZero, 0)); coord_in.x = in_x_idx.w; - VXC_ReadImage2DArray(src, input, coord_in, \ - VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(3, 3, 0, VXC_RM_TowardZero, 0)); - VXC_WriteImage2DArray(output, coord_out, src, VXC_MODIFIER(0, 3, 0,VXC_RM_TowardZero, 0)); + VXC_OP4(img_load_3d, src, input, coord_in.xyww, VXC_5BITOFFSET_XY(0, 0), + VXC_MODIFIER(3, 3, 0, VXC_RM_TowardZero, 0)); + + + int8 output_desc; + _viv_asm(COPY, output_desc, output, sizeof(output_desc)); + baseAddr = (int)coord_out.z * output_desc.s4 + output_desc.s0; + _viv_asm(MOV, coord_out.w, baseAddr); + VXC_OP4_NoDest(img_store_3d, output, coord_out.xyww, src, + VXC_MODIFIER(0, 3, 0,VXC_RM_TowardZero, 0)); } _viv_uniform VXC_512Bits uniGetExtractData_2x8; @@ -56,18 +68,29 @@ __kernel void resize_nearest_F16toF16_op vxc_ushort8 src0, src1, dst; int4 coord_in = (int4)(in_x_idx.x, in_y_idx, coord_out.z, 0); - VXC_ReadImage2DArray(src0, input, coord_in, \ - VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); - VXC_ReadImage2DArray(src1, input, coord_in, \ - VXC_5BITOFFSET_XY(8, 0), VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); - //in_x_idx = in_x_idx - in_x_idx.xxxx; + int8 input_desc; + _viv_asm(COPY, input_desc, input, sizeof(input_desc)); + int baseAddr = (int)coord_in.z * input_desc.s4 + input_desc.s0; + _viv_asm(MOV, coord_in.w, baseAddr); + + VXC_OP4(img_load_3d, src0, input, coord_in.xyww, VXC_5BITOFFSET_XY(0, 0), + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); + VXC_OP4(img_load_3d, src1, input, coord_in.xyww, VXC_5BITOFFSET_XY(8, 0), + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); + vxc_uchar16 mask = (vxc_uchar16)(8, 8, 8, 8, 8, 8, 8, 8, 16, 16, 16, 16, 16, 16, 16, 16); vxc_ushort8 input_idx; _viv_asm(COPY, input_idx, in_x_idx, 16); VXC_DP2x8(mask, input_idx, input_idx, \ VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniGetExtractData_2x8); VXC_BitExtract(dst, src0, src1, mask, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); - VXC_WriteImage2DArray(output, coord_out, dst, VXC_MODIFIER(0, 3, 0,VXC_RM_TowardZero, 0)); + + int8 output_desc; + _viv_asm(COPY, output_desc, output, sizeof(output_desc)); + baseAddr = (int)coord_out.z * output_desc.s4 + output_desc.s0; + _viv_asm(MOV, coord_out.w, baseAddr); + VXC_OP4_NoDest(img_store_3d, output, coord_out.xyww, dst, + VXC_MODIFIER(0, 3, 0,VXC_RM_TowardZero, 0)); } _viv_uniform VXC_512Bits uniConvertI8toI8_2x8; @@ -84,19 +107,31 @@ __kernel void resize_nearest_I8toI8 vxc_char16 src; int4 coord_in = (int4)(in_x_idx.x, in_y_idx, coord_out.z, 0); - VXC_ReadImage2DArray(src, input, coord_in, \ - VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0)); + int8 input_desc; + _viv_asm(COPY, input_desc, input, sizeof(input_desc)); + int baseAddr = (int)coord_in.z * input_desc.s4 + input_desc.s0; + _viv_asm(MOV, coord_in.w, baseAddr); + + VXC_OP4(img_load_3d, src, input, coord_in.xyww, VXC_5BITOFFSET_XY(0, 0), + VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0)); coord_in.x = in_x_idx.y; - VXC_ReadImage2DArray(src, input, coord_in, \ - VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(1, 1, 0, VXC_RM_TowardZero, 0)); + VXC_OP4(img_load_3d, src, input, coord_in.xyww, VXC_5BITOFFSET_XY(0, 0), + VXC_MODIFIER(1, 1, 0, VXC_RM_TowardZero, 0)); coord_in.x = in_x_idx.z; - VXC_ReadImage2DArray(src, input, coord_in, \ - VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(2, 2, 0, VXC_RM_TowardZero, 0)); + VXC_OP4(img_load_3d, src, input, coord_in.xyww, VXC_5BITOFFSET_XY(0, 0), + VXC_MODIFIER(2, 2, 0, VXC_RM_TowardZero, 0)); coord_in.x = in_x_idx.w; - VXC_ReadImage2DArray(src, input, coord_in, \ - VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(3, 3, 0, VXC_RM_TowardZero, 0)); + VXC_OP4(img_load_3d, src, input, coord_in.xyww, VXC_5BITOFFSET_XY(0, 0), + VXC_MODIFIER(3, 3, 0, VXC_RM_TowardZero, 0)); + VXC_DP2x8(src, src, src, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniConvertI8toI8_2x8); - VXC_WriteImage2DArray(output, coord_out, src, VXC_MODIFIER(0, 3, 0,VXC_RM_TowardZero, 0)); + + int8 output_desc; + _viv_asm(COPY, output_desc, output, sizeof(output_desc)); + baseAddr = (int)coord_out.z * output_desc.s4 + output_desc.s0; + _viv_asm(MOV, coord_out.w, baseAddr); + VXC_OP4_NoDest(img_store_3d, output, coord_out.xyww, src, + VXC_MODIFIER(0, 3, 0,VXC_RM_TowardZero, 0)); } __kernel void resize_nearest_I8toI8_op @@ -113,8 +148,14 @@ __kernel void resize_nearest_I8toI8_op vxc_char16 dst; int4 coord_in = (int4)(in_x_idx.x, in_y_idx, coord_out.z, 0); - VXC_ReadImage2DArray(src0, input, coord_in, \ - VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); + int8 input_desc; + _viv_asm(COPY, input_desc, input, sizeof(input_desc)); + int baseAddr = (int)coord_in.z * input_desc.s4 + input_desc.s0; + _viv_asm(MOV, coord_in.w, baseAddr); + + VXC_OP4(img_load_3d, src0, input, coord_in.xyww, VXC_5BITOFFSET_XY(0, 0), + VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); + vxc_uchar16 mask = (vxc_uchar16)(8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8); vxc_ushort8 input_idx; _viv_asm(COPY, input_idx, in_x_idx, 16); @@ -123,7 +164,13 @@ __kernel void resize_nearest_I8toI8_op VXC_BitExtract(dst0, src0, src0, mask, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); _viv_asm(COPY, dst, dst0, 8); VXC_DP2x8(dst, dst, dst, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), uniConvertI8toI8_2x8); - VXC_WriteImage2DArray(output, coord_out, dst, VXC_MODIFIER(0, 3, 0,VXC_RM_TowardZero, 0)); + + int8 output_desc; + _viv_asm(COPY, output_desc, output, sizeof(output_desc)); + baseAddr = (int)coord_out.z * output_desc.s4 + output_desc.s0; + _viv_asm(MOV, coord_out.w, baseAddr); + VXC_OP4_NoDest(img_store_3d, output, coord_out.xyww, dst, + VXC_MODIFIER(0, 3, 0,VXC_RM_TowardZero, 0)); } __kernel void resize_nearest_U8toU8 @@ -139,22 +186,34 @@ __kernel void resize_nearest_U8toU8 vxc_uchar16 src; int4 coord_in = (int4)(in_x_idx.x, in_y_idx, coord_out.z, 0); - VXC_ReadImage2DArray(src, input, coord_in, \ - VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0)); + int8 input_desc; + _viv_asm(COPY, input_desc, input, sizeof(input_desc)); + int baseAddr = (int)coord_in.z * input_desc.s4 + input_desc.s0; + _viv_asm(MOV, coord_in.w, baseAddr); + + VXC_OP4(img_load_3d, src, input, coord_in.xyww, VXC_5BITOFFSET_XY(0, 0), + VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0)); coord_in.x = in_x_idx.y; - VXC_ReadImage2DArray(src, input, coord_in, \ - VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(1, 1, 0, VXC_RM_TowardZero, 0)); + VXC_OP4(img_load_3d, src, input, coord_in.xyww, VXC_5BITOFFSET_XY(0, 0), + VXC_MODIFIER(1, 1, 0, VXC_RM_TowardZero, 0)); coord_in.x = in_x_idx.z; - VXC_ReadImage2DArray(src, input, coord_in, \ - VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(2, 2, 0, VXC_RM_TowardZero, 0)); + VXC_OP4(img_load_3d, src, input, coord_in.xyww, VXC_5BITOFFSET_XY(0, 0), + VXC_MODIFIER(2, 2, 0, VXC_RM_TowardZero, 0)); coord_in.x = in_x_idx.w; - VXC_ReadImage2DArray(src, input, coord_in, \ - VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(3, 3, 0, VXC_RM_TowardZero, 0)); + VXC_OP4(img_load_3d, src, input, coord_in.xyww, VXC_5BITOFFSET_XY(0, 0), + VXC_MODIFIER(3, 3, 0, VXC_RM_TowardZero, 0)); + vxc_ushort8 multiplier; _viv_asm(COPY, multiplier, multAndoutZP, 16); VXC_DP2x8(src, src, multiplier, \ VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), uniMultiplyAndPostShift_2x8); - VXC_WriteImage2DArray(output, coord_out, src, VXC_MODIFIER(0, 3, 0,VXC_RM_TowardZero, 0)); + + int8 output_desc; + _viv_asm(COPY, output_desc, output, sizeof(output_desc)); + baseAddr = (int)coord_out.z * output_desc.s4 + output_desc.s0; + _viv_asm(MOV, coord_out.w, baseAddr); + VXC_OP4_NoDest(img_store_3d, output, coord_out.xyww, src, + VXC_MODIFIER(0, 3, 0,VXC_RM_TowardZero, 0)); } __kernel void resize_nearest_U8toU8_op @@ -170,8 +229,14 @@ __kernel void resize_nearest_U8toU8_op vxc_uchar16 src0, dst; int4 coord_in = (int4)(in_x_idx.x, in_y_idx, coord_out.z, 0); - VXC_ReadImage2DArray(src0, input, coord_in, \ - VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); + int8 input_desc; + _viv_asm(COPY, input_desc, input, sizeof(input_desc)); + int baseAddr = (int)coord_in.z * input_desc.s4 + input_desc.s0; + _viv_asm(MOV, coord_in.w, baseAddr); + + VXC_OP4(img_load_3d, src0, input, coord_in.xyww, VXC_5BITOFFSET_XY(0, 0), + VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); + vxc_uchar16 mask = (vxc_uchar16)(8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8); vxc_ushort8 input_idx; _viv_asm(COPY, input_idx, in_x_idx, 16); @@ -180,7 +245,13 @@ __kernel void resize_nearest_U8toU8_op vxc_ushort8 multiplier; _viv_asm(COPY, multiplier, multAndoutZP, 16); VXC_DP2x8(dst, dst, multiplier, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), uniMultiplyAndPostShift_2x8); - VXC_WriteImage2DArray(output, coord_out, dst, VXC_MODIFIER(0, 3, 0,VXC_RM_TowardZero, 0)); + + int8 output_desc; + _viv_asm(COPY, output_desc, output, sizeof(output_desc)); + baseAddr = (int)coord_out.z * output_desc.s4 + output_desc.s0; + _viv_asm(MOV, coord_out.w, baseAddr); + VXC_OP4_NoDest(img_store_3d, output, coord_out.xyww, dst, + VXC_MODIFIER(0, 3, 0,VXC_RM_TowardZero, 0)); } __kernel void resize_nearest_I16toI16 @@ -196,19 +267,32 @@ __kernel void resize_nearest_I16toI16 vxc_short8 src; int4 coord_in = (int4)(in_x_idx.x, in_y_idx, coord_out.z, 0); - VXC_ReadImage2DArray(src, input, coord_in, \ - VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0)); + int8 input_desc; + _viv_asm(COPY, input_desc, input, sizeof(input_desc)); + int baseAddr = (int)coord_in.z * input_desc.s4 + input_desc.s0; + _viv_asm(MOV, coord_in.w, baseAddr); + + VXC_OP4(img_load_3d, src, input, coord_in.xyww, VXC_5BITOFFSET_XY(0, 0), + VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0)); coord_in.x = in_x_idx.y; - VXC_ReadImage2DArray(src, input, coord_in, \ - VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(1, 1, 0, VXC_RM_TowardZero, 0)); + VXC_OP4(img_load_3d, src, input, coord_in.xyww, VXC_5BITOFFSET_XY(0, 0), + VXC_MODIFIER(1, 1, 0, VXC_RM_TowardZero, 0)); coord_in.x = in_x_idx.z; - VXC_ReadImage2DArray(src, input, coord_in, \ - VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(2, 2, 0, VXC_RM_TowardZero, 0)); + VXC_OP4(img_load_3d, src, input, coord_in.xyww, VXC_5BITOFFSET_XY(0, 0), + VXC_MODIFIER(2, 2, 0, VXC_RM_TowardZero, 0)); coord_in.x = in_x_idx.w; - VXC_ReadImage2DArray(src, input, coord_in, \ - VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(3, 3, 0, VXC_RM_TowardZero, 0)); + VXC_OP4(img_load_3d, src, input, coord_in.xyww, VXC_5BITOFFSET_XY(0, 0), + VXC_MODIFIER(3, 3, 0, VXC_RM_TowardZero, 0)); + VXC_DP2x8(src, src, src, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), uniConvertI8toI8_2x8); - VXC_WriteImage2DArray(output, coord_out, src, VXC_MODIFIER(0, 3, 0,VXC_RM_TowardZero, 0)); + + + int8 output_desc; + _viv_asm(COPY, output_desc, output, sizeof(output_desc)); + baseAddr = (int)coord_out.z * output_desc.s4 + output_desc.s0; + _viv_asm(MOV, coord_out.w, baseAddr); + VXC_OP4_NoDest(img_store_3d, output, coord_out.xyww, src, + VXC_MODIFIER(0, 3, 0,VXC_RM_TowardZero, 0)); } __kernel void resize_nearest_I16toI16_op @@ -224,10 +308,16 @@ __kernel void resize_nearest_I16toI16_op vxc_ushort8 src0, src1, dst0; vxc_short8 dst; int4 coord_in = (int4)(in_x_idx.x, in_y_idx, coord_out.z, 0); - VXC_ReadImage2DArray(src0, input, coord_in, \ - VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); - VXC_ReadImage2DArray(src1, input, coord_in, \ - VXC_5BITOFFSET_XY(8, 0), VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); + + int8 input_desc; + _viv_asm(COPY, input_desc, input, sizeof(input_desc)); + int baseAddr = (int)coord_in.z * input_desc.s4 + input_desc.s0; + _viv_asm(MOV, coord_in.w, baseAddr); + + VXC_OP4(img_load_3d, src0, input, coord_in.xyww, VXC_5BITOFFSET_XY(0, 0), + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); + VXC_OP4(img_load_3d, src1, input, coord_in.xyww, VXC_5BITOFFSET_XY(8, 0), + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); //in_x_idx = in_x_idx - in_x_idx.xxxx; vxc_uchar16 mask = (vxc_uchar16)(8, 8, 8, 8, 8, 8, 8, 8, 16, 16, 16, 16, 16, 16, 16, 16); @@ -237,5 +327,11 @@ __kernel void resize_nearest_I16toI16_op VXC_BitExtract(dst0, src0, src1, mask, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); _viv_asm(COPY, dst, dst0, 8); VXC_DP2x8(dst, dst, dst, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), uniConvertI8toI8_2x8); - VXC_WriteImage2DArray(output, coord_out, dst, VXC_MODIFIER(0, 3, 0,VXC_RM_TowardZero, 0)); + + int8 output_desc; + _viv_asm(COPY, output_desc, output, sizeof(output_desc)); + baseAddr = (int)coord_out.z * output_desc.s4 + output_desc.s0; + _viv_asm(MOV, coord_out.w, baseAddr); + VXC_OP4_NoDest(img_store_3d, output, coord_out.xyww, dst, + VXC_MODIFIER(0, 3, 0,VXC_RM_TowardZero, 0)); } diff --git a/src/tim/vx/internal/src/libnnext/ops/vx/space2depth_internal.vx b/src/tim/vx/internal/src/libnnext/ops/vx/space2depth_internal.vx new file mode 100644 index 0000000..b8bb334 --- /dev/null +++ b/src/tim/vx/internal/src/libnnext/ops/vx/space2depth_internal.vx @@ -0,0 +1,135 @@ +#include "cl_viv_vx_ext.h" + +_viv_uniform VXC_512Bits uniExtractEvenUint8Stride2_2x8; +_viv_uniform VXC_512Bits uniExtractOddUint8Stride2_2x8; +_viv_uniform VXC_512Bits uniExtractEvenFp16Stride2_4x4; +_viv_uniform VXC_512Bits uniExtractOddFp16Stride2_4x4; + +_viv_uniform int input_depth; + +#define SPACE2DEPTH_INTERNAL_QINT_TO_QINT(src0_type_name, src1_type_name, read_type) \ +__kernel void space2depth_internal_##src0_type_name##to##src1_type_name( \ + image2d_array_t input, \ + image2d_array_t output, \ + int block_size_x, \ + int block_size_y \ + ) \ +{ \ + int gidx = get_global_id(0); \ + int gidy = get_global_id(1); \ + int gidz = get_global_id(2); \ + int4 coord = (int4)(gidx, gidy, gidz, 0); \ + read_type src; \ + VXC_ReadImage2DArray(src, input, coord, VXC_5BITOFFSET_XY(0, 0), \ + VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0)); \ + \ + ushort stride_x = (ushort)block_size_x; \ + ushort stride_y = (ushort)block_size_y; \ + ushort sidx = (ushort)gidx; \ + ushort sidy = (ushort)gidy; \ + ushort tmpX = sidx % stride_x; \ + ushort tmpY = sidy % stride_y; \ + int tmpId0 = tmpX; \ + int tmpId1 = tmpY; \ + int4 coord_out = (int4)((int)(sidx / stride_x), (int)(sidy / stride_y), 0, 0); \ + coord_out.z = tmpId0 * input_depth + tmpId1 * block_size_x * input_depth + gidz; \ + VXC_WriteImage2DArray(output, coord_out, src, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0)); \ +} +SPACE2DEPTH_INTERNAL_QINT_TO_QINT(U8, U8, vxc_uchar16) +SPACE2DEPTH_INTERNAL_QINT_TO_QINT(I8, I8, vxc_char16) +SPACE2DEPTH_INTERNAL_QINT_TO_QINT(I16, I16, vxc_short8) + +__kernel void space2depth_internal_F16toF16( + image2d_array_t input, + image2d_array_t output, + int block_size_x, + int block_size_y + ) +{ + int gidx = get_global_id(0); + int gidy = get_global_id(1); + int gidz = get_global_id(2); + int4 coord = (int4)(gidx, gidy, gidz, 0); + vxc_short8 data, imgVal0; + VXC_ReadImage2DArray(data, input, coord, VXC_5BITOFFSET_XY(0, 0), + VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0)); + + ushort stride_x = (ushort)block_size_x; + ushort stride_y = (ushort)block_size_y; + ushort sidx = (ushort)gidx; + ushort sidy = (ushort)gidy; + ushort tmpX = sidx % stride_x; + ushort tmpY = sidy % stride_y; + int tmpId0 = tmpX; + int tmpId1 = tmpY; + int4 coord_out = (int4)((int)(sidx / stride_x), (int)(sidy / stride_y), 0, 0); + coord_out.z = tmpId0 * input_depth + tmpId1 * block_size_x * input_depth + gidz; + + VXC_WriteImage2DArray(output, coord_out, data, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0)); +} + +#define SPACE2DEPTH_INTERNAL_QINT_TO_QINT_X2Y1(src0_type_name, src1_type_name, read_type, write_type) \ +__kernel void space2depth_internal_##src0_type_name##to##src1_type_name##_X2Y1( \ + image2d_array_t input, \ + image2d_array_t output, \ + int block_size_x, \ + int block_size_y \ + ) \ +{ \ + int gidx = get_global_id(0); \ + int gidy = get_global_id(1); \ + int gidz = get_global_id(2); \ + \ + int4 coord = (int4)(gidx, gidy, gidz, 0); \ + int4 coord_out = (int4)(gidx >> 1, gidy, gidz, 0); \ + int out_d1; \ + read_type imageData; \ + write_type imgVal0, imgVal1; \ + \ + VXC_ReadImage2DArray(imageData, input, coord, VXC_5BITOFFSET_XY(0, 0), \ + VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); \ + \ + out_d1 = gidz + input_depth; \ + \ + VXC_DP2x8(imgVal0, imageData, imageData,\ + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniExtractEvenUint8Stride2_2x8); \ + VXC_DP2x8(imgVal1, imageData, imageData,\ + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniExtractOddUint8Stride2_2x8); \ + VXC_WriteImage2DArray(output, coord_out, imgVal0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \ + coord_out.z = out_d1; \ + VXC_WriteImage2DArray(output, coord_out, imgVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \ +} +SPACE2DEPTH_INTERNAL_QINT_TO_QINT_X2Y1(U8, U8, vxc_uchar16, vxc_uchar16) +SPACE2DEPTH_INTERNAL_QINT_TO_QINT_X2Y1(I8, I8, vxc_char16, vxc_char16) + +#define SPACE2DEPTH_INTERNAL_16BITS_X2Y1(src0_type_name, src1_type_name, read_type, write_type) \ +__kernel void space2depth_internal_##src0_type_name##to##src1_type_name##_X2Y1( \ + image2d_array_t input, \ + image2d_array_t output, \ + int block_size_x, \ + int block_size_y \ + ) \ +{ \ + int gidx = get_global_id(0); \ + int gidy = get_global_id(1); \ + int gidz = get_global_id(2); \ + \ + int4 coord = (int4)(gidx, gidy, gidz, 0); \ + int4 coord_out = (int4)(gidx >> 1, gidy, gidz, 0); \ + int out_d1; \ + read_type imageData; \ + write_type imgVal0, imgVal1; \ + \ + VXC_ReadImage2DArray(imageData, input, coord, VXC_5BITOFFSET_XY(0, 0), \ + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \ + out_d1 = gidz + input_depth; \ + VXC_DP4x4(imgVal0, imageData, imageData, \ + VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniExtractEvenFp16Stride2_4x4); \ + VXC_DP4x4(imgVal1, imageData, imageData, \ + VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniExtractOddFp16Stride2_4x4); \ + VXC_WriteImage2DArray(output, coord_out, imgVal0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \ + coord_out.z = out_d1; \ + VXC_WriteImage2DArray(output, coord_out, imgVal1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \ +} +SPACE2DEPTH_INTERNAL_16BITS_X2Y1(I16, I16, vxc_short8, vxc_short8) +SPACE2DEPTH_INTERNAL_16BITS_X2Y1(F16, F16, vxc_short8, vxc_short8) \ No newline at end of file diff --git a/src/tim/vx/internal/src/libnnext/ops/vx/upsamplescale.vx b/src/tim/vx/internal/src/libnnext/ops/vx/upsamplescale.vx new file mode 100644 index 0000000..efc9266 --- /dev/null +++ b/src/tim/vx/internal/src/libnnext/ops/vx/upsamplescale.vx @@ -0,0 +1,58 @@ + +#include "cl_viv_vx_ext.h" + +_viv_uniform VXC_512Bits uniConvertDatatoF32_4x4; +_viv_uniform float output_scale; +_viv_uniform float tail; + +#define UPSAMPLE_SCALETO_FUN(src_name, dst_name, read_type, src_type, dst_type, write_type, conv_func) \ + __kernel void upsamplescale_##src_name##to##dst_name( \ + __read_only image2d_array_t input, \ + __write_only image2d_array_t output, \ + int stride, \ + float scale) \ +{ \ + int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0); \ + read_type read_val; \ + src_type src_val; \ + dst_type dst_val; \ + write_type write_val; \ + VXC_ReadImage2DArray(read_val, input, coord, VXC_5BITOFFSET_XY(0, 0), \ + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \ + _viv_asm(COPY, src_val, read_val, 16); \ + coord.xy *= stride; \ + int8 output_desc; \ + _viv_asm(COPY, output_desc, output, sizeof(output_desc)); \ + int baseAddr = (int)coord.z * output_desc.s4 + output_desc.s0; \ + _viv_asm(MOV, coord.w, baseAddr); \ + float4 data; \ + VXC_DP4x4(data, src_val, src_val, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), uniConvertDatatoF32_4x4); \ + data = data * output_scale + tail; \ + _viv_asm(conv_func, dst_val, data); \ + _viv_asm(COPY, write_val, dst_val, 16); \ + int4 coord_out = coord; \ + for (int y = 0; y < stride; y++) \ + { \ + coord_out.x = coord.x; \ + for (int x = 0; x < stride; ) \ + { \ + VXC_OP4_NoDest(img_store_3d, output, coord_out.xyww, write_val, \ + VXC_MODIFIER(0, 0, 0,VXC_RM_TowardZero, 0)); \ + x++; \ + coord_out.x ++; \ + } \ + coord_out.y ++; \ + } \ +} + +UPSAMPLE_SCALETO_FUN(F16, F16, vxc_short8, vxc_half8, half4, short4, CONV) +UPSAMPLE_SCALETO_FUN(F16, I16, vxc_short8, vxc_half8, int4, short4, CONV_RTE) +UPSAMPLE_SCALETO_FUN(F16, I8, vxc_short8, vxc_half8, int4, char4, CONV_RTE) +UPSAMPLE_SCALETO_FUN(F16, U8, vxc_short8, vxc_half8, int4, uchar4, CONV_RTE) +UPSAMPLE_SCALETO_FUN(I16, I16, vxc_short8, vxc_short8, int4, short4, CONV_RTE) +UPSAMPLE_SCALETO_FUN(I16, F16, vxc_short8, vxc_short8, half4, short4, CONV) +UPSAMPLE_SCALETO_FUN(I8, I8, vxc_char16, vxc_char16, int4, char4, CONV_RTE) +UPSAMPLE_SCALETO_FUN(I8, F16, vxc_short8, vxc_short8, half4, short4, CONV) +UPSAMPLE_SCALETO_FUN(U8, U8, vxc_uchar16, vxc_uchar16, int4, uchar4, CONV_RTE) +UPSAMPLE_SCALETO_FUN(U8, F16, vxc_short8, vxc_short8, half4, short4, CONV) + diff --git a/src/tim/vx/internal/src/libnnext/ops/vx/upsamplescale_k2.vx b/src/tim/vx/internal/src/libnnext/ops/vx/upsamplescale_k2.vx new file mode 100644 index 0000000..d1935b1 --- /dev/null +++ b/src/tim/vx/internal/src/libnnext/ops/vx/upsamplescale_k2.vx @@ -0,0 +1,83 @@ + +#include "cl_viv_vx_ext.h" + +_viv_uniform VXC_512Bits uniUpScale2X_lo_2x8; +_viv_uniform VXC_512Bits uniUpScale2X_hi_2x8; +_viv_uniform int2 multAndoutZP;//[0:15] multiplier, [31:63] output zp + +#define UPSAMPLE_SCALETO8B_FUN(src_name, dst_name, read_type, src_type, dst_type) \ + __kernel void upsamplescale_##src_name##to##dst_name##_K2( \ + __read_only image2d_array_t input, \ + __write_only image2d_array_t output, \ + int stride, \ + float scale) \ +{ \ + int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0); \ + read_type read_val; \ + src_type src_val; \ + dst_type dst_val; \ + VXC_ReadImage2DArray(read_val, input, coord, VXC_5BITOFFSET_XY(0, 0), \ + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \ + _viv_asm(COPY, src_val, read_val, 16); \ + coord.xy <<= 1; \ + int8 output_desc; \ + _viv_asm(COPY, output_desc, output, sizeof(output_desc)); \ + int baseAddr = (int)coord.z * output_desc.s4 + output_desc.s0; \ + _viv_asm(MOV, coord.w, baseAddr); \ + vxc_ushort8 multiplier; \ + _viv_asm(COPY, multiplier, multAndoutZP, 16); \ + VXC_DP2x8(dst_val, src_val, multiplier, \ + VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniUpScale2X_lo_2x8); \ + VXC_DP2x8(dst_val, src_val, multiplier, \ + VXC_MODIFIER(8, 15, 0, VXC_RM_ToNearestEven, 1), uniUpScale2X_hi_2x8); \ + VXC_OP4_NoDest(img_store_3d, output, coord.xyww, dst_val, VXC_MODIFIER(0, 15, 0,VXC_RM_TowardZero, 0)); \ + coord.y ++; \ + VXC_OP4_NoDest(img_store_3d, output, coord.xyww, dst_val, VXC_MODIFIER(0, 15, 0,VXC_RM_TowardZero, 0)); \ +} + +UPSAMPLE_SCALETO8B_FUN(F16, I8, vxc_short8, vxc_half8, vxc_char16) +UPSAMPLE_SCALETO8B_FUN(F16, U8, vxc_short8, vxc_half8, vxc_uchar16) +UPSAMPLE_SCALETO8B_FUN(I8, I8, vxc_char16, vxc_char16, vxc_char16) +UPSAMPLE_SCALETO8B_FUN(U8, U8, vxc_uchar16, vxc_uchar16, vxc_uchar16) + +#define UPSAMPLE_SCALETO16B_FUN(src_name, dst_name, read_type, src_type, dst_type, write_type) \ + __kernel void upsamplescale_##src_name##to##dst_name##_K2( \ + __read_only image2d_array_t input, \ + __write_only image2d_array_t output, \ + int stride, \ + float scale) \ +{ \ + int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0); \ + read_type read_val; \ + src_type src_val; \ + dst_type dst0_val; \ + dst_type dst1_val; \ + write_type write_val; \ + VXC_ReadImage2DArray(read_val, input, coord, VXC_5BITOFFSET_XY(0, 0), \ + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \ + _viv_asm(COPY, src_val, read_val, 16); \ + coord.xy <<= 1; \ + int8 output_desc; \ + _viv_asm(COPY, output_desc, output, sizeof(output_desc)); \ + int baseAddr = (int)coord.z * output_desc.s4 + output_desc.s0; \ + _viv_asm(MOV, coord.w, baseAddr); \ + vxc_ushort8 multiplier; \ + _viv_asm(COPY, multiplier, multAndoutZP, 16); \ + VXC_DP2x8(dst0_val, src_val, multiplier, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1), uniUpScale2X_lo_2x8); \ + VXC_DP2x8(dst1_val, src_val, multiplier, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1), uniUpScale2X_hi_2x8); \ + _viv_asm(COPY, write_val, dst0_val, 16); \ + VXC_OP4_NoDest(img_store_3d, output, coord.xyww, write_val, VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0)); \ + coord.y ++; \ + VXC_OP4_NoDest(img_store_3d, output, coord.xyww, write_val, VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0)); \ + _viv_asm(COPY, write_val, dst1_val, 16); \ + coord.xy = coord.xy + (int2)(8, -1); \ + VXC_OP4_NoDest(img_store_3d, output, coord.xyww, write_val, VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0)); \ + coord.y ++; \ + VXC_OP4_NoDest(img_store_3d, output, coord.xyww, write_val, VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0)); \ +} +UPSAMPLE_SCALETO16B_FUN(F16, F16, vxc_short8, vxc_half8, vxc_half8, vxc_short8) +UPSAMPLE_SCALETO16B_FUN(F16, I16, vxc_short8, vxc_half8, vxc_short8, vxc_short8) +UPSAMPLE_SCALETO16B_FUN(I8, F16, vxc_char16, vxc_char16, vxc_half8, vxc_short8) +UPSAMPLE_SCALETO16B_FUN(U8, F16, vxc_uchar16, vxc_uchar16, vxc_half8, vxc_short8) +UPSAMPLE_SCALETO16B_FUN(I16, F16, vxc_short8, vxc_short8, vxc_half8, vxc_short8) +UPSAMPLE_SCALETO16B_FUN(I16, I16, vxc_short8, vxc_short8, vxc_short8, vxc_short8) diff --git a/src/tim/vx/internal/src/libnnext/ops/vx/vsi_nn_kernel_crop.vx b/src/tim/vx/internal/src/libnnext/ops/vx/vsi_nn_kernel_crop.vx deleted file mode 100644 index bc5e1d0..0000000 --- a/src/tim/vx/internal/src/libnnext/ops/vx/vsi_nn_kernel_crop.vx +++ /dev/null @@ -1,111 +0,0 @@ -#include "cl_viv_vx_ext.h" - -//-----------------------------------------------tensor crop------------------------------- -__kernel void vxcTensorCrop_Int16( - __read_only image2d_array_t input, - __write_only image2d_array_t output, - int offset0, - int offset1, - int offset2) -{ - int4 coord_in = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0); - vxc_ushort8 src0, src1, src2, src3; - - VXC_ReadImage2DArray(src0, input, coord_in, VXC_5BITOFFSET_XY(0, 0),\ - VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); - VXC_ReadImage2DArray(src1, input, coord_in, VXC_5BITOFFSET_XY(0, 1),\ - VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); - VXC_ReadImage2DArray(src2, input, coord_in, VXC_5BITOFFSET_XY(0, 2),\ - VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); - VXC_ReadImage2DArray(src3, input, coord_in, VXC_5BITOFFSET_XY(0, 3),\ - VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); - int4 coord_out = (int4)(get_global_id(0) - offset0, get_global_id(1)\ - - offset1, get_global_id(2) - offset2, 0); - - VXC_WriteImage2DArray(output, coord_out, src0, VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0)); - coord_out.y ++; - VXC_WriteImage2DArray(output, coord_out, src1, VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0)); - coord_out.y ++; - VXC_WriteImage2DArray(output, coord_out, src2, VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0)); - coord_out.y ++; - VXC_WriteImage2DArray(output, coord_out, src3, VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0)); -} - -__kernel void vxcTensorCrop_Int8( - __read_only image2d_array_t input, - __write_only image2d_array_t output, - int offset0, - int offset1, - int offset2) -{ - int4 coord_in = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0); - - vxc_uchar16 src0, src1, src2, src3; - - VXC_ReadImage2DArray(src0, input, coord_in, VXC_5BITOFFSET_XY(0, 0),\ - VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); - VXC_ReadImage2DArray(src1, input, coord_in, VXC_5BITOFFSET_XY(0, 1),\ - VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); - VXC_ReadImage2DArray(src2, input, coord_in, VXC_5BITOFFSET_XY(0, 2),\ - VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); - VXC_ReadImage2DArray(src3, input, coord_in, VXC_5BITOFFSET_XY(0, 3),\ - VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); - int4 coord_out = (int4)(get_global_id(0) - offset0, get_global_id(1) - offset1,\ - get_global_id(2) - offset2, 0); - - VXC_WriteImage2DArray(output, coord_out, src0, VXC_MODIFIER(0, 15, 0,VXC_RM_TowardZero, 0)); - coord_out.y ++; - VXC_WriteImage2DArray(output, coord_out, src1, VXC_MODIFIER(0, 15, 0,VXC_RM_TowardZero, 0)); - coord_out.y ++; - VXC_WriteImage2DArray(output, coord_out, src2, VXC_MODIFIER(0, 15, 0,VXC_RM_TowardZero, 0)); - coord_out.y ++; - VXC_WriteImage2DArray(output, coord_out, src3, VXC_MODIFIER(0, 15, 0,VXC_RM_TowardZero, 0)); -} - -_viv_uniform VXC_512Bits uniConvertInt16toFp16_2x8; - -__kernel void vxcTensorCrop_Int16_Fp16( - __read_only image2d_array_t input, - __write_only image2d_array_t output, - int offset0, - int offset1, - int offset2) -{ - int4 coord_in = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0); - vxc_short8 src0, src1, src2, src3; - - VXC_ReadImage2DArray(src0, input, coord_in, VXC_5BITOFFSET_XY(0, 0),\ - VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); - VXC_ReadImage2DArray(src1, input, coord_in, VXC_5BITOFFSET_XY(0, 1),\ - VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); - VXC_ReadImage2DArray(src2, input, coord_in, VXC_5BITOFFSET_XY(0, 2),\ - VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); - VXC_ReadImage2DArray(src3, input, coord_in, VXC_5BITOFFSET_XY(0, 3),\ - VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); - int4 coord_out = (int4)(get_global_id(0) - offset0, get_global_id(1)\ - - offset1, get_global_id(2) - offset2, 0); - - vxc_half8 dst0, dst1, dst2, dst3; - VXC_DP2x8(dst0, src0, src0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1),\ - uniConvertInt16toFp16_2x8); - VXC_DP2x8(dst1, src1, src1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1),\ - uniConvertInt16toFp16_2x8); - VXC_DP2x8(dst2, src2, src2, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1),\ - uniConvertInt16toFp16_2x8); - VXC_DP2x8(dst3, src3, src3, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1),\ - uniConvertInt16toFp16_2x8); - - vxc_short8 out0, out1, out2, out3; - _viv_asm(COPY, out0, dst0, 16); - _viv_asm(COPY, out1, dst1, 16); - _viv_asm(COPY, out2, dst2, 16); - _viv_asm(COPY, out3, dst3, 16); - - VXC_WriteImage2DArray(output, coord_out, out0, VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0)); - coord_out.y ++; - VXC_WriteImage2DArray(output, coord_out, out1, VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0)); - coord_out.y ++; - VXC_WriteImage2DArray(output, coord_out, out2, VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0)); - coord_out.y ++; - VXC_WriteImage2DArray(output, coord_out, out3, VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0)); -} diff --git a/src/tim/vx/internal/src/libnnext/ops/vx/vsi_nn_kernel_fullconnect2.vx b/src/tim/vx/internal/src/libnnext/ops/vx/vsi_nn_kernel_fullconnect2.vx deleted file mode 100644 index a052f8c..0000000 --- a/src/tim/vx/internal/src/libnnext/ops/vx/vsi_nn_kernel_fullconnect2.vx +++ /dev/null @@ -1,63 +0,0 @@ -#include "cl_viv_vx_ext.h" - -_viv_uniform int loopNum; -_viv_uniform VXC_512Bits uniMulAcc_16x1; -__kernel void vsi_nn_kernel_fullconnect2( - __read_only image2d_array_t input, - __read_only image2d_array_t weight, - __read_only image2d_array_t bias, - __write_only image2d_array_t output) -{ - int4 coord_in = (int4)(16, get_global_id(0), get_global_id(1), 0); - int2 coord_out = (int2)(get_global_id(0), get_global_id(1)); - - vxc_short8 v0, v1, v2, v3, v4, v5, v6, v7; - vxc_half8 i0, i1, i2, i3; - vxc_half8 w0, w1, w2, w3; - float4 sum = 0; - float dst = 0; - dst = read_imagef(bias, coord_in.ywww).x; - do - { - VXC_ReadImage(v0, input, coord_in.xz, VXC_5BITOFFSET_XY(-16, 0),\ - VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); - _viv_asm(COPY, i0, v0, 16); - VXC_ReadImage(v1, weight, coord_in.xy, VXC_5BITOFFSET_XY(-16, 0),\ - VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); - _viv_asm(COPY, w0, v1, 16); - VXC_ReadImage(v2, input, coord_in.xz, VXC_5BITOFFSET_XY(-8, 0),\ - VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); - _viv_asm(COPY, i1, v2, 16); - VXC_ReadImage(v3, weight, coord_in.xy, VXC_5BITOFFSET_XY(-8, 0),\ - VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); - _viv_asm(COPY, w1, v3, 16); - VXC_ReadImage(v4, input, coord_in.xz, VXC_5BITOFFSET_XY(0, 0),\ - VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); - _viv_asm(COPY, i2, v4, 16); - VXC_ReadImage(v5, weight, coord_in.xy, VXC_5BITOFFSET_XY(0, 0),\ - VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); - _viv_asm(COPY, w2, v5, 16); - VXC_ReadImage(v6, input, coord_in.xz, VXC_5BITOFFSET_XY(8, 0),\ - VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); - _viv_asm(COPY, i3, v6, 16); - VXC_ReadImage(v7, weight, coord_in.xy, VXC_5BITOFFSET_XY(8, 0),\ - VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); - _viv_asm(COPY, w3, v7, 16); - - coord_in.x += 32; - - VXC_DP16x1(sum, i0, w0, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0), uniMulAcc_16x1); - VXC_DP16x1(sum, i1, w1, VXC_MODIFIER(1, 1, 0, VXC_RM_TowardZero, 0), uniMulAcc_16x1); - VXC_DP16x1(sum, i2, w2, VXC_MODIFIER(2, 2, 0, VXC_RM_TowardZero, 0), uniMulAcc_16x1); - VXC_DP16x1(sum, i3, w3, VXC_MODIFIER(3, 3, 0, VXC_RM_TowardZero, 0), uniMulAcc_16x1); - - float4 tmp = {1, 1, 1, 1}; - dst = dst + dot(sum, tmp); - - } while (coord_in.x < loopNum); - - vxc_half v; - _viv_asm(CONV, v, dst); - _viv_asm(COPY, v0, v, 16); - VXC_WriteImage(output, coord_out.xy, v0, VXC_MODIFIER(0, 0, 0,VXC_RM_TowardZero, 0)); -} diff --git a/src/tim/vx/internal/src/libnnext/ops/vx/vsi_nn_kernel_layernormalize_U8.vx b/src/tim/vx/internal/src/libnnext/ops/vx/vsi_nn_kernel_layernormalize_U8.vx deleted file mode 100644 index 118764e..0000000 --- a/src/tim/vx/internal/src/libnnext/ops/vx/vsi_nn_kernel_layernormalize_U8.vx +++ /dev/null @@ -1,129 +0,0 @@ -#include "cl_viv_vx_ext.h" - -/*****************************layernorm uint8 to fp16****************************/ -_viv_uniform int width; -_viv_uniform float dimRatio; -_viv_uniform VXC_512Bits UniFP16toFP32Lo4_dp4x4; -_viv_uniform VXC_512Bits uniConvert1stUint8SubZpToFp32_4x4; -_viv_uniform VXC_512Bits uniConvert2ndUint8SubZpToFp32_4x4; -_viv_uniform VXC_512Bits uniConvert3rdUint8SubZpToFp32_4x4; -_viv_uniform VXC_512Bits uniConvert4thUint8SubZpToFp32_4x4; -_viv_uniform VXC_512Bits uniSumU8_16x1; -_viv_uniform VXC_512Bits uniSqrSum_16x1; -_viv_uniform float input_scale; -_viv_uniform int inputZP; -_viv_uniform int sumInZp; -_viv_uniform int tmpZp1; -_viv_uniform int tmpZp2; -_viv_uniform float e2InScale; -_viv_uniform VXC_512Bits uniConvertSecFp16Fp32_4x4; -_viv_uniform VXC_512Bits UniPackFP16even_2x8; - -__kernel void vxcLayerNormU8toFp16( - image2d_array_t input, - image2d_array_t bias, - image2d_array_t scale, - image2d_array_t output, - float eps) -{ - int4 coord = (int4)(0, get_global_id(1), 0, 0); - vxc_uchar16 src0; - float sum = 0, sqr = 0; - int tmpSum = 0, tmpSqr = 0; - vxc_int4 tmpSum1; - vxc_int4 tmpSqr1; - - for(coord.x = 0; coord.x < width; coord.x += 16) - { - VXC_ReadImage(src0, input, coord.xy, VXC_5BITOFFSET_XY(0, 0),\ - VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); - VXC_DP16x1(tmpSum1, src0, src0, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0), uniSumU8_16x1); - tmpSum += (tmpSum1.x); - VXC_DP16x1(tmpSqr1, src0, src0, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0), uniSqrSum_16x1); - tmpSqr += (tmpSqr1.x + tmpZp1 * tmpSum1.x); - } - sum = (tmpSum + sumInZp) * input_scale; - sqr = (tmpSqr + tmpZp2) * e2InScale; - - float mean, vari; - mean = sum * dimRatio; - vari = sqr*dimRatio - mean*mean; - vari += eps; - vari = rsqrt(vari); - vxc_float4 tmpData0, tmpData1, tmpData2, tmpData3; - int4 coord_bias = (int4)(0, 0, 0, 0); - vxc_half8 scale_h; - vxc_float4 bias_f0, bias_f1, scale_f0, scale_f1; - vxc_short8 src1, outval; - short zp = inputZP; - half4 tmpVal0, tmpVal1; - vxc_half8 dst; - - for(coord.x = 0; coord.x < width; coord.x += 16) - { - coord_bias.x = coord.x; - VXC_ReadImage(src0, input, coord.xy, VXC_5BITOFFSET_XY(0, 0),\ - VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); - VXC_ReadImage(src1, scale, coord.xw, VXC_5BITOFFSET_XY(0, 0),\ - VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); - _viv_asm(COPY, scale_h, src1, 16); - VXC_DP4x4(scale_f0, scale_h, scale_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\ - UniFP16toFP32Lo4_dp4x4); - VXC_DP4x4(scale_f1, scale_h, scale_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\ - uniConvertSecFp16Fp32_4x4); - bias_f0 = read_imagef(bias, coord_bias); - coord_bias.x += 4; - bias_f1 = read_imagef(bias, coord_bias); - coord_bias.x += 4; - - VXC_ReadImage(src1, scale, coord.xw, VXC_5BITOFFSET_XY(8, 0),\ - VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); - _viv_asm(COPY, scale_h, src1, 16); - VXC_DP4x4(tmpData0, src0, zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\ - uniConvert1stUint8SubZpToFp32_4x4); - VXC_DP4x4(tmpData1, src0, zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\ - uniConvert2ndUint8SubZpToFp32_4x4); - VXC_DP4x4(tmpData2, src0, zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\ - uniConvert3rdUint8SubZpToFp32_4x4); - VXC_DP4x4(tmpData3, src0, zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\ - uniConvert4thUint8SubZpToFp32_4x4); - tmpData0 *= input_scale; - tmpData1 *= input_scale; - tmpData2 *= input_scale; - tmpData3 *= input_scale; - - vxc_float4 norm; - tmpData0 -= mean; - norm = scale_f0 * vari * tmpData0 + bias_f0; - bias_f0 = read_imagef(bias, coord_bias); - VXC_DP4x4(scale_f0, scale_h, scale_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\ - UniFP16toFP32Lo4_dp4x4); - coord_bias.x += 4; - _viv_asm(CONV, tmpVal0, norm); - - tmpData1 -= mean; - norm = scale_f1 * vari * tmpData1 + bias_f1; - bias_f1 = read_imagef(bias, coord_bias); - VXC_DP4x4(scale_f1, scale_h, scale_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\ - uniConvertSecFp16Fp32_4x4); - _viv_asm(CONV, tmpVal1, norm); - VXC_DP2x8(dst, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0),\ - UniPackFP16even_2x8); - _viv_asm(COPY, outval, dst, 16); - int2 coord_out = (int2)(coord.x, coord.y); - VXC_WriteImage(output, coord_out, outval, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); - - tmpData2 -= mean; - norm = scale_f0 * vari * tmpData2 + bias_f0; - _viv_asm(CONV, tmpVal0, norm); - - tmpData3 -= mean; - norm = scale_f1 * vari * tmpData3 + bias_f1; - _viv_asm(CONV, tmpVal1, norm); - VXC_DP2x8(dst, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0),\ - UniPackFP16even_2x8); - _viv_asm(COPY, outval, dst, 16); - coord_out.x += 8; - VXC_WriteImage(output, coord_out, outval, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); - } -} diff --git a/src/tim/vx/internal/src/libnnext/ops/vx/vsi_nn_kernel_resize.vx b/src/tim/vx/internal/src/libnnext/ops/vx/vsi_nn_kernel_resize.vx deleted file mode 100644 index 8175ced..0000000 --- a/src/tim/vx/internal/src/libnnext/ops/vx/vsi_nn_kernel_resize.vx +++ /dev/null @@ -1,38 +0,0 @@ -#include "cl_viv_vx_ext.h" - -//--------------------------resize------------------------- -_viv_uniform VXC_512Bits uniPackEvenData_2x8; -__kernel void resize_16bits_downsample_quarter - ( - __read_only image2d_array_t input, - __write_only image2d_array_t output - ) -{ - int2 coord = (int2)(get_global_id(0), get_global_id(1)); - vxc_short8 src0, src1; - VXC_ReadImage(src0, input, coord.xy, VXC_5BITOFFSET_XY(0, 0),\ - VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); - VXC_ReadImage(src1, input, coord.xy, VXC_5BITOFFSET_XY(8, 0),\ - VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); - - coord = coord >> 1; - VXC_DP2x8(src0, src0, src1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardInf, 0), uniPackEvenData_2x8); - VXC_WriteImage(output, coord, src0, VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0)); -} - -__kernel void resize_8bits_downsample_quarter - ( - __read_only image2d_array_t input, - __write_only image2d_array_t output - ) -{ - int2 coord = (int2)(get_global_id(0), get_global_id(1)); - vxc_char16 src0; - vxc_char8 dst; - VXC_ReadImage(src0, input, coord.xy, VXC_5BITOFFSET_XY(0, 0),\ - VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); - - coord = coord >> 1; - dst = src0.s02468ace; - VXC_WriteImage(output, coord, dst, VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0)); -} diff --git a/src/tim/vx/internal/src/libnnext/ops/vx/vsi_nn_kernel_scale.vx b/src/tim/vx/internal/src/libnnext/ops/vx/vsi_nn_kernel_scale.vx deleted file mode 100644 index 3c9551d..0000000 --- a/src/tim/vx/internal/src/libnnext/ops/vx/vsi_nn_kernel_scale.vx +++ /dev/null @@ -1,49 +0,0 @@ -#include "cl_viv_vx_ext.h" - -//--------------------------scale------------------------- -_viv_uniform VXC_512Bits uniExtractHalf8_2x8; -_viv_uniform VXC_512Bits uniFp16MulFp16ToFp32_Lo_4x4; -_viv_uniform VXC_512Bits uniFp16MulFp16ToFp32_Hi_4x4; -__kernel void scale_fp16 - ( - __read_only image2d_array_t input, - __read_only image2d_array_t weights, - __read_only image2d_array_t biases, - __write_only image2d_array_t output - ) -{ - int4 coord = (int4)(get_global_id(0), get_global_id(1), 0, 0); - vxc_short8 vec0, vec1; - vxc_half8 src0; - vxc_half8 w0; - vxc_float4 b0, b1; - vxc_float4 dst0, dst1; - VXC_ReadImage(vec0, input, coord.xy, VXC_5BITOFFSET_XY(0, 0),\ - VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); - _viv_asm(COPY, src0, vec0, 16); - VXC_ReadImage(vec1, weights, coord.xw, VXC_5BITOFFSET_XY(0, 0),\ - VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); - _viv_asm(COPY, w0, vec1, 16); - - coord.z = coord.x + 4; - - b0 = read_imagef(biases, coord.xwww); - b1 = read_imagef(biases, coord.zwww); - - VXC_DP4x4(dst0, src0, w0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\ - uniFp16MulFp16ToFp32_Lo_4x4); - VXC_DP4x4(dst1, src0, w0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\ - uniFp16MulFp16ToFp32_Hi_4x4); - dst0 += b0; - dst1 += b1; - - half4 t0, t1; - - _viv_asm(CONV, t0, dst0); - _viv_asm(CONV, t1, dst1); - - VXC_DP2x8(w0, t0, t1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniExtractHalf8_2x8); - _viv_asm(COPY, vec0, w0, 16); - - VXC_WriteImage(output, coord.xy, vec0, VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0)); -} diff --git a/src/tim/vx/internal/src/libnnext/ops/vx/vsi_nn_kernel_shufflechannel.vx b/src/tim/vx/internal/src/libnnext/ops/vx/vsi_nn_kernel_shufflechannel.vx deleted file mode 100644 index 9800aa8..0000000 --- a/src/tim/vx/internal/src/libnnext/ops/vx/vsi_nn_kernel_shufflechannel.vx +++ /dev/null @@ -1,67 +0,0 @@ -#include "cl_viv_vx_ext.h" - -/******************shuffle channel float16/int16********************/ -_viv_uniform int group_column; -_viv_uniform float rgroup_column; - -__kernel void shuffleChannelVXC( - image2d_array_t input, - image2d_array_t output, - int group_number, - int axis) -{ - int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0); - vxc_short8 src0, src1, src2, src3; - VXC_ReadImage2DArray(src0, input, coord, VXC_5BITOFFSET_XY(0, 0),\ - VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); - VXC_ReadImage2DArray(src1, input, coord, VXC_5BITOFFSET_XY(0, 1),\ - VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); - VXC_ReadImage2DArray(src2, input, coord, VXC_5BITOFFSET_XY(0, 2),\ - VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); - VXC_ReadImage2DArray(src3, input, coord, VXC_5BITOFFSET_XY(0, 3),\ - VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); - - int coordz = coord.z; - int index_col = coordz * rgroup_column; - int index_row = coordz - index_col * group_column; - coord.z = index_row * group_number + index_col; - VXC_WriteImage2DArray(output, coord, src0, VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0)); - coord.y ++; - VXC_WriteImage2DArray(output, coord, src1, VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0)); - coord.y ++; - VXC_WriteImage2DArray(output, coord, src2, VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0)); - coord.y ++; - VXC_WriteImage2DArray(output, coord, src3, VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0)); -} - -/*****************shuffle channel int8/uint8****************************/ - -__kernel void shuffleChannel8BitsVXC( - image2d_array_t input, - image2d_array_t output, - int group_number, - int axis) -{ - int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0); - vxc_char16 src0, src1, src2, src3; - VXC_ReadImage2DArray(src0, input, coord, VXC_5BITOFFSET_XY(0, 0),\ - VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); - VXC_ReadImage2DArray(src1, input, coord, VXC_5BITOFFSET_XY(0, 1),\ - VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); - VXC_ReadImage2DArray(src2, input, coord, VXC_5BITOFFSET_XY(0, 2),\ - VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); - VXC_ReadImage2DArray(src3, input, coord, VXC_5BITOFFSET_XY(0, 3),\ - VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); - - int coordz = coord.z; - int index_col = coordz * rgroup_column; - int index_row = coordz - index_col * group_column; - coord.z = index_row * group_number + index_col; - VXC_WriteImage2DArray(output, coord, src0, VXC_MODIFIER(0, 15, 0,VXC_RM_TowardZero, 0)); - coord.y ++; - VXC_WriteImage2DArray(output, coord, src1, VXC_MODIFIER(0, 15, 0,VXC_RM_TowardZero, 0)); - coord.y ++; - VXC_WriteImage2DArray(output, coord, src2, VXC_MODIFIER(0, 15, 0,VXC_RM_TowardZero, 0)); - coord.y ++; - VXC_WriteImage2DArray(output, coord, src3, VXC_MODIFIER(0, 15, 0,VXC_RM_TowardZero, 0)); -} diff --git a/src/tim/vx/internal/src/libnnext/ops/vx/vsi_nn_kernel_shufflechannel_axis1.vx b/src/tim/vx/internal/src/libnnext/ops/vx/vsi_nn_kernel_shufflechannel_axis1.vx deleted file mode 100644 index a4e0fff..0000000 --- a/src/tim/vx/internal/src/libnnext/ops/vx/vsi_nn_kernel_shufflechannel_axis1.vx +++ /dev/null @@ -1,65 +0,0 @@ -#include "cl_viv_vx_ext.h" - -/******************shuffle channel float16/int16********************/ -_viv_uniform int group_column; -_viv_uniform float rgroup_column; - -__kernel void shuffleChannel16Bits_Axis1( - image2d_array_t input, - image2d_array_t output, - int group_number, - int axis) -{ - int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0); - int4 coord_out = coord; - vxc_short8 src0, src1, src2, src3; - VXC_ReadImage2DArray(src0, input, coord, VXC_5BITOFFSET_XY(0, 0),\ - VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); - coord.x += 8; - VXC_ReadImage2DArray(src1, input, coord, VXC_5BITOFFSET_XY(0, 0),\ - VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); - coord.x += 8; - VXC_ReadImage2DArray(src2, input, coord, VXC_5BITOFFSET_XY(0, 0),\ - VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); - coord.x += 8; - VXC_ReadImage2DArray(src3, input, coord, VXC_5BITOFFSET_XY(0, 0),\ - VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); - - int coordy = coord.y; - int index_col = coordy * rgroup_column; - int index_row = coordy - index_col * group_column; - coord_out.y = index_row * group_number + index_col; - VXC_WriteImage2DArray(output, coord_out, src0, VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0)); - coord_out.x += 8; - VXC_WriteImage2DArray(output, coord_out, src1, VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0)); - coord_out.x += 8; - VXC_WriteImage2DArray(output, coord_out, src2, VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0)); - coord_out.x += 8; - VXC_WriteImage2DArray(output, coord_out, src3, VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0)); -} - -/*****************shuffle channel int8/uint8****************************/ - -__kernel void shuffleChannel8Bits_Axis1( - image2d_array_t input, - image2d_array_t output, - int group_number, - int axis) -{ - int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0); - int4 coord_out = coord; - vxc_char16 src0, src1; - VXC_ReadImage2DArray(src0, input, coord, VXC_5BITOFFSET_XY(0, 0),\ - VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); - coord.x += 16; - VXC_ReadImage2DArray(src1, input, coord, VXC_5BITOFFSET_XY(0, 0),\ - VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); - - int coordy = coord.y; - int index_col = coordy * rgroup_column; - int index_row = coordy - index_col * group_column; - coord_out.y = index_row * group_number + index_col; - VXC_WriteImage2DArray(output, coord_out, src0, VXC_MODIFIER(0, 15, 0,VXC_RM_TowardZero, 0)); - coord_out.x += 16; - VXC_WriteImage2DArray(output, coord_out, src1, VXC_MODIFIER(0, 15, 0,VXC_RM_TowardZero, 0)); -} diff --git a/src/tim/vx/internal/src/libnnext/ops/vx/vsi_nn_kernel_space2depth.vx b/src/tim/vx/internal/src/libnnext/ops/vx/vsi_nn_kernel_space2depth.vx deleted file mode 100644 index 01957b0..0000000 --- a/src/tim/vx/internal/src/libnnext/ops/vx/vsi_nn_kernel_space2depth.vx +++ /dev/null @@ -1,41 +0,0 @@ -#include "cl_viv_vx_ext.h" - -_viv_uniform VXC_512Bits uniExtractEvenFp16Stride2_4x4; -_viv_uniform VXC_512Bits uniExtractOddFp16Stride2_4x4; -_viv_uniform int input_depth; - -__kernel void vxcReorg2_fp16_fp16_sx2_sy1 - ( - image2d_array_t input, - image2d_array_t output, - int stridex, - int stridey - ) -{ - int gidx = get_global_id(0); - int gidy = get_global_id(1); - int gidz = get_global_id(2); - - int4 coord = (int4)(gidx, gidy, gidz, 0); - int4 coord_out = (int4)(gidx >> 1, gidy, 0, 0); - int out_d0, out_d1; - vxc_short8 imageData; - vxc_short8 imgVal0, imgVal1; - //int tmpw = gidz / input_depth; \n\ - //int tmpz = gidz % input_depth; \n\ - - VXC_ReadImage2DArray(imageData, input, coord, VXC_5BITOFFSET_XY(0, 0), - VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); - VXC_DP4x4(imgVal0, imageData, imageData, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), - uniExtractEvenFp16Stride2_4x4); - VXC_DP4x4(imgVal1, imageData, imageData, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), - uniExtractOddFp16Stride2_4x4); - - out_d0 = gidz * 2 * 1; - out_d1 = out_d0 + 1; - - coord_out.z = out_d0; - VXC_WriteImage2DArray(output, coord_out, imgVal0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); - coord_out.z = out_d1; - VXC_WriteImage2DArray(output, coord_out, imgVal1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); -} diff --git a/src/tim/vx/internal/src/libnnext/vsi_nn_libnnext_resource.c b/src/tim/vx/internal/src/libnnext/vsi_nn_libnnext_resource.c index 366041c..fd2db22 100644 --- a/src/tim/vx/internal/src/libnnext/vsi_nn_libnnext_resource.c +++ b/src/tim/vx/internal/src/libnnext/vsi_nn_libnnext_resource.c @@ -88,6 +88,84 @@ __kernel void a_times_b_plus_c_F16_F16_F16toF16_2D\n\ VXC_WriteImage(output, coord, result, VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0));\n\ }\n\ \n\ +_viv_uniform VXC_512Bits uniExtractHalf8_2x8;\n\ +_viv_uniform VXC_512Bits uniA_Times_B_lo_4x4;\n\ +_viv_uniform VXC_512Bits uniA_Times_B_hi_4x4;\n\ +__kernel void a_times_b_plus_c_F16_F16_F32toF16\n\ + (\n\ + __read_only image2d_array_t input0,\n\ + __read_only image2d_array_t input1,\n\ + __read_only image2d_array_t input2,\n\ + __write_only image2d_array_t output\n\ + )\n\ +{\n\ + int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0);\n\ +\n\ + vxc_half8 src0, src1, dst;\n\ + vxc_ushort8 vec0, vec1, result;\n\ + float4 b0, b1;\n\ + float4 dst0, dst1;\n\ +\n\ + VXC_ReadImage2DArray(vec0, input0, coord, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ + _viv_asm(COPY, src0, vec0, 16);\n\ + VXC_ReadImage2DArray(vec1, input1, coord, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ + _viv_asm(COPY, src1, vec1, 16);\n\ + b0 = read_imagef(input2, coord);\n\ + coord.x += 4;\n\ + b1 = read_imagef(input2, coord);\n\ + coord.x -= 4;\n\ +\n\ + VXC_DP4x4(dst0, src0, src1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 1), uniA_Times_B_lo_4x4);\n\ + VXC_DP4x4(dst1, src0, src1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 1), uniA_Times_B_hi_4x4);\n\ + dst0 += b0;\n\ + dst1 += b1;\n\ +\n\ + half4 t0, t1;\n\ + _viv_asm(CONV, t0, dst0);\n\ + _viv_asm(CONV, t1, dst1);\n\ + VXC_DP2x8(dst, t0, t1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniExtractHalf8_2x8);\n\ + _viv_asm(COPY, result, dst, 16);\n\ +\n\ + VXC_WriteImage2DArray(output, coord, result, VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0));\n\ +}\n\ +\n\ +__kernel void a_times_b_plus_c_F16_F16_F32toF16_2D\n\ + (\n\ + __read_only image2d_array_t input0,\n\ + __read_only image2d_array_t input1,\n\ + __read_only image2d_t input2,\n\ + __write_only image2d_array_t output\n\ + )\n\ +{\n\ + int4 coord = (int4)(get_global_id(0), get_global_id(1), 0, 0);\n\ +\n\ + vxc_half8 src0, src1, dst;\n\ + vxc_ushort8 vec0, vec1, result;\n\ + float4 b0, b1;\n\ + float4 dst0, dst1;\n\ +\n\ + VXC_ReadImage(vec0, input0, coord.xy, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ + _viv_asm(COPY, src0, vec0, 16);\n\ + VXC_ReadImage(vec1, input1, coord.xy, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ + _viv_asm(COPY, src1, vec1, 16);\n\ + b0 = read_imagef(input2, coord.xy);\n\ + coord.z = coord.x + 4;\n\ + b1 = read_imagef(input2, coord.zy);\n\ +\n\ + VXC_DP4x4(dst0, src0, src1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 1), uniA_Times_B_lo_4x4);\n\ + VXC_DP4x4(dst1, src0, src1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 1), uniA_Times_B_hi_4x4);\n\ + dst0 += b0;\n\ + dst1 += b1;\n\ +\n\ + half4 t0, t1;\n\ + _viv_asm(CONV, t0, dst0);\n\ + _viv_asm(CONV, t1, dst1);\n\ + VXC_DP2x8(dst, t0, t1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniExtractHalf8_2x8);\n\ + _viv_asm(COPY, result, dst, 16);\n\ +\n\ + VXC_WriteImage(output, coord, result, VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0));\n\ +}\n\ +\n\ "; /* end of a_times_b_plus_c_vx*/ static const char add_mean_std_norm_vx[] = "#include \"cl_viv_vx_ext.h\"\n\ @@ -3828,10 +3906,11 @@ __kernel void floordiv_BF16BF16toBF16_2D\n\ static const char gather_vx[] = "#include \"cl_viv_vx_ext.h\"\n\ \n\ _viv_uniform int indices_num;\n\ +_viv_uniform VXC_512Bits uniExtraCopyDpKeepinEvis_2x8;\n\ \n\ __kernel void gather_I8toI8(\n\ __read_only image2d_t input0,\n\ - __read_only image2d_array_t input1,\n\ + __read_only image2d_t input1,\n\ __write_only image2d_t output,\n\ int block_size,\n\ int block_num,\n\ @@ -3843,7 +3922,7 @@ __kernel void gather_I8toI8(\n\ int gidz = get_global_id(2); // block_num\n\ \n\ int4 coord_in = (int4)(gidy, 0, gidx, 0);\n\ - int4 indice = read_imagei(input1, coord_in.xyyy);\n\ + int4 indice = read_imagei(input1, coord_in.xy);\n\ coord_in.w = gidz * axis_num + indice.x;\n\ \n\ vxc_char16 src;\n\ @@ -3855,7 +3934,7 @@ __kernel void gather_I8toI8(\n\ \n\ __kernel void gather_U8toU8(\n\ __read_only image2d_t input0,\n\ - __read_only image2d_array_t input1,\n\ + __read_only image2d_t input1,\n\ __write_only image2d_t output,\n\ int block_size,\n\ int block_num,\n\ @@ -3867,7 +3946,7 @@ __kernel void gather_U8toU8(\n\ int gidz = get_global_id(2); // block_num\n\ \n\ int4 coord_in = (int4)(gidy, 0, gidx, 0);\n\ - int4 indice = read_imagei(input1, coord_in.xyyy);\n\ + int4 indice = read_imagei(input1, coord_in.xy);\n\ coord_in.w = gidz * axis_num + indice.x;\n\ \n\ vxc_uchar16 src;\n\ @@ -3879,7 +3958,7 @@ __kernel void gather_U8toU8(\n\ \n\ __kernel void gather_I16toI16(\n\ __read_only image2d_t input0,\n\ - __read_only image2d_array_t input1,\n\ + __read_only image2d_t input1,\n\ __write_only image2d_t output,\n\ int block_size,\n\ int block_num,\n\ @@ -3893,7 +3972,7 @@ __kernel void gather_I16toI16(\n\ int4 coord_in = (int4)(gidy, 0, gidx, 0);\n\ \n\ \n\ - int4 indice = read_imagei(input1, coord_in.xyyy);\n\ + int4 indice = read_imagei(input1, coord_in.xy);\n\ coord_in.w = gidz * axis_num + indice.x;\n\ \n\ vxc_short8 src;\n\ @@ -3905,7 +3984,7 @@ __kernel void gather_I16toI16(\n\ \n\ __kernel void gather_F16toF16(\n\ __read_only image2d_t input0,\n\ - __read_only image2d_array_t input1,\n\ + __read_only image2d_t input1,\n\ __write_only image2d_t output,\n\ int block_size,\n\ int block_num,\n\ @@ -3919,7 +3998,7 @@ __kernel void gather_F16toF16(\n\ int4 coord_in = (int4)(gidy, 0, gidx, 0);\n\ \n\ \n\ - int4 indice = read_imagei(input1, coord_in.xyyy);\n\ + int4 indice = read_imagei(input1, coord_in.xy);\n\ coord_in.w = gidz * axis_num + indice.x;\n\ \n\ vxc_short8 src;\n\ @@ -3928,6 +4007,110 @@ __kernel void gather_F16toF16(\n\ int2 coord = (int2)(gidx, gidz * indices_num + gidy);\n\ VXC_WriteImage(output, coord, src, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ }\n\ +\n\ +__kernel void gather_I8toI8_axis0(\n\ + __read_only image2d_t input0,\n\ + __read_only image2d_t input1,\n\ + __write_only image2d_t output,\n\ + int block_size,\n\ + int block_num,\n\ + int axis_num\n\ + )\n\ +{\n\ + int2 coord = (int2)(get_global_id(0), get_global_id(1));\n\ + int4 indices = read_imagei(input1, coord.xx);\n\ + int2 coord_in = (int2)(indices.x, get_global_id(1));\n\ +\n\ + vxc_char16 src, dst;\n\ + VXC_ReadImage(src, input0, coord_in, 0, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));\n\ + indices.x = get_global_id(1);\n\ + VXC_ReadImage(src, input0, indices.yx, 0, VXC_MODIFIER(1, 1, 0, VXC_RM_TowardZero, 0));\n\ + VXC_ReadImage(src, input0, indices.zx, 0, VXC_MODIFIER(2, 2, 0, VXC_RM_TowardZero, 0));\n\ + VXC_ReadImage(src, input0, indices.wx, 0, VXC_MODIFIER(3, 3, 0, VXC_RM_TowardZero, 0));\n\ +\n\ + VXC_DP2x8(dst, src, src, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0),\n\ + uniExtraCopyDpKeepinEvis_2x8);\n\ +\n\ + VXC_WriteImage(output, coord, dst, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0));\n\ +}\n\ +\n\ +__kernel void gather_U8toU8_axis0(\n\ + __read_only image2d_t input0,\n\ + __read_only image2d_t input1,\n\ + __write_only image2d_t output,\n\ + int block_size,\n\ + int block_num,\n\ + int axis_num\n\ + )\n\ +{\n\ + int2 coord = (int2)(get_global_id(0), get_global_id(1));\n\ + int4 indices = read_imagei(input1, coord.xx);\n\ + int2 coord_in = (int2)(indices.x, get_global_id(1));\n\ +\n\ + vxc_uchar16 src, dst;\n\ + VXC_ReadImage(src, input0, coord_in, 0, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));\n\ + indices.x = get_global_id(1);\n\ + VXC_ReadImage(src, input0, indices.yx, 0, VXC_MODIFIER(1, 1, 0, VXC_RM_TowardZero, 0));\n\ + VXC_ReadImage(src, input0, indices.zx, 0, VXC_MODIFIER(2, 2, 0, VXC_RM_TowardZero, 0));\n\ + VXC_ReadImage(src, input0, indices.wx, 0, VXC_MODIFIER(3, 3, 0, VXC_RM_TowardZero, 0));\n\ +\n\ + VXC_DP2x8(dst, src, src, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0),\n\ + uniExtraCopyDpKeepinEvis_2x8);\n\ +\n\ + VXC_WriteImage(output, coord, dst, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0));\n\ +}\n\ +\n\ +__kernel void gather_I16toI16_axis0(\n\ + __read_only image2d_t input0,\n\ + __read_only image2d_t input1,\n\ + __write_only image2d_t output,\n\ + int block_size,\n\ + int block_num,\n\ + int axis_num\n\ + )\n\ +{\n\ + int2 coord = (int2)(get_global_id(0), get_global_id(1));\n\ + int4 indices = read_imagei(input1, coord.xx);\n\ + int2 coord_in = (int2)(indices.x, get_global_id(1));\n\ +\n\ + vxc_short8 src, dst;\n\ + VXC_ReadImage(src, input0, coord_in, 0, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));\n\ + indices.x = get_global_id(1);\n\ + VXC_ReadImage(src, input0, indices.yx, 0, VXC_MODIFIER(1, 1, 0, VXC_RM_TowardZero, 0));\n\ + VXC_ReadImage(src, input0, indices.zx, 0, VXC_MODIFIER(2, 2, 0, VXC_RM_TowardZero, 0));\n\ + VXC_ReadImage(src, input0, indices.wx, 0, VXC_MODIFIER(3, 3, 0, VXC_RM_TowardZero, 0));\n\ +\n\ + VXC_DP2x8(dst, src, src, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0),\n\ + uniExtraCopyDpKeepinEvis_2x8);\n\ +\n\ + VXC_WriteImage(output, coord, dst, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0));\n\ +}\n\ +\n\ +__kernel void gather_F16toF16_axis0(\n\ + __read_only image2d_t input0,\n\ + __read_only image2d_t input1,\n\ + __write_only image2d_t output,\n\ + int block_size,\n\ + int block_num,\n\ + int axis_num\n\ + )\n\ +{\n\ + int2 coord = (int2)(get_global_id(0), get_global_id(1));\n\ + int4 indices = read_imagei(input1, coord.xx);\n\ + int2 coord_in = (int2)(indices.x, get_global_id(1));\n\ +\n\ + vxc_short8 src, dst;\n\ + VXC_ReadImage(src, input0, coord_in, 0, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));\n\ + indices.x = get_global_id(1);\n\ + VXC_ReadImage(src, input0, indices.yx, 0, VXC_MODIFIER(1, 1, 0, VXC_RM_TowardZero, 0));\n\ + VXC_ReadImage(src, input0, indices.zx, 0, VXC_MODIFIER(2, 2, 0, VXC_RM_TowardZero, 0));\n\ + VXC_ReadImage(src, input0, indices.wx, 0, VXC_MODIFIER(3, 3, 0, VXC_RM_TowardZero, 0));\n\ +\n\ + VXC_DP2x8(dst, src, src, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0),\n\ + uniExtraCopyDpKeepinEvis_2x8);\n\ +\n\ + VXC_WriteImage(output, coord, dst, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0));\n\ +}\n\ "; /* end of gather_vx*/ static const char gather_mix_vx[] = "#include \"cl_viv_vx_ext.h\"\n\ @@ -3943,7 +4126,7 @@ _viv_uniform int2 multAndoutZP1;//[0:15] multiplier, [31:63] output zp\n\ #define GATHER_8BITS_TO_F16(src0_type_name, read_type) \\\n\ __kernel void gather_##src0_type_name##toF16( \\\n\ __read_only image2d_t input0, \\\n\ - __read_only image2d_array_t input1, \\\n\ + __read_only image2d_t input1, \\\n\ __write_only image2d_t output, \\\n\ int block_size, \\\n\ int block_num, \\\n\ @@ -3955,7 +4138,7 @@ __kernel void gather_##src0_type_name##toF16( \\\n\ int gidz = get_global_id(2); \\\n\ \\\n\ int4 coord_in = (int4)(gidy, 0, gidx, 0); \\\n\ - int4 indice = read_imagei(input1, coord_in.xyyy); \\\n\ + int4 indice = read_imagei(input1, coord_in.xy); \\\n\ coord_in.w = gidz * axis_num + indice.x; \\\n\ \\\n\ read_type src; \\\n\ @@ -3979,7 +4162,7 @@ GATHER_8BITS_TO_F16(I8, vxc_char16)\n\ #define GATHER_F16_TO_QINT(src1_type_name, write_type) \\\n\ __kernel void gather_F16to##src1_type_name( \\\n\ __read_only image2d_t input0, \\\n\ - __read_only image2d_array_t input1, \\\n\ + __read_only image2d_t input1, \\\n\ __write_only image2d_t output, \\\n\ int block_size, \\\n\ int block_num, \\\n\ @@ -3991,7 +4174,7 @@ __kernel void gather_F16to##src1_type_name( \\\n\ int gidz = get_global_id(2); \\\n\ int4 coord_in = (int4)(gidy, 0, gidx, 0); \\\n\ \\\n\ - int4 indice = read_imagei(input1, coord_in.xyyy); \\\n\ + int4 indice = read_imagei(input1, coord_in.xy); \\\n\ coord_in.w = gidz * axis_num + indice.x; \\\n\ \\\n\ vxc_short8 src; \\\n\ @@ -4011,7 +4194,7 @@ GATHER_F16_TO_QINT(I16, vxc_short8)\n\ \n\ __kernel void gather_I16toF16(\n\ __read_only image2d_t input0,\n\ - __read_only image2d_array_t input1,\n\ + __read_only image2d_t input1,\n\ __write_only image2d_t output,\n\ int block_size,\n\ int block_num,\n\ @@ -4023,7 +4206,7 @@ __kernel void gather_I16toF16(\n\ int gidz = get_global_id(2);\n\ \n\ int4 coord_in = (int4)(gidy, 0, gidx, 0);\n\ - int4 indice = read_imagei(input1, coord_in.xyyy);\n\ + int4 indice = read_imagei(input1, coord_in.xy);\n\ coord_in.w = gidz * axis_num + indice.x;\n\ \n\ vxc_short8 src;\n\ @@ -4041,6 +4224,100 @@ __kernel void gather_I16toF16(\n\ \n\ VXC_WriteImage(output, coord, dst0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ }\n\ +\n\ +#define GATHER_8BITS_TO_F16_AXIS0(src0_type_name, read_type) \\\n\ +__kernel void gather_##src0_type_name##toF16_axis0( \\\n\ + __read_only image2d_t input0, \\\n\ + __read_only image2d_t input1, \\\n\ + __write_only image2d_t output, \\\n\ + int block_size, \\\n\ + int block_num, \\\n\ + int axis_num \\\n\ + ) \\\n\ +{ \\\n\ + int2 coord = (int2)(get_global_id(0), get_global_id(1)); \\\n\ + int4 indices = read_imagei(input1, coord.xx); \\\n\ + int2 coord_in = (int2)(indices.x, get_global_id(1)); \\\n\ + \\\n\ + read_type src; \\\n\ + VXC_ReadImage(src, input0, coord_in, 0, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0)); \\\n\ + indices.x = get_global_id(1); \\\n\ + VXC_ReadImage(src, input0, indices.yx, 0, VXC_MODIFIER(1, 1, 0, VXC_RM_TowardZero, 0)); \\\n\ + VXC_ReadImage(src, input0, indices.zx, 0, VXC_MODIFIER(2, 2, 0, VXC_RM_TowardZero, 0)); \\\n\ + VXC_ReadImage(src, input0, indices.wx, 0, VXC_MODIFIER(3, 3, 0, VXC_RM_TowardZero, 0)); \\\n\ + vxc_half8 src0; \\\n\ + vxc_short8 dst0; \\\n\ + vxc_ushort8 ms0; \\\n\ + _viv_asm(COPY, ms0, multAndoutZP0, 16); \\\n\ + VXC_DP2x8(src0,src,ms0, VXC_MODIFIER(0,7,0, VXC_RM_TowardZero,1), uniU8MulAndPostShift_0_Lo_2x8); \\\n\ + _viv_asm(COPY, dst0, src0, 16); \\\n\ + VXC_WriteImage(output, coord, dst0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \\\n\ +}\n\ +GATHER_8BITS_TO_F16_AXIS0(U8, vxc_uchar16)\n\ +GATHER_8BITS_TO_F16_AXIS0(I8, vxc_char16)\n\ +\n\ +#define GATHER_F16_TO_QINT_AXIS0(src1_type_name, write_type) \\\n\ +__kernel void gather_F16to##src1_type_name##_axis0( \\\n\ + __read_only image2d_t input0, \\\n\ + __read_only image2d_t input1, \\\n\ + __write_only image2d_t output, \\\n\ + int block_size, \\\n\ + int block_num, \\\n\ + int axis_num \\\n\ + ) \\\n\ +{ \\\n\ + int2 coord = (int2)(get_global_id(0), get_global_id(1)); \\\n\ + int4 indices = read_imagei(input1, coord.xx); \\\n\ + int2 coord_in = (int2)(indices.x, get_global_id(1)); \\\n\ + \\\n\ + vxc_short8 src; \\\n\ + VXC_ReadImage(src, input0, coord_in, 0, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0)); \\\n\ + indices.x = get_global_id(1); \\\n\ + VXC_ReadImage(src, input0, indices.yx, 0, VXC_MODIFIER(1, 1, 0, VXC_RM_TowardZero, 0)); \\\n\ + VXC_ReadImage(src, input0, indices.zx, 0, VXC_MODIFIER(2, 2, 0, VXC_RM_TowardZero, 0)); \\\n\ + VXC_ReadImage(src, input0, indices.wx, 0, VXC_MODIFIER(3, 3, 0, VXC_RM_TowardZero, 0)); \\\n\ + vxc_ushort8 mp1; \\\n\ + _viv_asm(COPY, mp1, multAndoutZP1, 16); \\\n\ + vxc_half8 data; \\\n\ + write_type dst; \\\n\ + _viv_asm(COPY, data, src, 16); \\\n\ + VXC_DP2x8(dst,data,mp1,VXC_MODIFIER(0,7,0,VXC_RM_ToNearestEven, 1),uniConvertFp16toU8_2x8); \\\n\ + VXC_WriteImage(output, coord, dst, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \\\n\ +}\n\ +GATHER_F16_TO_QINT_AXIS0(U8, vxc_uchar16)\n\ +GATHER_F16_TO_QINT_AXIS0(I8, vxc_char16)\n\ +GATHER_F16_TO_QINT_AXIS0(I16, vxc_short8)\n\ +\n\ +__kernel void gather_I16toF16_axis0(\n\ + __read_only image2d_t input0,\n\ + __read_only image2d_t input1,\n\ + __write_only image2d_t output,\n\ + int block_size,\n\ + int block_num,\n\ + int axis_num\n\ + )\n\ +{\n\ + int2 coord = (int2)(get_global_id(0), get_global_id(1));\n\ + int4 indices = read_imagei(input1, coord.xx);\n\ + int2 coord_in = (int2)(indices.x, get_global_id(1));\n\ +\n\ + vxc_short8 src;\n\ + VXC_ReadImage(src, input0, coord_in, 0, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));\n\ + indices.x = get_global_id(1);\n\ + VXC_ReadImage(src, input0, indices.yx, 0, VXC_MODIFIER(1, 1, 0, VXC_RM_TowardZero, 0));\n\ + VXC_ReadImage(src, input0, indices.zx, 0, VXC_MODIFIER(2, 2, 0, VXC_RM_TowardZero, 0));\n\ + VXC_ReadImage(src, input0, indices.wx, 0, VXC_MODIFIER(3, 3, 0, VXC_RM_TowardZero, 0));\n\ +\n\ + vxc_half8 src0;\n\ + vxc_short8 dst0;\n\ + vxc_ushort8 ms0;\n\ + _viv_asm(COPY, ms0, multAndoutZP0, 16);\n\ + VXC_DP2x8(src0, src, ms0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1),\\\n\ + uniU8MulAndPostShift_0_Lo_2x8);\n\ + _viv_asm(COPY, dst0, src0, 16);\n\ +\n\ + VXC_WriteImage(output, coord, dst0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0));\n\ +}\n\ "; /* end of gather_mix_vx*/ static const char gather_nd_vx[] = "#include \"cl_viv_vx_ext.h\"\n\ @@ -7776,6 +8053,2059 @@ L2NORMSCALE_AXIS1_U8_2D(F16, F16, vxc_uchar16, vxc_uchar16, vxc_half8, half4, L2NORMSCALE_AXIS1_U8_2D(F16, U8, vxc_uchar16, vxc_uchar16, vxc_uchar16, int4, vxc_uchar16)\n\ "; /* end of l2normalizescale_axis1_vx*/ +static const char layer_normalization_vx[] = "#include \"cl_viv_vx_ext.h\"\n\ +\n\ +/**************************layernorm float16***********************************/\n\ +_viv_uniform int width;\n\ +_viv_uniform float dimRatio;\n\ +_viv_uniform VXC_512Bits uniFp16SumSqr_dp8x2;\n\ +_viv_uniform VXC_512Bits UniFP16toFP32Lo4_dp4x4;\n\ +_viv_uniform VXC_512Bits uniExtractHalf4_dp4x4;\n\ +\n\ +__kernel void layer_norm_F16toF16(\n\ + image2d_array_t input, image2d_t bias, image2d_t scale,\n\ + image2d_array_t output, float eps)\n\ +{\n\ + int4 coord = (int4)(0, get_global_id(1), get_global_id(2), 0);\n\ + int4 coord_out = coord;\n\ +\n\ + int8 input_desc, output_desc;\n\ + _viv_asm(COPY, input_desc, input, sizeof(input_desc));\n\ + int baseAddr_a = (int)get_global_id(2) * input_desc.s4 + input_desc.s0;\n\ + _viv_asm(MOV, coord.z, baseAddr_a);\n\ +\n\ + vxc_short8 src0, src1;\n\ + vxc_float sum = 0, sqr = 0;\n\ + VXC_OP4(img_load_3d, src0, input, coord.xyzz, VXC_5BITOFFSET_XY(0, 0), \\\n\ + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ +\n\ + _viv_asm(COPY, output_desc, output, sizeof(output_desc));\n\ + int baseAddr = (int)get_global_id(2) * output_desc.s4 + output_desc.s0;\n\ + _viv_asm(MOV, coord_out.w, baseAddr);\n\ +\n\ + for(coord.x = 8; coord.x < (width+8); coord.x += 8)\n\ + {\n\ + vxc_half8 val0_h;\n\ + _viv_asm(COPY, val0_h, src0, 16);\n\ + VXC_OP4(img_load_3d, src0, input, coord.xyzz, VXC_5BITOFFSET_XY(0, 0), \\\n\ + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ + vxc_float4 sumsqr;\n\ + VXC_DP8x2(sumsqr, val0_h, val0_h, VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0),\\\n\ + uniFp16SumSqr_dp8x2);\n\ + sum += sumsqr.x;\n\ + sqr += sumsqr.y;\n\ + }\n\ + vxc_float mean;\n\ + mean = sum * dimRatio;\n\ + vxc_float vari;\n\ + vari = sqr*dimRatio - mean*mean;\n\ + vari += eps;\n\ + vari = rsqrt(vari);\n\ + vxc_float4 bias_f;\n\ + for(coord.x = 0; coord.x < width; coord.x += 4)\n\ + {\n\ + VXC_OP4(img_load_3d, src0, input, coord.xyzz, VXC_5BITOFFSET_XY(0, 0), \\\n\ + VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0));\n\ + VXC_ReadImage(src1, scale, coord.xw, VXC_5BITOFFSET_XY(0, 0),\\\n\ + VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0));\n\ + bias_f = read_imagef(bias, coord.xw);\n\ + vxc_half8 in_h, scale_h;\n\ + _viv_asm(COPY, in_h, src0, 16);\n\ + _viv_asm(COPY, scale_h, src1, 16);\n\ + vxc_float4 in_f, scale_f;\n\ + VXC_DP4x4(in_f, in_h, in_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\\\n\ + UniFP16toFP32Lo4_dp4x4);\n\ + VXC_DP4x4(scale_f, scale_h, scale_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\\\n\ + UniFP16toFP32Lo4_dp4x4);\n\ + vxc_float4 sub, norm;\n\ + sub = in_f - mean;\n\ + norm = scale_f * vari * sub + bias_f;\n\ + half4 norm_h;\n\ + _viv_asm(CONV, norm_h, norm);\n\ + vxc_half8 dst;\n\ + VXC_DP4x4(dst, norm_h, norm_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\\\n\ + uniExtractHalf4_dp4x4);\n\ + vxc_short8 dstval;\n\ + _viv_asm(COPY, dstval, dst, 16);\n\ + coord_out.x = coord.x;\n\ + VXC_OP4_NoDest(img_store_3d, output, coord_out.xyww, dstval, \\\n\ + VXC_MODIFIER(0, 3, 0,VXC_RM_TowardZero, 0));\n\ + }\n\ +}\n\ +/*****************************layernorm uint8 to uint8****************************/\n\ +_viv_uniform VXC_512Bits uniConvert1stUint8SubZpToFp32_4x4;\n\ +_viv_uniform VXC_512Bits uniConvert2ndUint8SubZpToFp32_4x4;\n\ +_viv_uniform VXC_512Bits uniConvert3rdUint8SubZpToFp32_4x4;\n\ +_viv_uniform VXC_512Bits uniConvert4thUint8SubZpToFp32_4x4;\n\ +_viv_uniform VXC_512Bits uniSumU8_16x1;\n\ +_viv_uniform VXC_512Bits uniSqrSum_16x1;\n\ +_viv_uniform float input_scale;\n\ +_viv_uniform int inputZP;\n\ +_viv_uniform float outputScale;\n\ +_viv_uniform float output_zp;\n\ +_viv_uniform int sumInZp;\n\ +_viv_uniform int tmpZp1;\n\ +_viv_uniform int tmpZp2;\n\ +_viv_uniform float e2InScale;\n\ +_viv_uniform VXC_512Bits uniConvertSecFp16Fp32_4x4;\n\ +_viv_uniform VXC_512Bits uniConvertInt32toUint8_2x8;\n\ +\n\ +__kernel void layer_norm_U8toU8(\n\ + image2d_array_t input, image2d_t bias, image2d_t scale,\n\ + image2d_array_t output, float eps)\n\ +{\n\ + int4 coord = (int4)(0, get_global_id(1), get_global_id(2), 0);\n\ + int4 coord_out = coord;\n\ +\n\ + vxc_uchar16 src0, src2;\n\ + vxc_short8 src1;\n\ + vxc_half8 scale_h;\n\ + float sum = 0, sqr = 0;\n\ + vxc_float4 bias_f0, bias_f1, scale_f0, scale_f1;\n\ + int tmpSum = 0, tmpSqr = 0;\n\ + vxc_int4 tmpSum1;\n\ + vxc_int4 tmpSqr1;\n\ + short zp = inputZP;\n\ +\n\ + int8 input_desc, output_desc;\n\ + _viv_asm(COPY, input_desc, input, sizeof(input_desc));\n\ + int baseAddr_a = (int)get_global_id(2) * input_desc.s4 + input_desc.s0;\n\ + _viv_asm(MOV, coord.z, baseAddr_a);\n\ +\n\ + _viv_asm(COPY, output_desc, output, sizeof(output_desc));\n\ + int baseAddr = (int)get_global_id(2) * output_desc.s4 + output_desc.s0;\n\ + _viv_asm(MOV, coord_out.w, baseAddr);\n\ +\n\ + for(coord.x = 0; coord.x < width; coord.x += 16)\n\ + {\n\ + VXC_OP4(img_load_3d, src0, input, coord.xyzz, VXC_5BITOFFSET_XY(0, 0), \\\n\ + VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));\n\ + VXC_DP16x1(tmpSum1, src0, src0, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0), uniSumU8_16x1);\n\ + tmpSum += (tmpSum1.x);\n\ + VXC_DP16x1(tmpSqr1, src0, src0, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0), uniSqrSum_16x1);\n\ + tmpSqr += (tmpSqr1.x + tmpZp1 * tmpSum1.x);\n\ + }\n\ + sum = (tmpSum + sumInZp) * input_scale;\n\ + sqr = (tmpSqr + tmpZp2) * e2InScale;\n\ +\n\ + float mean, vari;\n\ + mean = sum * dimRatio;\n\ + vari = sqr*dimRatio - mean*mean;\n\ + vari += eps;\n\ + vari = rsqrt(vari);\n\ + vxc_int4 tmpVal0, tmpVal1;\n\ + vxc_float4 tmpData0, tmpData1, tmpData2, tmpData3;\n\ + int2 coord_bias = (int2)(0, 0);\n\ +\n\ + for(coord.x = 0; coord.x < width; coord.x += 16)\n\ + {\n\ + VXC_OP4(img_load_3d, src0, input, coord.xyzz, VXC_5BITOFFSET_XY(0, 0), \\\n\ + VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));\n\ + VXC_ReadImage(src1, scale, coord.xw, VXC_5BITOFFSET_XY(0, 0),\\\n\ + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ + coord_bias.x = coord.x;\n\ + _viv_asm(COPY, scale_h, src1, 16);\n\ + VXC_DP4x4(scale_f0, scale_h, scale_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\\\n\ + UniFP16toFP32Lo4_dp4x4);\n\ + VXC_DP4x4(scale_f1, scale_h, scale_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\\\n\ + uniConvertSecFp16Fp32_4x4);\n\ + bias_f0 = read_imagef(bias, coord_bias);\n\ + coord_bias.x += 4;\n\ + bias_f1 = read_imagef(bias, coord_bias);\n\ + coord_bias.x += 4;\n\ +\n\ + VXC_ReadImage(src1, scale, coord.xw, VXC_5BITOFFSET_XY(8, 0),\\\n\ + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ + _viv_asm(COPY, scale_h, src1, 16);\n\ + VXC_DP4x4(tmpData0, src0, zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\\\n\ + uniConvert1stUint8SubZpToFp32_4x4);\n\ + VXC_DP4x4(tmpData1, src0, zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\\\n\ + uniConvert2ndUint8SubZpToFp32_4x4);\n\ + VXC_DP4x4(tmpData2, src0, zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\\\n\ + uniConvert3rdUint8SubZpToFp32_4x4);\n\ + VXC_DP4x4(tmpData3, src0, zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\\\n\ + uniConvert4thUint8SubZpToFp32_4x4);\n\ + tmpData0 *= input_scale;\n\ + tmpData1 *= input_scale;\n\ + tmpData2 *= input_scale;\n\ + tmpData3 *= input_scale;\n\ +\n\ + vxc_float4 norm;\n\ + tmpData0 -= mean;\n\ + norm = scale_f0 * vari * tmpData0 + bias_f0;\n\ + bias_f0 = read_imagef(bias, coord_bias);\n\ + VXC_DP4x4(scale_f0, scale_h, scale_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\\\n\ + UniFP16toFP32Lo4_dp4x4);\n\ + coord_bias.x += 4;\n\ + tmpVal0 = convert_int4_rte(norm * outputScale + output_zp);\n\ +\n\ + tmpData1 -= mean;\n\ + norm = scale_f1 * vari * tmpData1 + bias_f1;\n\ + bias_f1 = read_imagef(bias, coord_bias);\n\ + VXC_DP4x4(scale_f1, scale_h, scale_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\\\n\ + uniConvertSecFp16Fp32_4x4);\n\ + tmpVal1 = convert_int4_rte(norm * outputScale + output_zp);\n\ + VXC_DP2x8(src2, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1),\\\n\ + uniConvertInt32toUint8_2x8);\n\ +\n\ + tmpData2 -= mean;\n\ + norm = scale_f0 * vari * tmpData2 + bias_f0;\n\ + tmpVal0 = convert_int4_rte(norm * outputScale + output_zp);\n\ +\n\ + tmpData3 -= mean;\n\ + norm = scale_f1 * vari * tmpData3 + bias_f1;\n\ + tmpVal1 = convert_int4_rte(norm * outputScale + output_zp);\n\ + VXC_DP2x8(src2, tmpVal0, tmpVal1, VXC_MODIFIER(8, 15, 0, VXC_RM_TowardZero, 1),\\\n\ + uniConvertInt32toUint8_2x8);\n\ + coord_out.x = coord.x;\n\ + VXC_OP4_NoDest(img_store_3d, output, coord_out.xyww, src2, \\\n\ + VXC_MODIFIER(0, 15, 0,VXC_RM_TowardZero, 0));\n\ + }\n\ +}\n\ +/***************************layernorm float16 to uint8**************************/\n\ +__kernel void layer_norm_F16toU8(\n\ + image2d_array_t input, image2d_t bias, image2d_t scale,\n\ + image2d_array_t output, float eps)\n\ +{\n\ + int4 coord = (int4)(0, get_global_id(1), get_global_id(2), 0);\n\ + int4 coord_out = coord;\n\ +\n\ + int8 input_desc, output_desc;\n\ + _viv_asm(COPY, input_desc, input, sizeof(input_desc));\n\ + int baseAddr_a = (int)get_global_id(2) * input_desc.s4 + input_desc.s0;\n\ + _viv_asm(MOV, coord.z, baseAddr_a);\n\ +\n\ + vxc_short8 src0, src1;\n\ + vxc_float sum = 0, sqr = 0;\n\ + VXC_OP4(img_load_3d, src0, input, coord.xyzz, VXC_5BITOFFSET_XY(0, 0), \\\n\ + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ +\n\ + _viv_asm(COPY, output_desc, output, sizeof(output_desc));\n\ + int baseAddr = (int)get_global_id(2) * output_desc.s4 + output_desc.s0;\n\ + _viv_asm(MOV, coord_out.w, baseAddr);\n\ +\n\ + for(coord.x = 8; coord.x < (width+8); coord.x += 8)\n\ + {\n\ + vxc_half8 val0_h;\n\ + _viv_asm(COPY, val0_h, src0, 16);\n\ + VXC_OP4(img_load_3d, src0, input, coord.xyzz, VXC_5BITOFFSET_XY(0, 0), \\\n\ + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ + vxc_float4 sumsqr;\n\ + VXC_DP8x2(sumsqr, val0_h, val0_h, VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0),\\\n\ + uniFp16SumSqr_dp8x2);\n\ + sum += sumsqr.x;\n\ + sqr += sumsqr.y;\n\ + }\n\ + vxc_float mean;\n\ + mean = sum * dimRatio;\n\ + vxc_float vari;\n\ + vari = sqr*dimRatio - mean*mean;\n\ + vari += eps;\n\ + vari = rsqrt(vari);\n\ + vxc_float4 bias_f;\n\ + for(coord.x = 0; coord.x < width; coord.x += 4)\n\ + {\n\ + VXC_OP4(img_load_3d, src0, input, coord.xyzz, VXC_5BITOFFSET_XY(0, 0), \\\n\ + VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0));\n\ + VXC_ReadImage(src1, scale, coord.xw, VXC_5BITOFFSET_XY(0, 0),\\\n\ + VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0));\n\ + bias_f = read_imagef(bias, coord.xw);\n\ + vxc_half8 in_h, scale_h;\n\ + _viv_asm(COPY, in_h, src0, 16);\n\ + _viv_asm(COPY, scale_h, src1, 16);\n\ + vxc_float4 in_f, scale_f;\n\ + VXC_DP4x4(in_f, in_h, in_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\\\n\ + UniFP16toFP32Lo4_dp4x4);\n\ + VXC_DP4x4(scale_f, scale_h, scale_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\\\n\ + UniFP16toFP32Lo4_dp4x4);\n\ + vxc_float4 sub, norm;\n\ + sub = in_f - mean;\n\ + norm = scale_f * vari * sub + bias_f;\n\ + norm = norm * outputScale + output_zp;\n\ + int4 output_int4;\n\ + output_int4 = convert_int4_rte(norm);\n\ + vxc_uchar8 dst;\n\ + VXC_DP2x8(dst, output_int4, output_int4, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1),\n\ + uniConvertInt32toUint8_2x8);\n\ + coord_out.x = coord.x;\n\ + VXC_OP4_NoDest(img_store_3d, output, coord_out.xyww, dst, \\\n\ + VXC_MODIFIER(0, 3, 0,VXC_RM_TowardZero, 0));\n\ + }\n\ +}"; /* end of layer_normalization_vx*/ + +static const char layer_normalization_2d_vx[] = "#include \"cl_viv_vx_ext.h\"\n\ +\n\ +/**************************layernorm float16***********************************/\n\ +_viv_uniform int width;\n\ +_viv_uniform float dimRatio;\n\ +_viv_uniform VXC_512Bits uniFp16SumSqr_dp8x2;\n\ +_viv_uniform VXC_512Bits UniFP16toFP32Lo4_dp4x4;\n\ +_viv_uniform VXC_512Bits uniExtractHalf4_dp4x4;\n\ +\n\ +__kernel void layer_norm_F16toF16_2D(\n\ + image2d_t input, image2d_t bias, image2d_t scale,\n\ + image2d_t output, float eps)\n\ +{\n\ + int4 coord = (int4)(0, get_global_id(1), 0, 0);\n\ + vxc_short8 src0, src1;\n\ + vxc_float sum = 0, sqr = 0;\n\ + VXC_ReadImage(src0, input, coord.xy, VXC_5BITOFFSET_XY(0, 0),\\\n\ + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ + for(coord.x = 8; coord.x < (width+8); coord.x += 8)\n\ + {\n\ + vxc_half8 val0_h;\n\ + _viv_asm(COPY, val0_h, src0, 16);\n\ + VXC_ReadImage(src0, input, coord.xy, VXC_5BITOFFSET_XY(0, 0),\\\n\ + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ + vxc_float4 sumsqr;\n\ + VXC_DP8x2(sumsqr, val0_h, val0_h, VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0),\\\n\ + uniFp16SumSqr_dp8x2);\n\ + sum += sumsqr.x;\n\ + sqr += sumsqr.y;\n\ + }\n\ + vxc_float mean;\n\ + mean = sum * dimRatio;\n\ + vxc_float vari;\n\ + vari = sqr*dimRatio - mean*mean;\n\ + vari += eps;\n\ + vari = rsqrt(vari);\n\ + vxc_float4 bias_f;\n\ + for(coord.x = 0; coord.x < width; coord.x += 4)\n\ + {\n\ + VXC_ReadImage(src0, input, coord.xy, VXC_5BITOFFSET_XY(0, 0),\\\n\ + VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0));\n\ + VXC_ReadImage(src1, scale, coord.xw, VXC_5BITOFFSET_XY(0, 0),\\\n\ + VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0));\n\ + bias_f = read_imagef(bias, coord.xw);\n\ + vxc_half8 in_h, scale_h;\n\ + _viv_asm(COPY, in_h, src0, 16);\n\ + _viv_asm(COPY, scale_h, src1, 16);\n\ + vxc_float4 in_f, scale_f;\n\ + VXC_DP4x4(in_f, in_h, in_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\\\n\ + UniFP16toFP32Lo4_dp4x4);\n\ + VXC_DP4x4(scale_f, scale_h, scale_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\\\n\ + UniFP16toFP32Lo4_dp4x4);\n\ + vxc_float4 sub, norm;\n\ + sub = in_f - mean;\n\ + norm = scale_f * vari * sub + bias_f;\n\ + half4 norm_h;\n\ + _viv_asm(CONV, norm_h, norm);\n\ + vxc_half8 dst;\n\ + VXC_DP4x4(dst, norm_h, norm_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\\\n\ + uniExtractHalf4_dp4x4);\n\ + vxc_short8 dstval;\n\ + _viv_asm(COPY, dstval, dst, 16);\n\ + VXC_WriteImage(output, coord.xy, dstval, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0));\n\ + }\n\ +}\n\ +/*****************************layernorm uint8 to uint8****************************/\n\ +_viv_uniform VXC_512Bits uniConvert1stUint8SubZpToFp32_4x4;\n\ +_viv_uniform VXC_512Bits uniConvert2ndUint8SubZpToFp32_4x4;\n\ +_viv_uniform VXC_512Bits uniConvert3rdUint8SubZpToFp32_4x4;\n\ +_viv_uniform VXC_512Bits uniConvert4thUint8SubZpToFp32_4x4;\n\ +_viv_uniform VXC_512Bits uniSumU8_16x1;\n\ +_viv_uniform VXC_512Bits uniSqrSum_16x1;\n\ +_viv_uniform float input_scale;\n\ +_viv_uniform int inputZP;\n\ +_viv_uniform float outputScale;\n\ +_viv_uniform float output_zp;\n\ +_viv_uniform int sumInZp;\n\ +_viv_uniform int tmpZp1;\n\ +_viv_uniform int tmpZp2;\n\ +_viv_uniform float e2InScale;\n\ +_viv_uniform VXC_512Bits uniConvertSecFp16Fp32_4x4;\n\ +_viv_uniform VXC_512Bits uniConvertInt32toUint8_2x8;\n\ +\n\ +__kernel void layer_norm_U8toU8_2D(\n\ + image2d_t input, image2d_t bias, image2d_t scale,\n\ + image2d_t output, float eps)\n\ +{\n\ + int4 coord = (int4)(0, get_global_id(1), 0, 0);\n\ + vxc_uchar16 src0, src2;\n\ + vxc_short8 src1;\n\ + vxc_half8 scale_h;\n\ + float sum = 0, sqr = 0;\n\ + vxc_float4 bias_f0, bias_f1, scale_f0, scale_f1;\n\ + int tmpSum = 0, tmpSqr = 0;\n\ + vxc_int4 tmpSum1;\n\ + vxc_int4 tmpSqr1;\n\ + short zp = inputZP;\n\ +\n\ + for(coord.x = 0; coord.x < width; coord.x += 16)\n\ + {\n\ + VXC_ReadImage(src0, input, coord.xy, VXC_5BITOFFSET_XY(0, 0),\\\n\ + VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));\n\ + VXC_DP16x1(tmpSum1, src0, src0, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0), uniSumU8_16x1);\n\ + tmpSum += (tmpSum1.x);\n\ + VXC_DP16x1(tmpSqr1, src0, src0, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0), uniSqrSum_16x1);\n\ + tmpSqr += (tmpSqr1.x + tmpZp1 * tmpSum1.x);\n\ + }\n\ + sum = (tmpSum + sumInZp) * input_scale;\n\ + sqr = (tmpSqr + tmpZp2) * e2InScale;\n\ +\n\ + float mean, vari;\n\ + mean = sum * dimRatio;\n\ + vari = sqr*dimRatio - mean*mean;\n\ + vari += eps;\n\ + vari = rsqrt(vari);\n\ + vxc_int4 tmpVal0, tmpVal1;\n\ + vxc_float4 tmpData0, tmpData1, tmpData2, tmpData3;\n\ + int2 coord_bias = (int2)(0, 0);\n\ +\n\ + for(coord.x = 0; coord.x < width; coord.x += 16)\n\ + {\n\ + VXC_ReadImage(src0, input, coord.xy, VXC_5BITOFFSET_XY(0, 0),\\\n\ + VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));\n\ + VXC_ReadImage(src1, scale, coord.xw, VXC_5BITOFFSET_XY(0, 0),\\\n\ + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ + coord_bias.x = coord.x;\n\ + _viv_asm(COPY, scale_h, src1, 16);\n\ + VXC_DP4x4(scale_f0, scale_h, scale_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\\\n\ + UniFP16toFP32Lo4_dp4x4);\n\ + VXC_DP4x4(scale_f1, scale_h, scale_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\\\n\ + uniConvertSecFp16Fp32_4x4);\n\ + bias_f0 = read_imagef(bias, coord_bias);\n\ + coord_bias.x += 4;\n\ + bias_f1 = read_imagef(bias, coord_bias);\n\ + coord_bias.x += 4;\n\ +\n\ + VXC_ReadImage(src1, scale, coord.xw, VXC_5BITOFFSET_XY(8, 0),\\\n\ + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ + _viv_asm(COPY, scale_h, src1, 16);\n\ + VXC_DP4x4(tmpData0, src0, zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\\\n\ + uniConvert1stUint8SubZpToFp32_4x4);\n\ + VXC_DP4x4(tmpData1, src0, zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\\\n\ + uniConvert2ndUint8SubZpToFp32_4x4);\n\ + VXC_DP4x4(tmpData2, src0, zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\\\n\ + uniConvert3rdUint8SubZpToFp32_4x4);\n\ + VXC_DP4x4(tmpData3, src0, zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\\\n\ + uniConvert4thUint8SubZpToFp32_4x4);\n\ + tmpData0 = tmpData0 * input_scale - mean;\n\ + tmpData1 = tmpData1 * input_scale - mean;\n\ + tmpData2 = tmpData2 * input_scale - mean;\n\ + tmpData3 = tmpData3 * input_scale - mean;\n\ +\n\ + vxc_float4 norm;\n\ + norm = scale_f0 * vari * tmpData0 + bias_f0;\n\ + bias_f0 = read_imagef(bias, coord_bias);\n\ + VXC_DP4x4(scale_f0, scale_h, scale_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\\\n\ + UniFP16toFP32Lo4_dp4x4);\n\ + coord_bias.x += 4;\n\ + tmpVal0 = convert_int4_rte(norm * outputScale + output_zp);\n\ +\n\ + norm = scale_f1 * vari * tmpData1 + bias_f1;\n\ + bias_f1 = read_imagef(bias, coord_bias);\n\ + VXC_DP4x4(scale_f1, scale_h, scale_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\\\n\ + uniConvertSecFp16Fp32_4x4);\n\ + tmpVal1 = convert_int4_rte(norm * outputScale + output_zp);\n\ + VXC_DP2x8(src2, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1),\\\n\ + uniConvertInt32toUint8_2x8);\n\ +\n\ + norm = scale_f0 * vari * tmpData2 + bias_f0;\n\ + tmpVal0 = convert_int4_rte(norm * outputScale + output_zp);\n\ +\n\ + norm = scale_f1 * vari * tmpData3 + bias_f1;\n\ + tmpVal1 = convert_int4_rte(norm * outputScale + output_zp);\n\ + VXC_DP2x8(src2, tmpVal0, tmpVal1, VXC_MODIFIER(8, 15, 0, VXC_RM_TowardZero, 1),\\\n\ + uniConvertInt32toUint8_2x8);\n\ + VXC_WriteImage(output, coord.xy, src2, VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));\n\ + }\n\ +}\n\ +/***************************layernorm float16 to uint8**************************/\n\ +__kernel void layer_norm_F16toU8_2D(\n\ + image2d_t input, image2d_t bias, image2d_t scale,\n\ + image2d_t output, float eps)\n\ +{\n\ + int4 coord = (int4)(0, get_global_id(1), 0, 0);\n\ + vxc_short8 src0, src1;\n\ + vxc_float sum = 0, sqr = 0;\n\ + VXC_ReadImage(src0, input, coord.xy, VXC_5BITOFFSET_XY(0, 0),\\\n\ + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ + for(coord.x = 8; coord.x < (width+8); coord.x += 8)\n\ + {\n\ + vxc_half8 val0_h;\n\ + _viv_asm(COPY, val0_h, src0, 16);\n\ + VXC_ReadImage(src0, input, coord.xy, VXC_5BITOFFSET_XY(0, 0),\\\n\ + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ + vxc_float4 sumsqr;\n\ + VXC_DP8x2(sumsqr, val0_h, val0_h, VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0),\\\n\ + uniFp16SumSqr_dp8x2);\n\ + sum += sumsqr.x;\n\ + sqr += sumsqr.y;\n\ + }\n\ + vxc_float mean;\n\ + mean = sum * dimRatio;\n\ + vxc_float vari;\n\ + vari = sqr*dimRatio - mean*mean;\n\ + vari += eps;\n\ + vari = rsqrt(vari);\n\ + vxc_float4 bias_f;\n\ + for(coord.x = 0; coord.x < width; coord.x += 4)\n\ + {\n\ + VXC_ReadImage(src0, input, coord.xy, VXC_5BITOFFSET_XY(0, 0),\\\n\ + VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0));\n\ + VXC_ReadImage(src1, scale, coord.xw, VXC_5BITOFFSET_XY(0, 0),\\\n\ + VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0));\n\ + bias_f = read_imagef(bias, coord.xw);\n\ + vxc_half8 in_h, scale_h;\n\ + _viv_asm(COPY, in_h, src0, 16);\n\ + _viv_asm(COPY, scale_h, src1, 16);\n\ + vxc_float4 in_f, scale_f;\n\ + VXC_DP4x4(in_f, in_h, in_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\\\n\ + UniFP16toFP32Lo4_dp4x4);\n\ + VXC_DP4x4(scale_f, scale_h, scale_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\\\n\ + UniFP16toFP32Lo4_dp4x4);\n\ + vxc_float4 sub, norm;\n\ + sub = in_f - mean;\n\ + norm = scale_f * vari * sub + bias_f;\n\ + norm = norm * outputScale + output_zp;\n\ + int4 output_int4;\n\ + output_int4 = convert_int4_rte(norm);\n\ + vxc_uchar8 dst;\n\ + VXC_DP2x8(dst, output_int4, output_int4, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1),\n\ + uniConvertInt32toUint8_2x8);\n\ + VXC_WriteImage(output, coord.xy, dst, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0));\n\ + }\n\ +}\n\ +"; /* end of layer_normalization_2d_vx*/ + +static const char layer_normalization_i16_vx[] = "#include \"cl_viv_vx_ext.h\"\n\ +\n\ +/**************************layernorm float16***********************************/\n\ +_viv_uniform int width;\n\ +_viv_uniform float dimRatio;\n\ +_viv_uniform float dimRatio_scale;\n\ +_viv_uniform VXC_512Bits uniInt16SumSqr_dp8x2;\n\ +_viv_uniform VXC_512Bits UniFP16toFP32Lo4_dp4x4;\n\ +_viv_uniform VXC_512Bits uniConvertSecFp16Fp32_4x4;\n\ +_viv_uniform VXC_512Bits uniConvert1stUint8SubZpToFp32_4x4;\n\ +_viv_uniform VXC_512Bits uniConvert2ndUint8SubZpToFp32_4x4;\n\ +_viv_uniform VXC_512Bits uniConvertInt32toUint8_2x8;\n\ +\n\ +_viv_uniform float e2InScale;\n\ +_viv_uniform float outputScale;\n\ +_viv_uniform float output_zp;\n\ +_viv_uniform float input_scale;\n\ +_viv_uniform int inputZP;\n\ +\n\ +__kernel void layer_norm_I16toI16(\n\ + image2d_array_t input, image2d_t bias, image2d_t scale,\n\ + image2d_array_t output, float eps)\n\ +{\n\ + int4 coord = (int4)(0, get_global_id(1), get_global_id(2), 0);\n\ +\n\ + int8 input_desc, output_desc;\n\ + _viv_asm(COPY, input_desc, input, sizeof(input_desc));\n\ + int baseAddr_a = (int)get_global_id(2) * input_desc.s4 + input_desc.s0;\n\ + _viv_asm(MOV, coord.z, baseAddr_a);\n\ +\n\ + _viv_asm(COPY, output_desc, output, sizeof(output_desc));\n\ + int baseAddr = (int)get_global_id(2) * output_desc.s4 + output_desc.s0;\n\ + _viv_asm(MOV, coord.w, baseAddr);\n\ +\n\ + vxc_short8 src0, src1, dst;\n\ + vxc_float sum = 0, sqr = 0;\n\ + for(; coord.x < width;)\n\ + {\n\ + VXC_OP4(img_load_3d, src0, input, coord.xyzz, VXC_5BITOFFSET_XY(0, 0), \\\n\ + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ + coord.x += 8;\n\ + vxc_float4 sumsqr;\n\ + VXC_DP8x2(sumsqr, src0, src0, VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0),\\\n\ + uniInt16SumSqr_dp8x2);\n\ + sum += sumsqr.x;\n\ + sqr = sqr + sumsqr.y * e2InScale;\n\ + }\n\ + vxc_float mean;\n\ + mean = sum * dimRatio_scale;\n\ + vxc_float vari;\n\ + vari = sqr*dimRatio - mean*mean;\n\ + vari += eps;\n\ + vari = rsqrt(vari);\n\ +\n\ + short zp = inputZP;\n\ + vxc_float4 tmpData0, tmpData1;\n\ + vxc_float4 bias_f0, bias_f1, scale_f0, scale_f1;\n\ + vxc_half8 scale_h;\n\ + vxc_int4 tmpVal0, tmpVal1;\n\ +\n\ + int2 coord_bias = (int2)(0, 0);\n\ +\n\ + for(coord.x = 0; coord.x < width; coord.x += 8)\n\ + {\n\ + VXC_OP4(img_load_3d, src0, input, coord.xyzz, VXC_5BITOFFSET_XY(0, 0), \\\n\ + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ + coord_bias.x = coord.x;\n\ + VXC_ReadImage(src1, scale, coord_bias, VXC_5BITOFFSET_XY(0, 0),\\\n\ + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ + bias_f0 = read_imagef(bias, coord_bias);\n\ + coord_bias.x += 4;\n\ + bias_f1 = read_imagef(bias, coord_bias);\n\ +\n\ + VXC_DP4x4(tmpData0, src0, zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\\\n\ + uniConvert1stUint8SubZpToFp32_4x4);\n\ + VXC_DP4x4(tmpData1, src0, zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\\\n\ + uniConvert2ndUint8SubZpToFp32_4x4);\n\ +\n\ + _viv_asm(COPY, scale_h, src1, 16);\n\ + VXC_DP4x4(scale_f0, scale_h, scale_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\\\n\ + UniFP16toFP32Lo4_dp4x4);\n\ + VXC_DP4x4(scale_f1, scale_h, scale_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\\\n\ + uniConvertSecFp16Fp32_4x4);\n\ +\n\ + vxc_float4 sub, norm;\n\ + sub = tmpData0 * input_scale - mean;\n\ + norm = scale_f0 * vari * sub + bias_f0;\n\ + tmpVal0 = convert_int4_rte(norm * outputScale + output_zp);\n\ + sub = tmpData1 * input_scale - mean;\n\ + norm = scale_f1 * vari * sub + bias_f1;\n\ + tmpVal1 = convert_int4_rte(norm * outputScale + output_zp);\n\ +\n\ + VXC_DP2x8(dst, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1),\\\n\ + uniConvertInt32toUint8_2x8);\n\ + VXC_OP4_NoDest(img_store_3d, output, coord.xyww, dst, \\\n\ + VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0));\n\ + }\n\ +}\n\ +\n\ +__kernel void layer_norm_I16toI16_2D(\n\ + image2d_t input, image2d_t bias, image2d_t scale,\n\ + image2d_t output, float eps)\n\ +{\n\ + int2 coord = (int2)(0, get_global_id(1));\n\ +\n\ + vxc_short8 src0, src1, dst;\n\ + vxc_float sum = 0, sqr = 0;\n\ + for(; coord.x < width;)\n\ + {\n\ + VXC_ReadImage(src0, input, coord.xy, VXC_5BITOFFSET_XY(0, 0),\\\n\ + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ + coord.x += 8;\n\ + vxc_float4 sumsqr;\n\ + VXC_DP8x2(sumsqr, src0, src0, VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0),\\\n\ + uniInt16SumSqr_dp8x2);\n\ + sum += sumsqr.x;\n\ + sqr = sqr + sumsqr.y * e2InScale;\n\ + }\n\ + vxc_float mean, vari;\n\ + mean = sum * dimRatio_scale;\n\ + vari = sqr * dimRatio - mean * mean;\n\ + vari += eps;\n\ + vari = rsqrt(vari);\n\ +\n\ + short zp = inputZP;\n\ + vxc_float4 tmpData0, tmpData1;\n\ + vxc_float4 bias_f0, bias_f1, scale_f0, scale_f1;\n\ + vxc_half8 scale_h;\n\ + vxc_int4 tmpVal0, tmpVal1;\n\ +\n\ + int2 coord_bias = (int2)(0, 0);\n\ +\n\ + for(coord.x = 0; coord.x < width; coord.x += 8)\n\ + {\n\ + VXC_ReadImage(src0, input, coord.xy, VXC_5BITOFFSET_XY(0, 0),\\\n\ + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ + coord_bias.x = coord.x;\n\ + VXC_ReadImage(src1, scale, coord_bias, VXC_5BITOFFSET_XY(0, 0),\\\n\ + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ + bias_f0 = read_imagef(bias, coord_bias);\n\ + coord_bias.x += 4;\n\ + bias_f1 = read_imagef(bias, coord_bias);\n\ +\n\ + VXC_DP4x4(tmpData0, src0, zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\\\n\ + uniConvert1stUint8SubZpToFp32_4x4);\n\ + VXC_DP4x4(tmpData1, src0, zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\\\n\ + uniConvert2ndUint8SubZpToFp32_4x4);\n\ +\n\ + _viv_asm(COPY, scale_h, src1, 16);\n\ + VXC_DP4x4(scale_f0, scale_h, scale_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\\\n\ + UniFP16toFP32Lo4_dp4x4);\n\ + VXC_DP4x4(scale_f1, scale_h, scale_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\\\n\ + uniConvertSecFp16Fp32_4x4);\n\ +\n\ + vxc_float4 sub, norm;\n\ + sub = tmpData0 * input_scale - mean;\n\ + norm = scale_f0 * vari * sub + bias_f0;\n\ + tmpVal0 = convert_int4_rte(norm * outputScale + output_zp);\n\ + sub = tmpData1 * input_scale - mean;\n\ + norm = scale_f1 * vari * sub + bias_f1;\n\ + tmpVal1 = convert_int4_rte(norm * outputScale + output_zp);\n\ +\n\ + VXC_DP2x8(dst, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1),\\\n\ + uniConvertInt32toUint8_2x8);\n\ + VXC_WriteImage(output, coord.xy, dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ + }\n\ +}\n\ +"; /* end of layer_normalization_i16_vx*/ + +static const char layer_normalization_u8_f16_vx[] = "#include \"cl_viv_vx_ext.h\"\n\ +\n\ +/*****************************layernorm uint8 to fp16****************************/\n\ +_viv_uniform int width;\n\ +_viv_uniform float dimRatio;\n\ +_viv_uniform VXC_512Bits UniFP16toFP32Lo4_dp4x4;\n\ +_viv_uniform VXC_512Bits uniConvert1stUint8SubZpToFp32_4x4;\n\ +_viv_uniform VXC_512Bits uniConvert2ndUint8SubZpToFp32_4x4;\n\ +_viv_uniform VXC_512Bits uniConvert3rdUint8SubZpToFp32_4x4;\n\ +_viv_uniform VXC_512Bits uniConvert4thUint8SubZpToFp32_4x4;\n\ +_viv_uniform VXC_512Bits uniSumU8_16x1;\n\ +_viv_uniform VXC_512Bits uniSqrSum_16x1;\n\ +_viv_uniform float input_scale;\n\ +_viv_uniform int inputZP;\n\ +_viv_uniform int sumInZp;\n\ +_viv_uniform int tmpZp1;\n\ +_viv_uniform int tmpZp2;\n\ +_viv_uniform float e2InScale;\n\ +_viv_uniform VXC_512Bits uniConvertSecFp16Fp32_4x4;\n\ +_viv_uniform VXC_512Bits UniPackFP16even_2x8;\n\ +\n\ +__kernel void layer_norm_U8toF16(\n\ + image2d_array_t input,\n\ + image2d_t bias,\n\ + image2d_t scale,\n\ + image2d_array_t output,\n\ + float eps)\n\ +{\n\ + int4 coord = (int4)(0, get_global_id(1), get_global_id(2), 0);\n\ + int4 coord_out = coord;\n\ + vxc_uchar16 src0;\n\ + float sum = 0, sqr = 0;\n\ + int tmpSum = 0, tmpSqr = 0;\n\ + vxc_int4 tmpSum1;\n\ + vxc_int4 tmpSqr1;\n\ +\n\ + int8 input_desc, output_desc;\n\ + _viv_asm(COPY, input_desc, input, sizeof(input_desc));\n\ + int baseAddr_a = (int)get_global_id(2) * input_desc.s4 + input_desc.s0;\n\ + _viv_asm(MOV, coord.z, baseAddr_a);\n\ +\n\ + _viv_asm(COPY, output_desc, output, sizeof(output_desc));\n\ + int baseAddr = (int)get_global_id(2) * output_desc.s4 + output_desc.s0;\n\ + _viv_asm(MOV, coord_out.w, baseAddr);\n\ +\n\ + for(coord.x = 0; coord.x < width; coord.x += 16)\n\ + {\n\ + VXC_OP4(img_load_3d, src0, input, coord.xyzz, VXC_5BITOFFSET_XY(0, 0), \\\n\ + VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));\n\ + VXC_DP16x1(tmpSum1, src0, src0, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0), uniSumU8_16x1);\n\ + tmpSum += (tmpSum1.x);\n\ + VXC_DP16x1(tmpSqr1, src0, src0, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0), uniSqrSum_16x1);\n\ + tmpSqr += (tmpSqr1.x + tmpZp1 * tmpSum1.x);\n\ + }\n\ + sum = (tmpSum + sumInZp) * input_scale;\n\ + sqr = (tmpSqr + tmpZp2) * e2InScale;\n\ +\n\ + float mean, vari;\n\ + mean = sum * dimRatio;\n\ + vari = sqr*dimRatio - mean*mean;\n\ + vari += eps;\n\ + vari = rsqrt(vari);\n\ + vxc_float4 tmpData0, tmpData1, tmpData2, tmpData3;\n\ + int2 coord_bias = (int2)(0, 0);\n\ + vxc_half8 scale_h;\n\ + vxc_float4 bias_f0, bias_f1, scale_f0, scale_f1;\n\ + vxc_short8 src1, outval;\n\ + short zp = inputZP;\n\ + half4 tmpVal0, tmpVal1;\n\ + vxc_half8 dst;\n\ +\n\ + for(coord.x = 0; coord.x < width; coord.x += 16)\n\ + {\n\ + VXC_OP4(img_load_3d, src0, input, coord.xyzz, VXC_5BITOFFSET_XY(0, 0), \\\n\ + VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));\n\ + VXC_ReadImage(src1, scale, coord.xw, VXC_5BITOFFSET_XY(0, 0),\\\n\ + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ + coord_bias.x = coord.x;\n\ + _viv_asm(COPY, scale_h, src1, 16);\n\ + VXC_DP4x4(scale_f0, scale_h, scale_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\\\n\ + UniFP16toFP32Lo4_dp4x4);\n\ + VXC_DP4x4(scale_f1, scale_h, scale_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\\\n\ + uniConvertSecFp16Fp32_4x4);\n\ + bias_f0 = read_imagef(bias, coord_bias);\n\ + coord_bias.x += 4;\n\ + bias_f1 = read_imagef(bias, coord_bias);\n\ + coord_bias.x += 4;\n\ +\n\ + VXC_ReadImage(src1, scale, coord.xw, VXC_5BITOFFSET_XY(8, 0),\\\n\ + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ + _viv_asm(COPY, scale_h, src1, 16);\n\ + VXC_DP4x4(tmpData0, src0, zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\\\n\ + uniConvert1stUint8SubZpToFp32_4x4);\n\ + VXC_DP4x4(tmpData1, src0, zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\\\n\ + uniConvert2ndUint8SubZpToFp32_4x4);\n\ + VXC_DP4x4(tmpData2, src0, zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\\\n\ + uniConvert3rdUint8SubZpToFp32_4x4);\n\ + VXC_DP4x4(tmpData3, src0, zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\\\n\ + uniConvert4thUint8SubZpToFp32_4x4);\n\ + tmpData0 *= input_scale;\n\ + tmpData1 *= input_scale;\n\ + tmpData2 *= input_scale;\n\ + tmpData3 *= input_scale;\n\ +\n\ + vxc_float4 norm;\n\ + tmpData0 -= mean;\n\ + norm = scale_f0 * vari * tmpData0 + bias_f0;\n\ + bias_f0 = read_imagef(bias, coord_bias);\n\ + VXC_DP4x4(scale_f0, scale_h, scale_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\\\n\ + UniFP16toFP32Lo4_dp4x4);\n\ + coord_bias.x += 4;\n\ + _viv_asm(CONV, tmpVal0, norm);\n\ +\n\ + tmpData1 -= mean;\n\ + norm = scale_f1 * vari * tmpData1 + bias_f1;\n\ + bias_f1 = read_imagef(bias, coord_bias);\n\ + VXC_DP4x4(scale_f1, scale_h, scale_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\\\n\ + uniConvertSecFp16Fp32_4x4);\n\ + _viv_asm(CONV, tmpVal1, norm);\n\ + VXC_DP2x8(dst, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0),\\\n\ + UniPackFP16even_2x8);\n\ + _viv_asm(COPY, outval, dst, 16);\n\ + coord_out.x = coord.x;\n\ + VXC_OP4_NoDest(img_store_3d, output, coord_out.xyww, outval, \\\n\ + VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0));\n\ +\n\ + tmpData2 -= mean;\n\ + norm = scale_f0 * vari * tmpData2 + bias_f0;\n\ + _viv_asm(CONV, tmpVal0, norm);\n\ +\n\ + tmpData3 -= mean;\n\ + norm = scale_f1 * vari * tmpData3 + bias_f1;\n\ + _viv_asm(CONV, tmpVal1, norm);\n\ + VXC_DP2x8(dst, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0),\\\n\ + UniPackFP16even_2x8);\n\ + _viv_asm(COPY, outval, dst, 16);\n\ + coord_out.x += 8;\n\ + VXC_OP4_NoDest(img_store_3d, output, coord_out.xyww, outval, \\\n\ + VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0));\n\ + }\n\ +}\n\ +\n\ +__kernel void layer_norm_U8toF16_2D(\n\ + image2d_t input,\n\ + image2d_t bias,\n\ + image2d_t scale,\n\ + image2d_t output,\n\ + float eps)\n\ +{\n\ + int4 coord = (int4)(0, get_global_id(1), 0, 0);\n\ + vxc_uchar16 src0;\n\ + float sum = 0, sqr = 0;\n\ + int tmpSum = 0, tmpSqr = 0;\n\ + vxc_int4 tmpSum1;\n\ + vxc_int4 tmpSqr1;\n\ +\n\ + for(coord.x = 0; coord.x < width; coord.x += 16)\n\ + {\n\ + VXC_ReadImage(src0, input, coord.xy, VXC_5BITOFFSET_XY(0, 0),\\\n\ + VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));\n\ + VXC_DP16x1(tmpSum1, src0, src0, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0), uniSumU8_16x1);\n\ + tmpSum += (tmpSum1.x);\n\ + VXC_DP16x1(tmpSqr1, src0, src0, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0), uniSqrSum_16x1);\n\ + tmpSqr += (tmpSqr1.x + tmpZp1 * tmpSum1.x);\n\ + }\n\ + sum = (tmpSum + sumInZp) * input_scale;\n\ + sqr = (tmpSqr + tmpZp2) * e2InScale;\n\ +\n\ + float mean, vari;\n\ + mean = sum * dimRatio;\n\ + vari = sqr*dimRatio - mean*mean;\n\ + vari += eps;\n\ + vari = rsqrt(vari);\n\ + vxc_float4 tmpData0, tmpData1, tmpData2, tmpData3;\n\ + int2 coord_bias = (int2)(0, 0);\n\ + vxc_half8 scale_h;\n\ + vxc_float4 bias_f0, bias_f1, scale_f0, scale_f1;\n\ + vxc_short8 src1, outval;\n\ + short zp = inputZP;\n\ + half4 tmpVal0, tmpVal1;\n\ + vxc_half8 dst;\n\ +\n\ + int2 coord_out = (int2)(get_global_id(0), get_global_id(1));\n\ +\n\ + for(coord.x = 0; coord.x < width; coord.x += 16)\n\ + {\n\ + coord_bias.x = coord.x;\n\ + VXC_ReadImage(src0, input, coord.xy, VXC_5BITOFFSET_XY(0, 0),\\\n\ + VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));\n\ + VXC_ReadImage(src1, scale, coord.xw, VXC_5BITOFFSET_XY(0, 0),\\\n\ + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ + _viv_asm(COPY, scale_h, src1, 16);\n\ + VXC_DP4x4(scale_f0, scale_h, scale_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\\\n\ + UniFP16toFP32Lo4_dp4x4);\n\ + VXC_DP4x4(scale_f1, scale_h, scale_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\\\n\ + uniConvertSecFp16Fp32_4x4);\n\ + bias_f0 = read_imagef(bias, coord_bias);\n\ + coord_bias.x += 4;\n\ + bias_f1 = read_imagef(bias, coord_bias);\n\ + coord_bias.x += 4;\n\ +\n\ + VXC_ReadImage(src1, scale, coord.xw, VXC_5BITOFFSET_XY(8, 0),\\\n\ + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ + _viv_asm(COPY, scale_h, src1, 16);\n\ + VXC_DP4x4(tmpData0, src0, zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\\\n\ + uniConvert1stUint8SubZpToFp32_4x4);\n\ + VXC_DP4x4(tmpData1, src0, zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\\\n\ + uniConvert2ndUint8SubZpToFp32_4x4);\n\ + VXC_DP4x4(tmpData2, src0, zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\\\n\ + uniConvert3rdUint8SubZpToFp32_4x4);\n\ + VXC_DP4x4(tmpData3, src0, zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\\\n\ + uniConvert4thUint8SubZpToFp32_4x4);\n\ + tmpData0 *= input_scale;\n\ + tmpData1 *= input_scale;\n\ + tmpData2 *= input_scale;\n\ + tmpData3 *= input_scale;\n\ +\n\ + vxc_float4 norm;\n\ + tmpData0 -= mean;\n\ + norm = scale_f0 * vari * tmpData0 + bias_f0;\n\ + bias_f0 = read_imagef(bias, coord_bias);\n\ + VXC_DP4x4(scale_f0, scale_h, scale_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\\\n\ + UniFP16toFP32Lo4_dp4x4);\n\ + coord_bias.x += 4;\n\ + _viv_asm(CONV, tmpVal0, norm);\n\ +\n\ + tmpData1 -= mean;\n\ + norm = scale_f1 * vari * tmpData1 + bias_f1;\n\ + bias_f1 = read_imagef(bias, coord_bias);\n\ + VXC_DP4x4(scale_f1, scale_h, scale_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\\\n\ + uniConvertSecFp16Fp32_4x4);\n\ + _viv_asm(CONV, tmpVal1, norm);\n\ + VXC_DP2x8(dst, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0),\\\n\ + UniPackFP16even_2x8);\n\ + _viv_asm(COPY, outval, dst, 16);\n\ + coord_out.x = coord.x;\n\ + VXC_WriteImage(output, coord_out, outval, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ +\n\ + tmpData2 -= mean;\n\ + norm = scale_f0 * vari * tmpData2 + bias_f0;\n\ + _viv_asm(CONV, tmpVal0, norm);\n\ +\n\ + tmpData3 -= mean;\n\ + norm = scale_f1 * vari * tmpData3 + bias_f1;\n\ + _viv_asm(CONV, tmpVal1, norm);\n\ + VXC_DP2x8(dst, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0),\\\n\ + UniPackFP16even_2x8);\n\ + _viv_asm(COPY, outval, dst, 16);\n\ + coord_out.x += 8;\n\ + VXC_WriteImage(output, coord_out, outval, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ + }\n\ +}\n\ +"; /* end of layer_normalization_u8_f16_vx*/ + +static const char layer_normalization_wh_f16_vx[] = "#include \"cl_viv_vx_ext.h\"\n\ +\n\ +_viv_uniform VXC_512Bits uniFp16SumSqr_dp8x2;\n\ +_viv_uniform int width;\n\ +\n\ +_viv_uniform int height;\n\ +\n\ +_viv_uniform int height_depth;\n\ +_viv_uniform float dimRatio;\n\ +_viv_uniform int group_num;\n\ +_viv_uniform VXC_512Bits UniFP16toFP32Lo4_dp4x4;\n\ +_viv_uniform VXC_512Bits uniConvertSecFp16Fp32_4x4;\n\ +_viv_uniform VXC_512Bits uniConvertHalfToFp16_2x8;\n\ +_viv_uniform VXC_512Bits uniConvertInt32toUint8_2x8;\n\ +_viv_uniform float outputScale;\n\ +_viv_uniform float output_zp;\n\ +\n\ +__kernel __attribute__((reqd_work_group_size(16, 1, 1))) void layernorm_wh_sumSqr_F16toF32(\n\ + image2d_array_t input, image2d_t output)\n\ +{\n\ + int gidx = get_global_id(0) << 3;\n\ + int lidx = get_local_id(0);\n\ + int gidz = get_global_id(1);\n\ + int4 coord = (int4)(gidx, 0, gidz, 0);\n\ + vxc_short8 src0;\n\ + vxc_half8 in_h;\n\ + vxc_float4 sumsqr;\n\ + vxc_float4 tmpSumSqr = (vxc_float4)(0);\n\ +\n\ + __local float lcl_sum[16];\n\ + __local float lcl_sqr[16];\n\ +\n\ + int8 input_desc;\n\ + _viv_asm(COPY, input_desc, input, sizeof(input_desc));\n\ + int baseAddr_a = (int)coord.z * input_desc.s4 + input_desc.s0;\n\ + _viv_asm(MOV, coord.w, baseAddr_a);\n\ +\n\ + if(gidx < width)\n\ + {\n\ + for(coord.y = 0; coord.y < height;)\n\ + {\n\ + VXC_OP4(img_load_3d, src0, input, coord.xyww, VXC_5BITOFFSET_XY(0, 0),\n\ + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ + coord.y++;\n\ + _viv_asm(COPY, in_h, src0, 16);\n\ + VXC_DP8x2(sumsqr, in_h, in_h, VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0),\\\n\ + uniFp16SumSqr_dp8x2);\n\ + tmpSumSqr += sumsqr;\n\ + }\n\ + }\n\ +\n\ + lcl_sum[lidx] = tmpSumSqr.x;\n\ + lcl_sqr[lidx] = tmpSumSqr.y;\n\ + barrier(CLK_LOCAL_MEM_FENCE);\n\ +\n\ + int2 coord_out = (int2)(get_group_id(0) << 2, gidz);\n\ + if(lidx == 0)\n\ + {\n\ + float4 one = (float4)(1, 1, 1, 1);\n\ + __local float4* tmp_sum = (__local float4*)lcl_sum;\n\ + __local float4* tmp_sqr = (__local float4*)lcl_sqr;\n\ +\n\ + float sum = 0;\n\ + float sqr = 0;\n\ + for(int i = 0; i < 4; i++)\n\ + {\n\ + sum += dot(tmp_sum[i], one);\n\ + sqr += dot(tmp_sqr[i], one);\n\ + }\n\ +\n\ + float4 data = (float4)(sum, sqr, 0, 0);\n\ + write_imagef(output, coord_out, data);\n\ + }\n\ +}\n\ +\n\ +__kernel __attribute__((reqd_work_group_size(16, 1, 1))) void layernorm_wh_sumSqr_F16toF32_2D(\n\ + image2d_array_t input, image2d_t output)\n\ +{\n\ + int gidx = get_global_id(0) << 3;\n\ + int lidx = get_local_id(0);\n\ + int gidz = get_global_id(1);\n\ + int gidy = gidz * height;\n\ +\n\ + int2 coord = (int2)(gidx, gidy);\n\ + vxc_short8 src0;\n\ + vxc_half8 in_h;\n\ + vxc_float4 sumsqr;\n\ + vxc_float4 tmpSumSqr = (vxc_float4)(0);\n\ +\n\ + __local float lcl_sum[16];\n\ + __local float lcl_sqr[16];\n\ +\n\ + int endH = gidy + height;\n\ + if(gidx < width)\n\ + {\n\ + for(; coord.y < endH;)\n\ + {\n\ + VXC_ReadImage(src0, input, coord, VXC_5BITOFFSET_XY(0, 0),\n\ + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ + coord.y++;\n\ + _viv_asm(COPY, in_h, src0, 16);\n\ + VXC_DP8x2(sumsqr, in_h, in_h, VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0),\\\n\ + uniFp16SumSqr_dp8x2);\n\ + tmpSumSqr += sumsqr;\n\ + }\n\ + }\n\ +\n\ + lcl_sum[lidx] = tmpSumSqr.x;\n\ + lcl_sqr[lidx] = tmpSumSqr.y;\n\ + barrier(CLK_LOCAL_MEM_FENCE);\n\ +\n\ + int2 coord_out = (int2)(get_group_id(0) << 2, gidz);\n\ + if(lidx == 0)\n\ + {\n\ + float4 one = (float4)(1, 1, 1, 1);\n\ + __local float4* tmp_sum = (__local float4*)lcl_sum;\n\ + __local float4* tmp_sqr = (__local float4*)lcl_sqr;\n\ +\n\ + float sum = 0;\n\ + float sqr = 0;\n\ + for(int i = 0; i < 4; i++)\n\ + {\n\ + sum += dot(tmp_sum[i], one);\n\ + sqr += dot(tmp_sqr[i], one);\n\ + }\n\ +\n\ + float4 data = (float4)(sum, sqr, 0, 0);\n\ + write_imagef(output, coord_out, data);\n\ + }\n\ +}\n\ +\n\ +__kernel __attribute__((reqd_work_group_size(16, 1, 1))) void layernorm_wh_F16toF16(\n\ + image2d_array_t input, image2d_array_t bias, image2d_array_t scale, image2d_t meanVari,\n\ + image2d_array_t output, float eps)\n\ +{\n\ + int gidz = get_global_id(1);\n\ + int4 coord = (int4)(get_global_id(0), 0, gidz, 0);\n\ + int2 coord_sum = (int2)(0, gidz);\n\ + int4 coord_para = coord;\n\ + coord_para.z = (ushort)gidz / (ushort)(height_depth);\n\ + vxc_short8 src0;\n\ + vxc_short8 src1;\n\ + vxc_half8 scale_h, in_h;\n\ + vxc_float4 bias_f0, bias_f1, scale_f0, scale_f1;\n\ + vxc_float4 mean_vari = (vxc_float4)(0);\n\ +\n\ + for(int i = 0; i < group_num; i++)\n\ + {\n\ + mean_vari += read_imagef(meanVari, coord_sum);\n\ + coord_sum.x += 4;\n\ + }\n\ + mean_vari *= dimRatio;\n\ + mean_vari.s1 = mean_vari.s1 - mean_vari.s0 * mean_vari.s0 + eps;\n\ + mean_vari.s1 = rsqrt(mean_vari.s1);\n\ +\n\ + int4 coord_bias = coord_para;\n\ +\n\ + int8 input_desc, scale_desc, output_desc;\n\ + _viv_asm(COPY, input_desc, input, sizeof(input_desc));\n\ + int baseAddr_a = (int)coord.z * input_desc.s4 + input_desc.s0;\n\ + _viv_asm(MOV, coord.w, baseAddr_a);\n\ +\n\ + _viv_asm(COPY, scale_desc, scale, sizeof(scale_desc));\n\ + int baseAddr_c = (int)coord_para.z * scale_desc.s4 + scale_desc.s0;\n\ + _viv_asm(MOV, coord_para.w, baseAddr_c);\n\ +\n\ + _viv_asm(COPY, output_desc, output, sizeof(output_desc));\n\ + int baseAddr = (int)coord.z * output_desc.s4 + output_desc.s0;\n\ + _viv_asm(MOV, coord.z, baseAddr);\n\ +\n\ + vxc_float4 tmpData0, tmpData1;\n\ + vxc_short8 outval;\n\ + half4 tmpVal0, tmpVal1;\n\ + vxc_half8 dst;\n\ +\n\ + for(coord.y = 0; coord.y < height; coord.y++)\n\ + {\n\ + VXC_OP4(img_load_3d, src0, input, coord.xyww, VXC_5BITOFFSET_XY(0, 0),\n\ + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ + coord_para.y = coord.y;\n\ + coord_bias.y = coord.y;\n\ + VXC_OP4(img_load_3d, src1, scale, coord_para.xyww, VXC_5BITOFFSET_XY(0, 0),\n\ + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ + bias_f0 = read_imagef(bias, coord_bias);\n\ + coord_bias.x += 4;\n\ + bias_f1 = read_imagef(bias, coord_bias);\n\ + coord_bias.x = coord.x;\n\ +\n\ + _viv_asm(COPY, in_h, src0, 16);\n\ + VXC_DP4x4(tmpData0, in_h, in_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\\\n\ + UniFP16toFP32Lo4_dp4x4);\n\ + VXC_DP4x4(tmpData1, in_h, in_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\\\n\ + uniConvertSecFp16Fp32_4x4);\n\ +\n\ + _viv_asm(COPY, scale_h, src1, 16);\n\ + VXC_DP4x4(scale_f0, scale_h, scale_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\\\n\ + UniFP16toFP32Lo4_dp4x4);\n\ + VXC_DP4x4(scale_f1, scale_h, scale_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\\\n\ + uniConvertSecFp16Fp32_4x4);\n\ +\n\ + vxc_float4 sub, norm;\n\ + sub = tmpData0 - mean_vari.s0;\n\ + norm = scale_f0 * mean_vari.s1 * sub + bias_f0;\n\ + _viv_asm(CONV, tmpVal0, norm);\n\ + sub = tmpData1 - mean_vari.s0;\n\ + norm = scale_f1 * mean_vari.s1 * sub + bias_f1;\n\ + _viv_asm(CONV, tmpVal1, norm);\n\ + VXC_DP2x8(dst, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0),\\\n\ + uniConvertHalfToFp16_2x8);\n\ + _viv_asm(COPY, outval, dst, 16);\n\ + VXC_OP4_NoDest(img_store_3d, output, coord.xyzz, outval, \\\n\ + VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0));\n\ + }\n\ +}\n\ +\n\ +__kernel __attribute__((reqd_work_group_size(16, 1, 1))) void layernorm_wh_F16toF16_2D(\n\ + image2d_t input, image2d_t bias, image2d_t scale, image2d_t meanVari,\n\ + image2d_t output, float eps)\n\ +{\n\ + int2 coord = (int2)(get_global_id(0), 0);\n\ + int2 coord_bias = (int2)(0, 0);\n\ + vxc_short8 src0;\n\ + vxc_short8 src1;\n\ + vxc_half8 scale_h, in_h;\n\ + vxc_float4 bias_f0, bias_f1, scale_f0, scale_f1;\n\ + vxc_float4 mean_vari = (vxc_float4)(0);\n\ +\n\ + for(int i = 0; i < group_num; i++)\n\ + {\n\ + mean_vari += read_imagef(meanVari, coord_bias);\n\ + coord_bias.x += 4;\n\ + }\n\ + mean_vari *= dimRatio;\n\ + mean_vari.s1 = mean_vari.s1 - mean_vari.s0 * mean_vari.s0 + eps;\n\ + mean_vari.s1 = rsqrt(mean_vari.s1);\n\ +\n\ + coord_bias = coord;\n\ +\n\ + vxc_float4 tmpData0, tmpData1;\n\ + vxc_short8 outval;\n\ + half4 tmpVal0, tmpVal1;\n\ + vxc_half8 dst;\n\ +\n\ + for(coord.y = 0; coord.y < height; coord.y++)\n\ + {\n\ + VXC_ReadImage(src0, input, coord, VXC_5BITOFFSET_XY(0, 0),\\\n\ + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ + coord_bias.y = coord.y;\n\ + VXC_ReadImage(src1, scale, coord, VXC_5BITOFFSET_XY(0, 0),\\\n\ + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ + bias_f0 = read_imagef(bias, coord_bias);\n\ + coord_bias.x += 4;\n\ + bias_f1 = read_imagef(bias, coord_bias);\n\ + coord_bias.x = coord.x;\n\ +\n\ + _viv_asm(COPY, in_h, src0, 16);\n\ + VXC_DP4x4(tmpData0, in_h, in_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\\\n\ + UniFP16toFP32Lo4_dp4x4);\n\ + VXC_DP4x4(tmpData1, in_h, in_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\\\n\ + uniConvertSecFp16Fp32_4x4);\n\ +\n\ + _viv_asm(COPY, scale_h, src1, 16);\n\ + VXC_DP4x4(scale_f0, scale_h, scale_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\\\n\ + UniFP16toFP32Lo4_dp4x4);\n\ + VXC_DP4x4(scale_f1, scale_h, scale_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\\\n\ + uniConvertSecFp16Fp32_4x4);\n\ +\n\ + vxc_float4 sub, norm;\n\ + sub = tmpData0 - mean_vari.s0;\n\ + norm = scale_f0 * mean_vari.s1 * sub + bias_f0;\n\ + _viv_asm(CONV, tmpVal0, norm);\n\ + sub = tmpData1 - mean_vari.s0;\n\ + norm = scale_f1 * mean_vari.s1 * sub + bias_f1;\n\ + _viv_asm(CONV, tmpVal1, norm);\n\ + VXC_DP2x8(dst, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0),\\\n\ + uniConvertHalfToFp16_2x8);\n\ + _viv_asm(COPY, outval, dst, 16);\n\ + VXC_WriteImage(output, coord, outval, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ + }\n\ +}\n\ +\n\ +__kernel __attribute__((reqd_work_group_size(16, 1, 1))) void layernorm_wh_F16toU8(\n\ + image2d_array_t input, image2d_array_t bias, image2d_array_t scale, image2d_t meanVari,\n\ + image2d_array_t output, float eps)\n\ +{\n\ + int gidz = get_global_id(1);\n\ + int4 coord = (int4)(get_global_id(0), 0, gidz, 0);\n\ + int2 coord_sum = (int2)(0, gidz);\n\ + int4 coord_para = coord;\n\ + coord_para.z = (ushort)gidz / (ushort)(height_depth);\n\ + vxc_short8 src0;\n\ + vxc_short8 src1;\n\ + vxc_half8 scale_h, in_h;\n\ + vxc_float4 bias_f0, bias_f1, scale_f0, scale_f1;\n\ + vxc_float4 mean_vari = (vxc_float4)(0);\n\ +\n\ + for(int i = 0; i < group_num; i++)\n\ + {\n\ + mean_vari += read_imagef(meanVari, coord_sum);\n\ + coord_sum.x += 4;\n\ + }\n\ + mean_vari *= dimRatio;\n\ + mean_vari.s1 = mean_vari.s1 - mean_vari.s0 * mean_vari.s0 + eps;\n\ + mean_vari.s1 = rsqrt(mean_vari.s1);\n\ +\n\ + int4 coord_bias = coord_para;\n\ +\n\ + int8 input_desc, scale_desc, output_desc;\n\ + _viv_asm(COPY, input_desc, input, sizeof(input_desc));\n\ + int baseAddr_a = (int)coord.z * input_desc.s4 + input_desc.s0;\n\ + _viv_asm(MOV, coord.w, baseAddr_a);\n\ +\n\ + _viv_asm(COPY, scale_desc, scale, sizeof(scale_desc));\n\ + int baseAddr_c = (int)coord_para.z * scale_desc.s4 + scale_desc.s0;\n\ + _viv_asm(MOV, coord_para.w, baseAddr_c);\n\ +\n\ + _viv_asm(COPY, output_desc, output, sizeof(output_desc));\n\ + int baseAddr = (int)coord.z * output_desc.s4 + output_desc.s0;\n\ + _viv_asm(MOV, coord.z, baseAddr);\n\ +\n\ + vxc_float4 tmpData0, tmpData1;\n\ + vxc_uchar16 outval;\n\ + vxc_int4 tmpVal0, tmpVal1;\n\ +\n\ + for(coord.y = 0; coord.y < height; coord.y++)\n\ + {\n\ + VXC_OP4(img_load_3d, src0, input, coord.xyww, VXC_5BITOFFSET_XY(0, 0),\n\ + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ + coord_para.y = coord.y;\n\ + coord_bias.y = coord.y;\n\ + VXC_OP4(img_load_3d, src1, scale, coord_para.xyww, VXC_5BITOFFSET_XY(0, 0),\n\ + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ + bias_f0 = read_imagef(bias, coord_bias);\n\ + coord_bias.x += 4;\n\ + bias_f1 = read_imagef(bias, coord_bias);\n\ + coord_bias.x = coord.x;\n\ +\n\ + _viv_asm(COPY, in_h, src0, 16);\n\ + VXC_DP4x4(tmpData0, in_h, in_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\\\n\ + UniFP16toFP32Lo4_dp4x4);\n\ + VXC_DP4x4(tmpData1, in_h, in_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\\\n\ + uniConvertSecFp16Fp32_4x4);\n\ +\n\ + _viv_asm(COPY, scale_h, src1, 16);\n\ + VXC_DP4x4(scale_f0, scale_h, scale_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\\\n\ + UniFP16toFP32Lo4_dp4x4);\n\ + VXC_DP4x4(scale_f1, scale_h, scale_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\\\n\ + uniConvertSecFp16Fp32_4x4);\n\ +\n\ + vxc_float4 sub, norm;\n\ + sub = tmpData0 - mean_vari.s0;\n\ + norm = scale_f0 * mean_vari.s1 * sub + bias_f0;\n\ + tmpVal0 = convert_int4_rte(norm * outputScale + output_zp);\n\ + sub = tmpData1 - mean_vari.s0;\n\ + norm = scale_f1 * mean_vari.s1 * sub + bias_f1;\n\ + tmpVal1 = convert_int4_rte(norm * outputScale + output_zp);\n\ + VXC_DP2x8(outval, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1),\\\n\ + uniConvertInt32toUint8_2x8);\n\ + VXC_OP4_NoDest(img_store_3d, output, coord.xyzz, outval, \\\n\ + VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0));\n\ + }\n\ +}\n\ +\n\ +__kernel __attribute__((reqd_work_group_size(16, 1, 1))) void layernorm_wh_F16toU8_2D(\n\ + image2d_t input, image2d_t bias, image2d_t scale, image2d_t meanVari,\n\ + image2d_t output, float eps)\n\ +{\n\ + int2 coord = (int2)(get_global_id(0), 0);\n\ + int2 coord_bias = (int2)(0, 0);\n\ + vxc_short8 src0;\n\ + vxc_short8 src1;\n\ + vxc_half8 scale_h, in_h;\n\ + vxc_float4 bias_f0, bias_f1, scale_f0, scale_f1;\n\ + vxc_float4 mean_vari = (vxc_float4)(0);\n\ +\n\ + for(int i = 0; i < group_num; i++)\n\ + {\n\ + mean_vari += read_imagef(meanVari, coord_bias);\n\ + coord_bias.x += 4;\n\ + }\n\ + mean_vari *= dimRatio;\n\ + mean_vari.s1 = mean_vari.s1 - mean_vari.s0 * mean_vari.s0 + eps;\n\ + mean_vari.s1 = rsqrt(mean_vari.s1);\n\ +\n\ + coord_bias = coord;\n\ +\n\ + vxc_float4 tmpData0, tmpData1;\n\ + vxc_uchar16 outval;\n\ + vxc_int4 tmpVal0, tmpVal1;\n\ +\n\ + for(coord.y = 0; coord.y < height; coord.y++)\n\ + {\n\ + VXC_ReadImage(src0, input, coord, VXC_5BITOFFSET_XY(0, 0),\\\n\ + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ + coord_bias.y = coord.y;\n\ + VXC_ReadImage(src1, scale, coord, VXC_5BITOFFSET_XY(0, 0),\\\n\ + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ + bias_f0 = read_imagef(bias, coord_bias);\n\ + coord_bias.x += 4;\n\ + bias_f1 = read_imagef(bias, coord_bias);\n\ + coord_bias.x = coord.x;\n\ +\n\ + _viv_asm(COPY, in_h, src0, 16);\n\ + VXC_DP4x4(tmpData0, in_h, in_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\\\n\ + UniFP16toFP32Lo4_dp4x4);\n\ + VXC_DP4x4(tmpData1, in_h, in_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\\\n\ + uniConvertSecFp16Fp32_4x4);\n\ +\n\ + _viv_asm(COPY, scale_h, src1, 16);\n\ + VXC_DP4x4(scale_f0, scale_h, scale_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\\\n\ + UniFP16toFP32Lo4_dp4x4);\n\ + VXC_DP4x4(scale_f1, scale_h, scale_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\\\n\ + uniConvertSecFp16Fp32_4x4);\n\ +\n\ + vxc_float4 sub, norm;\n\ + sub = tmpData0 - mean_vari.s0;\n\ + norm = scale_f0 * mean_vari.s1 * sub + bias_f0;\n\ + tmpVal0 = convert_int4_rte(norm * outputScale + output_zp);\n\ + sub = tmpData1 - mean_vari.s0;\n\ + norm = scale_f1 * mean_vari.s1 * sub + bias_f1;\n\ + tmpVal1 = convert_int4_rte(norm * outputScale + output_zp);\n\ + VXC_DP2x8(outval, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1),\\\n\ + uniConvertInt32toUint8_2x8);\n\ + VXC_WriteImage(output, coord, outval, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ + }\n\ +}"; /* end of layer_normalization_wh_f16_vx*/ + +static const char layer_normalization_wh_i16_vx[] = "#include \"cl_viv_vx_ext.h\"\n\ +\n\ +_viv_uniform VXC_512Bits uniInt16SumSqr_dp8x2;\n\ +_viv_uniform float e2InScale;\n\ +_viv_uniform int width;\n\ +\n\ +_viv_uniform float input_scale;\n\ +_viv_uniform int height;\n\ +\n\ +_viv_uniform int height_depth;\n\ +_viv_uniform float dimRatio;\n\ +_viv_uniform int group_num;\n\ +_viv_uniform VXC_512Bits UniFP16toFP32Lo4_dp4x4;\n\ +_viv_uniform VXC_512Bits uniConvertSecFp16Fp32_4x4;\n\ +_viv_uniform VXC_512Bits uniConvertHalfToFp16_2x8;\n\ +_viv_uniform VXC_512Bits uniConvertInt32toUint8_2x8;\n\ +_viv_uniform float outputScale;\n\ +_viv_uniform float output_zp;\n\ +\n\ +_viv_uniform VXC_512Bits uniConvert1stUint8SubZpToFp32_4x4;\n\ +_viv_uniform VXC_512Bits uniConvert2ndUint8SubZpToFp32_4x4;\n\ +_viv_uniform int inputZP;\n\ +\n\ +__kernel __attribute__((reqd_work_group_size(16, 1, 1))) void layernorm_wh_sumSqr_I16toF32(\n\ + image2d_array_t input, image2d_t output)\n\ +{\n\ + int gidx = get_global_id(0) << 4;\n\ + int lidx = get_local_id(0);\n\ + int gidz = get_global_id(1);\n\ + int4 coord = (int4)(gidx, 0, gidz, 0);\n\ + vxc_short8 src0;\n\ + float4 tmpSumSqr = (float4)(0);\n\ +\n\ + __local float lcl_sum[16];\n\ + __local float lcl_sqr[16];\n\ +\n\ + int8 input_desc;\n\ + _viv_asm(COPY, input_desc, input, sizeof(input_desc));\n\ + int baseAddr_a = (int)coord.z * input_desc.s4 + input_desc.s0;\n\ + _viv_asm(MOV, coord.w, baseAddr_a);\n\ +\n\ + if(gidx < width)\n\ + {\n\ + for(coord.y = 0; coord.y < height;)\n\ + {\n\ + VXC_OP4(img_load_3d, src0, input, coord.xyww, VXC_5BITOFFSET_XY(0, 0),\n\ + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ + coord.y++;\n\ + vxc_float4 sumsqr;\n\ + VXC_DP8x2(sumsqr, src0, src0, VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0),\\\n\ + uniInt16SumSqr_dp8x2);\n\ + tmpSumSqr += sumsqr;\n\ + }\n\ + tmpSumSqr.x *= input_scale;\n\ + tmpSumSqr.y *= e2InScale;\n\ + }\n\ + lcl_sum[lidx] = tmpSumSqr.x;\n\ + lcl_sqr[lidx] = tmpSumSqr.y;\n\ + barrier(CLK_LOCAL_MEM_FENCE);\n\ +\n\ + int2 coord_out = (int2)(get_group_id(0) << 2, gidz);\n\ + if(lidx == 0)\n\ + {\n\ + float4 one = (float4)(1, 1, 1, 1);\n\ + __local float4* tmp_sum = (__local float4*)lcl_sum;\n\ + __local float4* tmp_sqr = (__local float4*)lcl_sqr;\n\ + float4 data = (float4)(0);\n\ + for(int i = 0; i < 4; i++)\n\ + {\n\ + data.x += dot(tmp_sum[i], one);\n\ + data.y += dot(tmp_sqr[i], one);\n\ + }\n\ + write_imagef(output, coord_out, data);\n\ + }\n\ +}\n\ +\n\ +__kernel __attribute__((reqd_work_group_size(16, 1, 1))) void layernorm_wh_sumSqr_I16toF32_2D(\n\ + image2d_t input, image2d_t output)\n\ +{\n\ + int gidx = get_global_id(0) << 4;\n\ + int lidx = get_local_id(0);\n\ + int gidz = get_global_id(1);\n\ + int gidy = gidz * height;\n\ +\n\ + int2 coord = (int2)(gidx, gidy);\n\ + vxc_short8 src0;\n\ + float4 tmpSumSqr = (float4)(0);\n\ +\n\ + __local float lcl_sum[16];\n\ + __local float lcl_sqr[16];\n\ +\n\ + int endH = gidy + height;\n\ + if(gidx < width)\n\ + {\n\ + for(; coord.y < endH;)\n\ + {\n\ + VXC_ReadImage(src0, input, coord, VXC_5BITOFFSET_XY(0, 0),\n\ + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ + coord.y++;\n\ + vxc_float4 sumsqr;\n\ + VXC_DP8x2(sumsqr, src0, src0, VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0),\\\n\ + uniInt16SumSqr_dp8x2);\n\ + tmpSumSqr += sumsqr;\n\ + }\n\ + tmpSumSqr.x *= input_scale;\n\ + tmpSumSqr.y *= e2InScale;\n\ + }\n\ + lcl_sum[lidx] = tmpSumSqr.x;\n\ + lcl_sqr[lidx] = tmpSumSqr.y;\n\ + barrier(CLK_LOCAL_MEM_FENCE);\n\ +\n\ + int2 coord_out = (int2)(get_group_id(0) << 2, gidz);\n\ + if(lidx == 0)\n\ + {\n\ + float4 one = (float4)(1, 1, 1, 1);\n\ + __local float4* tmp_sum = (__local float4*)lcl_sum;\n\ + __local float4* tmp_sqr = (__local float4*)lcl_sqr;\n\ + float4 data = (float4)(0);\n\ + for(int i = 0; i < 4; i++)\n\ + {\n\ + data.x += dot(tmp_sum[i], one);\n\ + data.y += dot(tmp_sqr[i], one);\n\ + }\n\ + write_imagef(output, coord_out, data);\n\ + }\n\ +}\n\ +\n\ +__kernel __attribute__((reqd_work_group_size(16, 1, 1))) void layernorm_wh_I16toI16(\n\ + image2d_array_t input, image2d_array_t bias, image2d_array_t scale, image2d_t meanVari,\n\ + image2d_array_t output, float eps)\n\ +{\n\ + int gidz = get_global_id(1);\n\ + int4 coord = (int4)(get_global_id(0), 0, gidz, 0);\n\ + int2 coord_sum = (int2)(0, gidz);\n\ + int4 coord_para = coord;\n\ + coord_para.z = (ushort)gidz / (ushort)(height_depth);\n\ + vxc_short8 src0, src1, outval;\n\ + vxc_half8 scale_h;\n\ + vxc_float4 bias_f0, bias_f1, scale_f0, scale_f1;\n\ + vxc_float4 mean_vari = (vxc_float4)(0);\n\ +\n\ + for(int i = 0; i < group_num; i++)\n\ + {\n\ + mean_vari += read_imagef(meanVari, coord_sum);\n\ + coord_sum.x += 4;\n\ + }\n\ + mean_vari *= dimRatio;\n\ + mean_vari.s1 = mean_vari.s1 - mean_vari.s0 * mean_vari.s0 + eps;\n\ + mean_vari.s1 = rsqrt(mean_vari.s1);\n\ +\n\ + int4 coord_bias = coord_para;\n\ +\n\ + int8 input_desc, scale_desc, output_desc;\n\ + _viv_asm(COPY, input_desc, input, sizeof(input_desc));\n\ + int baseAddr_a = (int)coord.z * input_desc.s4 + input_desc.s0;\n\ + _viv_asm(MOV, coord.w, baseAddr_a);\n\ +\n\ + _viv_asm(COPY, scale_desc, scale, sizeof(scale_desc));\n\ + int baseAddr_c = (int)coord_para.z * scale_desc.s4 + scale_desc.s0;\n\ + _viv_asm(MOV, coord_para.w, baseAddr_c);\n\ +\n\ + _viv_asm(COPY, output_desc, output, sizeof(output_desc));\n\ + int baseAddr = (int)coord.z * output_desc.s4 + output_desc.s0;\n\ + _viv_asm(MOV, coord.z, baseAddr);\n\ +\n\ + short zp = inputZP;\n\ + vxc_float4 tmpData0, tmpData1, norm;\n\ + vxc_int4 tmpVal0, tmpVal1;\n\ +\n\ + for(coord.y = 0; coord.y < height; coord.y++)\n\ + {\n\ + VXC_OP4(img_load_3d, src0, input, coord.xyww, VXC_5BITOFFSET_XY(0, 0),\n\ + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ + coord_para.y = coord.y;\n\ + coord_bias.y = coord.y;\n\ + VXC_OP4(img_load_3d, src1, scale, coord_para.xyww, VXC_5BITOFFSET_XY(0, 0),\n\ + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ + bias_f0 = read_imagef(bias, coord_bias);\n\ + coord_bias.x += 4;\n\ + bias_f1 = read_imagef(bias, coord_bias);\n\ + coord_bias.x = coord.x;\n\ +\n\ + _viv_asm(COPY, scale_h, src1, 16);\n\ + VXC_DP4x4(scale_f0, scale_h, scale_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\\\n\ + UniFP16toFP32Lo4_dp4x4);\n\ + VXC_DP4x4(scale_f1, scale_h, scale_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\\\n\ + uniConvertSecFp16Fp32_4x4);\n\ + VXC_DP4x4(tmpData0, src0, zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\\\n\ + uniConvert1stUint8SubZpToFp32_4x4);\n\ + VXC_DP4x4(tmpData1, src0, zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\\\n\ + uniConvert2ndUint8SubZpToFp32_4x4);\n\ + tmpData0 = tmpData0 * input_scale - mean_vari.s0;\n\ + tmpData1 = tmpData1 * input_scale - mean_vari.s0;\n\ +\n\ + norm = scale_f0 * mean_vari.s1 * tmpData0 + bias_f0;\n\ + tmpVal0 = convert_int4_rte(norm * outputScale + output_zp);\n\ + norm = scale_f1 * mean_vari.s1 * tmpData1 + bias_f1;\n\ + tmpVal1 = convert_int4_rte(norm * outputScale + output_zp);\n\ +\n\ + VXC_DP2x8(outval, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1),\\\n\ + uniConvertInt32toUint8_2x8);\n\ + VXC_OP4_NoDest(img_store_3d, output, coord.xyzz, outval, \\\n\ + VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0));\n\ + }\n\ +}\n\ +\n\ +__kernel __attribute__((reqd_work_group_size(16, 1, 1))) void layernorm_wh_I16toI16_2D(\n\ + image2d_t input, image2d_t bias, image2d_t scale, image2d_t meanVari,\n\ + image2d_t output, float eps)\n\ +{\n\ + int2 coord = (int2)(get_global_id(0), 0);\n\ + int2 coord_bias = (int2)(0, 0);\n\ + vxc_short8 src0, src1, outval;\n\ + vxc_half8 scale_h;\n\ + vxc_float4 bias_f0, bias_f1, scale_f0, scale_f1;\n\ + vxc_float4 mean_vari = (vxc_float4)(0);\n\ +\n\ + for(int i = 0; i < group_num; i++)\n\ + {\n\ + mean_vari += read_imagef(meanVari, coord_bias);\n\ + coord_bias.x += 4;\n\ + }\n\ + mean_vari *= dimRatio;\n\ + mean_vari.s1 = mean_vari.s1 - mean_vari.s0 * mean_vari.s0 + eps;\n\ + mean_vari.s1 = rsqrt(mean_vari.s1);\n\ +\n\ + coord_bias = coord;\n\ +\n\ + short zp = inputZP;\n\ + vxc_float4 tmpData0, tmpData1, norm;\n\ + vxc_int4 tmpVal0, tmpVal1;\n\ +\n\ + for(coord.y = 0; coord.y < height; coord.y++)\n\ + {\n\ + VXC_ReadImage(src0, input, coord, VXC_5BITOFFSET_XY(0, 0),\\\n\ + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ + coord_bias.y = coord.y;\n\ + VXC_ReadImage(src1, scale, coord, VXC_5BITOFFSET_XY(0, 0),\\\n\ + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ + bias_f0 = read_imagef(bias, coord_bias);\n\ + coord_bias.x += 4;\n\ + bias_f1 = read_imagef(bias, coord_bias);\n\ + coord_bias.x = coord.x;\n\ +\n\ + _viv_asm(COPY, scale_h, src1, 16);\n\ + VXC_DP4x4(scale_f0, scale_h, scale_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\\\n\ + UniFP16toFP32Lo4_dp4x4);\n\ + VXC_DP4x4(scale_f1, scale_h, scale_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\\\n\ + uniConvertSecFp16Fp32_4x4);\n\ + VXC_DP4x4(tmpData0, src0, zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\\\n\ + uniConvert1stUint8SubZpToFp32_4x4);\n\ + VXC_DP4x4(tmpData1, src0, zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\\\n\ + uniConvert2ndUint8SubZpToFp32_4x4);\n\ + tmpData0 = tmpData0 * input_scale - mean_vari.s0;\n\ + tmpData1 = tmpData1 * input_scale - mean_vari.s0;\n\ +\n\ + norm = scale_f0 * mean_vari.s1 * tmpData0 + bias_f0;\n\ + tmpVal0 = convert_int4_rte(norm * outputScale + output_zp);\n\ + norm = scale_f1 * mean_vari.s1 * tmpData1 + bias_f1;\n\ + tmpVal1 = convert_int4_rte(norm * outputScale + output_zp);\n\ +\n\ + VXC_DP2x8(outval, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1),\\\n\ + uniConvertInt32toUint8_2x8);\n\ + VXC_WriteImage(output, coord, outval, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ + }\n\ +}"; /* end of layer_normalization_wh_i16_vx*/ + +static const char layer_normalization_wh_u8_vx[] = "#include \"cl_viv_vx_ext.h\"\n\ +\n\ +_viv_uniform VXC_512Bits uniSumU8_16x1;\n\ +_viv_uniform VXC_512Bits uniSqrSum_16x1;\n\ +_viv_uniform int sumInZp;\n\ +_viv_uniform int tmpZp1;\n\ +_viv_uniform float e2InScale;\n\ +_viv_uniform float rowSumScale;\n\ +_viv_uniform int width;\n\ +\n\ +_viv_uniform float input_scale;\n\ +_viv_uniform int height;\n\ +\n\ +_viv_uniform int height_depth;\n\ +_viv_uniform float dimRatio;\n\ +_viv_uniform int group_num;\n\ +_viv_uniform VXC_512Bits UniFP16toFP32Lo4_dp4x4;\n\ +_viv_uniform VXC_512Bits uniConvertSecFp16Fp32_4x4;\n\ +_viv_uniform VXC_512Bits uniConvertHalfToFp16_2x8;\n\ +_viv_uniform VXC_512Bits uniConvertInt32toUint8_2x8;\n\ +_viv_uniform float outputScale;\n\ +_viv_uniform float output_zp;\n\ +\n\ +_viv_uniform VXC_512Bits uniConvert1stUint8SubZpToFp32_4x4;\n\ +_viv_uniform VXC_512Bits uniConvert2ndUint8SubZpToFp32_4x4;\n\ +_viv_uniform int inputZP;\n\ +\n\ +__kernel __attribute__((reqd_work_group_size(16, 1, 1))) void layernorm_wh_sumSqr_U8toF32(\n\ + image2d_array_t input, image2d_t output)\n\ +{\n\ + int gidx = get_global_id(0) << 4;\n\ + int lidx = get_local_id(0);\n\ + int gidz = get_global_id(1);\n\ + int4 coord = (int4)(gidx, 0, gidz, 0);\n\ + vxc_uchar16 src0;\n\ + float sum = 0, sqr = 0;\n\ + int tmpSum = 0, tmpSqr = 0, tmpSum1, tmpSqr1;\n\ +\n\ + __local float lcl_sum[16];\n\ + __local float lcl_sqr[16];\n\ +\n\ + int8 input_desc;\n\ + _viv_asm(COPY, input_desc, input, sizeof(input_desc));\n\ + int baseAddr_a = (int)coord.z * input_desc.s4 + input_desc.s0;\n\ + _viv_asm(MOV, coord.w, baseAddr_a);\n\ +\n\ + if(gidx < width)\n\ + {\n\ + for(coord.y = 0; coord.y < height;)\n\ + {\n\ + VXC_OP4(img_load_3d, src0, input, coord.xyww, VXC_5BITOFFSET_XY(0, 0),\n\ + VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));\n\ + coord.y++;\n\ + VXC_DP16x1(tmpSum1, src0, src0, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0), uniSumU8_16x1);\n\ + tmpSum += (tmpSum1);\n\ + VXC_DP16x1(tmpSqr1, src0, src0, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0), uniSqrSum_16x1);\n\ + tmpSqr += (tmpSqr1 + tmpZp1 * tmpSum1);\n\ + }\n\ + sqr += (tmpSqr * e2InScale + rowSumScale);\n\ + sum = (tmpSum + sumInZp) * input_scale;\n\ + }\n\ + lcl_sum[lidx] = sum;\n\ + lcl_sqr[lidx] = sqr;\n\ + barrier(CLK_LOCAL_MEM_FENCE);\n\ +\n\ + int2 coord_out = (int2)(get_group_id(0) << 2, gidz);\n\ + if(lidx == 0)\n\ + {\n\ + float4 one = (float4)(1, 1, 1, 1);\n\ + __local float4* tmp_sum = (__local float4*)lcl_sum;\n\ + __local float4* tmp_sqr = (__local float4*)lcl_sqr;\n\ + sum = 0; sqr = 0;\n\ + for(int i = 0; i < 4; i++)\n\ + {\n\ + sum += dot(tmp_sum[i], one);\n\ + sqr += dot(tmp_sqr[i], one);\n\ + }\n\ + float4 data = (float4)(sum, sqr, 0, 0);\n\ + write_imagef(output, coord_out, data);\n\ + }\n\ +}\n\ +\n\ +__kernel __attribute__((reqd_work_group_size(16, 1, 1))) void layernorm_wh_sumSqr_U8toF32_2D(\n\ + image2d_t input, image2d_t output)\n\ +{\n\ + int gidx = get_global_id(0) << 4;\n\ + int lidx = get_local_id(0);\n\ + int gidz = get_global_id(1);\n\ + int gidy = gidz * height;\n\ +\n\ + int2 coord = (int2)(gidx, gidy);\n\ + vxc_uchar16 src0;\n\ + float sum = 0, sqr = 0;\n\ + int tmpSum = 0, tmpSqr = 0, tmpSum1, tmpSqr1;\n\ +\n\ + __local float lcl_sum[16];\n\ + __local float lcl_sqr[16];\n\ +\n\ + int endH = gidy + height;\n\ + if(gidx < width)\n\ + {\n\ + for(; coord.y < endH;)\n\ + {\n\ + VXC_ReadImage(src0, input, coord, VXC_5BITOFFSET_XY(0, 0),\n\ + VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));\n\ + coord.y++;\n\ + VXC_DP16x1(tmpSum1, src0, src0, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0), uniSumU8_16x1);\n\ + tmpSum += (tmpSum1);\n\ + VXC_DP16x1(tmpSqr1, src0, src0, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0), uniSqrSum_16x1);\n\ + tmpSqr += (tmpSqr1 + tmpZp1 * tmpSum1);\n\ + }\n\ + sqr += (tmpSqr * e2InScale + rowSumScale);\n\ + sum = (tmpSum + sumInZp) * input_scale;\n\ + }\n\ + lcl_sum[lidx] = sum;\n\ + lcl_sqr[lidx] = sqr;\n\ + barrier(CLK_LOCAL_MEM_FENCE);\n\ +\n\ + int2 coord_out = (int2)(get_group_id(0) << 2, gidz);\n\ + if(lidx == 0)\n\ + {\n\ + float4 one = (float4)(1, 1, 1, 1);\n\ + __local float4* tmp_sum = (__local float4*)lcl_sum;\n\ + __local float4* tmp_sqr = (__local float4*)lcl_sqr;\n\ + sum = 0; sqr = 0;\n\ + for(int i = 0; i < 4; i++)\n\ + {\n\ + sum += dot(tmp_sum[i], one);\n\ + sqr += dot(tmp_sqr[i], one);\n\ + }\n\ + float4 data = (float4)(sum, sqr, 0, 0);\n\ + write_imagef(output, coord_out, data);\n\ + }\n\ +}\n\ +\n\ +__kernel __attribute__((reqd_work_group_size(16, 1, 1))) void layernorm_wh_U8toF16(\n\ + image2d_array_t input, image2d_array_t bias, image2d_array_t scale, image2d_t meanVari,\n\ + image2d_array_t output, float eps)\n\ +{\n\ + int gidz = get_global_id(1);\n\ + int4 coord = (int4)(get_global_id(0), 0, gidz, 0);\n\ + int2 coord_sum = (int2)(0, gidz);\n\ + int4 coord_para = coord;\n\ + coord_para.z = (ushort)gidz / (ushort)(height_depth);\n\ + vxc_uchar16 src0;\n\ + vxc_short8 src1, outval;\n\ + vxc_half8 scale_h, dst;\n\ + vxc_float4 bias_f0, bias_f1, scale_f0, scale_f1;\n\ + vxc_float4 mean_vari = (vxc_float4)(0);\n\ +\n\ + for(int i = 0; i < group_num; i++)\n\ + {\n\ + mean_vari += read_imagef(meanVari, coord_sum);\n\ + coord_sum.x += 4;\n\ + }\n\ + mean_vari *= dimRatio;\n\ + mean_vari.s1 = mean_vari.s1 - mean_vari.s0 * mean_vari.s0 + eps;\n\ + mean_vari.s1 = rsqrt(mean_vari.s1);\n\ +\n\ + int4 coord_bias = coord_para;\n\ +\n\ + int8 input_desc, scale_desc, output_desc;\n\ + _viv_asm(COPY, input_desc, input, sizeof(input_desc));\n\ + int baseAddr_a = (int)coord.z * input_desc.s4 + input_desc.s0;\n\ + _viv_asm(MOV, coord.w, baseAddr_a);\n\ +\n\ + _viv_asm(COPY, scale_desc, scale, sizeof(scale_desc));\n\ + int baseAddr_c = (int)coord_para.z * scale_desc.s4 + scale_desc.s0;\n\ + _viv_asm(MOV, coord_para.w, baseAddr_c);\n\ +\n\ + _viv_asm(COPY, output_desc, output, sizeof(output_desc));\n\ + int baseAddr = (int)coord.z * output_desc.s4 + output_desc.s0;\n\ + _viv_asm(MOV, coord.z, baseAddr);\n\ +\n\ + short zp = inputZP;\n\ + vxc_float4 tmpData0, tmpData1, norm;\n\ + half4 tmpVal0, tmpVal1;\n\ +\n\ + for(coord.y = 0; coord.y < height; coord.y++)\n\ + {\n\ + VXC_OP4(img_load_3d, src0, input, coord.xyww, VXC_5BITOFFSET_XY(0, 0),\n\ + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ + coord_para.y = coord.y; coord_bias.y = coord.y;\n\ + VXC_OP4(img_load_3d, src1, scale, coord_para.xyww, VXC_5BITOFFSET_XY(0, 0),\n\ + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ + bias_f0 = read_imagef(bias, coord_bias);\n\ + coord_bias.x += 4;\n\ + bias_f1 = read_imagef(bias, coord_bias);\n\ + coord_bias.x = coord.x;\n\ +\n\ + _viv_asm(COPY, scale_h, src1, 16);\n\ + VXC_DP4x4(scale_f0, scale_h, scale_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\\\n\ + UniFP16toFP32Lo4_dp4x4);\n\ + VXC_DP4x4(scale_f1, scale_h, scale_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\\\n\ + uniConvertSecFp16Fp32_4x4);\n\ + VXC_DP4x4(tmpData0, src0, zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\\\n\ + uniConvert1stUint8SubZpToFp32_4x4);\n\ + VXC_DP4x4(tmpData1, src0, zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\\\n\ + uniConvert2ndUint8SubZpToFp32_4x4);\n\ + tmpData0 = tmpData0 * input_scale - mean_vari.s0;\n\ + tmpData1 = tmpData1 * input_scale - mean_vari.s0;\n\ +\n\ + norm = scale_f0 * mean_vari.s1 * tmpData0 + bias_f0;\n\ + _viv_asm(CONV, tmpVal0, norm);\n\ + norm = scale_f1 * mean_vari.s1 * tmpData1 + bias_f1;\n\ + _viv_asm(CONV, tmpVal1, norm);\n\ +\n\ + VXC_DP2x8(dst, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0),\\\n\ + uniConvertHalfToFp16_2x8);\n\ + _viv_asm(COPY, outval, dst, 16);\n\ + VXC_OP4_NoDest(img_store_3d, output, coord.xyzz, outval, \\\n\ + VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0));\n\ + }\n\ +}\n\ +\n\ +__kernel __attribute__((reqd_work_group_size(16, 1, 1))) void layernorm_wh_U8toF16_2D(\n\ + image2d_t input, image2d_t bias, image2d_t scale, image2d_t meanVari,\n\ + image2d_t output, float eps)\n\ +{\n\ + int2 coord = (int2)(get_global_id(0), 0);\n\ + int2 coord_bias = (int2)(0, 0);\n\ + vxc_uchar16 src0;\n\ + vxc_short8 src1, outval;\n\ + vxc_half8 scale_h, dst;\n\ + vxc_float4 bias_f0, bias_f1, scale_f0, scale_f1;\n\ + vxc_float4 mean_vari = (vxc_float4)(0);\n\ +\n\ + for(int i = 0; i < group_num; i++)\n\ + {\n\ + mean_vari += read_imagef(meanVari, coord_bias);\n\ + coord_bias.x += 4;\n\ + }\n\ + mean_vari *= dimRatio;\n\ + mean_vari.s1 = mean_vari.s1 - mean_vari.s0 * mean_vari.s0 + eps;\n\ + mean_vari.s1 = rsqrt(mean_vari.s1);\n\ +\n\ + coord_bias = coord;\n\ +\n\ + short zp = inputZP;\n\ + vxc_float4 tmpData0, tmpData1, norm;\n\ + half4 tmpVal0, tmpVal1;\n\ +\n\ + for(coord.y = 0; coord.y < height; coord.y++)\n\ + {\n\ + VXC_ReadImage(src0, input, coord, VXC_5BITOFFSET_XY(0, 0),\\\n\ + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ + coord_bias.y = coord.y;\n\ + VXC_ReadImage(src1, scale, coord, VXC_5BITOFFSET_XY(0, 0),\\\n\ + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ + bias_f0 = read_imagef(bias, coord_bias);\n\ + coord_bias.x += 4;\n\ + bias_f1 = read_imagef(bias, coord_bias);\n\ + coord_bias.x = coord.x;\n\ +\n\ + _viv_asm(COPY, scale_h, src1, 16);\n\ + VXC_DP4x4(scale_f0, scale_h, scale_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\\\n\ + UniFP16toFP32Lo4_dp4x4);\n\ + VXC_DP4x4(scale_f1, scale_h, scale_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\\\n\ + uniConvertSecFp16Fp32_4x4);\n\ + VXC_DP4x4(tmpData0, src0, zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\\\n\ + uniConvert1stUint8SubZpToFp32_4x4);\n\ + VXC_DP4x4(tmpData1, src0, zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\\\n\ + uniConvert2ndUint8SubZpToFp32_4x4);\n\ + tmpData0 = tmpData0 * input_scale - mean_vari.s0;\n\ + tmpData1 = tmpData1 * input_scale - mean_vari.s0;\n\ +\n\ + norm = scale_f0 * mean_vari.s1 * tmpData0 + bias_f0;\n\ + _viv_asm(CONV, tmpVal0, norm);\n\ + norm = scale_f1 * mean_vari.s1 * tmpData1 + bias_f1;\n\ + _viv_asm(CONV, tmpVal1, norm);\n\ +\n\ + VXC_DP2x8(dst, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0),\\\n\ + uniConvertHalfToFp16_2x8);\n\ + _viv_asm(COPY, outval, dst, 16);\n\ + VXC_WriteImage(output, coord, outval, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ + }\n\ +}\n\ +\n\ +__kernel __attribute__((reqd_work_group_size(16, 1, 1))) void layernorm_wh_U8toU8(\n\ + image2d_array_t input, image2d_array_t bias, image2d_array_t scale, image2d_t meanVari,\n\ + image2d_array_t output, float eps)\n\ +{\n\ + int gidz = get_global_id(1);\n\ + int4 coord = (int4)(get_global_id(0), 0, gidz, 0);\n\ + int2 coord_sum = (int2)(0, gidz);\n\ + int4 coord_para = coord;\n\ + coord_para.z = (ushort)gidz / (ushort)(height_depth);\n\ + vxc_uchar16 src0 , outval;\n\ + vxc_short8 src1;\n\ + vxc_half8 scale_h;\n\ + vxc_float4 bias_f0, bias_f1, scale_f0, scale_f1;\n\ + vxc_float4 mean_vari = (vxc_float4)(0);\n\ +\n\ + for(int i = 0; i < group_num; i++)\n\ + {\n\ + mean_vari += read_imagef(meanVari, coord_sum);\n\ + coord_sum.x += 4;\n\ + }\n\ + mean_vari *= dimRatio;\n\ + mean_vari.s1 = mean_vari.s1 - mean_vari.s0 * mean_vari.s0 + eps;\n\ + mean_vari.s1 = rsqrt(mean_vari.s1);\n\ +\n\ + int4 coord_bias = coord_para;\n\ +\n\ + int8 input_desc, scale_desc, output_desc;\n\ + _viv_asm(COPY, input_desc, input, sizeof(input_desc));\n\ + int baseAddr_a = (int)coord.z * input_desc.s4 + input_desc.s0;\n\ + _viv_asm(MOV, coord.w, baseAddr_a);\n\ +\n\ + _viv_asm(COPY, scale_desc, scale, sizeof(scale_desc));\n\ + int baseAddr_c = (int)coord_para.z * scale_desc.s4 + scale_desc.s0;\n\ + _viv_asm(MOV, coord_para.w, baseAddr_c);\n\ +\n\ + _viv_asm(COPY, output_desc, output, sizeof(output_desc));\n\ + int baseAddr = (int)coord.z * output_desc.s4 + output_desc.s0;\n\ + _viv_asm(MOV, coord.z, baseAddr);\n\ +\n\ + short zp = inputZP;\n\ + vxc_float4 tmpData0, tmpData1, norm;\n\ + vxc_int4 tmpVal0, tmpVal1;\n\ +\n\ + for(coord.y = 0; coord.y < height; coord.y++)\n\ + {\n\ + VXC_OP4(img_load_3d, src0, input, coord.xyww, VXC_5BITOFFSET_XY(0, 0),\n\ + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ + coord_para.y = coord.y;\n\ + coord_bias.y = coord.y;\n\ + VXC_OP4(img_load_3d, src1, scale, coord_para.xyww, VXC_5BITOFFSET_XY(0, 0),\n\ + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ + bias_f0 = read_imagef(bias, coord_bias);\n\ + coord_bias.x += 4;\n\ + bias_f1 = read_imagef(bias, coord_bias);\n\ + coord_bias.x = coord.x;\n\ +\n\ + _viv_asm(COPY, scale_h, src1, 16);\n\ + VXC_DP4x4(scale_f0, scale_h, scale_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\\\n\ + UniFP16toFP32Lo4_dp4x4);\n\ + VXC_DP4x4(scale_f1, scale_h, scale_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\\\n\ + uniConvertSecFp16Fp32_4x4);\n\ + VXC_DP4x4(tmpData0, src0, zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\\\n\ + uniConvert1stUint8SubZpToFp32_4x4);\n\ + VXC_DP4x4(tmpData1, src0, zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\\\n\ + uniConvert2ndUint8SubZpToFp32_4x4);\n\ + tmpData0 = tmpData0 * input_scale - mean_vari.s0;\n\ + tmpData1 = tmpData1 * input_scale - mean_vari.s0;\n\ +\n\ + norm = scale_f0 * mean_vari.s1 * tmpData0 + bias_f0;\n\ + tmpVal0 = convert_int4_rte(norm * outputScale + output_zp);\n\ + norm = scale_f1 * mean_vari.s1 * tmpData1 + bias_f1;\n\ + tmpVal1 = convert_int4_rte(norm * outputScale + output_zp);\n\ +\n\ + VXC_DP2x8(outval, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1),\\\n\ + uniConvertInt32toUint8_2x8);\n\ + VXC_OP4_NoDest(img_store_3d, output, coord.xyzz, outval, \\\n\ + VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0));\n\ + }\n\ +}\n\ +\n\ +__kernel __attribute__((reqd_work_group_size(16, 1, 1))) void layernorm_wh_U8toU8_2D(\n\ + image2d_t input, image2d_t bias, image2d_t scale, image2d_t meanVari,\n\ + image2d_t output, float eps)\n\ +{\n\ + int2 coord = (int2)(get_global_id(0), 0);\n\ + int2 coord_bias = (int2)(0, 0);\n\ + vxc_uchar16 src0, outval;\n\ + vxc_short8 src1;\n\ + vxc_half8 scale_h;\n\ + vxc_float4 bias_f0, bias_f1, scale_f0, scale_f1;\n\ + vxc_float4 mean_vari = (vxc_float4)(0);\n\ +\n\ + for(int i = 0; i < group_num; i++)\n\ + {\n\ + mean_vari += read_imagef(meanVari, coord_bias);\n\ + coord_bias.x += 4;\n\ + }\n\ + mean_vari *= dimRatio;\n\ + mean_vari.s1 = mean_vari.s1 - mean_vari.s0 * mean_vari.s0 + eps;\n\ + mean_vari.s1 = rsqrt(mean_vari.s1);\n\ +\n\ + coord_bias = coord;\n\ +\n\ + short zp = inputZP;\n\ + vxc_float4 tmpData0, tmpData1, norm;\n\ + vxc_int4 tmpVal0, tmpVal1;\n\ +\n\ + for(coord.y = 0; coord.y < height; coord.y++)\n\ + {\n\ + VXC_ReadImage(src0, input, coord, VXC_5BITOFFSET_XY(0, 0),\\\n\ + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ + coord_bias.y = coord.y;\n\ + VXC_ReadImage(src1, scale, coord, VXC_5BITOFFSET_XY(0, 0),\\\n\ + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ + bias_f0 = read_imagef(bias, coord_bias);\n\ + coord_bias.x += 4;\n\ + bias_f1 = read_imagef(bias, coord_bias);\n\ + coord_bias.x = coord.x;\n\ +\n\ + _viv_asm(COPY, scale_h, src1, 16);\n\ + VXC_DP4x4(scale_f0, scale_h, scale_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\\\n\ + UniFP16toFP32Lo4_dp4x4);\n\ + VXC_DP4x4(scale_f1, scale_h, scale_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\\\n\ + uniConvertSecFp16Fp32_4x4);\n\ + VXC_DP4x4(tmpData0, src0, zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\\\n\ + uniConvert1stUint8SubZpToFp32_4x4);\n\ + VXC_DP4x4(tmpData1, src0, zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\\\n\ + uniConvert2ndUint8SubZpToFp32_4x4);\n\ + tmpData0 = tmpData0 * input_scale - mean_vari.s0;\n\ + tmpData1 = tmpData1 * input_scale - mean_vari.s0;\n\ +\n\ + norm = scale_f0 * mean_vari.s1 * tmpData0 + bias_f0;\n\ + tmpVal0 = convert_int4_rte(norm * outputScale + output_zp);\n\ + norm = scale_f1 * mean_vari.s1 * tmpData1 + bias_f1;\n\ + tmpVal1 = convert_int4_rte(norm * outputScale + output_zp);\n\ +\n\ + VXC_DP2x8(outval, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1),\\\n\ + uniConvertInt32toUint8_2x8);\n\ + VXC_WriteImage(output, coord, outval, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ + }\n\ +}"; /* end of layer_normalization_wh_u8_vx*/ + static const char log_softmax_axis0_vx[] = "#include \"cl_viv_vx_ext.h\"\n\ _viv_uniform float rlogE;\n\ _viv_uniform int axisSize;\n\ @@ -21179,144 +23509,6 @@ __kernel void pre_process_bgra_copy_U8toU8(\n\ }\n\ "; /* end of pre_process_bgra_vx*/ -static const char pre_process_bgra_trans_vx[] = "#include \"cl_viv_vx_ext.h\"\n\ -\n\ -_viv_uniform VXC_512Bits uniDescaleU8_4x4;\n\ -_viv_uniform VXC_512Bits uniConvertIntergetoF32_4x4;\n\ -\n\ -_viv_uniform VXC_512Bits uniBilinearTmp1BgraShort_4x4;\n\ -_viv_uniform VXC_512Bits uniBilinearTmp2BgraShort_4x4;\n\ -_viv_uniform VXC_512Bits uniBilinearTmp3BgraShort_4x4;\n\ -_viv_uniform VXC_512Bits uniBilinearTmp4BgraShort_4x4;\n\ -_viv_uniform VXC_512Bits uniBilinearTmp5BgraShort_4x4;\n\ -_viv_uniform VXC_512Bits uniBilinearTmp6BgraShort_4x4;\n\ -_viv_uniform VXC_512Bits uniBilinearTmp7BgraShort_4x4;\n\ -_viv_uniform VXC_512Bits uniBilinearTmp8BgraShort_4x4;\n\ -\n\ -_viv_uniform VXC_512Bits uniConvertInt32toUint8_2x8;\n\ -_viv_uniform VXC_512Bits uniExtractInt32BgraToU8Bgr_2x8;\n\ -\n\ -_viv_uniform int zp;\n\ -_viv_uniform float outputScale;\n\ -\n\ -__kernel void pre_process_bgra_scale_nhwc_U8toU8(\n\ - __read_only image2d_array_t input, __write_only image2d_array_t output,\n\ - global int *xRatio, global int * yRatio, global int * xOffset, global int * yOffset,\n\ - float rMean, float gMean, float bMean, float var, int reverse_channel, int trans)\n\ -{\n\ - int4 gidx = get_global_id(0);\n\ - int gidy = get_global_id(1);\n\ - gidx += (int4)(0, 1, 2, 3);\n\ -\n\ - int4 fx = (gidx * (*xRatio) + ((*xRatio) >> 1)) - (1 << 14);\n\ - int4 sx = fx & 0xffff8000; // Floor\n\ - int fy, sy;\n\ - fx -= sx;\n\ - sx = sx >> 15;\n\ - fx = (fx +(1 << 4)) >> 5;\n\ -\n\ - // for y\n\ - fy = (gidy * (*yRatio) + ((*yRatio) >> 1)) - (1<< 14);\n\ - sy = fy & 0xffff8000; // Floor\n\ - fy -= sy;\n\ - sy = sy >> 15;\n\ -\n\ - sy = sy < 0 ? 0 : sy;\n\ - fy = fy < 0 ? 0 : fy;\n\ -\n\ - fy = (fy + (1<< 4)) >> 5;\n\ - sx = (sx + (*xOffset)) * 4 ;\n\ - sy += (*yOffset);\n\ - int4 srcPos = (int4)(sx.x, sy, sy + 1, sx.y);\n\ - vxc_uchar16 lineBGRA0, lineBGRA1, lineBGRA2, lineBGRA3;\n\ - vxc_uchar16 dataB, dataG, dataR;\n\ -\n\ - VXC_ReadImage(lineBGRA0, input, srcPos.xy, VXC_5BITOFFSET_XY(0, 0),\n\ - VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ - VXC_ReadImage(lineBGRA0, input, srcPos.xz, VXC_5BITOFFSET_XY(0, 0),\n\ - VXC_MODIFIER(8, 15, 0, VXC_RM_TowardZero, 0));\n\ -\n\ - VXC_ReadImage(lineBGRA1, input, srcPos.wy, VXC_5BITOFFSET_XY(0, 0),\n\ - VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ - VXC_ReadImage(lineBGRA1, input, srcPos.wz, VXC_5BITOFFSET_XY(0, 0),\n\ - VXC_MODIFIER(8, 15, 0, VXC_RM_TowardZero, 0));\n\ -\n\ - srcPos.x = sx.z;\n\ - srcPos.w = sx.w;\n\ -\n\ - VXC_ReadImage(lineBGRA2, input, srcPos.xy, VXC_5BITOFFSET_XY(0, 0),\n\ - VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ - VXC_ReadImage(lineBGRA2, input, srcPos.xz, VXC_5BITOFFSET_XY(0, 0),\n\ - VXC_MODIFIER(8, 15, 0, VXC_RM_TowardZero, 0));\n\ -\n\ - VXC_ReadImage(lineBGRA3, input, srcPos.wy, VXC_5BITOFFSET_XY(0, 0),\n\ - VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ - VXC_ReadImage(lineBGRA3, input, srcPos.wz, VXC_5BITOFFSET_XY(0, 0),\n\ - VXC_MODIFIER(8, 15, 0, VXC_RM_TowardZero, 0));\n\ -\n\ - vxc_uchar4 val_u8;\n\ - int4 tmp1, tmp2, result1, result2;\n\ - float4 tmpDst, tmp0;\n\ - float4 mean = (float4)(bMean, gMean, rMean, 0);\n\ - //tmpFx = (int4)(fx.x, fx.x, fx.x, fx.x);\n\ - int tmpV = 1 << 19;\n\ - vxc_short8 tmpFx;\n\ - VXC_DP2x8(tmpFx, fx, fx, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1),\n\ - uniConvertInt32toUint8_2x8);\n\ - //tmpFx = fx.xxxx;\n\ - VXC_DP4x4(tmp1, lineBGRA0, tmpFx, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\n\ - uniBilinearTmp1BgraShort_4x4);\n\ - VXC_DP4x4(tmp2, lineBGRA0, tmpFx, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\n\ - uniBilinearTmp2BgraShort_4x4);\n\ - tmp1 = fy * (tmp2 - tmp1) + (tmp1 << 10);\n\ - VXC_DP4x4(val_u8, tmp1, tmpV, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), uniDescaleU8_4x4);\n\ - VXC_DP4x4(tmp0, val_u8, val_u8, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), \\\n\ - uniConvertIntergetoF32_4x4);\n\ - tmpDst = (tmp0 - mean) * var;\n\ - result1 = convert_int4_rte(tmpDst * outputScale + zp);\n\ -\n\ - //tmpFx = fx.yyyy;\n\ - VXC_DP4x4(tmp1, lineBGRA1, tmpFx, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniBilinearTmp3BgraShort_4x4);\n\ - VXC_DP4x4(tmp2, lineBGRA1, tmpFx, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniBilinearTmp4BgraShort_4x4);\n\ - tmp1 = fy * (tmp2 - tmp1) + (tmp1 << 10);\n\ - VXC_DP4x4(val_u8, tmp1, tmpV, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), uniDescaleU8_4x4);\n\ - VXC_DP4x4(tmp0, val_u8, val_u8, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), \\\n\ - uniConvertIntergetoF32_4x4);\n\ - tmpDst = (tmp0 - mean) * var;\n\ - result2 = convert_int4_rte(tmpDst * outputScale + zp);\n\ -\n\ - vxc_uchar16 dst;\n\ - VXC_DP2x8(dst, result1, result2, VXC_MODIFIER(0, 5, 0, VXC_RM_ToNearestEven, 1),\n\ - uniExtractInt32BgraToU8Bgr_2x8);\n\ -\n\ - //tmpFx = fx.zzzz;\n\ - VXC_DP4x4(tmp1, lineBGRA2, tmpFx, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniBilinearTmp5BgraShort_4x4);\n\ - VXC_DP4x4(tmp2, lineBGRA2, tmpFx, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniBilinearTmp6BgraShort_4x4);\n\ - tmp1 = fy * (tmp2 - tmp1) + (tmp1 << 10);\n\ - VXC_DP4x4(val_u8, tmp1, tmpV, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), uniDescaleU8_4x4);\n\ - VXC_DP4x4(tmp0, val_u8, val_u8, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), \\\n\ - uniConvertIntergetoF32_4x4);\n\ - tmpDst = (tmp0 - mean) * var;\n\ - result1 = convert_int4_rte(tmpDst * outputScale + zp);\n\ -\n\ - //tmpFx = fx.wwww;\n\ - VXC_DP4x4(tmp1, lineBGRA3, tmpFx, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniBilinearTmp7BgraShort_4x4);\n\ - VXC_DP4x4(tmp2, lineBGRA3, tmpFx, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniBilinearTmp8BgraShort_4x4);\n\ - tmp1 = fy * (tmp2 - tmp1) + (tmp1 << 10);\n\ - VXC_DP4x4(val_u8, tmp1, tmpV, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), uniDescaleU8_4x4);\n\ - VXC_DP4x4(tmp0, val_u8, val_u8, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), \\\n\ - uniConvertIntergetoF32_4x4);\n\ - tmpDst = (tmp0 - mean) * var;\n\ - result2 = convert_int4_rte(tmpDst * outputScale + zp);\n\ -\n\ - VXC_DP2x8(dst, result1, result2, VXC_MODIFIER(6, 11, 0, VXC_RM_ToNearestEven, 1),\n\ - uniExtractInt32BgraToU8Bgr_2x8);\n\ -\n\ - int4 dstPos = (int4)(get_global_id(0) * 3, gidy, 0, 0);\n\ - VXC_WriteImage2DArray(output, dstPos, dst, VXC_MODIFIER(0, 11, 0, VXC_RM_TowardZero, 0));\n\ -}\n\ -"; /* end of pre_process_bgra_trans_vx*/ - static const char pre_process_gray_vx[] = "/*\n\ ============================================================================\n\ Name : GrayScale.vx\n\ @@ -22287,97 +24479,6 @@ __kernel void pre_process_nv12_scale_U8toF16_gq(\n\ VXC_WriteImage2DArray(output, dstPos, dst, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0));\n\ }"; /* end of pre_process_nv12_scale_mix_vx*/ -static const char pre_process_nv12_trans_u8_vx[] = "#include \"cl_viv_vx_ext.h\"\n\ -\n\ -_viv_uniform int bOrder;\n\ -_viv_uniform int rOrder;\n\ -\n\ -_viv_uniform float outputScaleVar;\n\ -_viv_uniform float bMeanScaleVarZp;\n\ -_viv_uniform float gMeanScaleVarZp;\n\ -_viv_uniform float rMeanScaleVarZp;\n\ -\n\ -_viv_uniform uint xrIntFloat_16;\n\ -_viv_uniform uint yrIntFloat_16;\n\ -\n\ -_viv_uniform VXC_512Bits uniConvertNV12toB_4x4;\n\ -_viv_uniform VXC_512Bits uniConvertNV12toG_4x4;\n\ -_viv_uniform VXC_512Bits uniConvertNV12toR_4x4;\n\ -\n\ -_viv_uniform VXC_512Bits uniConvertInt32toUint8_2x8;\n\ -_viv_uniform VXC_512Bits uniConvertUVtoCharSub128_2x8;\n\ -\n\ -_viv_uniform VXC_512Bits uniTransPackBgr1st_2x8;\n\ -_viv_uniform VXC_512Bits uniTransPackBgr2nd_2x8;\n\ -\n\ -__kernel void pre_process_nv12_trans_U8toU8(\n\ - __read_only image2d_t y_img, __read_only image2d_t uv_img,\n\ - __write_only image2d_t output,\n\ - global int *xRatio, global int * yRatio, global int * xOffset, global int * yOffset,\n\ - float rMean, float gMean, float bMean, float var, int reverse_channel, int trans)\n\ -{\n\ - uint4 gidx = get_global_id(0);\n\ - uint gidy = get_global_id(1);\n\ - gidx += (uint4)(0, 1, 2, 3);\n\ -\n\ - uint dy = (gidy * yrIntFloat_16) >> 16;\n\ - uint4 dx = (gidx * xrIntFloat_16) >> 16;\n\ - int sy = convert_int(dy) + (*yOffset);\n\ - int4 sx = convert_int4(dx) + (*xOffset);\n\ - int4 uvX = sx & 0xfffffffe;\n\ - int uvY = sy >> 1;\n\ -\n\ - vxc_uchar16 Y, UV;\n\ - int2 coord = (int2)(sx.x, sy);\n\ - int2 coord_uv = (int2)(uvX.x, uvY);\n\ -\n\ - VXC_ReadImage(Y, y_img, coord, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));\n\ - coord.x = sx.y;\n\ - VXC_ReadImage(Y, y_img, coord, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(1, 1, 0, VXC_RM_TowardZero, 0));\n\ - coord.x = sx.z;\n\ - VXC_ReadImage(Y, y_img, coord, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(2, 2, 0, VXC_RM_TowardZero, 0));\n\ - coord.x = sx.w;\n\ - VXC_ReadImage(Y, y_img, coord, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(3, 3, 0, VXC_RM_TowardZero, 0));\n\ -\n\ - VXC_ReadImage(UV, uv_img,coord_uv, VXC_5BITOFFSET_XY(0,0),VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0));\n\ - coord_uv.x = uvX.y;\n\ - VXC_ReadImage(UV, uv_img,coord_uv, VXC_5BITOFFSET_XY(0,0),VXC_MODIFIER(2, 3, 0, VXC_RM_TowardZero, 0));\n\ - coord_uv.x = uvX.z;\n\ - VXC_ReadImage(UV, uv_img,coord_uv, VXC_5BITOFFSET_XY(0,0),VXC_MODIFIER(4, 5, 0, VXC_RM_TowardZero, 0));\n\ - coord_uv.x = uvX.w;\n\ - VXC_ReadImage(UV, uv_img,coord_uv, VXC_5BITOFFSET_XY(0,0),VXC_MODIFIER(6, 7, 0, VXC_RM_TowardZero, 0));\n\ -\n\ - vxc_char16 tmpUV;\n\ - short tmpVal = 128;\n\ - VXC_DP2x8(tmpUV, UV, tmpVal, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvertUVtoCharSub128_2x8);\n\ -\n\ - float4 tmpDstB, tmpDstG, tmpDstR;\n\ - VXC_DP4x4(tmpDstB, Y, tmpUV, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertNV12toB_4x4);\n\ - VXC_DP4x4(tmpDstG, Y, tmpUV, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertNV12toG_4x4);\n\ - VXC_DP4x4(tmpDstR, Y, tmpUV, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertNV12toR_4x4);\n\ -\n\ - int4 result, dstR, dstG, dstB;\n\ - vxc_uchar16 dst, tmpPack;\n\ - dstB = convert_int4_rte(tmpDstB * outputScaleVar + bMeanScaleVarZp);\n\ - dstG = convert_int4_rte(tmpDstG * outputScaleVar + gMeanScaleVarZp);\n\ - dstR = convert_int4_rte(tmpDstR * outputScaleVar + rMeanScaleVarZp);\n\ -\n\ - if(bOrder == 2)\n\ - {\n\ - int4 exchangeData = dstB;\n\ - dstB = dstR;\n\ - dstR = exchangeData;\n\ - }\n\ -\n\ - VXC_DP2x8(tmpPack, dstB, dstG, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniConvertInt32toUint8_2x8);\n\ - VXC_DP2x8(dst, tmpPack, dstR, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniTransPackBgr1st_2x8);\n\ - VXC_DP2x8(dst, tmpPack, dstR, VXC_MODIFIER(8, 11, 0, VXC_RM_ToNearestEven, 1), uniTransPackBgr2nd_2x8);\n\ -\n\ - int2 dstPos = (int2)(get_global_id(0) * 3, gidy);\n\ - VXC_WriteImage(output, dstPos, dst, VXC_MODIFIER(0, 11, 0, VXC_RM_TowardZero, 0));\n\ -}\n\ -"; /* end of pre_process_nv12_trans_u8_vx*/ - static const char pre_process_rgb_vx[] = "#include \"cl_viv_vx_ext.h\"\n\ \n\ _viv_uniform VXC_512Bits uniVecShift10;\n\ @@ -22711,276 +24812,6 @@ IMAGE_PRE_PROCESS_COPY_8BITS(U8, vxc_uchar16)\n\ IMAGE_PRE_PROCESS_COPY_8BITS(I8, vxc_char16)\n\ "; /* end of pre_process_rgb_copy_vx*/ -static const char pre_process_rgb_copy_trans_vx[] = "#include \"cl_viv_vx_ext.h\"\n\ -\n\ -_viv_uniform float outputScale;\n\ -_viv_uniform float outputZP;\n\ -_viv_uniform VXC_512Bits uniNormilizationLo_2x8;\n\ -_viv_uniform VXC_512Bits uniNormilizationHi_2x8;\n\ -#define IMAGE_PRE_PROCESS_COPY_16BITS_NHWC(dst_name, dst_type, copy_type) \\\n\ -__kernel void pre_process_rgb_copy_nhwc_U8to##dst_name \\\n\ - ( \\\n\ - __read_only image2d_array_t input, \\\n\ - __write_only image2d_array_t output, \\\n\ - global int *xRatio, \\\n\ - global int *yRatio, \\\n\ - global int *xOffset, \\\n\ - global int *yOffset, \\\n\ - float rMean, \\\n\ - float gMean, \\\n\ - float bMean, \\\n\ - float f32Var, \\\n\ - int reverse_channel, \\\n\ - int trans \\\n\ - ) \\\n\ -{ \\\n\ - int2 coord = (int2)(get_global_id(0), get_global_id(1)); \\\n\ - \\\n\ - coord.xy += (int2) (*xOffset, *yOffset); \\\n\ - vxc_uchar16 src0, src1; \\\n\ - dst_type dst0, dst1; \\\n\ - copy_type dst; \\\n\ - \\\n\ - VXC_ReadImage(src0, input, coord.xy, 0, VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); \\\n\ - \\\n\ - f32Var *= outputScale; \\\n\ - float4 paramData = (float4)(rMean * f32Var - outputZP, gMean * f32Var - outputZP, \\\n\ - bMean * f32Var - outputZP, f32Var); \\\n\ - half4 paramData_f16; \\\n\ - _viv_asm(CONV, paramData_f16, paramData); \\\n\ - \\\n\ - int4 coord_out = (int4)(get_global_id(0), get_global_id(1), get_global_id(0), get_global_id(0)); \\\n\ - coord_out.z = coord_out.x + 8; \\\n\ - \\\n\ - VXC_DP2x8(dst0, src0, paramData_f16, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), \\\n\ - uniNormilizationLo_2x8); \\\n\ - VXC_DP2x8(dst1, src0, paramData_f16, VXC_MODIFIER(0, 6, 0, VXC_RM_ToNearestEven, 1), \\\n\ - uniNormilizationHi_2x8); \\\n\ - _viv_asm(COPY, dst, dst0, 16); \\\n\ - VXC_WriteImage(output, coord_out.xy, dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \\\n\ - _viv_asm(COPY, dst, dst1, 16); \\\n\ - VXC_WriteImage(output, coord_out.zy, dst, VXC_MODIFIER(0, 6, 0, VXC_RM_TowardZero, 0)); \\\n\ -}\n\ -IMAGE_PRE_PROCESS_COPY_16BITS_NHWC(I16, vxc_short8, vxc_short8)\n\ -IMAGE_PRE_PROCESS_COPY_16BITS_NHWC(F16, vxc_half8, vxc_short8)\n\ -\n\ -#define IMAGE_PRE_PROCESS_COPY_8BITS_NHWC(dst_name, dst_type) \\\n\ -__kernel void pre_process_rgb_copy_nhwc_U8to##dst_name \\\n\ - ( \\\n\ - __read_only image2d_array_t input, \\\n\ - __write_only image2d_array_t output, \\\n\ - global int *xRatio, \\\n\ - global int *yRatio, \\\n\ - global int *xOffset, \\\n\ - global int *yOffset, \\\n\ - float rMean, \\\n\ - float gMean, \\\n\ - float bMean, \\\n\ - float f32Var, \\\n\ - int reverse_channel, \\\n\ - int trans \\\n\ - ) \\\n\ -{ \\\n\ - int2 coord = (int2)(get_global_id(0), get_global_id(1)); \\\n\ - coord.xy += (int2) (*xOffset, *yOffset); \\\n\ - vxc_uchar16 src0, src1; \\\n\ - dst_type dst; \\\n\ - \\\n\ - VXC_ReadImage(src0, input, coord.xy, 0, VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); \\\n\ - \\\n\ - f32Var *= outputScale; \\\n\ - float4 paramData = (float4)(rMean * f32Var - outputZP, gMean * f32Var - outputZP, \\\n\ - bMean * f32Var - outputZP, f32Var); \\\n\ - \\\n\ - half4 paramData_f16; \\\n\ - _viv_asm(CONV, paramData_f16, paramData); \\\n\ - \\\n\ - int2 coord_out = (int2)(get_global_id(0), get_global_id(1)); \\\n\ - \\\n\ - VXC_DP2x8(dst, src0, paramData_f16, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), \\\n\ - uniNormilizationLo_2x8); \\\n\ - VXC_DP2x8(dst, src0, paramData_f16, VXC_MODIFIER(8, 14, 0, VXC_RM_ToNearestEven, 1), \\\n\ - uniNormilizationHi_2x8); \\\n\ - VXC_WriteImage(output, coord_out, dst, VXC_MODIFIER(0, 14, 0, VXC_RM_TowardZero, 0)); \\\n\ -}\n\ -IMAGE_PRE_PROCESS_COPY_8BITS_NHWC(U8, vxc_uchar16)\n\ -IMAGE_PRE_PROCESS_COPY_8BITS_NHWC(I8, vxc_char16)\n\ -"; /* end of pre_process_rgb_copy_trans_vx*/ - -static const char pre_process_rgb_trans_vx[] = "#include \"cl_viv_vx_ext.h\"\n\ -\n\ -_viv_uniform VXC_512Bits uniVecShift10;\n\ -_viv_uniform VXC_512Bits uniAddRShift;\n\ -_viv_uniform VXC_512Bits uniGetTempVal;\n\ -_viv_uniform VXC_512Bits uniExtractBytes;\n\ -_viv_uniform VXC_512Bits uniUnpackToR;\n\ -_viv_uniform VXC_512Bits uniUnpackToG;\n\ -_viv_uniform VXC_512Bits uniUnpackToB;\n\ -\n\ -_viv_uniform VXC_512Bits uniConvertIntergetoF32_4x4;\n\ -_viv_uniform float outputScale;\n\ -_viv_uniform VXC_512Bits uniExtract8Data_2x8;\n\ -_viv_uniform float outputZP;\n\ -\n\ -_viv_uniform VXC_512Bits uniRePackRGBLo_2x8;\n\ -_viv_uniform VXC_512Bits uniRePackRGBHi_2x8;\n\ -#define IMAGE_PRE_PROCESS_NHWC(dst_name, conv_type, dst_type, copy_type) \\\n\ -__kernel void pre_process_rgb_scale_nhwc_U8to##dst_name \\\n\ - ( \\\n\ -__read_only image2d_array_t input, \\\n\ -__write_only image2d_array_t output, \\\n\ - global int *xRatio, \\\n\ - global int *yRatio, \\\n\ - global int *xOffset, \\\n\ - global int *yOffset, \\\n\ - float rMean, \\\n\ - float gMean, \\\n\ - float bMean, \\\n\ - float f32Var, \\\n\ - int reverse_channel, \\\n\ - int trans \\\n\ - ) \\\n\ -{ \\\n\ - int2 ratioXY = (int2)(*xRatio, *yRatio); \\\n\ - int4 xPos = get_global_id(0); \\\n\ - int yPos = get_global_id(1); \\\n\ - int2 ratioSufXY = (ratioXY >> 1) - (1 << 14); \\\n\ - xPos += (int4)(0, 1, 2, 3); \\\n\ - \\\n\ - /*x*/ \\\n\ - int4 fx0 = xPos * ratioXY.x + ratioSufXY.x; \\\n\ - int4 sx = fx0 & 0xffff8000; \\\n\ - fx0 -= sx; \\\n\ - sx = sx >> 15; \\\n\ - \\\n\ - vxc_short4 fx; \\\n\ - VXC_DP4x4(fx, fx0, 1 << 4, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAddRShift); \\\n\ - /*y*/ \\\n\ - int fy = yPos * ratioXY.y + ratioSufXY.y; \\\n\ - int sy = fy & 0xffff8000; \\\n\ - \\\n\ - fy -= sy; \\\n\ - sy = sy >> 15; \\\n\ - \\\n\ - fy = (fy + (1<< 4)) >> 5; \\\n\ - \\\n\ - vxc_uchar16 line0RGB1, line0RGB2; \\\n\ - vxc_uchar16 line1RGB3, line1RGB4; \\\n\ - int4 coord; \\\n\ - sx = sx * 3 + *xOffset; \\\n\ - coord.xyz = sx.xyz; \\\n\ - coord.w = sy + *yOffset; \\\n\ - int2 coord1 = (int2)(sx.w, coord.w); \\\n\ - VXC_ReadImage(line0RGB1, input, coord.xw, 0, VXC_MODIFIER(0, 5, 0, VXC_RM_TowardZero, 0)); \\\n\ - VXC_ReadImage(line0RGB1, input, coord.yw, 0, VXC_MODIFIER(6, 11, 0, VXC_RM_TowardZero, 0)); \\\n\ - VXC_ReadImage(line0RGB2, input, coord.zw, 0, VXC_MODIFIER(0, 5, 0, VXC_RM_TowardZero, 0)); \\\n\ - VXC_ReadImage(line0RGB2, input, coord1, 0, VXC_MODIFIER(6, 11, 0, VXC_RM_TowardZero, 0)); \\\n\ - \\\n\ - VXC_ReadImage(line1RGB3, input, coord.xw, VXC_5BITOFFSET_XY(0, 1), \\\n\ - VXC_MODIFIER(0, 5, 0, VXC_RM_TowardZero, 0)); \\\n\ - VXC_ReadImage(line1RGB3, input, coord.yw, VXC_5BITOFFSET_XY(0, 1), \\\n\ - VXC_MODIFIER(6, 11, 0, VXC_RM_TowardZero, 0)); \\\n\ - VXC_ReadImage(line1RGB4, input, coord.zw, VXC_5BITOFFSET_XY(0, 1), \\\n\ - VXC_MODIFIER(0, 5, 0, VXC_RM_TowardZero, 0)); \\\n\ - VXC_ReadImage(line1RGB4, input, coord1, VXC_5BITOFFSET_XY(0, 1), \\\n\ - VXC_MODIFIER(6, 11, 0, VXC_RM_TowardZero, 0)); \\\n\ - \\\n\ - float4 bgrMean = (float4)(bMean, gMean, rMean, 0); \\\n\ - \\\n\ - bgrMean *= f32Var; \\\n\ - \\\n\ - int4 test01, temp1; \\\n\ - int4 test02, temp2; \\\n\ - int4 tt; \\\n\ - vxc_uchar4 val; \\\n\ - int4 coord_out = (int4)(xPos.x * 3, yPos, xPos.x * 3 + 6, 0); \\\n\ - \\\n\ - vxc_uchar8 line1, line2; \\\n\ - \\\n\ - /*R*/ \\\n\ - VXC_DP2x8(line1, line0RGB1, line0RGB2, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniUnpackToR); \\\n\ - VXC_DP2x8(line2, line1RGB3, line1RGB4, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniUnpackToR); \\\n\ - \\\n\ - VXC_DP4x4(test01, line1, line1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniVecShift10); \\\n\ - VXC_DP4x4(temp1, line1, fx, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniGetTempVal); \\\n\ - temp1 = temp1 + test01; \\\n\ - \\\n\ - VXC_DP4x4(test02, line2, line2, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniVecShift10); \\\n\ - VXC_DP4x4(temp2, line2, fx, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniGetTempVal); \\\n\ - temp2 = temp2 + test02; \\\n\ - temp2 = fy * (temp2 - temp1) + (temp1 << 10); \\\n\ - \\\n\ - vxc_float4 tmp_dst; \\\n\ - vxc_uchar4 u8_dst; \\\n\ - VXC_DP4x4(u8_dst, temp2, 1 << 19, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 1), uniExtractBytes); \\\n\ - VXC_DP4x4(tmp_dst, u8_dst, u8_dst, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 1), \\\n\ - uniConvertIntergetoF32_4x4); \\\n\ - \\\n\ - /*convert U8 to dst*/ \\\n\ - dst_type dstRG, dstB, dst; \\\n\ - tmp_dst = tmp_dst * f32Var - bgrMean.zzzz; \\\n\ - tmp_dst = tmp_dst * outputScale + outputZP; \\\n\ - conv_type dst0; \\\n\ - _viv_asm(CONV_RTE, dst0, tmp_dst); \\\n\ - VXC_DP2x8(dstRG, dst0, dst0, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), uniExtract8Data_2x8); \\\n\ - \\\n\ - /*G*/ \\\n\ - VXC_DP2x8(line1, line0RGB1, line0RGB2, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniUnpackToG); \\\n\ - VXC_DP2x8(line2, line1RGB3, line1RGB4, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniUnpackToG); \\\n\ - \\\n\ - VXC_DP4x4(test01, line1, line1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniVecShift10); \\\n\ - VXC_DP4x4(temp1, line1, fx, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniGetTempVal); \\\n\ - temp1 = temp1 + test01; \\\n\ - \\\n\ - VXC_DP4x4(test02, line2, line2, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniVecShift10); \\\n\ - VXC_DP4x4(temp2, line2, fx, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniGetTempVal); \\\n\ - temp2 = temp2 + test02; \\\n\ - temp2 = fy * (temp2 - temp1) + (temp1 << 10); \\\n\ - \\\n\ - VXC_DP4x4(u8_dst, temp2, 1 << 19, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 1), uniExtractBytes); \\\n\ - VXC_DP4x4(tmp_dst, u8_dst, u8_dst, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 1), \\\n\ - uniConvertIntergetoF32_4x4); \\\n\ - \\\n\ - tmp_dst = tmp_dst * f32Var - bgrMean.y; \\\n\ - tmp_dst = tmp_dst * outputScale + outputZP; \\\n\ - _viv_asm(CONV_RTE, dst0, tmp_dst); \\\n\ - VXC_DP2x8(dstRG, dst0, dst0, VXC_MODIFIER(4, 7, 0, VXC_RM_ToNearestEven, 1), uniExtract8Data_2x8); \\\n\ - \\\n\ - /*B*/ \\\n\ - VXC_DP2x8(line1, line0RGB1, line0RGB2, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniUnpackToB); \\\n\ - VXC_DP2x8(line2, line1RGB3, line1RGB4, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniUnpackToB); \\\n\ - \\\n\ - VXC_DP4x4(test01, line1, line1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniVecShift10); \\\n\ - VXC_DP4x4(temp1, line1, fx, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniGetTempVal); \\\n\ - temp1 = temp1 + test01; \\\n\ - \\\n\ - VXC_DP4x4(test02, line2, line2, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniVecShift10); \\\n\ - VXC_DP4x4(temp2, line2, fx, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniGetTempVal); \\\n\ - temp2 = temp2 + test02; \\\n\ - temp2 = fy * (temp2 - temp1) + (temp1 << 10); \\\n\ - \\\n\ - VXC_DP4x4(u8_dst, temp2, 1 << 19, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 1), uniExtractBytes); \\\n\ - VXC_DP4x4(tmp_dst, u8_dst, u8_dst, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 1), \\\n\ - uniConvertIntergetoF32_4x4); \\\n\ - \\\n\ - tmp_dst = tmp_dst * f32Var - bgrMean.x; \\\n\ - tmp_dst = tmp_dst * outputScale + outputZP; \\\n\ - _viv_asm(CONV_RTE, dst0, tmp_dst); \\\n\ - VXC_DP2x8(dstB, dst0, dst0, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), uniExtract8Data_2x8); \\\n\ - VXC_DP2x8(dst, dstRG, dstB, VXC_MODIFIER(0, 5, 0, VXC_RM_ToNearestEven, 1), uniRePackRGBLo_2x8); \\\n\ - copy_type result; \\\n\ - _viv_asm(COPY, result, dst, 16); \\\n\ - VXC_WriteImage(output, coord_out.xy, result, VXC_MODIFIER(0, 5, 0, VXC_RM_TowardZero, 0)); \\\n\ - VXC_DP2x8(dst, dstRG, dstB, VXC_MODIFIER(0, 5, 0, VXC_RM_ToNearestEven, 1), uniRePackRGBHi_2x8); \\\n\ - _viv_asm(COPY, result, dst, 16); \\\n\ - VXC_WriteImage(output, coord_out.zy, result, VXC_MODIFIER(0, 5, 0, VXC_RM_TowardZero, 0)); \\\n\ -}\n\ -IMAGE_PRE_PROCESS_NHWC(U8, uint4, vxc_uchar16, vxc_uchar16)\n\ -IMAGE_PRE_PROCESS_NHWC(I8, int4, vxc_char16, vxc_char16)\n\ -IMAGE_PRE_PROCESS_NHWC(I16, int4, vxc_short8, vxc_short8)\n\ -IMAGE_PRE_PROCESS_NHWC(F16, half4, vxc_half8, vxc_short8)\n\ -"; /* end of pre_process_rgb_trans_vx*/ - static const char pre_process_yuv420_copy_u8_vx[] = "#include \"cl_viv_vx_ext.h\"\n\ \n\ _viv_uniform VXC_512Bits uniCalculateTmpR1st_4x4;\n\ @@ -23006,19 +24837,6 @@ _viv_uniform VXC_512Bits uniCalculateTmpB3rd_4x4;\n\ _viv_uniform VXC_512Bits uniCalculateTmpB4th_4x4;\n\ _viv_uniform VXC_512Bits uniCalculateB1st_4x4;\n\ \n\ -_viv_uniform VXC_512Bits uniPackBG0_2x8;\n\ -_viv_uniform VXC_512Bits uniPackTmpAndR_2x8;\n\ -_viv_uniform VXC_512Bits uniPackRB0_2x8;\n\ -_viv_uniform VXC_512Bits uniPackTmp0AndG_2x8;\n\ -_viv_uniform VXC_512Bits uniPackGR1_2x8;\n\ -_viv_uniform VXC_512Bits uniPackTmp1AndB_2x8;\n\ -_viv_uniform VXC_512Bits uniPackBG1_2x8;\n\ -_viv_uniform VXC_512Bits uniPackTmp1AndR_2x8;\n\ -_viv_uniform VXC_512Bits uniPackRB2_2x8;\n\ -_viv_uniform VXC_512Bits uniPackTmp2AndG_2x8;\n\ -_viv_uniform VXC_512Bits uniPackGR2_2x8;\n\ -_viv_uniform VXC_512Bits uniPackTmp2AndB_2x8;\n\ -\n\ _viv_uniform VXC_512Bits uniQuantU8toU8LoB_2x8;\n\ _viv_uniform VXC_512Bits uniQuantU8toU8HiB_2x8;\n\ _viv_uniform VXC_512Bits uniQuantU8toU8LoG_2x8;\n\ @@ -23128,140 +24946,6 @@ __kernel void pre_process_yuv420_copy_U8toU8(\n\ VXC_WriteImage2DArray(output, pos, dst2, VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));\n\ }\n\ \n\ -// store bgrbgrbgr\n\ -__kernel void pre_process_yuv420_copy_trans_U8(\n\ - __read_only image2d_t y_img,\n\ - __read_only image2d_t u_img,\n\ - __read_only image2d_t v_img,\n\ - __write_only image2d_array_t output,\n\ - global int * xRatio,\n\ - global int * yRatio,\n\ - global int * xOffset,\n\ - global int * yOffset,\n\ - float rMean,\n\ - float gMean,\n\ - float bMean,\n\ - float var,\n\ - int reverse_channel,\n\ - int trans\n\ - )\n\ -{\n\ - int4 pos = (int4)(get_global_id(0) + (*xOffset), get_global_id(1) + (*yOffset), 0, 0);\n\ - int4 pos1 = (int4)((get_global_id(0) + (*xOffset)) >> 1, (get_global_id(1) + (*yOffset)) >> 1, 0, 0);\n\ - vxc_uchar16 Y;\n\ - vxc_uchar8 U, V;\n\ - vxc_int4 C0, C1, C2, C3;\n\ - vxc_uchar16 R, G, B;\n\ - vxc_uchar16 dst;\n\ -\n\ - VXC_ReadImage(Y, y_img, pos.xy, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));\n\ - VXC_ReadImage(U, u_img, pos1.xy, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ - VXC_ReadImage(V, v_img, pos1.xy, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ -\n\ - var *= outputScale;\n\ - float4 paramData = (float4)(bMean * var - zp, gMean * var - zp,\\\n\ - rMean * var - zp, var);\n\ - half4 paramData_f16;\n\ - _viv_asm(CONV, paramData_f16, paramData);\n\ -\n\ - //C = Y - 16;\n\ - //D = U - 128;\n\ - //E = V - 128;\n\ - // calculate R\n\ - // ((298 * C + 409 * E + 128) >> 8) --> [(298Y + 409V - 56992) >> 8]\n\ - int tmpV = -56992;\n\ - VXC_DP4x4(C0, Y, V, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniCalculateTmpR1st_4x4);\n\ - VXC_DP4x4(C1, Y, V, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniCalculateTmpR2nd_4x4);\n\ - VXC_DP4x4(C2, Y, V, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniCalculateTmpR3rd_4x4);\n\ - VXC_DP4x4(C3, Y, V, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniCalculateTmpR4th_4x4);\n\ -\n\ - VXC_DP4x4(dst, C0, tmpV, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), uniCalculateR1st_4x4);\n\ - VXC_DP4x4(dst, C1, tmpV, VXC_MODIFIER(4, 7, 0, VXC_RM_ToNearestEven, 1), uniCalculateR1st_4x4);\n\ - VXC_DP4x4(dst, C2, tmpV, VXC_MODIFIER(8, 11, 0, VXC_RM_ToNearestEven, 1), uniCalculateR1st_4x4);\n\ - VXC_DP4x4(dst, C3, tmpV, VXC_MODIFIER(12, 15, 0, VXC_RM_ToNearestEven, 1), uniCalculateR1st_4x4);\n\ -\n\ - VXC_DP2x8(R, dst, paramData_f16, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniQuantU8toU8LoR_2x8);\n\ - VXC_DP2x8(R, dst, paramData_f16, VXC_MODIFIER(8, 15, 0, VXC_RM_ToNearestEven, 1), uniQuantU8toU8HiR_2x8);\n\ -\n\ - // calculate G\n\ - // ((298 * C - 100* D - 208 * E + 128) >> 8) --> [(298Y - 100U - 208V + 34784) >> 8]\n\ - // 298Y - 208V\n\ - VXC_DP4x4(C0, Y, V, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniCalculateTmpG1st_4x4);\n\ - VXC_DP4x4(C1, Y, V, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniCalculateTmpG2nd_4x4);\n\ - VXC_DP4x4(C2, Y, V, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniCalculateTmpG3rd_4x4);\n\ - VXC_DP4x4(C3, Y, V, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniCalculateTmpG4th_4x4);\n\ - // 34784 - 100U\n\ - ushort tmpG = 34784;\n\ - vxc_ushort8 tmpDstG;\n\ - VXC_DP2x8(tmpDstG, U, tmpG, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniCalculateTmpGbyU_2x8);\n\ - VXC_DP4x4(dst, C0, tmpDstG, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), uniCalculateG1st_4x4);\n\ - VXC_DP4x4(dst, C1, tmpDstG, VXC_MODIFIER(4, 7, 0, VXC_RM_ToNearestEven, 1), uniCalculateG2nd_4x4);\n\ - VXC_DP4x4(dst, C2, tmpDstG, VXC_MODIFIER(8, 11, 0, VXC_RM_ToNearestEven, 1), uniCalculateG3rd_4x4);\n\ - VXC_DP4x4(dst, C3, tmpDstG, VXC_MODIFIER(12, 15, 0, VXC_RM_ToNearestEven, 1), uniCalculateG4th_4x4);\n\ -\n\ - VXC_DP2x8(G, dst, paramData_f16, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniQuantU8toU8LoG_2x8);\n\ - VXC_DP2x8(G, dst, paramData_f16, VXC_MODIFIER(8, 15, 0, VXC_RM_ToNearestEven, 1), uniQuantU8toU8HiG_2x8);\n\ -\n\ - // calculate B\n\ - // ((298 * C + 516 * D + 128) >> 8) ==> [(298Y + 516U - 70688) >> 8]\n\ - VXC_DP4x4(C0, Y, U, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniCalculateTmpB1st_4x4);\n\ - VXC_DP4x4(C1, Y, U, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniCalculateTmpB2nd_4x4);\n\ - VXC_DP4x4(C2, Y, U, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniCalculateTmpB3rd_4x4);\n\ - VXC_DP4x4(C3, Y, U, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniCalculateTmpB4th_4x4);\n\ - tmpV = -70688;\n\ - VXC_DP4x4(dst, C0, tmpV, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), uniCalculateB1st_4x4);\n\ - VXC_DP4x4(dst, C1, tmpV, VXC_MODIFIER(4, 7, 0, VXC_RM_ToNearestEven, 1), uniCalculateB1st_4x4);\n\ - VXC_DP4x4(dst, C2, tmpV, VXC_MODIFIER(8, 11, 0, VXC_RM_ToNearestEven, 1), uniCalculateB1st_4x4);\n\ - VXC_DP4x4(dst, C3, tmpV, VXC_MODIFIER(12, 15, 0, VXC_RM_ToNearestEven, 1), uniCalculateB1st_4x4);\n\ -\n\ - VXC_DP2x8(B, dst, paramData_f16, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniQuantU8toU8LoB_2x8);\n\ - VXC_DP2x8(B, dst, paramData_f16, VXC_MODIFIER(8, 15, 0, VXC_RM_ToNearestEven, 1), uniQuantU8toU8HiB_2x8);\n\ -\n\ - // reorder to bgr\n\ - vxc_uchar8 tmpdst0, tmpdst1;\n\ - vxc_uchar16 dst0, dst1, dst2;\n\ -\n\ - if(bOrder == 2)\n\ - {\n\ - vxc_uchar16 exchangeData = B;\n\ - B = R;\n\ - R = exchangeData;\n\ - }\n\ -\n\ - // BGR BGR BG\n\ - VXC_DP2x8(tmpdst0, B, G, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniPackBG0_2x8);\n\ - VXC_DP2x8(dst0, tmpdst0, R, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniPackTmpAndR_2x8);\n\ -\n\ - // RBG RBG RB\n\ - VXC_DP2x8(tmpdst0, R, B, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniPackRB0_2x8);\n\ - VXC_DP2x8(dst0, tmpdst0, G, VXC_MODIFIER(8, 15, 0, VXC_RM_TowardZero, 0), uniPackTmp0AndG_2x8);\n\ -\n\ - pos = (int4)(get_global_id(0) * 3, get_global_id(1), 0, 0);\n\ -\n\ - VXC_WriteImage2DArray(output, pos, dst0, VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));\n\ - pos.x += 16;\n\ -\n\ - // GRB GRB GR\n\ - VXC_DP2x8(tmpdst0, G, R, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniPackGR1_2x8);\n\ - VXC_DP2x8(dst1, tmpdst0, B, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniPackTmp1AndB_2x8);\n\ -\n\ - // BGR BGR BG\n\ - VXC_DP2x8(tmpdst0, B, G, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniPackBG1_2x8);\n\ - VXC_DP2x8(dst1, tmpdst0, R, VXC_MODIFIER(8, 15, 0, VXC_RM_TowardZero, 0), uniPackTmp1AndR_2x8);\n\ -\n\ - VXC_WriteImage2DArray(output, pos, dst1, VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));\n\ - pos.x += 16;\n\ -\n\ - // RBG RBG RB\n\ - VXC_DP2x8(tmpdst0, R, B, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniPackRB2_2x8);\n\ - VXC_DP2x8(dst2, tmpdst0, G, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniPackTmp2AndG_2x8);\n\ -\n\ - // GRB GRB GR\n\ - VXC_DP2x8(tmpdst1, G, R, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniPackGR2_2x8);\n\ - VXC_DP2x8(dst2, tmpdst1, B, VXC_MODIFIER(8, 15, 0, VXC_RM_TowardZero, 0), uniPackTmp2AndB_2x8);\n\ -\n\ - VXC_WriteImage2DArray(output, pos, dst2, VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));\n\ -}\n\ "; /* end of pre_process_yuv420_copy_u8_vx*/ static const char pre_process_yuv420_scale_fp16_vx[] = "#include \"cl_viv_vx_ext.h\"\n\ @@ -24182,242 +25866,6 @@ __kernel void pre_process_yuv420_scale_U8toU8(\n\ VXC_WriteImage2DArray(output, dstPos, dst, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0));\n\ }"; /* end of pre_process_yuv420_scale_u8_vx*/ -static const char pre_process_yuv420_trans_u8_vx[] = "#include \"cl_viv_vx_ext.h\"\n\ -\n\ -_viv_uniform VXC_512Bits uniCalculateR1st_4x4;\n\ -_viv_uniform VXC_512Bits uniCalculateTmpGbyU_2x8;\n\ -_viv_uniform VXC_512Bits uniCalculateTmpGbyU2nd_2x8;\n\ -\n\ -_viv_uniform VXC_512Bits uniCalculateB1st_4x4;\n\ -_viv_uniform VXC_512Bits uniConvertInt32toUint8_2x8;\n\ -_viv_uniform VXC_512Bits uniDescaleU8_4x4;\n\ -\n\ -_viv_uniform VXC_512Bits uniCalculateTmpRWise_4x4;\n\ -_viv_uniform VXC_512Bits uniCalculateTmpRWise2nd_4x4;\n\ -_viv_uniform VXC_512Bits uniCalculateTmpRWise3rd_4x4;\n\ -_viv_uniform VXC_512Bits uniCalculateTmpRWise4th_4x4;\n\ -_viv_uniform VXC_512Bits uniCalculateTmpGWise_4x4;\n\ -_viv_uniform VXC_512Bits uniCalculateTmpGWise2nd_4x4;\n\ -_viv_uniform VXC_512Bits uniCalculateTmpGWise3rd_4x4;\n\ -_viv_uniform VXC_512Bits uniCalculateTmpGWise4th_4x4;\n\ -_viv_uniform VXC_512Bits uniCalculateGWise_4x4;\n\ -_viv_uniform VXC_512Bits uniCalculateGWise2nd_4x4;\n\ -\n\ -_viv_uniform VXC_512Bits uniCalculateTmpBWise_4x4;\n\ -_viv_uniform VXC_512Bits uniCalculateTmpBWise2nd_4x4;\n\ -_viv_uniform VXC_512Bits uniCalculateTmpBWise3rd_4x4;\n\ -_viv_uniform VXC_512Bits uniCalculateTmpBWise4th_4x4;\n\ -\n\ -_viv_uniform VXC_512Bits uniBilinearTmp1st_4x4;\n\ -_viv_uniform VXC_512Bits uniBilinearTmp2nd_4x4;\n\ -_viv_uniform VXC_512Bits uniBilinearTmp3rd_4x4;\n\ -_viv_uniform VXC_512Bits uniBilinearTmp4th_4x4;\n\ -\n\ -_viv_uniform VXC_512Bits uniTransPackBgr1st_2x8;\n\ -_viv_uniform VXC_512Bits uniTransPackBgr2nd_2x8;\n\ -\n\ -_viv_uniform int bOrder;\n\ -_viv_uniform int rOrder;\n\ -_viv_uniform int zp;\n\ -_viv_uniform float outputScale;\n\ -\n\ -__kernel void pre_process_yuv420_trans_U8toU8(\n\ - __read_only image2d_array_t y_img, __read_only image2d_array_t u_img,\n\ - __read_only image2d_array_t v_img, __write_only image2d_array_t output,\n\ - global int *xRatio, global int * yRatio, global int * xOffset, global int * yOffset,\n\ - float rMean, float gMean, float bMean, float var, int reverse_channel, int trans)\n\ -{\n\ - int4 gidx = get_global_id(0);\n\ - int gidy = get_global_id(1);\n\ - gidx += (int4)(0, 1, 2, 3);\n\ -\n\ - int4 fx = (gidx * (*xRatio) + ((*xRatio) >> 1)) - (1 << 14);\n\ - int4 sx = fx & 0xffff8000; // Floor\n\ - int fy, sy;\n\ - fx -= sx;\n\ - sx = sx >> 15;\n\ - fx = (fx +(1 << 4)) >> 5;\n\ -\n\ - // for y\n\ - fy = (gidy * (*yRatio) + ((*yRatio) >> 1)) - (1<< 14);\n\ - sy = fy & 0xffff8000; // Floor\n\ - fy -= sy;\n\ - sy = sy >> 15;\n\ -\n\ - sy = sy < 0 ? 0 : sy;\n\ - fy = fy < 0 ? 0 : fy;\n\ -\n\ - fy = (fy + (1<< 4)) >> 5;\n\ - sx += (*xOffset);\n\ - sy += (*yOffset);\n\ - int4 srcPos = (int4)(sx.x, sy, get_global_id(2), 0);\n\ - int4 srcPos1 = (int4)(sx.x >> 1, sy >> 1, get_global_id(2), 0);\n\ - int4 srcPos2 = (int4)(sx.x >> 1, (sy + 1) >> 1, get_global_id(2), 0);\n\ -\n\ - vxc_uchar16 Y, U, V;\n\ - vxc_int4 C0, C1, C2, C3;\n\ - vxc_uchar16 R, G, B;\n\ -\n\ - VXC_ReadImage(Y, y_img, srcPos, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0));\n\ - VXC_ReadImage(Y, y_img, srcPos, VXC_5BITOFFSET_XY(0, 1), VXC_MODIFIER(2, 3, 0, VXC_RM_TowardZero, 0));\n\ - VXC_ReadImage(U, u_img, srcPos1, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));\n\ - VXC_ReadImage(V, v_img, srcPos1, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));\n\ - srcPos1.x = (sx.x + 1) >> 1;\n\ - VXC_ReadImage(U, u_img, srcPos1, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(1, 1, 0, VXC_RM_TowardZero, 0));\n\ - VXC_ReadImage(V, v_img, srcPos1, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(1, 1, 0, VXC_RM_TowardZero, 0));\n\ -\n\ - VXC_ReadImage(U, u_img, srcPos2, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(2, 2, 0, VXC_RM_TowardZero, 0));\n\ - VXC_ReadImage(V, v_img, srcPos2, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(2, 2, 0, VXC_RM_TowardZero, 0));\n\ - srcPos2.x = (sx.x + 1) >> 1;\n\ - VXC_ReadImage(U, u_img, srcPos2, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(3, 3, 0, VXC_RM_TowardZero, 0));\n\ - VXC_ReadImage(V, v_img, srcPos2, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(3, 3, 0, VXC_RM_TowardZero, 0));\n\ -\n\ - srcPos.x = sx.y;\n\ - srcPos1.x = sx.y >> 1;\n\ - srcPos2.x = sx.y >> 1;\n\ - VXC_ReadImage(Y, y_img, srcPos, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(4, 5, 0, VXC_RM_TowardZero, 0));\n\ - VXC_ReadImage(Y, y_img, srcPos, VXC_5BITOFFSET_XY(0, 1), VXC_MODIFIER(6, 7, 0, VXC_RM_TowardZero, 0));\n\ - VXC_ReadImage(U, u_img, srcPos1, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(4, 4, 0, VXC_RM_TowardZero, 0));\n\ - VXC_ReadImage(V, v_img, srcPos1, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(4, 4, 0, VXC_RM_TowardZero, 0));\n\ - srcPos1.x = (sx.y + 1) >> 1;\n\ - VXC_ReadImage(U, u_img, srcPos1, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(5, 5, 0, VXC_RM_TowardZero, 0));\n\ - VXC_ReadImage(V, v_img, srcPos1, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(5, 5, 0, VXC_RM_TowardZero, 0));\n\ -\n\ - VXC_ReadImage(U, u_img, srcPos2, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(6, 6, 0, VXC_RM_TowardZero, 0));\n\ - VXC_ReadImage(V, v_img, srcPos2, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(6, 6, 0, VXC_RM_TowardZero, 0));\n\ - srcPos2.x = (sx.y + 1) >> 1;\n\ - VXC_ReadImage(U, u_img, srcPos2, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(7, 7, 0, VXC_RM_TowardZero, 0));\n\ - VXC_ReadImage(V, v_img, srcPos2, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(7, 7, 0, VXC_RM_TowardZero, 0));\n\ -\n\ - srcPos.x = sx.z;\n\ - srcPos1.x = sx.z >> 1;\n\ - srcPos2.x = sx.z >> 1;\n\ - VXC_ReadImage(Y, y_img, srcPos, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(8, 9, 0, VXC_RM_TowardZero, 0));\n\ - VXC_ReadImage(Y, y_img, srcPos, VXC_5BITOFFSET_XY(0, 1), VXC_MODIFIER(10, 11, 0, VXC_RM_TowardZero, 0));\n\ - VXC_ReadImage(U, u_img, srcPos1, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(8, 8, 0, VXC_RM_TowardZero, 0));\n\ - VXC_ReadImage(V, v_img, srcPos1, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(8, 8, 0, VXC_RM_TowardZero, 0));\n\ - srcPos1.x = (sx.z + 1) >> 1;\n\ - VXC_ReadImage(U, u_img, srcPos1, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(9, 9, 0, VXC_RM_TowardZero, 0));\n\ - VXC_ReadImage(V, v_img, srcPos1, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(9, 9, 0, VXC_RM_TowardZero, 0));\n\ -\n\ - VXC_ReadImage(U, u_img, srcPos2, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(10, 10, 0, VXC_RM_TowardZero, 0));\n\ - VXC_ReadImage(V, v_img, srcPos2, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(10, 10, 0, VXC_RM_TowardZero, 0));\n\ - srcPos2.x = (sx.z + 1) >> 1;\n\ - VXC_ReadImage(U, u_img, srcPos2, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(11, 11, 0, VXC_RM_TowardZero, 0));\n\ - VXC_ReadImage(V, v_img, srcPos2, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(11, 11, 0, VXC_RM_TowardZero, 0));\n\ -\n\ - srcPos.x = sx.w;\n\ - srcPos1.x = sx.w >> 1;\n\ - srcPos2.x = sx.w >> 1;\n\ - VXC_ReadImage(Y, y_img, srcPos, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(12, 13, 0, VXC_RM_TowardZero, 0));\n\ - VXC_ReadImage(Y, y_img, srcPos, VXC_5BITOFFSET_XY(0, 1), VXC_MODIFIER(14, 15, 0, VXC_RM_TowardZero, 0));\n\ - VXC_ReadImage(U, u_img, srcPos1, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(12, 12, 0, VXC_RM_TowardZero, 0));\n\ - VXC_ReadImage(V, v_img, srcPos1, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(12, 12, 0, VXC_RM_TowardZero, 0));\n\ - srcPos1.x = (sx.w + 1) >> 1;\n\ - VXC_ReadImage(U, u_img, srcPos1, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(13, 13, 0, VXC_RM_TowardZero, 0));\n\ - VXC_ReadImage(V, v_img, srcPos1, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(13, 13, 0, VXC_RM_TowardZero, 0));\n\ -\n\ - VXC_ReadImage(U, u_img, srcPos2, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(14, 14, 0, VXC_RM_TowardZero, 0));\n\ - VXC_ReadImage(V, v_img, srcPos2, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(14, 14, 0, VXC_RM_TowardZero, 0));\n\ - srcPos2.x = (sx.w + 1) >> 1;\n\ - VXC_ReadImage(U, u_img, srcPos2, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(15, 15, 0, VXC_RM_TowardZero, 0));\n\ - VXC_ReadImage(V, v_img, srcPos2, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(15, 15, 0, VXC_RM_TowardZero, 0));\n\ -\n\ - //C = Y - 16; D = U - 128; E = V - 128;\n\ - // ((298 * C + 409 * E + 128) >> 8) --> [(298Y + 409V - 56992) >> 8]\n\ - int tmpV = -56992;\n\ - VXC_DP4x4(C0, Y, V, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniCalculateTmpRWise_4x4);\n\ - VXC_DP4x4(C1, Y, V, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniCalculateTmpRWise2nd_4x4);\n\ - VXC_DP4x4(C2, Y, V, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniCalculateTmpRWise3rd_4x4);\n\ - VXC_DP4x4(C3, Y, V, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniCalculateTmpRWise4th_4x4);\n\ - VXC_DP4x4(R, C0, tmpV, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), uniCalculateR1st_4x4);\n\ - VXC_DP4x4(R, C1, tmpV, VXC_MODIFIER(4, 7, 0, VXC_RM_ToNearestEven, 1), uniCalculateR1st_4x4);\n\ - VXC_DP4x4(R, C2, tmpV, VXC_MODIFIER(8, 11, 0, VXC_RM_ToNearestEven, 1), uniCalculateR1st_4x4);\n\ - VXC_DP4x4(R, C3, tmpV, VXC_MODIFIER(12, 15, 0, VXC_RM_ToNearestEven, 1), uniCalculateR1st_4x4);\n\ -\n\ - // ((298 * C - 100* D - 208 * E + 128) >> 8) --> [(298Y - 100U - 208V + 34784) >> 8]\n\ - // 298Y - 208V\n\ - VXC_DP4x4(C0, Y, V, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniCalculateTmpGWise_4x4);\n\ - VXC_DP4x4(C1, Y, V, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniCalculateTmpGWise2nd_4x4);\n\ - VXC_DP4x4(C2, Y, V, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniCalculateTmpGWise3rd_4x4);\n\ - VXC_DP4x4(C3, Y, V, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniCalculateTmpGWise4th_4x4);\n\ - // 34784 - 100U\n\ - ushort tmpG = 34784;\n\ - vxc_ushort8 tmpDstG, tmpDstG1;\n\ - VXC_DP2x8(tmpDstG, U, tmpG, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniCalculateTmpGbyU_2x8);\n\ - VXC_DP2x8(tmpDstG1, U, tmpG, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniCalculateTmpGbyU2nd_2x8);\n\ - VXC_DP4x4(G, C0, tmpDstG, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), uniCalculateGWise_4x4);\n\ - VXC_DP4x4(G, C1, tmpDstG, VXC_MODIFIER(4, 7, 0, VXC_RM_ToNearestEven, 1), uniCalculateGWise2nd_4x4);\n\ - VXC_DP4x4(G, C2, tmpDstG1, VXC_MODIFIER(8, 11, 0, VXC_RM_ToNearestEven, 1), uniCalculateGWise_4x4);\n\ - VXC_DP4x4(G, C3, tmpDstG1, VXC_MODIFIER(12, 15, 0, VXC_RM_ToNearestEven, 1), uniCalculateGWise2nd_4x4);\n\ -\n\ - // ((298 * C + 516 * D + 128) >> 8) ==> [(298Y + 516U - 70688) >> 8]\n\ - VXC_DP4x4(C0, Y, U, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniCalculateTmpBWise_4x4);\n\ - VXC_DP4x4(C1, Y, U, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniCalculateTmpBWise2nd_4x4);\n\ - VXC_DP4x4(C2, Y, U, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniCalculateTmpBWise3rd_4x4);\n\ - VXC_DP4x4(C3, Y, U, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniCalculateTmpBWise4th_4x4);\n\ - tmpV = -70688;\n\ - VXC_DP4x4(B, C0, tmpV, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), uniCalculateB1st_4x4);\n\ - VXC_DP4x4(B, C1, tmpV, VXC_MODIFIER(4, 7, 0, VXC_RM_ToNearestEven, 1), uniCalculateB1st_4x4);\n\ - VXC_DP4x4(B, C2, tmpV, VXC_MODIFIER(8, 11, 0, VXC_RM_ToNearestEven, 1), uniCalculateB1st_4x4);\n\ - VXC_DP4x4(B, C3, tmpV, VXC_MODIFIER(12, 15, 0, VXC_RM_ToNearestEven, 1), uniCalculateB1st_4x4);\n\ -\n\ - int4 result, temp1, temp2, dstR, dstG, dstB;\n\ - int4 tmpData0, tmpData1;\n\ -\n\ - VXC_DP4x4(tmpData0, B, B, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniBilinearTmp1st_4x4);\n\ - VXC_DP4x4(tmpData1, B, B, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniBilinearTmp2nd_4x4);\n\ - temp1 = fx * tmpData0 + tmpData1;\n\ - // temp2 - temp1\n\ - VXC_DP4x4(tmpData0, B, B, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniBilinearTmp3rd_4x4);\n\ - VXC_DP4x4(tmpData1, B, B, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniBilinearTmp4th_4x4);\n\ - temp2 = fx * tmpData0 + tmpData1;\n\ - result = fy * temp2 + (temp1 << 10);\n\ -\n\ - tmpV = 1 << 19;\n\ - vxc_uchar8 dst, tmpPack;\n\ - float4 tmpDst;\n\ - VXC_DP4x4(tmpDst, result, tmpV, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), uniDescaleU8_4x4);\n\ - tmpDst = (tmpDst - bMean) * var;\n\ - dstB = convert_int4_rte(tmpDst * outputScale + zp);\n\ -\n\ - VXC_DP4x4(tmpData0, G, G, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniBilinearTmp1st_4x4);\n\ - VXC_DP4x4(tmpData1, G, G, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniBilinearTmp2nd_4x4);\n\ - temp1 = fx * tmpData0 + tmpData1;\n\ - VXC_DP4x4(tmpData0, G, G, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniBilinearTmp3rd_4x4);\n\ - VXC_DP4x4(tmpData1, G, G, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniBilinearTmp4th_4x4);\n\ - temp2 = fx * tmpData0 + tmpData1;\n\ - result = fy * temp2 + (temp1 << 10);\n\ - VXC_DP4x4(tmpDst, result, tmpV, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), uniDescaleU8_4x4);\n\ - tmpDst = (tmpDst - gMean) * var;\n\ - dstG = convert_int4_rte(tmpDst * outputScale + zp);\n\ -\n\ - VXC_DP4x4(tmpData0, R, R, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniBilinearTmp1st_4x4);\n\ - VXC_DP4x4(tmpData1, R, R, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniBilinearTmp2nd_4x4);\n\ - temp1 = fx * tmpData0 + tmpData1;\n\ - VXC_DP4x4(tmpData0, R, R, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniBilinearTmp3rd_4x4);\n\ - VXC_DP4x4(tmpData1, R, R, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniBilinearTmp4th_4x4);\n\ - temp2 = fx * tmpData0 + tmpData1;\n\ - result = fy * temp2 + (temp1 << 10);\n\ - VXC_DP4x4(tmpDst, result, tmpV, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), uniDescaleU8_4x4);\n\ - tmpDst = (tmpDst - rMean) * var;\n\ - dstR = convert_int4_rte(tmpDst * outputScale + zp);\n\ -\n\ - if(bOrder == 2)\n\ - {\n\ - int4 exchangeData = dstB;\n\ - dstB = dstR;\n\ - dstR = exchangeData;\n\ - }\n\ -\n\ - VXC_DP2x8(tmpPack, dstB, dstG, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniConvertInt32toUint8_2x8);\n\ - VXC_DP2x8(dst, tmpPack, dstR, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniTransPackBgr1st_2x8);\n\ - VXC_DP2x8(dst, tmpPack, dstR, VXC_MODIFIER(8, 11, 0, VXC_RM_ToNearestEven, 1), uniTransPackBgr2nd_2x8);\n\ -\n\ - int2 dstPos = (int2)(get_global_id(0) * 3, gidy);\n\ - VXC_WriteImage(output, dstPos, dst, VXC_MODIFIER(0, 11, 0, VXC_RM_TowardZero, 0));\n\ -}"; /* end of pre_process_yuv420_trans_u8_vx*/ - static const char pre_process_yuv444_copy_u8_vx[] = "#include \"cl_viv_vx_ext.h\"\n\ \n\ _viv_uniform VXC_512Bits uniCalculateTmpR1st_4x4;\n\ @@ -24442,19 +25890,6 @@ _viv_uniform VXC_512Bits uniCalculateTmpB3rd_4x4;\n\ _viv_uniform VXC_512Bits uniCalculateTmpB4th_4x4;\n\ _viv_uniform VXC_512Bits uniCalculateB1st_4x4;\n\ \n\ -_viv_uniform VXC_512Bits uniPackBG0_2x8;\n\ -_viv_uniform VXC_512Bits uniPackTmpAndR_2x8;\n\ -_viv_uniform VXC_512Bits uniPackRB0_2x8;\n\ -_viv_uniform VXC_512Bits uniPackTmp0AndG_2x8;\n\ -_viv_uniform VXC_512Bits uniPackGR1_2x8;\n\ -_viv_uniform VXC_512Bits uniPackTmp1AndB_2x8;\n\ -_viv_uniform VXC_512Bits uniPackBG1_2x8;\n\ -_viv_uniform VXC_512Bits uniPackTmp1AndR_2x8;\n\ -_viv_uniform VXC_512Bits uniPackRB2_2x8;\n\ -_viv_uniform VXC_512Bits uniPackTmp2AndG_2x8;\n\ -_viv_uniform VXC_512Bits uniPackGR2_2x8;\n\ -_viv_uniform VXC_512Bits uniPackTmp2AndB_2x8;\n\ -\n\ _viv_uniform VXC_512Bits uniQuantU8toU8LoB_2x8;\n\ _viv_uniform VXC_512Bits uniQuantU8toU8HiB_2x8;\n\ _viv_uniform VXC_512Bits uniQuantU8toU8LoG_2x8;\n\ @@ -24563,140 +25998,7 @@ __kernel void pre_process_yuv444_copy_U8toU8(\n\ pos.z = rOrder;\n\ VXC_WriteImage2DArray(output, pos, dst2, VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));\n\ }\n\ -\n\ -// store bgrbgrbgr\n\ -__kernel void pre_process_yuv444_copy_trans_U8(\n\ - __read_only image2d_t y_img,\n\ - __read_only image2d_t u_img,\n\ - __read_only image2d_t v_img,\n\ - __write_only image2d_array_t output,\n\ - global int * xRatio,\n\ - global int * yRatio,\n\ - global int * xOffset,\n\ - global int * yOffset,\n\ - float rMean,\n\ - float gMean,\n\ - float bMean,\n\ - float var,\n\ - int reverse_channel,\n\ - int trans\n\ - )\n\ -{\n\ - int4 pos = (int4)(get_global_id(0) + (*xOffset), get_global_id(1) + (*yOffset), 0, 0);\n\ - vxc_uchar16 Y, U, V;\n\ - vxc_int4 C0, C1, C2, C3;\n\ - vxc_uchar16 R, G, B;\n\ - vxc_uchar16 dst;\n\ -\n\ - VXC_ReadImage(Y, y_img, pos.xy, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));\n\ - VXC_ReadImage(U, u_img, pos.xy, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));\n\ - VXC_ReadImage(V, v_img, pos.xy, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));\n\ -\n\ - var *= outputScale;\n\ - float4 paramData = (float4)(bMean * var - zp, gMean * var - zp,\\\n\ - rMean * var - zp, var);\n\ - half4 paramData_f16;\n\ - _viv_asm(CONV, paramData_f16, paramData);\n\ -\n\ - //C = Y - 16;\n\ - //D = U - 128;\n\ - //E = V - 128;\n\ - // calculate R\n\ - // ((298 * C + 409 * E + 128) >> 8) --> [(298Y + 409V - 56992) >> 8]\n\ - int tmpV = -56992;\n\ - VXC_DP4x4(C0, Y, V, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniCalculateTmpR1st_4x4);\n\ - VXC_DP4x4(C1, Y, V, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniCalculateTmpR2nd_4x4);\n\ - VXC_DP4x4(C2, Y, V, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniCalculateTmpR3rd_4x4);\n\ - VXC_DP4x4(C3, Y, V, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniCalculateTmpR4th_4x4);\n\ -\n\ - VXC_DP4x4(dst, C0, tmpV, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), uniCalculateR1st_4x4);\n\ - VXC_DP4x4(dst, C1, tmpV, VXC_MODIFIER(4, 7, 0, VXC_RM_ToNearestEven, 1), uniCalculateR1st_4x4);\n\ - VXC_DP4x4(dst, C2, tmpV, VXC_MODIFIER(8, 11, 0, VXC_RM_ToNearestEven, 1), uniCalculateR1st_4x4);\n\ - VXC_DP4x4(dst, C3, tmpV, VXC_MODIFIER(12, 15, 0, VXC_RM_ToNearestEven, 1), uniCalculateR1st_4x4);\n\ -\n\ - VXC_DP2x8(R, dst, paramData_f16, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniQuantU8toU8LoR_2x8);\n\ - VXC_DP2x8(R, dst, paramData_f16, VXC_MODIFIER(8, 15, 0, VXC_RM_ToNearestEven, 1), uniQuantU8toU8HiR_2x8);\n\ -\n\ - // calculate G\n\ - // ((298 * C - 100* D - 208 * E + 128) >> 8) --> [(298Y - 100U - 208V + 34784) >> 8]\n\ - // 298Y - 208V\n\ - VXC_DP4x4(C0, Y, V, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniCalculateTmpG1st_4x4);\n\ - VXC_DP4x4(C1, Y, V, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniCalculateTmpG2nd_4x4);\n\ - VXC_DP4x4(C2, Y, V, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniCalculateTmpG3rd_4x4);\n\ - VXC_DP4x4(C3, Y, V, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniCalculateTmpG4th_4x4);\n\ - // 34784 - 100U\n\ - ushort tmpG = 34784;\n\ - vxc_ushort8 tmpDstG0, tmpDstG1;\n\ - VXC_DP2x8(tmpDstG0, U, tmpG, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniCalculateTmpGbyU_2x8);\n\ - VXC_DP2x8(tmpDstG1, U, tmpG, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniCalculateTmpGbyU2_2x8);\n\ - VXC_DP4x4(dst, C0, tmpDstG0, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), uniCalculateG1st_4x4);\n\ - VXC_DP4x4(dst, C1, tmpDstG0, VXC_MODIFIER(4, 7, 0, VXC_RM_ToNearestEven, 1), uniCalculateG2nd_4x4);\n\ - VXC_DP4x4(dst, C2, tmpDstG1, VXC_MODIFIER(8, 11, 0, VXC_RM_ToNearestEven, 1), uniCalculateG1st_4x4);\n\ - VXC_DP4x4(dst, C3, tmpDstG1, VXC_MODIFIER(12, 15, 0, VXC_RM_ToNearestEven, 1), uniCalculateG2nd_4x4);\n\ -\n\ - VXC_DP2x8(G, dst, paramData_f16, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniQuantU8toU8LoG_2x8);\n\ - VXC_DP2x8(G, dst, paramData_f16, VXC_MODIFIER(8, 15, 0, VXC_RM_ToNearestEven, 1), uniQuantU8toU8HiG_2x8);\n\ -\n\ - // calculate B\n\ - // ((298 * C + 516 * D + 128) >> 8) ==> [(298Y + 516U - 70688) >> 8]\n\ - VXC_DP4x4(C0, Y, U, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniCalculateTmpB1st_4x4);\n\ - VXC_DP4x4(C1, Y, U, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniCalculateTmpB2nd_4x4);\n\ - VXC_DP4x4(C2, Y, U, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniCalculateTmpB3rd_4x4);\n\ - VXC_DP4x4(C3, Y, U, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniCalculateTmpB4th_4x4);\n\ - tmpV = -70688;\n\ - VXC_DP4x4(dst, C0, tmpV, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), uniCalculateB1st_4x4);\n\ - VXC_DP4x4(dst, C1, tmpV, VXC_MODIFIER(4, 7, 0, VXC_RM_ToNearestEven, 1), uniCalculateB1st_4x4);\n\ - VXC_DP4x4(dst, C2, tmpV, VXC_MODIFIER(8, 11, 0, VXC_RM_ToNearestEven, 1), uniCalculateB1st_4x4);\n\ - VXC_DP4x4(dst, C3, tmpV, VXC_MODIFIER(12, 15, 0, VXC_RM_ToNearestEven, 1), uniCalculateB1st_4x4);\n\ -\n\ - VXC_DP2x8(B, dst, paramData_f16, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniQuantU8toU8LoB_2x8);\n\ - VXC_DP2x8(B, dst, paramData_f16, VXC_MODIFIER(8, 15, 0, VXC_RM_ToNearestEven, 1), uniQuantU8toU8HiB_2x8);\n\ -\n\ - // reorder to bgr\n\ - vxc_uchar8 tmpdst0, tmpdst1;\n\ - vxc_uchar16 dst0, dst1, dst2;\n\ -\n\ - if(bOrder == 2)\n\ - {\n\ - vxc_uchar16 exchangeData = B;\n\ - B = R;\n\ - R = exchangeData;\n\ - }\n\ -\n\ - // BGR BGR BG\n\ - VXC_DP2x8(tmpdst0, B, G, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniPackBG0_2x8);\n\ - VXC_DP2x8(dst0, tmpdst0, R, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniPackTmpAndR_2x8);\n\ -\n\ - // RBG RBG RB\n\ - VXC_DP2x8(tmpdst0, R, B, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniPackRB0_2x8);\n\ - VXC_DP2x8(dst0, tmpdst0, G, VXC_MODIFIER(8, 15, 0, VXC_RM_TowardZero, 0), uniPackTmp0AndG_2x8);\n\ -\n\ - pos = (int4)(get_global_id(0) * 3, get_global_id(1), 0, 0);\n\ -\n\ - VXC_WriteImage2DArray(output, pos, dst0, VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));\n\ - pos.x += 16;\n\ -\n\ - // GRB GRB GR\n\ - VXC_DP2x8(tmpdst0, G, R, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniPackGR1_2x8);\n\ - VXC_DP2x8(dst1, tmpdst0, B, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniPackTmp1AndB_2x8);\n\ -\n\ - // BGR BGR BG\n\ - VXC_DP2x8(tmpdst0, B, G, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniPackBG1_2x8);\n\ - VXC_DP2x8(dst1, tmpdst0, R, VXC_MODIFIER(8, 15, 0, VXC_RM_TowardZero, 0), uniPackTmp1AndR_2x8);\n\ -\n\ - VXC_WriteImage2DArray(output, pos, dst1, VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));\n\ - pos.x += 16;\n\ -\n\ - // RBG RBG RB\n\ - VXC_DP2x8(tmpdst0, R, B, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniPackRB2_2x8);\n\ - VXC_DP2x8(dst2, tmpdst0, G, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniPackTmp2AndG_2x8);\n\ -\n\ - // GRB GRB GR\n\ - VXC_DP2x8(tmpdst1, G, R, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniPackGR2_2x8);\n\ - VXC_DP2x8(dst2, tmpdst1, B, VXC_MODIFIER(8, 15, 0, VXC_RM_TowardZero, 0), uniPackTmp2AndB_2x8);\n\ -\n\ - VXC_WriteImage2DArray(output, pos, dst2, VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));\n\ -}"; /* end of pre_process_yuv444_copy_u8_vx*/ +"; /* end of pre_process_yuv444_copy_u8_vx*/ static const char pre_process_yuv444_scale_vx[] = "#include \"cl_viv_vx_ext.h\"\n\ \n\ @@ -25086,203 +26388,6 @@ __kernel void pre_process_yuv444_scale_U8toF16(\n\ VXC_WriteImage2DArray(output, dstPos, dst, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0));\n\ }"; /* end of pre_process_yuv444_scale_fp16_vx*/ -static const char pre_process_yuv444_trans_u8_vx[] = "#include \"cl_viv_vx_ext.h\"\n\ -\n\ -_viv_uniform VXC_512Bits uniCalculateR1st_4x4;\n\ -_viv_uniform VXC_512Bits uniCalculateTmpGbyU_2x8;\n\ -_viv_uniform VXC_512Bits uniCalculateTmpGbyU2nd_2x8;\n\ -\n\ -_viv_uniform VXC_512Bits uniCalculateB1st_4x4;\n\ -_viv_uniform VXC_512Bits uniConvertInt32toUint8_2x8;\n\ -_viv_uniform VXC_512Bits uniDescaleU8_4x4;\n\ -\n\ -_viv_uniform VXC_512Bits uniCalculateTmpRWise_4x4;\n\ -_viv_uniform VXC_512Bits uniCalculateTmpRWise2nd_4x4;\n\ -_viv_uniform VXC_512Bits uniCalculateTmpRWise3rd_4x4;\n\ -_viv_uniform VXC_512Bits uniCalculateTmpRWise4th_4x4;\n\ -_viv_uniform VXC_512Bits uniCalculateTmpGWise_4x4;\n\ -_viv_uniform VXC_512Bits uniCalculateTmpGWise2nd_4x4;\n\ -_viv_uniform VXC_512Bits uniCalculateTmpGWise3rd_4x4;\n\ -_viv_uniform VXC_512Bits uniCalculateTmpGWise4th_4x4;\n\ -_viv_uniform VXC_512Bits uniCalculateGWise_4x4;\n\ -_viv_uniform VXC_512Bits uniCalculateGWise2nd_4x4;\n\ -\n\ -_viv_uniform VXC_512Bits uniCalculateTmpBWise_4x4;\n\ -_viv_uniform VXC_512Bits uniCalculateTmpBWise2nd_4x4;\n\ -_viv_uniform VXC_512Bits uniCalculateTmpBWise3rd_4x4;\n\ -_viv_uniform VXC_512Bits uniCalculateTmpBWise4th_4x4;\n\ -\n\ -_viv_uniform VXC_512Bits uniBilinearTmp1st_4x4;\n\ -_viv_uniform VXC_512Bits uniBilinearTmp2nd_4x4;\n\ -_viv_uniform VXC_512Bits uniBilinearTmp3rd_4x4;\n\ -_viv_uniform VXC_512Bits uniBilinearTmp4th_4x4;\n\ -\n\ -_viv_uniform VXC_512Bits uniTransPackBgr1st_2x8;\n\ -_viv_uniform VXC_512Bits uniTransPackBgr2nd_2x8;\n\ -\n\ -_viv_uniform int bOrder;\n\ -_viv_uniform int rOrder;\n\ -_viv_uniform int zp;\n\ -_viv_uniform float outputScale;\n\ -\n\ -#define IMAGE_PRE_PROCESS_YUV444_TRANS(dst_name, dst_type) \\\n\ -__kernel void pre_process_yuv444_trans_U8to##dst_name( \\\n\ - __read_only image2d_t y_img, __read_only image2d_t u_img, \\\n\ - __read_only image2d_t v_img, __write_only image2d_t output, \\\n\ - global int *xRatio, global int * yRatio, global int * xOffset, global int * yOffset, \\\n\ - float rMean, float gMean, float bMean, float var, int reverse_channel, int trans) \\\n\ -{ \\\n\ - int4 gidx = get_global_id(0); \\\n\ - int gidy = get_global_id(1); \\\n\ - gidx += (int4)(0, 1, 2, 3); \\\n\ - \\\n\ - int4 fx = (gidx * (*xRatio) + ((*xRatio) >> 1)) - (1 << 14); \\\n\ - int4 sx = fx & 0xffff8000; \\\n\ - int fy, sy; \\\n\ - fx -= sx; \\\n\ - sx = sx >> 15; \\\n\ - fx = (fx +(1 << 4)) >> 5; \\\n\ - \\\n\ - fy = (gidy * (*yRatio) + ((*yRatio) >> 1)) - (1<< 14); \\\n\ - sy = fy & 0xffff8000; \\\n\ - fy -= sy; \\\n\ - sy = sy >> 15; \\\n\ - \\\n\ - sy = sy < 0 ? 0 : sy; \\\n\ - fy = fy < 0 ? 0 : fy; \\\n\ - \\\n\ - fy = (fy + (1<< 4)) >> 5; \\\n\ - sx += (*xOffset); \\\n\ - sy += (*yOffset); \\\n\ - int2 srcPos = (int2)(sx.x, sy); \\\n\ - \\\n\ - vxc_uchar16 Y, U, V; \\\n\ - vxc_int4 C0, C1, C2, C3; \\\n\ - vxc_uchar16 R, G, B; \\\n\ - \\\n\ - VXC_ReadImage(Y, y_img, srcPos, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0)); \\\n\ - VXC_ReadImage(Y, y_img, srcPos, VXC_5BITOFFSET_XY(0, 1), VXC_MODIFIER(2, 3, 0, VXC_RM_TowardZero, 0)); \\\n\ - VXC_ReadImage(U, u_img, srcPos, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0)); \\\n\ - VXC_ReadImage(U, u_img, srcPos, VXC_5BITOFFSET_XY(0, 1), VXC_MODIFIER(2, 3, 0, VXC_RM_TowardZero, 0)); \\\n\ - VXC_ReadImage(V, v_img, srcPos, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0)); \\\n\ - VXC_ReadImage(V, v_img, srcPos, VXC_5BITOFFSET_XY(0, 1), VXC_MODIFIER(2, 3, 0, VXC_RM_TowardZero, 0)); \\\n\ - \\\n\ - srcPos.x = sx.y; \\\n\ - VXC_ReadImage(Y, y_img, srcPos, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(4, 5, 0, VXC_RM_TowardZero, 0)); \\\n\ - VXC_ReadImage(Y, y_img, srcPos, VXC_5BITOFFSET_XY(0, 1), VXC_MODIFIER(6, 7, 0, VXC_RM_TowardZero, 0)); \\\n\ - VXC_ReadImage(U, u_img, srcPos, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(4, 5, 0, VXC_RM_TowardZero, 0)); \\\n\ - VXC_ReadImage(U, u_img, srcPos, VXC_5BITOFFSET_XY(0, 1), VXC_MODIFIER(6, 7, 0, VXC_RM_TowardZero, 0)); \\\n\ - VXC_ReadImage(V, v_img, srcPos, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(4, 5, 0, VXC_RM_TowardZero, 0)); \\\n\ - VXC_ReadImage(V, v_img, srcPos, VXC_5BITOFFSET_XY(0, 1), VXC_MODIFIER(6, 7, 0, VXC_RM_TowardZero, 0)); \\\n\ - \\\n\ - srcPos.x = sx.z; \\\n\ - VXC_ReadImage(Y, y_img, srcPos, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(8, 9, 0, VXC_RM_TowardZero, 0)); \\\n\ - VXC_ReadImage(Y, y_img, srcPos, VXC_5BITOFFSET_XY(0, 1), VXC_MODIFIER(10, 11, 0, VXC_RM_TowardZero, 0)); \\\n\ - VXC_ReadImage(U, u_img, srcPos, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(8, 9, 0, VXC_RM_TowardZero, 0)); \\\n\ - VXC_ReadImage(U, u_img, srcPos, VXC_5BITOFFSET_XY(0, 1), VXC_MODIFIER(10, 11, 0, VXC_RM_TowardZero, 0)); \\\n\ - VXC_ReadImage(V, v_img, srcPos, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(8, 9, 0, VXC_RM_TowardZero, 0)); \\\n\ - VXC_ReadImage(V, v_img, srcPos, VXC_5BITOFFSET_XY(0, 1), VXC_MODIFIER(10, 11, 0, VXC_RM_TowardZero, 0)); \\\n\ - \\\n\ - srcPos.x = sx.w; \\\n\ - VXC_ReadImage(Y, y_img, srcPos, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(12, 13, 0, VXC_RM_TowardZero, 0)); \\\n\ - VXC_ReadImage(Y, y_img, srcPos, VXC_5BITOFFSET_XY(0, 1), VXC_MODIFIER(14, 15, 0, VXC_RM_TowardZero, 0)); \\\n\ - VXC_ReadImage(U, u_img, srcPos, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(12, 13, 0, VXC_RM_TowardZero, 0)); \\\n\ - VXC_ReadImage(U, u_img, srcPos, VXC_5BITOFFSET_XY(0, 1), VXC_MODIFIER(14, 15, 0, VXC_RM_TowardZero, 0)); \\\n\ - VXC_ReadImage(V, v_img, srcPos, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(12, 13, 0, VXC_RM_TowardZero, 0)); \\\n\ - VXC_ReadImage(V, v_img, srcPos, VXC_5BITOFFSET_XY(0, 1), VXC_MODIFIER(14, 15, 0, VXC_RM_TowardZero, 0)); \\\n\ - \\\n\ - int tmpV = -56992; \\\n\ - VXC_DP4x4(C0, Y, V, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniCalculateTmpRWise_4x4); \\\n\ - VXC_DP4x4(C1, Y, V, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniCalculateTmpRWise2nd_4x4); \\\n\ - VXC_DP4x4(C2, Y, V, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniCalculateTmpRWise3rd_4x4); \\\n\ - VXC_DP4x4(C3, Y, V, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniCalculateTmpRWise4th_4x4); \\\n\ - VXC_DP4x4(R, C0, tmpV, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), uniCalculateR1st_4x4); \\\n\ - VXC_DP4x4(R, C1, tmpV, VXC_MODIFIER(4, 7, 0, VXC_RM_ToNearestEven, 1), uniCalculateR1st_4x4); \\\n\ - VXC_DP4x4(R, C2, tmpV, VXC_MODIFIER(8, 11, 0, VXC_RM_ToNearestEven, 1), uniCalculateR1st_4x4); \\\n\ - VXC_DP4x4(R, C3, tmpV, VXC_MODIFIER(12, 15, 0, VXC_RM_ToNearestEven, 1), uniCalculateR1st_4x4); \\\n\ - \\\n\ - VXC_DP4x4(C0, Y, V, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniCalculateTmpGWise_4x4); \\\n\ - VXC_DP4x4(C1, Y, V, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniCalculateTmpGWise2nd_4x4); \\\n\ - VXC_DP4x4(C2, Y, V, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniCalculateTmpGWise3rd_4x4); \\\n\ - VXC_DP4x4(C3, Y, V, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniCalculateTmpGWise4th_4x4); \\\n\ - \\\n\ - ushort tmpG = 34784; \\\n\ - vxc_ushort8 tmpDstG, tmpDstG1; \\\n\ - VXC_DP2x8(tmpDstG, U, tmpG, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniCalculateTmpGbyU_2x8); \\\n\ - VXC_DP2x8(tmpDstG1, U, tmpG, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniCalculateTmpGbyU2nd_2x8); \\\n\ - VXC_DP4x4(G, C0, tmpDstG, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), uniCalculateGWise_4x4); \\\n\ - VXC_DP4x4(G, C1, tmpDstG, VXC_MODIFIER(4, 7, 0, VXC_RM_ToNearestEven, 1), uniCalculateGWise2nd_4x4); \\\n\ - VXC_DP4x4(G, C2, tmpDstG1, VXC_MODIFIER(8, 11, 0, VXC_RM_ToNearestEven, 1), uniCalculateGWise_4x4); \\\n\ - VXC_DP4x4(G, C3, tmpDstG1, VXC_MODIFIER(12, 15, 0, VXC_RM_ToNearestEven, 1), uniCalculateGWise2nd_4x4); \\\n\ - \\\n\ - VXC_DP4x4(C0, Y, U, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniCalculateTmpBWise_4x4); \\\n\ - VXC_DP4x4(C1, Y, U, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniCalculateTmpBWise2nd_4x4); \\\n\ - VXC_DP4x4(C2, Y, U, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniCalculateTmpBWise3rd_4x4); \\\n\ - VXC_DP4x4(C3, Y, U, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniCalculateTmpBWise4th_4x4); \\\n\ - tmpV = -70688; \\\n\ - VXC_DP4x4(B, C0, tmpV, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), uniCalculateB1st_4x4); \\\n\ - VXC_DP4x4(B, C1, tmpV, VXC_MODIFIER(4, 7, 0, VXC_RM_ToNearestEven, 1), uniCalculateB1st_4x4); \\\n\ - VXC_DP4x4(B, C2, tmpV, VXC_MODIFIER(8, 11, 0, VXC_RM_ToNearestEven, 1), uniCalculateB1st_4x4); \\\n\ - VXC_DP4x4(B, C3, tmpV, VXC_MODIFIER(12, 15, 0, VXC_RM_ToNearestEven, 1), uniCalculateB1st_4x4); \\\n\ - \\\n\ - int4 result, temp1, temp2, dstR, dstG, dstB; \\\n\ - int4 tmpData0, tmpData1; \\\n\ - \\\n\ - VXC_DP4x4(tmpData0, B, B, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniBilinearTmp1st_4x4); \\\n\ - VXC_DP4x4(tmpData1, B, B, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniBilinearTmp2nd_4x4); \\\n\ - temp1 = fx * tmpData0 + tmpData1; \\\n\ - \\\n\ - VXC_DP4x4(tmpData0, B, B, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniBilinearTmp3rd_4x4); \\\n\ - VXC_DP4x4(tmpData1, B, B, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniBilinearTmp4th_4x4); \\\n\ - temp2 = fx * tmpData0 + tmpData1; \\\n\ - result = fy * temp2 + (temp1 << 10); \\\n\ - \\\n\ - tmpV = 1 << 19; \\\n\ - dst_type dst, tmpPack; \\\n\ - float4 tmpDst; \\\n\ - \\\n\ - VXC_DP4x4(tmpDst, result, tmpV, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), uniDescaleU8_4x4); \\\n\ - tmpDst = (tmpDst - bMean) * var; \\\n\ - dstB = convert_int4_rte(tmpDst * outputScale + zp); \\\n\ - \\\n\ - VXC_DP4x4(tmpData0, G, G, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniBilinearTmp1st_4x4); \\\n\ - VXC_DP4x4(tmpData1, G, G, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniBilinearTmp2nd_4x4); \\\n\ - temp1 = fx * tmpData0 + tmpData1; \\\n\ - VXC_DP4x4(tmpData0, G, G, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniBilinearTmp3rd_4x4); \\\n\ - VXC_DP4x4(tmpData1, G, G, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniBilinearTmp4th_4x4); \\\n\ - temp2 = fx * tmpData0 + tmpData1; \\\n\ - result = fy * temp2 + (temp1 << 10); \\\n\ - VXC_DP4x4(tmpDst, result, tmpV, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), uniDescaleU8_4x4); \\\n\ - tmpDst = (tmpDst - gMean) * var; \\\n\ - dstG = convert_int4_rte(tmpDst * outputScale + zp); \\\n\ - \\\n\ - VXC_DP4x4(tmpData0, R, R, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniBilinearTmp1st_4x4); \\\n\ - VXC_DP4x4(tmpData1, R, R, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniBilinearTmp2nd_4x4); \\\n\ - temp1 = fx * tmpData0 + tmpData1; \\\n\ - VXC_DP4x4(tmpData0, R, R, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniBilinearTmp3rd_4x4); \\\n\ - VXC_DP4x4(tmpData1, R, R, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniBilinearTmp4th_4x4); \\\n\ - temp2 = fx * tmpData0 + tmpData1; \\\n\ - result = fy * temp2 + (temp1 << 10); \\\n\ - VXC_DP4x4(tmpDst, result, tmpV, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), uniDescaleU8_4x4); \\\n\ - tmpDst = (tmpDst - rMean) * var; \\\n\ - dstR = convert_int4_rte(tmpDst * outputScale + zp); \\\n\ - \\\n\ - if(bOrder == 2) \\\n\ - { \\\n\ - int4 exchangeData = dstB; \\\n\ - dstB = dstR; \\\n\ - dstR = exchangeData; \\\n\ - } \\\n\ - \\\n\ - VXC_DP2x8(tmpPack, dstB, dstG, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniConvertInt32toUint8_2x8); \\\n\ - VXC_DP2x8(dst, tmpPack, dstR, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniTransPackBgr1st_2x8); \\\n\ - VXC_DP2x8(dst, tmpPack, dstR, VXC_MODIFIER(8, 11, 0, VXC_RM_ToNearestEven, 1), uniTransPackBgr2nd_2x8); \\\n\ - \\\n\ - int2 dstPos = (int2)(get_global_id(0) * 3, gidy); \\\n\ - VXC_WriteImage(output, dstPos, dst, VXC_MODIFIER(0, 11, 0, VXC_RM_TowardZero, 0)); \\\n\ -}\n\ -IMAGE_PRE_PROCESS_YUV444_TRANS(U8, vxc_uchar16)"; /* end of pre_process_yuv444_trans_u8_vx*/ - static const char prelu_vx[] = "\n\ #include \"cl_viv_vx_ext.h\"\n\ \n\ @@ -29709,37 +30814,34 @@ __kernel void resize_bilinear_BF16toBF16_DOWN\n\ float top_y_f = floor(in_y);\n\ float y_lerp = in_y - top_y_f;\n\ int top_y_idx = convert_int(top_y_f);\n\ - int bottom_y_idx = top_y_idx + 1;\n\ vxc_short8 top;\n\ vxc_short8 bottom;\n\ vxc_short8 zero = (vxc_short8)(0, 0, 0, 0, 0, 0, 0, 0);\n\ int4 coord_in = (int4)(left_x_idx.x, top_y_idx, coord_out.z, 0);\n\ \n\ - VXC_ReadImage2DArray(top, input, coord_in, \\\n\ - VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0));\n\ + int8 input_desc;\n\ + _viv_asm(COPY, input_desc, input, sizeof(input_desc));\n\ + int baseAddr = (int)coord_in.z * input_desc.s4 + input_desc.s0;\n\ + _viv_asm(MOV, coord_in.w, baseAddr);\n\ + VXC_OP4(img_load_3d, top, input, coord_in.xyww, VXC_5BITOFFSET_XY(0, 0),\n\ + VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0));\n\ + VXC_OP4(img_load_3d, bottom, input, coord_in.xyww, VXC_5BITOFFSET_XY(0, 1),\n\ + VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0));\n\ coord_in.x = left_x_idx.y;\n\ - VXC_ReadImage2DArray(top, input, coord_in, \\\n\ - VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(2, 3, 0, VXC_RM_TowardZero, 0));\n\ + VXC_OP4(img_load_3d, top, input, coord_in.xyww, VXC_5BITOFFSET_XY(0, 0),\n\ + VXC_MODIFIER(2, 3, 0, VXC_RM_TowardZero, 0));\n\ + VXC_OP4(img_load_3d, bottom, input, coord_in.xyww, VXC_5BITOFFSET_XY(0, 1),\n\ + VXC_MODIFIER(2, 3, 0, VXC_RM_TowardZero, 0));\n\ coord_in.x = left_x_idx.z;\n\ - VXC_ReadImage2DArray(top, input, coord_in, \\\n\ - VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(4, 5, 0, VXC_RM_TowardZero, 0));\n\ + VXC_OP4(img_load_3d, top, input, coord_in.xyww, VXC_5BITOFFSET_XY(0, 0),\n\ + VXC_MODIFIER(4, 5, 0, VXC_RM_TowardZero, 0));\n\ + VXC_OP4(img_load_3d, bottom, input, coord_in.xyww, VXC_5BITOFFSET_XY(0, 1),\n\ + VXC_MODIFIER(4, 5, 0, VXC_RM_TowardZero, 0));\n\ coord_in.x = left_x_idx.w;\n\ - VXC_ReadImage2DArray(top, input, coord_in, \\\n\ - VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(6, 7, 0, VXC_RM_TowardZero, 0));\n\ -\n\ - coord_in.y = bottom_y_idx;\n\ - coord_in.x = left_x_idx.x;\n\ - VXC_ReadImage2DArray(bottom, input, coord_in, \\\n\ - VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0));\n\ - coord_in.x = left_x_idx.y;\n\ - VXC_ReadImage2DArray(bottom, input, coord_in, \\\n\ - VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(2, 3, 0, VXC_RM_TowardZero, 0));\n\ - coord_in.x = left_x_idx.z;\n\ - VXC_ReadImage2DArray(bottom, input, coord_in, \\\n\ - VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(4, 5, 0, VXC_RM_TowardZero, 0));\n\ - coord_in.x = left_x_idx.w;\n\ - VXC_ReadImage2DArray(bottom, input, coord_in, \\\n\ - VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(6, 7, 0, VXC_RM_TowardZero, 0));\n\ + VXC_OP4(img_load_3d, top, input, coord_in.xyww, VXC_5BITOFFSET_XY(0, 0),\n\ + VXC_MODIFIER(6, 7, 0, VXC_RM_TowardZero, 0));\n\ + VXC_OP4(img_load_3d, bottom, input, coord_in.xyww, VXC_5BITOFFSET_XY(0, 1),\n\ + VXC_MODIFIER(6, 7, 0, VXC_RM_TowardZero, 0));\n\ \n\ vxc_ushort8 src;\n\ float4 left4;\n\ @@ -29765,7 +30867,14 @@ __kernel void resize_bilinear_BF16toBF16_DOWN\n\ vxc_ushort8 tmp, dst;\n\ _viv_asm(COPY, tmp, dst4, 16);\n\ dst.s0123 = tmp.s1357;\n\ - VXC_WriteImage2DArray(output, coord_out, dst, VXC_MODIFIER(0, 3, 0,VXC_RM_TowardZero, 0));\n\ +\n\ + int8 output_desc;\n\ + _viv_asm(COPY, output_desc, output, sizeof(output_desc));\n\ + baseAddr = (int)coord_out.z * output_desc.s4 + output_desc.s0;\n\ + _viv_asm(MOV, coord_out.w, baseAddr);\n\ +\n\ + VXC_OP4_NoDest(img_store_3d, output, coord_out.xyww, dst,\n\ + VXC_MODIFIER(0, 3, 0,VXC_RM_TowardZero, 0));\n\ }\n\ \n\ __kernel void resize_bilinear_BF16toBF16_UP\n\ @@ -29788,22 +30897,24 @@ __kernel void resize_bilinear_BF16toBF16_UP\n\ float top_y_f = floor(in_y);\n\ float y_lerp = in_y - top_y_f;\n\ int top_y_idx = convert_int(top_y_f);\n\ - float bottom_y_f = ceil(in_y);\n\ - int bottom_y_idx= convert_int(bottom_y_f);\n\ vxc_ushort8 src0, src1, src2, src3, dst0, dst1;\n\ vxc_short8 zero = (vxc_short8)(0, 0, 0, 0, 0, 0, 0, 0);\n\ int4 coord_in = (int4)(left_x_idx.x, top_y_idx, coord_out.z, 0);\n\ \n\ - VXC_ReadImage2DArray(src0, input, coord_in, \\\n\ - VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ - VXC_ReadImage2DArray(src1, input, coord_in, \\\n\ - VXC_5BITOFFSET_XY(8, 0), VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ + int8 input_desc;\n\ + _viv_asm(COPY, input_desc, input, sizeof(input_desc));\n\ + int baseAddr = (int)coord_in.z * input_desc.s4 + input_desc.s0;\n\ + _viv_asm(MOV, coord_in.w, baseAddr);\n\ +\n\ + VXC_OP4(img_load_3d, src0, input, coord_in.xyww, VXC_5BITOFFSET_XY(0, 0),\n\ + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ + VXC_OP4(img_load_3d, src1, input, coord_in.xyww, VXC_5BITOFFSET_XY(8, 0),\n\ + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ + VXC_OP4(img_load_3d, src2, input, coord_in.xyww, VXC_5BITOFFSET_XY(0, 1),\n\ + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ + VXC_OP4(img_load_3d, src3, input, coord_in.xyww, VXC_5BITOFFSET_XY(8, 1),\n\ + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ \n\ - coord_in.y = bottom_y_idx;\n\ - VXC_ReadImage2DArray(src2, input, coord_in, \\\n\ - VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ - VXC_ReadImage2DArray(src3, input, coord_in, \\\n\ - VXC_5BITOFFSET_XY(8, 0), VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ \n\ vxc_ushort8 bitextract_p0;\n\ vxc_uchar16 maskShift = {16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16};\n\ @@ -29813,29 +30924,36 @@ __kernel void resize_bilinear_BF16toBF16_UP\n\ VXC_DP2x8(maskShift, bitextract_p0, constData, \\\n\ VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniGetMaskShift_2x8);\n\ \n\ - do\n\ + int8 output_desc;\n\ + _viv_asm(COPY, output_desc, output, sizeof(output_desc));\n\ + baseAddr = (int)coord_out.z * output_desc.s4 + output_desc.s0;\n\ + _viv_asm(MOV, coord_out.w, baseAddr);\n\ +\n\ + float4 left4;\n\ + float4 right4;\n\ + float4 top4;\n\ + float4 bottom4;\n\ +\n\ + int loop = depth - 1;\n\ + while (coord_in.z < loop)\n\ {\n\ VXC_BitExtract(dst0, src0, src1, maskShift, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ VXC_BitExtract(dst1, src2, src3, maskShift, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ \n\ - coord_in.z ++;\n\ - coord_in.y = top_y_idx;\n\ - VXC_ReadImage2DArray(src0, input, coord_in, \\\n\ - VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ - VXC_ReadImage2DArray(src1, input, coord_in, \\\n\ - VXC_5BITOFFSET_XY(8, 0), VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ \n\ - coord_in.y = bottom_y_idx;\n\ - VXC_ReadImage2DArray(src2, input, coord_in, \\\n\ - VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ - VXC_ReadImage2DArray(src3, input, coord_in, \\\n\ - VXC_5BITOFFSET_XY(8, 0), VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ + coord_in.w += input_desc.s4;\n\ + VXC_OP4(img_load_3d, src0, input, coord_in.xyww, VXC_5BITOFFSET_XY(0, 0),\n\ + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ + VXC_OP4(img_load_3d, src1, input, coord_in.xyww, VXC_5BITOFFSET_XY(8, 0),\n\ + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ + VXC_OP4(img_load_3d, src2, input, coord_in.xyww, VXC_5BITOFFSET_XY(0, 1),\n\ + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ + VXC_OP4(img_load_3d, src3, input, coord_in.xyww, VXC_5BITOFFSET_XY(8, 1),\n\ + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ + coord_in.z ++;\n\ \n\ vxc_ushort8 dst_tmp;\n\ - float4 left4;\n\ - float4 right4;\n\ - float4 top4;\n\ - float4 bottom4;\n\ +\n\ \n\ VXC_DP2x8(dst_tmp, dst0, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvBF16toF32_Part0_2x8);\n\ _viv_asm(COPY, left4, dst_tmp, 16);\n\ @@ -29857,16 +30975,39 @@ __kernel void resize_bilinear_BF16toBF16_UP\n\ vxc_ushort8 tmp, dst;\n\ _viv_asm(COPY, tmp, dst4, 16);\n\ dst.s0123 = tmp.s1357;\n\ - VXC_WriteImage2DArray(output, coord_out, dst, VXC_MODIFIER(0, 3, 0,VXC_RM_TowardZero, 0));\n\ - coord_out.z ++;\n\ - } while (coord_in.z < depth);\n\ +\n\ + VXC_OP4_NoDest(img_store_3d, output, coord_out.xyww, dst, VXC_MODIFIER(0, 3, 0,VXC_RM_TowardZero, 0));\n\ + coord_out.w += output_desc.s4;\n\ + }\n\ +\n\ + VXC_BitExtract(dst0, src0, src1, maskShift, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ + VXC_BitExtract(dst1, src2, src3, maskShift, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ + vxc_ushort8 dst_tmp;\n\ + VXC_DP2x8(dst_tmp, dst0, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvBF16toF32_Part0_2x8);\n\ + _viv_asm(COPY, left4, dst_tmp, 16);\n\ + VXC_DP2x8(dst_tmp, dst0, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvBF16toF32_Part1_2x8);\n\ + _viv_asm(COPY, right4, dst_tmp, 16);\n\ + right4 -= left4;\n\ + top4 = right4 * x_lerp + left4;\n\ + VXC_DP2x8(dst_tmp, dst1, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvBF16toF32_Part0_2x8);\n\ + _viv_asm(COPY, left4, dst_tmp, 16);\n\ + VXC_DP2x8(dst_tmp, dst1, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvBF16toF32_Part1_2x8);\n\ + _viv_asm(COPY, right4, dst_tmp, 16);\n\ + right4 -= left4;\n\ + bottom4 = right4 * x_lerp + left4;\n\ + bottom4 -= top4;\n\ + float4 dst4 = bottom4 * y_lerp + top4;\n\ + vxc_ushort8 tmp, dst;\n\ + _viv_asm(COPY, tmp, dst4, 16);\n\ + dst.s0123 = tmp.s1357;\n\ + VXC_OP4_NoDest(img_store_3d, output, coord_out.xyww, dst, VXC_MODIFIER(0, 3, 0,VXC_RM_TowardZero, 0));\n\ }\n\ "; /* end of resize_bilinear_BF16_vx*/ static const char resize_bilinear_F16_vx[] = "#include \"cl_viv_vx_ext.h\"\n\ \n\ _viv_uniform VXC_512Bits uniExtact8Bit_2x8;\n\ -_viv_uniform VXC_512Bits uniFp16toFp32_4x4;\n\ +_viv_uniform VXC_512Bits uniFp16toFp32_left_4x4;\n\ _viv_uniform VXC_512Bits uniRightSubLeft_4x4;\n\ _viv_uniform VXC_512Bits uniExtactHalf8_2x8;\n\ _viv_uniform float2 scale_xy;\n\ @@ -29892,94 +31033,66 @@ __kernel void resize_bilinear_F16toF16_DOWN\n\ float4 left_x_f = floor(in_x);\n\ float4 x_lerp = in_x - left_x_f;\n\ int4 left_x_idx = convert_int4(left_x_f);\n\ - float4 right_x_f = ceil(in_x);\n\ - int4 right_x_idx = convert_int4(right_x_f);\n\ float in_y = (convert_float(coord_out.y) + half_pixel_value) * scale_xy.y - half_pixel_value;\n\ float top_y_f = floor(in_y);\n\ float y_lerp = in_y - top_y_f;\n\ int top_y_idx = convert_int(top_y_f);\n\ - float bottom_y_f = ceil(in_y);\n\ - int bottom_y_idx= convert_int(bottom_y_f);\n\ - vxc_short8 top_left0, top_right0;\n\ - vxc_short8 bottom_left0, bottom_right0;\n\ - vxc_half8 top_left, top_right;\n\ - vxc_half8 bottom_left, bottom_right;\n\ + vxc_short8 top_short, bottom_short, dst;\n\ + vxc_half8 top, bottom, result;\n\ int4 coord_in = (int4)(left_x_idx.x, top_y_idx, coord_out.z, 0);\n\ \n\ - VXC_ReadImage2DArray(top_left0, input, coord_in, \\\n\ - VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));\n\ + int8 input_desc;\n\ + _viv_asm(COPY, input_desc, input, sizeof(input_desc));\n\ + int baseAddr = (int)coord_in.z * input_desc.s4 + input_desc.s0;\n\ + _viv_asm(MOV, coord_in.w, baseAddr);\n\ + VXC_OP4(img_load_3d, top_short, input, coord_in.xyww, VXC_5BITOFFSET_XY(0, 0),\n\ + VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0));\n\ + VXC_OP4(img_load_3d, bottom_short, input, coord_in.xyww, VXC_5BITOFFSET_XY(0, 1),\n\ + VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0));\n\ coord_in.x = left_x_idx.y;\n\ - VXC_ReadImage2DArray(top_left0, input, coord_in, \\\n\ - VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(1, 1, 0, VXC_RM_TowardZero, 0));\n\ + VXC_OP4(img_load_3d, top_short, input, coord_in.xyww, VXC_5BITOFFSET_XY(0, 0),\n\ + VXC_MODIFIER(2, 3, 0, VXC_RM_TowardZero, 0));\n\ + VXC_OP4(img_load_3d, bottom_short, input, coord_in.xyww, VXC_5BITOFFSET_XY(0, 1),\n\ + VXC_MODIFIER(2, 3, 0, VXC_RM_TowardZero, 0));\n\ coord_in.x = left_x_idx.z;\n\ - VXC_ReadImage2DArray(top_left0, input, coord_in, \\\n\ - VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(2, 2, 0, VXC_RM_TowardZero, 0));\n\ + VXC_OP4(img_load_3d, top_short, input, coord_in.xyww, VXC_5BITOFFSET_XY(0, 0),\n\ + VXC_MODIFIER(4, 5, 0, VXC_RM_TowardZero, 0));\n\ + VXC_OP4(img_load_3d, bottom_short, input, coord_in.xyww, VXC_5BITOFFSET_XY(0, 1),\n\ + VXC_MODIFIER(4, 5, 0, VXC_RM_TowardZero, 0));\n\ coord_in.x = left_x_idx.w;\n\ - VXC_ReadImage2DArray(top_left0, input, coord_in, \\\n\ - VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(3, 3, 0, VXC_RM_TowardZero, 0));\n\ - _viv_asm(COPY, top_left, top_left0, 16);\n\ -\n\ - coord_in.x = right_x_idx.x;\n\ - VXC_ReadImage2DArray(top_right0, input, coord_in, \\\n\ - VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));\n\ - coord_in.x = right_x_idx.y;\n\ - VXC_ReadImage2DArray(top_right0, input, coord_in, \\\n\ - VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(1, 1, 0, VXC_RM_TowardZero, 0));\n\ - coord_in.x = right_x_idx.z;\n\ - VXC_ReadImage2DArray(top_right0, input, coord_in, \\\n\ - VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(2, 2, 0, VXC_RM_TowardZero, 0));\n\ - coord_in.x = right_x_idx.w;\n\ - VXC_ReadImage2DArray(top_right0, input, coord_in, \\\n\ - VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(3, 3, 0, VXC_RM_TowardZero, 0));\n\ - _viv_asm(COPY, top_right, top_right0, 16);\n\ -\n\ - coord_in.y = bottom_y_idx;\n\ - coord_in.x = left_x_idx.x;\n\ - VXC_ReadImage2DArray(bottom_left0, input, coord_in, \\\n\ - VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));\n\ - coord_in.x = left_x_idx.y;\n\ - VXC_ReadImage2DArray(bottom_left0, input, coord_in, \\\n\ - VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(1, 1, 0, VXC_RM_TowardZero, 0));\n\ - coord_in.x = left_x_idx.z;\n\ - VXC_ReadImage2DArray(bottom_left0, input, coord_in, \\\n\ - VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(2, 2, 0, VXC_RM_TowardZero, 0));\n\ - coord_in.x = left_x_idx.w;\n\ - VXC_ReadImage2DArray(bottom_left0, input, coord_in, \\\n\ - VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(3, 3, 0, VXC_RM_TowardZero, 0));\n\ - _viv_asm(COPY, bottom_left, bottom_left0, 16);\n\ -\n\ - coord_in.x = right_x_idx.x;\n\ - VXC_ReadImage2DArray(bottom_right0, input, coord_in, \\\n\ - VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));\n\ - coord_in.x = right_x_idx.y;\n\ - VXC_ReadImage2DArray(bottom_right0, input, coord_in, \\\n\ - VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(1, 1, 0, VXC_RM_TowardZero, 0));\n\ - coord_in.x = right_x_idx.z;\n\ - VXC_ReadImage2DArray(bottom_right0, input, coord_in, \\\n\ - VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(2, 2, 0, VXC_RM_TowardZero, 0));\n\ - coord_in.x = right_x_idx.w;\n\ - VXC_ReadImage2DArray(bottom_right0, input, coord_in, \\\n\ - VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(3, 3, 0, VXC_RM_TowardZero, 0));\n\ - _viv_asm(COPY, bottom_right, bottom_right0, 16);\n\ + VXC_OP4(img_load_3d, top_short, input, coord_in.xyww, VXC_5BITOFFSET_XY(0, 0),\n\ + VXC_MODIFIER(6, 7, 0, VXC_RM_TowardZero, 0));\n\ + VXC_OP4(img_load_3d, bottom_short, input, coord_in.xyww, VXC_5BITOFFSET_XY(0, 1),\n\ + VXC_MODIFIER(6, 7, 0, VXC_RM_TowardZero, 0));\n\ + _viv_asm(COPY, top, top_short, 16);\n\ + _viv_asm(COPY, bottom, bottom_short, 16);\n\ \n\ float4 left4;\n\ float4 right4;\n\ float4 top4;\n\ float4 bottom4;\n\ \n\ - VXC_DP4x4(left4, top_left, top_left, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniFp16toFp32_4x4);\n\ - VXC_DP4x4(right4, top_right, top_left, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniRightSubLeft_4x4);\n\ - top4 = right4 * x_lerp + left4;\n\ - VXC_DP4x4(left4, bottom_left, bottom_left, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniFp16toFp32_4x4);\n\ - VXC_DP4x4(right4, bottom_right, bottom_left, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniRightSubLeft_4x4);\n\ - bottom4 = right4 * x_lerp + left4;\n\ - bottom4 -= top4;\n\ - float4 dst4 = bottom4 * y_lerp + top4;\n\ + VXC_DP4x4(left4, top, top, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniFp16toFp32_left_4x4);\n\ + VXC_DP4x4(right4, top, top, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniRightSubLeft_4x4);\n\ + top4 = right4 * x_lerp + left4;\n\ + VXC_DP4x4(left4, bottom, bottom, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniFp16toFp32_left_4x4);\n\ + VXC_DP4x4(right4, bottom, bottom, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniRightSubLeft_4x4);\n\ + bottom4 = right4 * x_lerp + left4;\n\ + bottom4 -= top4;\n\ + float4 dst4 = bottom4 * y_lerp + top4;\n\ +\n\ half4 tmp;\n\ _viv_asm(CONV, tmp, dst4);\n\ - VXC_DP2x8(top_left, tmp, tmp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniExtactHalf8_2x8);\n\ - _viv_asm(COPY, top_left0, top_left, 16);\n\ - VXC_WriteImage2DArray(output, coord_out, top_left0, VXC_MODIFIER(0, 3, 0,VXC_RM_TowardZero, 0));\n\ + VXC_DP2x8(result, tmp, tmp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniExtactHalf8_2x8);\n\ + _viv_asm(COPY, dst, result, 16);\n\ +\n\ + int8 output_desc;\n\ + _viv_asm(COPY, output_desc, output, sizeof(output_desc));\n\ + baseAddr = (int)coord_out.z * output_desc.s4 + output_desc.s0;\n\ + _viv_asm(MOV, coord_out.w, baseAddr);\n\ +\n\ + VXC_OP4_NoDest(img_store_3d, output, coord_out.xyww, dst,\n\ + VXC_MODIFIER(0, 3, 0,VXC_RM_TowardZero, 0));\n\ }\n\ \n\ __kernel void resize_bilinear_F16toU8_DOWN\n\ @@ -29996,84 +31109,50 @@ __kernel void resize_bilinear_F16toU8_DOWN\n\ float4 left_x_f = floor(in_x);\n\ float4 x_lerp = in_x - left_x_f;\n\ int4 left_x_idx = convert_int4(left_x_f);\n\ - float4 right_x_f = ceil(in_x);\n\ - int4 right_x_idx = convert_int4(right_x_f);\n\ float in_y = (convert_float(coord_out.y) + half_pixel_value) * scale_xy.y - half_pixel_value;\n\ float top_y_f = floor(in_y);\n\ float y_lerp = in_y - top_y_f;\n\ int top_y_idx = convert_int(top_y_f);\n\ - float bottom_y_f = ceil(in_y);\n\ - int bottom_y_idx= convert_int(bottom_y_f);\n\ - vxc_short8 top_left0, top_right0;\n\ - vxc_short8 bottom_left0, bottom_right0;\n\ - vxc_half8 top_left, top_right;\n\ - vxc_half8 bottom_left, bottom_right;\n\ +\n\ + vxc_short8 top_short, bottom_short;\n\ + vxc_half8 top, bottom;\n\ int4 coord_in = (int4)(left_x_idx.x, top_y_idx, coord_out.z, 0);\n\ \n\ - VXC_ReadImage2DArray(top_left0, input, coord_in, \\\n\ - VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));\n\ + int8 input_desc;\n\ + _viv_asm(COPY, input_desc, input, sizeof(input_desc));\n\ + int baseAddr = (int)coord_in.z * input_desc.s4 + input_desc.s0;\n\ + _viv_asm(MOV, coord_in.w, baseAddr);\n\ + VXC_OP4(img_load_3d, top_short, input, coord_in.xyww, VXC_5BITOFFSET_XY(0, 0),\n\ + VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0));\n\ + VXC_OP4(img_load_3d, bottom_short, input, coord_in.xyww, VXC_5BITOFFSET_XY(0, 1),\n\ + VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0));\n\ coord_in.x = left_x_idx.y;\n\ - VXC_ReadImage2DArray(top_left0, input, coord_in, \\\n\ - VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(1, 1, 0, VXC_RM_TowardZero, 0));\n\ + VXC_OP4(img_load_3d, top_short, input, coord_in.xyww, VXC_5BITOFFSET_XY(0, 0),\n\ + VXC_MODIFIER(2, 3, 0, VXC_RM_TowardZero, 0));\n\ + VXC_OP4(img_load_3d, bottom_short, input, coord_in.xyww, VXC_5BITOFFSET_XY(0, 1),\n\ + VXC_MODIFIER(2, 3, 0, VXC_RM_TowardZero, 0));\n\ coord_in.x = left_x_idx.z;\n\ - VXC_ReadImage2DArray(top_left0, input, coord_in, \\\n\ - VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(2, 2, 0, VXC_RM_TowardZero, 0));\n\ + VXC_OP4(img_load_3d, top_short, input, coord_in.xyww, VXC_5BITOFFSET_XY(0, 0),\n\ + VXC_MODIFIER(4, 5, 0, VXC_RM_TowardZero, 0));\n\ + VXC_OP4(img_load_3d, bottom_short, input, coord_in.xyww, VXC_5BITOFFSET_XY(0, 1),\n\ + VXC_MODIFIER(4, 5, 0, VXC_RM_TowardZero, 0));\n\ coord_in.x = left_x_idx.w;\n\ - VXC_ReadImage2DArray(top_left0, input, coord_in, \\\n\ - VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(3, 3, 0, VXC_RM_TowardZero, 0));\n\ - _viv_asm(COPY, top_left, top_left0, 16);\n\ + VXC_OP4(img_load_3d, top_short, input, coord_in.xyww, VXC_5BITOFFSET_XY(0, 0),\n\ + VXC_MODIFIER(6, 7, 0, VXC_RM_TowardZero, 0));\n\ + VXC_OP4(img_load_3d, bottom_short, input, coord_in.xyww, VXC_5BITOFFSET_XY(0, 1),\n\ + VXC_MODIFIER(6, 7, 0, VXC_RM_TowardZero, 0));\n\ + _viv_asm(COPY, top, top_short, 16);\n\ + _viv_asm(COPY, bottom, bottom_short, 16);\n\ \n\ - coord_in.x = right_x_idx.x;\n\ - VXC_ReadImage2DArray(top_right0, input, coord_in, \\\n\ - VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));\n\ - coord_in.x = right_x_idx.y;\n\ - VXC_ReadImage2DArray(top_right0, input, coord_in, \\\n\ - VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(1, 1, 0, VXC_RM_TowardZero, 0));\n\ - coord_in.x = right_x_idx.z;\n\ - VXC_ReadImage2DArray(top_right0, input, coord_in, \\\n\ - VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(2, 2, 0, VXC_RM_TowardZero, 0));\n\ - coord_in.x = right_x_idx.w;\n\ - VXC_ReadImage2DArray(top_right0, input, coord_in, \\\n\ - VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(3, 3, 0, VXC_RM_TowardZero, 0));\n\ - _viv_asm(COPY, top_right, top_right0, 16);\n\ -\n\ - coord_in.y = bottom_y_idx;\n\ - coord_in.x = left_x_idx.x;\n\ - VXC_ReadImage2DArray(bottom_left0, input, coord_in, \\\n\ - VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));\n\ - coord_in.x = left_x_idx.y;\n\ - VXC_ReadImage2DArray(bottom_left0, input, coord_in, \\\n\ - VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(1, 1, 0, VXC_RM_TowardZero, 0));\n\ - coord_in.x = left_x_idx.z;\n\ - VXC_ReadImage2DArray(bottom_left0, input, coord_in, \\\n\ - VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(2, 2, 0, VXC_RM_TowardZero, 0));\n\ - coord_in.x = left_x_idx.w;\n\ - VXC_ReadImage2DArray(bottom_left0, input, coord_in, \\\n\ - VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(3, 3, 0, VXC_RM_TowardZero, 0));\n\ - _viv_asm(COPY, bottom_left, bottom_left0, 16);\n\ -\n\ - coord_in.x = right_x_idx.x;\n\ - VXC_ReadImage2DArray(bottom_right0, input, coord_in, \\\n\ - VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));\n\ - coord_in.x = right_x_idx.y;\n\ - VXC_ReadImage2DArray(bottom_right0, input, coord_in, \\\n\ - VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(1, 1, 0, VXC_RM_TowardZero, 0));\n\ - coord_in.x = right_x_idx.z;\n\ - VXC_ReadImage2DArray(bottom_right0, input, coord_in, \\\n\ - VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(2, 2, 0, VXC_RM_TowardZero, 0));\n\ - coord_in.x = right_x_idx.w;\n\ - VXC_ReadImage2DArray(bottom_right0, input, coord_in, \\\n\ - VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(3, 3, 0, VXC_RM_TowardZero, 0));\n\ - _viv_asm(COPY, bottom_right, bottom_right0, 16);\n\ float4 left4;\n\ float4 right4;\n\ float4 top4;\n\ float4 bottom4;\n\ - VXC_DP4x4(left4, top_left, top_left, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniFp16toFp32_4x4);\n\ - VXC_DP4x4(right4, top_right, top_left, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniRightSubLeft_4x4);\n\ + VXC_DP4x4(left4, top, top, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniFp16toFp32_left_4x4);\n\ + VXC_DP4x4(right4, top, top, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniRightSubLeft_4x4);\n\ top4 = right4 * x_lerp + left4;\n\ - VXC_DP4x4(left4, bottom_left, bottom_left, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniFp16toFp32_4x4);\n\ - VXC_DP4x4(right4, bottom_right, bottom_left, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniRightSubLeft_4x4);\n\ + VXC_DP4x4(left4, bottom, bottom, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniFp16toFp32_left_4x4);\n\ + VXC_DP4x4(right4, bottom, bottom, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniRightSubLeft_4x4);\n\ bottom4 = right4 * x_lerp + left4;\n\ bottom4 -= top4;\n\ float4 dst4 = bottom4 * y_lerp + top4;\n\ @@ -30081,7 +31160,14 @@ __kernel void resize_bilinear_F16toU8_DOWN\n\ int4 dst = convert_int4_rte(dst4);\n\ vxc_uchar8 dst_uchar;\n\ VXC_DP2x8(dst_uchar, dst, dst, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), uniExtact8Bit_2x8);\n\ - VXC_WriteImage2DArray(output, coord_out, dst_uchar, VXC_MODIFIER(0, 3, 0,VXC_RM_TowardZero, 0));\n\ +\n\ + int8 output_desc;\n\ + _viv_asm(COPY, output_desc, output, sizeof(output_desc));\n\ + baseAddr = (int)coord_out.z * output_desc.s4 + output_desc.s0;\n\ + _viv_asm(MOV, coord_out.w, baseAddr);\n\ +\n\ + VXC_OP4_NoDest(img_store_3d, output, coord_out.xyww, dst_uchar,\n\ + VXC_MODIFIER(0, 3, 0,VXC_RM_TowardZero, 0));\n\ }\n\ \n\ __kernel void resize_bilinear_F16toF16_UP\n\ @@ -30104,24 +31190,26 @@ __kernel void resize_bilinear_F16toF16_UP\n\ float top_y_f = floor(in_y);\n\ float y_lerp = in_y - top_y_f;\n\ int top_y_idx = convert_int(top_y_f);\n\ - float bottom_y_f = ceil(in_y);\n\ - int bottom_y_idx= convert_int(bottom_y_f);\n\ +\n\ \n\ vxc_ushort8 src0, src1, src2, src3, dst0, dst1;\n\ vxc_half8 top;\n\ vxc_half8 bottom;\n\ int4 coord_in = (int4)(left_x_idx.x, top_y_idx, coord_out.z, 0);\n\ \n\ - VXC_ReadImage2DArray(src0, input, coord_in, \\\n\ - VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ - VXC_ReadImage2DArray(src1, input, coord_in, \\\n\ - VXC_5BITOFFSET_XY(8, 0), VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ + int8 input_desc;\n\ + _viv_asm(COPY, input_desc, input, sizeof(input_desc));\n\ + int baseAddr = (int)coord_in.z * input_desc.s4 + input_desc.s0;\n\ + _viv_asm(MOV, coord_in.w, baseAddr);\n\ \n\ - coord_in.y = bottom_y_idx;\n\ - VXC_ReadImage2DArray(src2, input, coord_in, \\\n\ - VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ - VXC_ReadImage2DArray(src3, input, coord_in, \\\n\ - VXC_5BITOFFSET_XY(8, 0), VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ + VXC_OP4(img_load_3d, src0, input, coord_in.xyww, VXC_5BITOFFSET_XY(0, 0),\n\ + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ + VXC_OP4(img_load_3d, src1, input, coord_in.xyww, VXC_5BITOFFSET_XY(8, 0),\n\ + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ + VXC_OP4(img_load_3d, src2, input, coord_in.xyww, VXC_5BITOFFSET_XY(0, 1),\n\ + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ + VXC_OP4(img_load_3d, src3, input, coord_in.xyww, VXC_5BITOFFSET_XY(8, 1),\n\ + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ \n\ vxc_ushort8 bitextract_p0;\n\ vxc_uchar16 maskShift = {16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16};\n\ @@ -30131,32 +31219,41 @@ __kernel void resize_bilinear_F16toF16_UP\n\ VXC_DP2x8(maskShift, bitextract_p0, constData, \\\n\ VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniGetMaskShift_2x8);\n\ \n\ - do\n\ + int8 output_desc;\n\ + _viv_asm(COPY, output_desc, output, sizeof(output_desc));\n\ + baseAddr = (int)coord_out.z * output_desc.s4 + output_desc.s0;\n\ + _viv_asm(MOV, coord_out.w, baseAddr);\n\ +\n\ + float4 left4;\n\ + float4 right4;\n\ + float4 top4;\n\ + float4 bottom4;\n\ +\n\ + int loop = depth - 1;\n\ + while (coord_in.z < loop)\n\ {\n\ VXC_BitExtract(dst0, src0, src1, maskShift, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ VXC_BitExtract(dst1, src2, src3, maskShift, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ _viv_asm(COPY, top, dst0, 16);\n\ _viv_asm(COPY, bottom, dst1, 16);\n\ +\n\ +\n\ + coord_in.w += input_desc.s4;\n\ + VXC_OP4(img_load_3d, src0, input, coord_in.xyww, VXC_5BITOFFSET_XY(0, 0),\n\ + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ + VXC_OP4(img_load_3d, src1, input, coord_in.xyww, VXC_5BITOFFSET_XY(8, 0),\n\ + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ + VXC_OP4(img_load_3d, src2, input, coord_in.xyww, VXC_5BITOFFSET_XY(0, 1),\n\ + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ + VXC_OP4(img_load_3d, src3, input, coord_in.xyww, VXC_5BITOFFSET_XY(8, 1),\n\ + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ coord_in.z ++;\n\ - coord_in.y = top_y_idx;\n\ - VXC_ReadImage2DArray(src0, input, coord_in, \\\n\ - VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ - VXC_ReadImage2DArray(src1, input, coord_in, \\\n\ - VXC_5BITOFFSET_XY(8, 0), VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ - coord_in.y = bottom_y_idx;\n\ - VXC_ReadImage2DArray(src2, input, coord_in, \\\n\ - VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ - VXC_ReadImage2DArray(src3, input, coord_in, \\\n\ - VXC_5BITOFFSET_XY(8, 0), VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ - float4 left4;\n\ - float4 right4;\n\ - float4 top4;\n\ - float4 bottom4;\n\ - VXC_DP4x4(left4, top, top, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniFp16toFp32_4x4);\n\ - VXC_DP4x4(right4, top, top, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniFp16toFp32_part1_4x4);\n\ +\n\ + VXC_DP4x4(left4, top, top, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniFp16toFp32_left_4x4);\n\ + VXC_DP4x4(right4, top, top, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniRightSubLeft_4x4);\n\ top4 = right4 * x_lerp + left4;\n\ - VXC_DP4x4(left4, bottom, bottom, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniFp16toFp32_4x4);\n\ - VXC_DP4x4(right4, bottom, bottom, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniFp16toFp32_part1_4x4);\n\ + VXC_DP4x4(left4, bottom, bottom, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniFp16toFp32_left_4x4);\n\ + VXC_DP4x4(right4, bottom, bottom, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniRightSubLeft_4x4);\n\ bottom4 = right4 * x_lerp + left4;\n\ bottom4 -= top4;\n\ float4 dst4 = bottom4 * y_lerp + top4;\n\ @@ -30164,9 +31261,30 @@ __kernel void resize_bilinear_F16toF16_UP\n\ _viv_asm(CONV, tmp, dst4);\n\ VXC_DP2x8(top, tmp, tmp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniExtactHalf8_2x8);\n\ _viv_asm(COPY, dst0, top, 16);\n\ - VXC_WriteImage2DArray(output, coord_out, dst0, VXC_MODIFIER(0, 3, 0,VXC_RM_TowardZero, 0));\n\ - coord_out.z ++;\n\ - } while (coord_in.z < depth);\n\ +\n\ + VXC_OP4_NoDest(img_store_3d, output, coord_out.xyww, dst0, VXC_MODIFIER(0, 3, 0,VXC_RM_TowardZero, 0));\n\ + coord_out.w += output_desc.s4;\n\ + }\n\ +\n\ + VXC_BitExtract(dst0, src0, src1, maskShift, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ + VXC_BitExtract(dst1, src2, src3, maskShift, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ + _viv_asm(COPY, top, dst0, 16);\n\ + _viv_asm(COPY, bottom, dst1, 16);\n\ +\n\ + VXC_DP4x4(left4, top, top, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniFp16toFp32_left_4x4);\n\ + VXC_DP4x4(right4, top, top, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniRightSubLeft_4x4);\n\ + top4 = right4 * x_lerp + left4;\n\ + VXC_DP4x4(left4, bottom, bottom, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniFp16toFp32_left_4x4);\n\ + VXC_DP4x4(right4, bottom, bottom, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniRightSubLeft_4x4);\n\ + bottom4 = right4 * x_lerp + left4;\n\ + bottom4 -= top4;\n\ + float4 dst4 = bottom4 * y_lerp + top4;\n\ + half4 tmp;\n\ + _viv_asm(CONV, tmp, dst4);\n\ + VXC_DP2x8(top, tmp, tmp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniExtactHalf8_2x8);\n\ + _viv_asm(COPY, dst0, top, 16);\n\ +\n\ + VXC_OP4_NoDest(img_store_3d, output, coord_out.xyww, dst0, VXC_MODIFIER(0, 3, 0,VXC_RM_TowardZero, 0));\n\ }\n\ "; /* end of resize_bilinear_F16_vx*/ @@ -30177,8 +31295,8 @@ _viv_uniform float2 scale_xy;\n\ _viv_uniform int depth;\n\ _viv_uniform VXC_512Bits uniConvertI32toI16_2x8;\n\ _viv_uniform VXC_512Bits uniGetMaskShift_2x8;\n\ -_viv_uniform VXC_512Bits uniConvertDFP2FP32_part1_4x4;\n\ -_viv_uniform VXC_512Bits uniConvertDFP2FP32_4x4;\n\ +_viv_uniform VXC_512Bits uniDFPtoFp32_left_4x4;\n\ +_viv_uniform VXC_512Bits uniRightSubLeft_4x4;\n\ _viv_uniform float dfpScale;\n\ _viv_uniform float half_pixel_value;\n\ \n\ @@ -30206,8 +31324,6 @@ __kernel void resize_bilinear_I16toI16_UP\n\ float top_y_f = floor(in_y);\n\ float y_lerp = in_y - top_y_f;\n\ int top_y_idx = convert_int(top_y_f);\n\ - float bottom_y_f = ceil(in_y);\n\ - int bottom_y_idx= convert_int(bottom_y_f);\n\ \n\ vxc_ushort8 src0, src1, src2, src3, dst0, dst1;\n\ \n\ @@ -30216,16 +31332,19 @@ __kernel void resize_bilinear_I16toI16_UP\n\ \n\ int4 coord_in = (int4)(left_x_idx.x, top_y_idx, coord_out.z, 0);\n\ \n\ - VXC_ReadImage2DArray(src0, input, coord_in, \\\n\ - VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ - VXC_ReadImage2DArray(src1, input, coord_in, \\\n\ - VXC_5BITOFFSET_XY(8, 0), VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ + int8 input_desc;\n\ + _viv_asm(COPY, input_desc, input, sizeof(input_desc));\n\ + int baseAddr = (int)coord_in.z * input_desc.s4 + input_desc.s0;\n\ + _viv_asm(MOV, coord_in.w, baseAddr);\n\ \n\ - coord_in.y = bottom_y_idx;\n\ - VXC_ReadImage2DArray(src2, input, coord_in, \\\n\ - VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ - VXC_ReadImage2DArray(src3, input, coord_in, \\\n\ - VXC_5BITOFFSET_XY(8, 0), VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ + VXC_OP4(img_load_3d, src0, input, coord_in.xyww, VXC_5BITOFFSET_XY(0, 0),\n\ + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ + VXC_OP4(img_load_3d, src1, input, coord_in.xyww, VXC_5BITOFFSET_XY(8, 0),\n\ + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ + VXC_OP4(img_load_3d, src2, input, coord_in.xyww, VXC_5BITOFFSET_XY(0, 1),\n\ + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ + VXC_OP4(img_load_3d, src3, input, coord_in.xyww, VXC_5BITOFFSET_XY(8, 1),\n\ + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ \n\ vxc_ushort8 bitextract_p0;\n\ vxc_uchar16 maskShift = {16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16};\n\ @@ -30235,39 +31354,42 @@ __kernel void resize_bilinear_I16toI16_UP\n\ VXC_DP2x8(maskShift, bitextract_p0, constData, \\\n\ VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniGetMaskShift_2x8);\n\ \n\ - do\n\ + int8 output_desc;\n\ + _viv_asm(COPY, output_desc, output, sizeof(output_desc));\n\ + baseAddr = (int)coord_out.z * output_desc.s4 + output_desc.s0;\n\ + _viv_asm(MOV, coord_out.w, baseAddr);\n\ +\n\ + float4 left4;\n\ + float4 right4;\n\ + float4 top4;\n\ + float4 bottom4;\n\ +\n\ + int loop = depth - 1;\n\ + while (coord_in.z < loop)\n\ {\n\ VXC_BitExtract(dst0, src0, src1, maskShift, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ VXC_BitExtract(dst1, src2, src3, maskShift, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ _viv_asm(COPY, top, dst0, 16);\n\ _viv_asm(COPY, bottom, dst1, 16);\n\ -\n\ - float4 left4;\n\ - float4 right4;\n\ - float4 top4;\n\ - float4 bottom4;\n\ -\n\ + coord_in.w += input_desc.s4;\n\ + VXC_OP4(img_load_3d, src0, input, coord_in.xyww, VXC_5BITOFFSET_XY(0, 0),\n\ + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ + VXC_OP4(img_load_3d, src1, input, coord_in.xyww, VXC_5BITOFFSET_XY(8, 0),\n\ + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ + VXC_OP4(img_load_3d, src2, input, coord_in.xyww, VXC_5BITOFFSET_XY(0, 1),\n\ + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ + VXC_OP4(img_load_3d, src3, input, coord_in.xyww, VXC_5BITOFFSET_XY(8, 1),\n\ + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ coord_in.z ++;\n\ - coord_in.y = top_y_idx;\n\ - VXC_ReadImage2DArray(src0, input, coord_in, \\\n\ - VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ - VXC_ReadImage2DArray(src1, input, coord_in, \\\n\ - VXC_5BITOFFSET_XY(8, 0), VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ \n\ - coord_in.y = bottom_y_idx;\n\ - VXC_ReadImage2DArray(src2, input, coord_in, \\\n\ - VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ - VXC_ReadImage2DArray(src3, input, coord_in, \\\n\ - VXC_5BITOFFSET_XY(8, 0), VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ -\n\ - VXC_DP4x4(left4, top, top, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertDFP2FP32_4x4);\n\ - VXC_DP4x4(right4, top, top, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertDFP2FP32_part1_4x4);\n\ + VXC_DP4x4(left4, top, top, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniDFPtoFp32_left_4x4);\n\ + VXC_DP4x4(right4, top, top, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniRightSubLeft_4x4);\n\ top4 = right4 * x_lerp + left4;\n\ \n\ VXC_DP4x4(left4, bottom, bottom, \\\n\ - VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertDFP2FP32_4x4);\n\ + VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniDFPtoFp32_left_4x4);\n\ VXC_DP4x4(right4, bottom, bottom, \\\n\ - VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertDFP2FP32_part1_4x4);\n\ + VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniRightSubLeft_4x4);\n\ bottom4 = right4 * x_lerp + left4;\n\ bottom4 -= top4;\n\ float4 dst4 = bottom4 * y_lerp + top4;\n\ @@ -30275,10 +31397,30 @@ __kernel void resize_bilinear_I16toI16_UP\n\ int4 dst = convert_int4_rte(dst4);\n\ \n\ VXC_DP2x8(top, dst, dst, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), uniExtact8Bit_2x8);\n\ - VXC_WriteImage2DArray(output, coord_out, top, VXC_MODIFIER(0, 3, 0,VXC_RM_TowardZero, 0));\n\ \n\ - coord_out.z ++;\n\ - } while (coord_in.z < depth);\n\ + VXC_OP4_NoDest(img_store_3d, output, coord_out.xyww, top, VXC_MODIFIER(0, 3, 0,VXC_RM_TowardZero, 0));\n\ + coord_out.w += output_desc.s4;\n\ + }\n\ +\n\ + VXC_BitExtract(dst0, src0, src1, maskShift, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ + VXC_BitExtract(dst1, src2, src3, maskShift, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ + _viv_asm(COPY, top, dst0, 16);\n\ + _viv_asm(COPY, bottom, dst1, 16);\n\ + VXC_DP4x4(left4, top, top, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniDFPtoFp32_left_4x4);\n\ + VXC_DP4x4(right4, top, top, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniRightSubLeft_4x4);\n\ + top4 = right4 * x_lerp + left4;\n\ + VXC_DP4x4(left4, bottom, bottom, \\\n\ + VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniDFPtoFp32_left_4x4);\n\ + VXC_DP4x4(right4, bottom, bottom, \\\n\ + VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniRightSubLeft_4x4);\n\ + bottom4 = right4 * x_lerp + left4;\n\ + bottom4 -= top4;\n\ + float4 dst4 = bottom4 * y_lerp + top4;\n\ + dst4 = dst4 * dfpScale;\n\ + int4 dst = convert_int4_rte(dst4);\n\ + VXC_DP2x8(top, dst, dst, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), uniExtact8Bit_2x8);\n\ + VXC_OP4_NoDest(img_store_3d, output, coord_out.xyww, top, VXC_MODIFIER(0, 3, 0,VXC_RM_TowardZero, 0));\n\ +\n\ }\n\ \n\ __kernel void resize_bilinear_I16toI16_DOWN\n\ @@ -30297,104 +31439,68 @@ __kernel void resize_bilinear_I16toI16_DOWN\n\ float4 left_x_f = floor(in_x);\n\ float4 x_lerp = in_x - left_x_f;\n\ int4 left_x_idx = convert_int4(left_x_f);\n\ - float4 right_x_f = ceil(in_x);\n\ - int4 right_x_idx = convert_int4(right_x_f);\n\ -\n\ float in_y = (convert_float(coord_out.y) + half_pixel_value) * scale_xy.y - half_pixel_value;\n\ -\n\ float top_y_f = floor(in_y);\n\ float y_lerp = in_y - top_y_f;\n\ int top_y_idx = convert_int(top_y_f);\n\ - float bottom_y_f = ceil(in_y);\n\ - int bottom_y_idx= convert_int(bottom_y_f);\n\ -\n\ - vxc_short8 top_left, top_right;\n\ - vxc_short8 bottom_left, bottom_right;\n\ \n\ + vxc_short8 top, bottom, result;\n\ int4 coord_in = (int4)(left_x_idx.x, top_y_idx, coord_out.z, 0);\n\ \n\ - VXC_ReadImage2DArray(top_left, input, coord_in, \\\n\ - VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));\n\ + int8 input_desc;\n\ + _viv_asm(COPY, input_desc, input, sizeof(input_desc));\n\ + int baseAddr = (int)coord_in.z * input_desc.s4 + input_desc.s0;\n\ + _viv_asm(MOV, coord_in.w, baseAddr);\n\ + VXC_OP4(img_load_3d, top, input, coord_in.xyww, VXC_5BITOFFSET_XY(0, 0),\n\ + VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0));\n\ + VXC_OP4(img_load_3d, bottom, input, coord_in.xyww, VXC_5BITOFFSET_XY(0, 1),\n\ + VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0));\n\ coord_in.x = left_x_idx.y;\n\ - VXC_ReadImage2DArray(top_left, input, coord_in, \\\n\ - VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(1, 1, 0, VXC_RM_TowardZero, 0));\n\ + VXC_OP4(img_load_3d, top, input, coord_in.xyww, VXC_5BITOFFSET_XY(0, 0),\n\ + VXC_MODIFIER(2, 3, 0, VXC_RM_TowardZero, 0));\n\ + VXC_OP4(img_load_3d, bottom, input, coord_in.xyww, VXC_5BITOFFSET_XY(0, 1),\n\ + VXC_MODIFIER(2, 3, 0, VXC_RM_TowardZero, 0));\n\ coord_in.x = left_x_idx.z;\n\ - VXC_ReadImage2DArray(top_left, input, coord_in, \\\n\ - VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(2, 2, 0, VXC_RM_TowardZero, 0));\n\ + VXC_OP4(img_load_3d, top, input, coord_in.xyww, VXC_5BITOFFSET_XY(0, 0),\n\ + VXC_MODIFIER(4, 5, 0, VXC_RM_TowardZero, 0));\n\ + VXC_OP4(img_load_3d, bottom, input, coord_in.xyww, VXC_5BITOFFSET_XY(0, 1),\n\ + VXC_MODIFIER(4, 5, 0, VXC_RM_TowardZero, 0));\n\ coord_in.x = left_x_idx.w;\n\ - VXC_ReadImage2DArray(top_left, input, coord_in, \\\n\ - VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(3, 3, 0, VXC_RM_TowardZero, 0));\n\ -\n\ - coord_in.x = right_x_idx.x;\n\ - VXC_ReadImage2DArray(top_right, input, coord_in, \\\n\ - VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));\n\ - coord_in.x = right_x_idx.y;\n\ - VXC_ReadImage2DArray(top_right, input, coord_in, \\\n\ - VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(1, 1, 0, VXC_RM_TowardZero, 0));\n\ - coord_in.x = right_x_idx.z;\n\ - VXC_ReadImage2DArray(top_right, input, coord_in, \\\n\ - VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(2, 2, 0, VXC_RM_TowardZero, 0));\n\ - coord_in.x = right_x_idx.w;\n\ - VXC_ReadImage2DArray(top_right, input, coord_in, \\\n\ - VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(3, 3, 0, VXC_RM_TowardZero, 0));\n\ -\n\ - coord_in.y = bottom_y_idx;\n\ - coord_in.x = left_x_idx.x;\n\ - VXC_ReadImage2DArray(bottom_left, input, coord_in, \\\n\ - VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));\n\ - coord_in.x = left_x_idx.y;\n\ - VXC_ReadImage2DArray(bottom_left, input, coord_in, \\\n\ - VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(1, 1, 0, VXC_RM_TowardZero, 0));\n\ - coord_in.x = left_x_idx.z;\n\ - VXC_ReadImage2DArray(bottom_left, input, coord_in, \\\n\ - VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(2, 2, 0, VXC_RM_TowardZero, 0));\n\ - coord_in.x = left_x_idx.w;\n\ - VXC_ReadImage2DArray(bottom_left, input, coord_in, \\\n\ - VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(3, 3, 0, VXC_RM_TowardZero, 0));\n\ -\n\ - coord_in.x = right_x_idx.x;\n\ - VXC_ReadImage2DArray(bottom_right, input, coord_in, \\\n\ - VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));\n\ - coord_in.x = right_x_idx.y;\n\ - VXC_ReadImage2DArray(bottom_right, input, coord_in, \\\n\ - VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(1, 1, 0, VXC_RM_TowardZero, 0));\n\ - coord_in.x = right_x_idx.z;\n\ - VXC_ReadImage2DArray(bottom_right, input, coord_in, \\\n\ - VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(2, 2, 0, VXC_RM_TowardZero, 0));\n\ - coord_in.x = right_x_idx.w;\n\ - VXC_ReadImage2DArray(bottom_right, input, coord_in, \\\n\ - VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(3, 3, 0, VXC_RM_TowardZero, 0));\n\ + VXC_OP4(img_load_3d, top, input, coord_in.xyww, VXC_5BITOFFSET_XY(0, 0),\n\ + VXC_MODIFIER(6, 7, 0, VXC_RM_TowardZero, 0));\n\ + VXC_OP4(img_load_3d, bottom, input, coord_in.xyww, VXC_5BITOFFSET_XY(0, 1),\n\ + VXC_MODIFIER(6, 7, 0, VXC_RM_TowardZero, 0));\n\ \n\ float4 left4;\n\ float4 right4;\n\ float4 top4;\n\ float4 bottom4;\n\ \n\ - VXC_DP4x4(left4, top_left, top_left, \\\n\ - VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertDFP2FP32_4x4);\n\ - VXC_DP4x4(right4, top_right, top_right, \\\n\ - VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertDFP2FP32_4x4);\n\ -\n\ - right4 -= left4;\n\ + VXC_DP4x4(left4, top, top, \\\n\ + VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniDFPtoFp32_left_4x4);\n\ + VXC_DP4x4(right4, top, top, \\\n\ + VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniRightSubLeft_4x4);\n\ top4 = right4 * x_lerp + left4;\n\ \n\ - VXC_DP4x4(left4, bottom_left, bottom_left, \\\n\ - VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertDFP2FP32_4x4);\n\ - VXC_DP4x4(right4, bottom_right, bottom_right, \\\n\ - VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertDFP2FP32_4x4);\n\ -\n\ - right4 -= left4;\n\ + VXC_DP4x4(left4, bottom, bottom, \\\n\ + VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniDFPtoFp32_left_4x4);\n\ + VXC_DP4x4(right4, bottom, bottom, \\\n\ + VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniRightSubLeft_4x4);\n\ bottom4 = right4 * x_lerp + left4;\n\ -\n\ bottom4 -= top4;\n\ float4 dst4 = bottom4 * y_lerp + top4;\n\ -\n\ dst4 = dst4 * dfpScale;\n\ -\n\ int4 dst = convert_int4_rte(dst4);\n\ \n\ - VXC_DP2x8(top_left, dst, dst, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), uniExtact8Bit_2x8);\n\ - VXC_WriteImage2DArray(output, coord_out, top_left, VXC_MODIFIER(0, 3, 0,VXC_RM_TowardZero, 0));\n\ + VXC_DP2x8(result, dst, dst, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), uniExtact8Bit_2x8);\n\ +\n\ + int8 output_desc;\n\ + _viv_asm(COPY, output_desc, output, sizeof(output_desc));\n\ + baseAddr = (int)coord_out.z * output_desc.s4 + output_desc.s0;\n\ + _viv_asm(MOV, coord_out.w, baseAddr);\n\ +\n\ + VXC_OP4_NoDest(img_store_3d, output, coord_out.xyww, result,\n\ + VXC_MODIFIER(0, 3, 0,VXC_RM_TowardZero, 0));\n\ }\n\ \n\ "; /* end of resize_bilinear_I16_vx*/ @@ -30406,8 +31512,8 @@ _viv_uniform float2 scale_xy;\n\ _viv_uniform int depth;\n\ _viv_uniform VXC_512Bits uniConvertI32toI16_2x8;\n\ _viv_uniform VXC_512Bits uniGetMaskShift_2x8;\n\ -_viv_uniform VXC_512Bits uniConvertDFP2FP32_part1_4x4;\n\ -_viv_uniform VXC_512Bits uniConvertDFP2FP32_4x4;\n\ +_viv_uniform VXC_512Bits uniDFPtoFp32_left_4x4;\n\ +_viv_uniform VXC_512Bits uniRightSubLeft_4x4;\n\ _viv_uniform float dfpScale;\n\ _viv_uniform float half_pixel_value;\n\ \n\ @@ -30435,8 +31541,6 @@ __kernel void resize_bilinear_I8toI8_UP\n\ float top_y_f = floor(in_y);\n\ float y_lerp = in_y - top_y_f;\n\ int top_y_idx = convert_int(top_y_f);\n\ - float bottom_y_f = ceil(in_y);\n\ - int bottom_y_idx= convert_int(bottom_y_f);\n\ \n\ vxc_uchar16 src0, src1, dst0, dst1;\n\ \n\ @@ -30445,12 +31549,15 @@ __kernel void resize_bilinear_I8toI8_UP\n\ \n\ int4 coord_in = (int4)(left_x_idx.x, top_y_idx, coord_out.z, 0);\n\ \n\ - VXC_ReadImage2DArray(src0, input, coord_in, \\\n\ - VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));\n\ + int8 input_desc;\n\ + _viv_asm(COPY, input_desc, input, sizeof(input_desc));\n\ + int baseAddr = (int)coord_in.z * input_desc.s4 + input_desc.s0;\n\ + _viv_asm(MOV, coord_in.w, baseAddr);\n\ \n\ - coord_in.y = bottom_y_idx;\n\ - VXC_ReadImage2DArray(src1, input, coord_in, \\\n\ - VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));\n\ + VXC_OP4(img_load_3d, src0, input, coord_in.xyww, VXC_5BITOFFSET_XY(0, 0),\n\ + VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));\n\ + VXC_OP4(img_load_3d, src1, input, coord_in.xyww, VXC_5BITOFFSET_XY(0, 1),\n\ + VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));\n\ \n\ vxc_ushort8 bitextract_p0;\n\ vxc_uchar16 maskShift = {8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8};\n\ @@ -30460,37 +31567,42 @@ __kernel void resize_bilinear_I8toI8_UP\n\ VXC_DP2x8(maskShift, bitextract_p0, constData, \\\n\ VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniGetMaskShift_2x8);\n\ \n\ - do\n\ + int8 output_desc;\n\ + _viv_asm(COPY, output_desc, output, sizeof(output_desc));\n\ + baseAddr = (int)coord_out.z * output_desc.s4 + output_desc.s0;\n\ + _viv_asm(MOV, coord_out.w, baseAddr);\n\ +\n\ + float4 left4;\n\ + float4 right4;\n\ + float4 top4;\n\ + float4 bottom4;\n\ +\n\ + int loop = depth - 1;\n\ + while (coord_in.z < loop)\n\ {\n\ VXC_BitExtract(dst0, src0, src0, maskShift, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ VXC_BitExtract(dst1, src1, src1, maskShift, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ _viv_asm(COPY, top, dst0, 16);\n\ _viv_asm(COPY, bottom, dst1, 16);\n\ \n\ + coord_in.w += input_desc.s4;\n\ + VXC_OP4(img_load_3d, src0, input, coord_in.xyww, VXC_5BITOFFSET_XY(0, 0),\n\ + VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));\n\ + VXC_OP4(img_load_3d, src1, input, coord_in.xyww, VXC_5BITOFFSET_XY(0, 1),\n\ + VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));\n\ coord_in.z ++;\n\ - coord_in.y = top_y_idx;\n\ - VXC_ReadImage2DArray(src0, input, coord_in, \\\n\ - VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));\n\ - coord_in.y = bottom_y_idx;\n\ - VXC_ReadImage2DArray(src1, input, coord_in, \\\n\ - VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));\n\ -\n\ - float4 left4;\n\ - float4 right4;\n\ - float4 top4;\n\ - float4 bottom4;\n\ \n\ VXC_DP4x4(left4, top, top, \\\n\ - VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertDFP2FP32_4x4);\n\ + VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniDFPtoFp32_left_4x4);\n\ VXC_DP4x4(right4, top, top, \\\n\ - VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertDFP2FP32_part1_4x4);\n\ + VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniRightSubLeft_4x4);\n\ \n\ top4 = right4 * x_lerp + left4;\n\ \n\ VXC_DP4x4(left4, bottom, bottom, \\\n\ - VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertDFP2FP32_4x4);\n\ + VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniDFPtoFp32_left_4x4);\n\ VXC_DP4x4(right4, bottom, bottom, \\\n\ - VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertDFP2FP32_part1_4x4);\n\ + VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniRightSubLeft_4x4);\n\ \n\ bottom4 = right4 * x_lerp + left4;\n\ bottom4 -= top4;\n\ @@ -30498,10 +31610,31 @@ __kernel void resize_bilinear_I8toI8_UP\n\ dst4 = dst4 * dfpScale;\n\ int4 dst = convert_int4_rte(dst4);\n\ VXC_DP2x8(top, dst, dst, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), uniExtact8Bit_2x8);\n\ - VXC_WriteImage2DArray(output, coord_out, top, VXC_MODIFIER(0, 3, 0,VXC_RM_TowardZero, 0));\n\ \n\ - coord_out.z ++;\n\ - } while (coord_in.z < depth);\n\ + VXC_OP4_NoDest(img_store_3d, output, coord_out.xyww, top, VXC_MODIFIER(0, 3, 0,VXC_RM_TowardZero, 0));\n\ + coord_out.w += output_desc.s4;\n\ + }\n\ +\n\ + VXC_BitExtract(dst0, src0, src0, maskShift, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ + VXC_BitExtract(dst1, src1, src1, maskShift, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ + _viv_asm(COPY, top, dst0, 16);\n\ + _viv_asm(COPY, bottom, dst1, 16);\n\ + VXC_DP4x4(left4, top, top, \\\n\ + VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniDFPtoFp32_left_4x4);\n\ + VXC_DP4x4(right4, top, top, \\\n\ + VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniRightSubLeft_4x4);\n\ + top4 = right4 * x_lerp + left4;\n\ + VXC_DP4x4(left4, bottom, bottom, \\\n\ + VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniDFPtoFp32_left_4x4);\n\ + VXC_DP4x4(right4, bottom, bottom, \\\n\ + VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniRightSubLeft_4x4);\n\ + bottom4 = right4 * x_lerp + left4;\n\ + bottom4 -= top4;\n\ + float4 dst4 = bottom4 * y_lerp + top4;\n\ + dst4 = dst4 * dfpScale;\n\ + int4 dst = convert_int4_rte(dst4);\n\ + VXC_DP2x8(top, dst, dst, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), uniExtact8Bit_2x8);\n\ + VXC_OP4_NoDest(img_store_3d, output, coord_out.xyww, top, VXC_MODIFIER(0, 3, 0,VXC_RM_TowardZero, 0));\n\ }\n\ \n\ __kernel void resize_bilinear_I8toI8_DOWN\n\ @@ -30513,98 +31646,55 @@ __kernel void resize_bilinear_I8toI8_DOWN\n\ )\n\ {\n\ int4 coord_out = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0);\n\ -\n\ int4 coord_x = coord_out.xxxx + (int4)(0, 1, 2, 3);\n\ float4 in_x = (convert_float4(coord_x) + half_pixel_value) * scale_xy.xxxx - half_pixel_value;\n\ -\n\ float4 left_x_f = floor(in_x);\n\ float4 x_lerp = in_x - left_x_f;\n\ int4 left_x_idx = convert_int4(left_x_f);\n\ - float4 right_x_f = ceil(in_x);\n\ - int4 right_x_idx = convert_int4(right_x_f);\n\ -\n\ float in_y = (convert_float(coord_out.y) + half_pixel_value) * scale_xy.y - half_pixel_value;\n\ -\n\ float top_y_f = floor(in_y);\n\ float y_lerp = in_y - top_y_f;\n\ int top_y_idx = convert_int(top_y_f);\n\ - float bottom_y_f = ceil(in_y);\n\ - int bottom_y_idx= convert_int(bottom_y_f);\n\ -\n\ - vxc_char16 top_left, top_right;\n\ - vxc_char16 bottom_left, bottom_right;\n\ -\n\ + vxc_char16 top, bottom, result;\n\ int4 coord_in = (int4)(left_x_idx.x, top_y_idx, coord_out.z, 0);\n\ \n\ - VXC_ReadImage2DArray(top_left, input, coord_in, \\\n\ - VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));\n\ + int8 input_desc;\n\ + _viv_asm(COPY, input_desc, input, sizeof(input_desc));\n\ + int baseAddr = (int)coord_in.z * input_desc.s4 + input_desc.s0;\n\ + _viv_asm(MOV, coord_in.w, baseAddr);\n\ + VXC_OP4(img_load_3d, top, input, coord_in.xyww, VXC_5BITOFFSET_XY(0, 0),\n\ + VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0));\n\ + VXC_OP4(img_load_3d, bottom, input, coord_in.xyww, VXC_5BITOFFSET_XY(0, 1),\n\ + VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0));\n\ coord_in.x = left_x_idx.y;\n\ - VXC_ReadImage2DArray(top_left, input, coord_in, \\\n\ - VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(1, 1, 0, VXC_RM_TowardZero, 0));\n\ + VXC_OP4(img_load_3d, top, input, coord_in.xyww, VXC_5BITOFFSET_XY(0, 0),\n\ + VXC_MODIFIER(2, 3, 0, VXC_RM_TowardZero, 0));\n\ + VXC_OP4(img_load_3d, bottom, input, coord_in.xyww, VXC_5BITOFFSET_XY(0, 1),\n\ + VXC_MODIFIER(2, 3, 0, VXC_RM_TowardZero, 0));\n\ coord_in.x = left_x_idx.z;\n\ - VXC_ReadImage2DArray(top_left, input, coord_in, \\\n\ - VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(2, 2, 0, VXC_RM_TowardZero, 0));\n\ + VXC_OP4(img_load_3d, top, input, coord_in.xyww, VXC_5BITOFFSET_XY(0, 0),\n\ + VXC_MODIFIER(4, 5, 0, VXC_RM_TowardZero, 0));\n\ + VXC_OP4(img_load_3d, bottom, input, coord_in.xyww, VXC_5BITOFFSET_XY(0, 1),\n\ + VXC_MODIFIER(4, 5, 0, VXC_RM_TowardZero, 0));\n\ coord_in.x = left_x_idx.w;\n\ - VXC_ReadImage2DArray(top_left, input, coord_in, \\\n\ - VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(3, 3, 0, VXC_RM_TowardZero, 0));\n\ -\n\ - coord_in.x = right_x_idx.x;\n\ - VXC_ReadImage2DArray(top_right, input, coord_in, \\\n\ - VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));\n\ - coord_in.x = right_x_idx.y;\n\ - VXC_ReadImage2DArray(top_right, input, coord_in, \\\n\ - VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(1, 1, 0, VXC_RM_TowardZero, 0));\n\ - coord_in.x = right_x_idx.z;\n\ - VXC_ReadImage2DArray(top_right, input, coord_in, \\\n\ - VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(2, 2, 0, VXC_RM_TowardZero, 0));\n\ - coord_in.x = right_x_idx.w;\n\ - VXC_ReadImage2DArray(top_right, input, coord_in, \\\n\ - VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(3, 3, 0, VXC_RM_TowardZero, 0));\n\ -\n\ - coord_in.y = bottom_y_idx;\n\ - coord_in.x = left_x_idx.x;\n\ - VXC_ReadImage2DArray(bottom_left, input, coord_in, \\\n\ - VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));\n\ - coord_in.x = left_x_idx.y;\n\ - VXC_ReadImage2DArray(bottom_left, input, coord_in, \\\n\ - VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(1, 1, 0, VXC_RM_TowardZero, 0));\n\ - coord_in.x = left_x_idx.z;\n\ - VXC_ReadImage2DArray(bottom_left, input, coord_in, \\\n\ - VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(2, 2, 0, VXC_RM_TowardZero, 0));\n\ - coord_in.x = left_x_idx.w;\n\ - VXC_ReadImage2DArray(bottom_left, input, coord_in, \\\n\ - VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(3, 3, 0, VXC_RM_TowardZero, 0));\n\ -\n\ - coord_in.x = right_x_idx.x;\n\ - VXC_ReadImage2DArray(bottom_right, input, coord_in, \\\n\ - VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));\n\ - coord_in.x = right_x_idx.y;\n\ - VXC_ReadImage2DArray(bottom_right, input, coord_in, \\\n\ - VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(1, 1, 0, VXC_RM_TowardZero, 0));\n\ - coord_in.x = right_x_idx.z;\n\ - VXC_ReadImage2DArray(bottom_right, input, coord_in, \\\n\ - VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(2, 2, 0, VXC_RM_TowardZero, 0));\n\ - coord_in.x = right_x_idx.w;\n\ - VXC_ReadImage2DArray(bottom_right, input, coord_in, \\\n\ - VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(3, 3, 0, VXC_RM_TowardZero, 0));\n\ + VXC_OP4(img_load_3d, top, input, coord_in.xyww, VXC_5BITOFFSET_XY(0, 0),\n\ + VXC_MODIFIER(6, 7, 0, VXC_RM_TowardZero, 0));\n\ + VXC_OP4(img_load_3d, bottom, input, coord_in.xyww, VXC_5BITOFFSET_XY(0, 1),\n\ + VXC_MODIFIER(6, 7, 0, VXC_RM_TowardZero, 0));\n\ \n\ float4 left4;\n\ float4 right4;\n\ float4 top4;\n\ float4 bottom4;\n\ \n\ - VXC_DP4x4(left4, top_left, top_left, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertDFP2FP32_4x4);\n\ - VXC_DP4x4(right4, top_right, top_right, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertDFP2FP32_4x4);\n\ -\n\ - right4 -= left4;\n\ + VXC_DP4x4(left4, top, top, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniDFPtoFp32_left_4x4);\n\ + VXC_DP4x4(right4, top, top, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniRightSubLeft_4x4);\n\ top4 = right4 * x_lerp + left4;\n\ \n\ - VXC_DP4x4(left4, bottom_left, bottom_left, \\\n\ - VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertDFP2FP32_4x4);\n\ - VXC_DP4x4(right4, bottom_right, bottom_right, \\\n\ - VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertDFP2FP32_4x4);\n\ -\n\ - right4 -= left4;\n\ + VXC_DP4x4(left4, bottom, bottom, \\\n\ + VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniDFPtoFp32_left_4x4);\n\ + VXC_DP4x4(right4, bottom, bottom, \\\n\ + VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniRightSubLeft_4x4);\n\ bottom4 = right4 * x_lerp + left4;\n\ \n\ bottom4 -= top4;\n\ @@ -30614,21 +31704,26 @@ __kernel void resize_bilinear_I8toI8_DOWN\n\ \n\ int4 dst = convert_int4_rte(dst4);\n\ \n\ - VXC_DP2x8(top_left, dst, dst, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), uniExtact8Bit_2x8);\n\ - VXC_WriteImage2DArray(output, coord_out, top_left, VXC_MODIFIER(0, 3, 0,VXC_RM_TowardZero, 0));\n\ + VXC_DP2x8(result, dst, dst, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), uniExtact8Bit_2x8);\n\ +\n\ + int8 output_desc;\n\ + _viv_asm(COPY, output_desc, output, sizeof(output_desc));\n\ + baseAddr = (int)coord_out.z * output_desc.s4 + output_desc.s0;\n\ + _viv_asm(MOV, coord_out.w, baseAddr);\n\ + VXC_OP4_NoDest(img_store_3d, output, coord_out.xyww, result, VXC_MODIFIER(0, 3, 0,VXC_RM_TowardZero, 0));\n\ }\n\ "; /* end of resize_bilinear_I8_vx*/ static const char resize_bilinear_U8_vx[] = "#include \"cl_viv_vx_ext.h\"\n\ \n\ -_viv_uniform VXC_512Bits uniU8SubZPtoFp32_4x4;\n\ +_viv_uniform VXC_512Bits uniU8SubZPtoFp32_left_4x4;\n\ +_viv_uniform VXC_512Bits uniU8RightSubLeft_4x4;\n\ _viv_uniform VXC_512Bits uniExtact8Bit_2x8;\n\ _viv_uniform float2 scale_xy;\n\ _viv_uniform int depth;\n\ _viv_uniform int input_ZP;\n\ _viv_uniform float uint8Scale;\n\ _viv_uniform float output_ZP;\n\ -_viv_uniform VXC_512Bits uniU8SubZPtoFp32_part1_4x4;\n\ _viv_uniform VXC_512Bits uniConvertI32toI16_2x8;\n\ _viv_uniform VXC_512Bits uniGetMaskShift_2x8;\n\ _viv_uniform float half_pixel_value;\n\ @@ -30647,69 +31742,36 @@ __kernel void resize_bilinear_U8toF16_DOWN\n\ float4 left_x_f = floor(in_x);\n\ float4 x_lerp = in_x - left_x_f;\n\ int4 left_x_idx = convert_int4(left_x_f);\n\ - float4 right_x_f = ceil(in_x);\n\ - int4 right_x_idx = convert_int4(right_x_f);\n\ float in_y = (convert_float(coord_out.y) + half_pixel_value) * scale_xy.y - half_pixel_value;\n\ float top_y_f = floor(in_y);\n\ float y_lerp = in_y - top_y_f;\n\ int top_y_idx = convert_int(top_y_f);\n\ - float bottom_y_f = ceil(in_y);\n\ - int bottom_y_idx= convert_int(bottom_y_f);\n\ - vxc_uchar16 top_left, top_right;\n\ - vxc_uchar16 bottom_left, bottom_right;\n\ + vxc_uchar16 top, bottom;\n\ int4 coord_in = (int4)(left_x_idx.x, top_y_idx, coord_out.z, 0);\n\ \n\ - VXC_ReadImage2DArray(top_left, input, coord_in, \\\n\ - VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));\n\ + int8 input_desc;\n\ + _viv_asm(COPY, input_desc, input, sizeof(input_desc));\n\ + int baseAddr = (int)coord_in.z * input_desc.s4 + input_desc.s0;\n\ + _viv_asm(MOV, coord_in.w, baseAddr);\n\ + VXC_OP4(img_load_3d, top, input, coord_in.xyww, VXC_5BITOFFSET_XY(0, 0),\n\ + VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0));\n\ + VXC_OP4(img_load_3d, bottom, input, coord_in.xyww, VXC_5BITOFFSET_XY(0, 1),\n\ + VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0));\n\ coord_in.x = left_x_idx.y;\n\ - VXC_ReadImage2DArray(top_left, input, coord_in, \\\n\ - VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(1, 1, 0, VXC_RM_TowardZero, 0));\n\ + VXC_OP4(img_load_3d, top, input, coord_in.xyww, VXC_5BITOFFSET_XY(0, 0),\n\ + VXC_MODIFIER(2, 3, 0, VXC_RM_TowardZero, 0));\n\ + VXC_OP4(img_load_3d, bottom, input, coord_in.xyww, VXC_5BITOFFSET_XY(0, 1),\n\ + VXC_MODIFIER(2, 3, 0, VXC_RM_TowardZero, 0));\n\ coord_in.x = left_x_idx.z;\n\ - VXC_ReadImage2DArray(top_left, input, coord_in, \\\n\ - VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(2, 2, 0, VXC_RM_TowardZero, 0));\n\ + VXC_OP4(img_load_3d, top, input, coord_in.xyww, VXC_5BITOFFSET_XY(0, 0),\n\ + VXC_MODIFIER(4, 5, 0, VXC_RM_TowardZero, 0));\n\ + VXC_OP4(img_load_3d, bottom, input, coord_in.xyww, VXC_5BITOFFSET_XY(0, 1),\n\ + VXC_MODIFIER(4, 5, 0, VXC_RM_TowardZero, 0));\n\ coord_in.x = left_x_idx.w;\n\ - VXC_ReadImage2DArray(top_left, input, coord_in, \\\n\ - VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(3, 3, 0, VXC_RM_TowardZero, 0));\n\ -\n\ - coord_in.x = right_x_idx.x;\n\ - VXC_ReadImage2DArray(top_right, input, coord_in, \\\n\ - VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));\n\ - coord_in.x = right_x_idx.y;\n\ - VXC_ReadImage2DArray(top_right, input, coord_in, \\\n\ - VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(1, 1, 0, VXC_RM_TowardZero, 0));\n\ - coord_in.x = right_x_idx.z;\n\ - VXC_ReadImage2DArray(top_right, input, coord_in, \\\n\ - VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(2, 2, 0, VXC_RM_TowardZero, 0));\n\ - coord_in.x = right_x_idx.w;\n\ - VXC_ReadImage2DArray(top_right, input, coord_in, \\\n\ - VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(3, 3, 0, VXC_RM_TowardZero, 0));\n\ -\n\ - coord_in.y = bottom_y_idx;\n\ - coord_in.x = left_x_idx.x;\n\ - VXC_ReadImage2DArray(bottom_left, input, coord_in, \\\n\ - VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));\n\ - coord_in.x = left_x_idx.y;\n\ - VXC_ReadImage2DArray(bottom_left, input, coord_in, \\\n\ - VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(1, 1, 0, VXC_RM_TowardZero, 0));\n\ - coord_in.x = left_x_idx.z;\n\ - VXC_ReadImage2DArray(bottom_left, input, coord_in, \\\n\ - VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(2, 2, 0, VXC_RM_TowardZero, 0));\n\ - coord_in.x = left_x_idx.w;\n\ - VXC_ReadImage2DArray(bottom_left, input, coord_in, \\\n\ - VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(3, 3, 0, VXC_RM_TowardZero, 0));\n\ -\n\ - coord_in.x = right_x_idx.x;\n\ - VXC_ReadImage2DArray(bottom_right, input, coord_in, \\\n\ - VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));\n\ - coord_in.x = right_x_idx.y;\n\ - VXC_ReadImage2DArray(bottom_right, input, coord_in, \\\n\ - VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(1, 1, 0, VXC_RM_TowardZero, 0));\n\ - coord_in.x = right_x_idx.z;\n\ - VXC_ReadImage2DArray(bottom_right, input, coord_in, \\\n\ - VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(2, 2, 0, VXC_RM_TowardZero, 0));\n\ - coord_in.x = right_x_idx.w;\n\ - VXC_ReadImage2DArray(bottom_right, input, coord_in, \\\n\ - VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(3, 3, 0, VXC_RM_TowardZero, 0));\n\ + VXC_OP4(img_load_3d, top, input, coord_in.xyww, VXC_5BITOFFSET_XY(0, 0),\n\ + VXC_MODIFIER(6, 7, 0, VXC_RM_TowardZero, 0));\n\ + VXC_OP4(img_load_3d, bottom, input, coord_in.xyww, VXC_5BITOFFSET_XY(0, 1),\n\ + VXC_MODIFIER(6, 7, 0, VXC_RM_TowardZero, 0));\n\ \n\ float4 left4;\n\ float4 right4;\n\ @@ -30718,16 +31780,12 @@ __kernel void resize_bilinear_U8toF16_DOWN\n\ \n\ unsigned char inputZP;\n\ _viv_asm(COPY, inputZP, input_ZP, 4);\n\ - VXC_DP4x4(left4, top_left, inputZP, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniU8SubZPtoFp32_4x4);\n\ - VXC_DP4x4(right4, top_right, inputZP, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniU8SubZPtoFp32_4x4);\n\ -\n\ - right4 -= left4;\n\ + VXC_DP4x4(left4, top, inputZP, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniU8SubZPtoFp32_left_4x4);\n\ + VXC_DP4x4(right4, top, inputZP, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniU8RightSubLeft_4x4);\n\ top4 = right4 * x_lerp + left4;\n\ \n\ - VXC_DP4x4(left4, bottom_left, inputZP, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniU8SubZPtoFp32_4x4);\n\ - VXC_DP4x4(right4, bottom_right, inputZP, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniU8SubZPtoFp32_4x4);\n\ -\n\ - right4 -= left4;\n\ + VXC_DP4x4(left4, bottom, inputZP, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniU8SubZPtoFp32_left_4x4);\n\ + VXC_DP4x4(right4, bottom, inputZP, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniU8RightSubLeft_4x4);\n\ bottom4 = right4 * x_lerp + left4;\n\ \n\ bottom4 -= top4;\n\ @@ -30741,7 +31799,12 @@ __kernel void resize_bilinear_U8toF16_DOWN\n\ vxc_short8 dst_short;\n\ _viv_asm(COPY, dst_short, dst, 16);\n\ \n\ - VXC_WriteImage2DArray(output, coord_out, dst_short.s0246, VXC_MODIFIER(0, 3, 0,VXC_RM_TowardZero, 0));\n\ + int8 output_desc;\n\ + _viv_asm(COPY, output_desc, output, sizeof(output_desc));\n\ + baseAddr = (int)coord_out.z * output_desc.s4 + output_desc.s0;\n\ + _viv_asm(MOV, coord_out.w, baseAddr);\n\ + VXC_OP4_NoDest(img_store_3d, output, coord_out.xyww, dst_short.s0246,\n\ + VXC_MODIFIER(0, 3, 0,VXC_RM_TowardZero, 0));\n\ }\n\ \n\ __kernel void resize_bilinear_U8toU8_UP\n\ @@ -30768,8 +31831,6 @@ __kernel void resize_bilinear_U8toU8_UP\n\ float top_y_f = floor(in_y);\n\ float y_lerp = in_y - top_y_f;\n\ int top_y_idx = convert_int(top_y_f);\n\ - float bottom_y_f = ceil(in_y);\n\ - int bottom_y_idx= convert_int(bottom_y_f);\n\ \n\ vxc_uchar16 src0, src1;\n\ \n\ @@ -30778,12 +31839,15 @@ __kernel void resize_bilinear_U8toU8_UP\n\ \n\ int4 coord_in = (int4)(left_x_idx.x, top_y_idx, coord_out.z, 0);\n\ \n\ - VXC_ReadImage2DArray(src0, input, coord_in, \\\n\ - VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));\n\ + int8 input_desc;\n\ + _viv_asm(COPY, input_desc, input, sizeof(input_desc));\n\ + int baseAddr = (int)coord_in.z * input_desc.s4 + input_desc.s0;\n\ + _viv_asm(MOV, coord_in.w, baseAddr);\n\ \n\ - coord_in.y = bottom_y_idx;\n\ - VXC_ReadImage2DArray(src1, input, coord_in, \\\n\ - VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));\n\ + VXC_OP4(img_load_3d, src0, input, coord_in.xyww, VXC_5BITOFFSET_XY(0, 0),\n\ + VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));\n\ + VXC_OP4(img_load_3d, src1, input, coord_in.xyww, VXC_5BITOFFSET_XY(0, 1),\n\ + VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));\n\ \n\ vxc_ushort8 bitextract_p0;\n\ vxc_uchar16 maskShift = {8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8};\n\ @@ -30793,46 +31857,67 @@ __kernel void resize_bilinear_U8toU8_UP\n\ VXC_DP2x8(maskShift, bitextract_p0, constData, \\\n\ VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniGetMaskShift_2x8);\n\ \n\ - do\n\ + int8 output_desc;\n\ + _viv_asm(COPY, output_desc, output, sizeof(output_desc));\n\ + baseAddr = (int)coord_out.z * output_desc.s4 + output_desc.s0;\n\ + _viv_asm(MOV, coord_out.w, baseAddr);\n\ +\n\ + float4 left4;\n\ + float4 right4;\n\ + float4 top4;\n\ + float4 bottom4;\n\ +\n\ + int loop = depth - 1;\n\ + while (coord_in.z < loop)\n\ {\n\ VXC_BitExtract(top, src0, src0, maskShift, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ VXC_BitExtract(bottom, src1, src1, maskShift, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ \n\ + coord_in.w += input_desc.s4;\n\ + VXC_OP4(img_load_3d, src0, input, coord_in.xyww, VXC_5BITOFFSET_XY(0, 0),\n\ + VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));\n\ + VXC_OP4(img_load_3d, src1, input, coord_in.xyww, VXC_5BITOFFSET_XY(0, 1),\n\ + VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));\n\ coord_in.z ++;\n\ - coord_in.y = top_y_idx;\n\ - VXC_ReadImage2DArray(src0, input, coord_in, \\\n\ - VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));\n\ - coord_in.y = bottom_y_idx;\n\ - VXC_ReadImage2DArray(src1, input, coord_in, \\\n\ - VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));\n\ -\n\ - float4 left4;\n\ - float4 right4;\n\ - float4 top4;\n\ - float4 bottom4;\n\ -\n\ unsigned char inputZP;\n\ _viv_asm(COPY, inputZP, input_ZP, 4);\n\ - VXC_DP4x4(left4, top, inputZP, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniU8SubZPtoFp32_4x4);\n\ - VXC_DP4x4(right4, top, top, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniU8SubZPtoFp32_part1_4x4);\n\ -\n\ + VXC_DP4x4(left4, top, inputZP, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniU8SubZPtoFp32_left_4x4);\n\ + VXC_DP4x4(right4, top, top, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniU8RightSubLeft_4x4);\n\ top4 = right4 * x_lerp + left4;\n\ \n\ VXC_DP4x4(left4, bottom, inputZP, \\\n\ - VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniU8SubZPtoFp32_4x4);\n\ + VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniU8SubZPtoFp32_left_4x4);\n\ VXC_DP4x4(right4, bottom, bottom, \\\n\ - VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniU8SubZPtoFp32_part1_4x4);\n\ -\n\ + VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniU8RightSubLeft_4x4);\n\ bottom4 = right4 * x_lerp + left4;\n\ bottom4 -= top4;\n\ float4 dst4 = bottom4 * y_lerp + top4;\n\ dst4 = dst4 * uint8Scale + output_ZP;\n\ int4 dst = convert_int4_rte(dst4);\n\ VXC_DP2x8(top, dst, dst, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), uniExtact8Bit_2x8);\n\ - VXC_WriteImage2DArray(output, coord_out, top, VXC_MODIFIER(0, 3, 0,VXC_RM_TowardZero, 0));\n\ + VXC_OP4_NoDest(img_store_3d, output, coord_out.xyww, top, VXC_MODIFIER(0, 3, 0,VXC_RM_TowardZero, 0));\n\ + coord_out.w += output_desc.s4;\n\ + }\n\ \n\ - coord_out.z ++;\n\ - } while (coord_in.z < depth);\n\ + VXC_BitExtract(top, src0, src0, maskShift, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ + VXC_BitExtract(bottom, src1, src1, maskShift, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ + unsigned char inputZP;\n\ + _viv_asm(COPY, inputZP, input_ZP, 4);\n\ + VXC_DP4x4(left4, top, inputZP, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniU8SubZPtoFp32_left_4x4);\n\ + VXC_DP4x4(right4, top, top, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniU8RightSubLeft_4x4);\n\ + top4 = right4 * x_lerp + left4;\n\ +\n\ + VXC_DP4x4(left4, bottom, inputZP, \\\n\ + VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniU8SubZPtoFp32_left_4x4);\n\ + VXC_DP4x4(right4, bottom, bottom, \\\n\ + VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniU8RightSubLeft_4x4);\n\ + bottom4 = right4 * x_lerp + left4;\n\ + bottom4 -= top4;\n\ + float4 dst4 = bottom4 * y_lerp + top4;\n\ + dst4 = dst4 * uint8Scale + output_ZP;\n\ + int4 dst = convert_int4_rte(dst4);\n\ + VXC_DP2x8(top, dst, dst, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), uniExtact8Bit_2x8);\n\ + VXC_OP4_NoDest(img_store_3d, output, coord_out.xyww, top, VXC_MODIFIER(0, 3, 0,VXC_RM_TowardZero, 0));\n\ }\n\ \n\ __kernel void resize_bilinear_U8toU8_DOWN\n\ @@ -30849,69 +31934,36 @@ __kernel void resize_bilinear_U8toU8_DOWN\n\ float4 left_x_f = floor(in_x);\n\ float4 x_lerp = in_x - left_x_f;\n\ int4 left_x_idx = convert_int4(left_x_f);\n\ - float4 right_x_f = ceil(in_x);\n\ - int4 right_x_idx = convert_int4(right_x_f);\n\ float in_y = (convert_float(coord_out.y) + half_pixel_value) * scale_xy.y - half_pixel_value;\n\ float top_y_f = floor(in_y);\n\ float y_lerp = in_y - top_y_f;\n\ int top_y_idx = convert_int(top_y_f);\n\ - float bottom_y_f = ceil(in_y);\n\ - int bottom_y_idx= convert_int(bottom_y_f);\n\ - vxc_uchar16 top_left, top_right;\n\ - vxc_uchar16 bottom_left, bottom_right;\n\ + vxc_uchar16 top, bottom, result;\n\ int4 coord_in = (int4)(left_x_idx.x, top_y_idx, coord_out.z, 0);\n\ \n\ - VXC_ReadImage2DArray(top_left, input, coord_in, \\\n\ - VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));\n\ + int8 input_desc;\n\ + _viv_asm(COPY, input_desc, input, sizeof(input_desc));\n\ + int baseAddr = (int)coord_in.z * input_desc.s4 + input_desc.s0;\n\ + _viv_asm(MOV, coord_in.w, baseAddr);\n\ + VXC_OP4(img_load_3d, top, input, coord_in.xyww, VXC_5BITOFFSET_XY(0, 0),\n\ + VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0));\n\ + VXC_OP4(img_load_3d, bottom, input, coord_in.xyww, VXC_5BITOFFSET_XY(0, 1),\n\ + VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0));\n\ coord_in.x = left_x_idx.y;\n\ - VXC_ReadImage2DArray(top_left, input, coord_in, \\\n\ - VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(1, 1, 0, VXC_RM_TowardZero, 0));\n\ + VXC_OP4(img_load_3d, top, input, coord_in.xyww, VXC_5BITOFFSET_XY(0, 0),\n\ + VXC_MODIFIER(2, 3, 0, VXC_RM_TowardZero, 0));\n\ + VXC_OP4(img_load_3d, bottom, input, coord_in.xyww, VXC_5BITOFFSET_XY(0, 1),\n\ + VXC_MODIFIER(2, 3, 0, VXC_RM_TowardZero, 0));\n\ coord_in.x = left_x_idx.z;\n\ - VXC_ReadImage2DArray(top_left, input, coord_in, \\\n\ - VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(2, 2, 0, VXC_RM_TowardZero, 0));\n\ + VXC_OP4(img_load_3d, top, input, coord_in.xyww, VXC_5BITOFFSET_XY(0, 0),\n\ + VXC_MODIFIER(4, 5, 0, VXC_RM_TowardZero, 0));\n\ + VXC_OP4(img_load_3d, bottom, input, coord_in.xyww, VXC_5BITOFFSET_XY(0, 1),\n\ + VXC_MODIFIER(4, 5, 0, VXC_RM_TowardZero, 0));\n\ coord_in.x = left_x_idx.w;\n\ - VXC_ReadImage2DArray(top_left, input, coord_in, \\\n\ - VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(3, 3, 0, VXC_RM_TowardZero, 0));\n\ -\n\ - coord_in.x = right_x_idx.x;\n\ - VXC_ReadImage2DArray(top_right, input, coord_in, \\\n\ - VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));\n\ - coord_in.x = right_x_idx.y;\n\ - VXC_ReadImage2DArray(top_right, input, coord_in, \\\n\ - VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(1, 1, 0, VXC_RM_TowardZero, 0));\n\ - coord_in.x = right_x_idx.z;\n\ - VXC_ReadImage2DArray(top_right, input, coord_in, \\\n\ - VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(2, 2, 0, VXC_RM_TowardZero, 0));\n\ - coord_in.x = right_x_idx.w;\n\ - VXC_ReadImage2DArray(top_right, input, coord_in, \\\n\ - VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(3, 3, 0, VXC_RM_TowardZero, 0));\n\ -\n\ - coord_in.y = bottom_y_idx;\n\ - coord_in.x = left_x_idx.x;\n\ - VXC_ReadImage2DArray(bottom_left, input, coord_in, \\\n\ - VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));\n\ - coord_in.x = left_x_idx.y;\n\ - VXC_ReadImage2DArray(bottom_left, input, coord_in, \\\n\ - VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(1, 1, 0, VXC_RM_TowardZero, 0));\n\ - coord_in.x = left_x_idx.z;\n\ - VXC_ReadImage2DArray(bottom_left, input, coord_in, \\\n\ - VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(2, 2, 0, VXC_RM_TowardZero, 0));\n\ - coord_in.x = left_x_idx.w;\n\ - VXC_ReadImage2DArray(bottom_left, input, coord_in, \\\n\ - VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(3, 3, 0, VXC_RM_TowardZero, 0));\n\ -\n\ - coord_in.x = right_x_idx.x;\n\ - VXC_ReadImage2DArray(bottom_right, input, coord_in, \\\n\ - VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));\n\ - coord_in.x = right_x_idx.y;\n\ - VXC_ReadImage2DArray(bottom_right, input, coord_in, \\\n\ - VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(1, 1, 0, VXC_RM_TowardZero, 0));\n\ - coord_in.x = right_x_idx.z;\n\ - VXC_ReadImage2DArray(bottom_right, input, coord_in, \\\n\ - VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(2, 2, 0, VXC_RM_TowardZero, 0));\n\ - coord_in.x = right_x_idx.w;\n\ - VXC_ReadImage2DArray(bottom_right, input, coord_in, \\\n\ - VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(3, 3, 0, VXC_RM_TowardZero, 0));\n\ + VXC_OP4(img_load_3d, top, input, coord_in.xyww, VXC_5BITOFFSET_XY(0, 0),\n\ + VXC_MODIFIER(6, 7, 0, VXC_RM_TowardZero, 0));\n\ + VXC_OP4(img_load_3d, bottom, input, coord_in.xyww, VXC_5BITOFFSET_XY(0, 1),\n\ + VXC_MODIFIER(6, 7, 0, VXC_RM_TowardZero, 0));\n\ \n\ float4 left4;\n\ float4 right4;\n\ @@ -30920,27 +31972,28 @@ __kernel void resize_bilinear_U8toU8_DOWN\n\ \n\ unsigned char inputZP;\n\ _viv_asm(COPY, inputZP, input_ZP, 4);\n\ - VXC_DP4x4(left4, top_left, inputZP, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniU8SubZPtoFp32_4x4);\n\ - VXC_DP4x4(right4, top_right, inputZP, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniU8SubZPtoFp32_4x4);\n\ -\n\ - right4 -= left4;\n\ + VXC_DP4x4(left4, top, inputZP, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniU8SubZPtoFp32_left_4x4);\n\ + VXC_DP4x4(right4, top, inputZP, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniU8RightSubLeft_4x4);\n\ top4 = right4 * x_lerp + left4;\n\ \n\ - VXC_DP4x4(left4, bottom_left, inputZP, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniU8SubZPtoFp32_4x4);\n\ - VXC_DP4x4(right4, bottom_right, inputZP, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniU8SubZPtoFp32_4x4);\n\ -\n\ - right4 -= left4;\n\ + VXC_DP4x4(left4, bottom, inputZP, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniU8SubZPtoFp32_left_4x4);\n\ + VXC_DP4x4(right4, bottom, inputZP, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniU8RightSubLeft_4x4);\n\ bottom4 = right4 * x_lerp + left4;\n\ \n\ bottom4 -= top4;\n\ float4 dst4 = bottom4 * y_lerp + top4;\n\ \n\ dst4 = dst4 * uint8Scale + output_ZP;\n\ -\n\ int4 dst = convert_int4_rte(dst4);\n\ \n\ - VXC_DP2x8(top_left, dst, dst, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), uniExtact8Bit_2x8);\n\ - VXC_WriteImage2DArray(output, coord_out, top_left, VXC_MODIFIER(0, 3, 0,VXC_RM_TowardZero, 0));\n\ + VXC_DP2x8(result, dst, dst, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), uniExtact8Bit_2x8);\n\ +\n\ + int8 output_desc;\n\ + _viv_asm(COPY, output_desc, output, sizeof(output_desc));\n\ + baseAddr = (int)coord_out.z * output_desc.s4 + output_desc.s0;\n\ + _viv_asm(MOV, coord_out.w, baseAddr);\n\ + VXC_OP4_NoDest(img_store_3d, output, coord_out.xyww, result,\n\ + VXC_MODIFIER(0, 3, 0,VXC_RM_TowardZero, 0));\n\ }\n\ "; /* end of resize_bilinear_U8_vx*/ @@ -31082,7 +32135,8 @@ __kernel void resize_bilinear_U8toU8_UP_opt\n\ baseAddr = (int)coord_out.z * output_desc.s4 + output_desc.s0;\n\ _viv_asm(MOV, coord_out.w, baseAddr);\n\ \n\ - do\n\ + int loop = depth - 1;\n\ + while (coord_in.z < loop)\n\ {\n\ VXC_BitExtract(top_bottom, src0, src0, maskShift, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ VXC_BitExtract(top_bottom, src1, src1, maskShift, VXC_MODIFIER(8, 15, 0, VXC_RM_TowardZero, 0));\n\ @@ -31101,8 +32155,17 @@ __kernel void resize_bilinear_U8toU8_UP_opt\n\ VXC_MODIFIER(0, 3, 0,VXC_RM_TowardZero, 0));\n\ coord_out.w += output_desc.s4;\n\ \n\ - coord_out.z ++;\n\ - } while (coord_out.z < depth);\n\ + coord_in.z ++;\n\ + }\n\ +\n\ + VXC_BitExtract(top_bottom, src0, src0, maskShift, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ + VXC_BitExtract(top_bottom, src1, src1, maskShift, VXC_MODIFIER(8, 15, 0, VXC_RM_TowardZero, 0));\n\ + vxc_uchar16 dst;\n\ + VXC_DP4x4_b(dst, lerp.hi, lerp.lo, top_bottom,\n\ + VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), uniBilinear_4x4_b);\n\ + VXC_OP4_NoDest(img_store_3d, output, coord_out.xyww, dst,\n\ + VXC_MODIFIER(0, 3, 0,VXC_RM_TowardZero, 0));\n\ +\n\ }\n\ \n\ #endif"; /* end of resize_bilinear_U8_opt_vx*/ @@ -31137,18 +32200,30 @@ __kernel void resize_nearest_F16toF16\n\ vxc_short8 src;\n\ int4 coord_in = (int4)(in_x_idx.x, in_y_idx, coord_out.z, 0);\n\ \n\ - VXC_ReadImage2DArray(src, input, coord_in, \\\n\ - VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));\n\ + int8 input_desc;\n\ + _viv_asm(COPY, input_desc, input, sizeof(input_desc));\n\ + int baseAddr = (int)coord_in.z * input_desc.s4 + input_desc.s0;\n\ + _viv_asm(MOV, coord_in.w, baseAddr);\n\ +\n\ + VXC_OP4(img_load_3d, src, input, coord_in.xyww, VXC_5BITOFFSET_XY(0, 0),\n\ + VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));\n\ coord_in.x = in_x_idx.y;\n\ - VXC_ReadImage2DArray(src, input, coord_in, \\\n\ - VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(1, 1, 0, VXC_RM_TowardZero, 0));\n\ + VXC_OP4(img_load_3d, src, input, coord_in.xyww, VXC_5BITOFFSET_XY(0, 0),\n\ + VXC_MODIFIER(1, 1, 0, VXC_RM_TowardZero, 0));\n\ coord_in.x = in_x_idx.z;\n\ - VXC_ReadImage2DArray(src, input, coord_in, \\\n\ - VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(2, 2, 0, VXC_RM_TowardZero, 0));\n\ + VXC_OP4(img_load_3d, src, input, coord_in.xyww, VXC_5BITOFFSET_XY(0, 0),\n\ + VXC_MODIFIER(2, 2, 0, VXC_RM_TowardZero, 0));\n\ coord_in.x = in_x_idx.w;\n\ - VXC_ReadImage2DArray(src, input, coord_in, \\\n\ - VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(3, 3, 0, VXC_RM_TowardZero, 0));\n\ - VXC_WriteImage2DArray(output, coord_out, src, VXC_MODIFIER(0, 3, 0,VXC_RM_TowardZero, 0));\n\ + VXC_OP4(img_load_3d, src, input, coord_in.xyww, VXC_5BITOFFSET_XY(0, 0),\n\ + VXC_MODIFIER(3, 3, 0, VXC_RM_TowardZero, 0));\n\ +\n\ +\n\ + int8 output_desc;\n\ + _viv_asm(COPY, output_desc, output, sizeof(output_desc));\n\ + baseAddr = (int)coord_out.z * output_desc.s4 + output_desc.s0;\n\ + _viv_asm(MOV, coord_out.w, baseAddr);\n\ + VXC_OP4_NoDest(img_store_3d, output, coord_out.xyww, src,\n\ + VXC_MODIFIER(0, 3, 0,VXC_RM_TowardZero, 0));\n\ }\n\ \n\ _viv_uniform VXC_512Bits uniGetExtractData_2x8;\n\ @@ -31165,18 +32240,29 @@ __kernel void resize_nearest_F16toF16_op\n\ vxc_ushort8 src0, src1, dst;\n\ int4 coord_in = (int4)(in_x_idx.x, in_y_idx, coord_out.z, 0);\n\ \n\ - VXC_ReadImage2DArray(src0, input, coord_in, \\\n\ - VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ - VXC_ReadImage2DArray(src1, input, coord_in, \\\n\ - VXC_5BITOFFSET_XY(8, 0), VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ - //in_x_idx = in_x_idx - in_x_idx.xxxx;\n\ + int8 input_desc;\n\ + _viv_asm(COPY, input_desc, input, sizeof(input_desc));\n\ + int baseAddr = (int)coord_in.z * input_desc.s4 + input_desc.s0;\n\ + _viv_asm(MOV, coord_in.w, baseAddr);\n\ +\n\ + VXC_OP4(img_load_3d, src0, input, coord_in.xyww, VXC_5BITOFFSET_XY(0, 0),\n\ + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ + VXC_OP4(img_load_3d, src1, input, coord_in.xyww, VXC_5BITOFFSET_XY(8, 0),\n\ + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ +\n\ vxc_uchar16 mask = (vxc_uchar16)(8, 8, 8, 8, 8, 8, 8, 8, 16, 16, 16, 16, 16, 16, 16, 16);\n\ vxc_ushort8 input_idx;\n\ _viv_asm(COPY, input_idx, in_x_idx, 16);\n\ VXC_DP2x8(mask, input_idx, input_idx, \\\n\ VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniGetExtractData_2x8);\n\ VXC_BitExtract(dst, src0, src1, mask, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0));\n\ - VXC_WriteImage2DArray(output, coord_out, dst, VXC_MODIFIER(0, 3, 0,VXC_RM_TowardZero, 0));\n\ +\n\ + int8 output_desc;\n\ + _viv_asm(COPY, output_desc, output, sizeof(output_desc));\n\ + baseAddr = (int)coord_out.z * output_desc.s4 + output_desc.s0;\n\ + _viv_asm(MOV, coord_out.w, baseAddr);\n\ + VXC_OP4_NoDest(img_store_3d, output, coord_out.xyww, dst,\n\ + VXC_MODIFIER(0, 3, 0,VXC_RM_TowardZero, 0));\n\ }\n\ \n\ _viv_uniform VXC_512Bits uniConvertI8toI8_2x8;\n\ @@ -31193,19 +32279,31 @@ __kernel void resize_nearest_I8toI8\n\ vxc_char16 src;\n\ int4 coord_in = (int4)(in_x_idx.x, in_y_idx, coord_out.z, 0);\n\ \n\ - VXC_ReadImage2DArray(src, input, coord_in, \\\n\ - VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));\n\ + int8 input_desc;\n\ + _viv_asm(COPY, input_desc, input, sizeof(input_desc));\n\ + int baseAddr = (int)coord_in.z * input_desc.s4 + input_desc.s0;\n\ + _viv_asm(MOV, coord_in.w, baseAddr);\n\ +\n\ + VXC_OP4(img_load_3d, src, input, coord_in.xyww, VXC_5BITOFFSET_XY(0, 0),\n\ + VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));\n\ coord_in.x = in_x_idx.y;\n\ - VXC_ReadImage2DArray(src, input, coord_in, \\\n\ - VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(1, 1, 0, VXC_RM_TowardZero, 0));\n\ + VXC_OP4(img_load_3d, src, input, coord_in.xyww, VXC_5BITOFFSET_XY(0, 0),\n\ + VXC_MODIFIER(1, 1, 0, VXC_RM_TowardZero, 0));\n\ coord_in.x = in_x_idx.z;\n\ - VXC_ReadImage2DArray(src, input, coord_in, \\\n\ - VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(2, 2, 0, VXC_RM_TowardZero, 0));\n\ + VXC_OP4(img_load_3d, src, input, coord_in.xyww, VXC_5BITOFFSET_XY(0, 0),\n\ + VXC_MODIFIER(2, 2, 0, VXC_RM_TowardZero, 0));\n\ coord_in.x = in_x_idx.w;\n\ - VXC_ReadImage2DArray(src, input, coord_in, \\\n\ - VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(3, 3, 0, VXC_RM_TowardZero, 0));\n\ + VXC_OP4(img_load_3d, src, input, coord_in.xyww, VXC_5BITOFFSET_XY(0, 0),\n\ + VXC_MODIFIER(3, 3, 0, VXC_RM_TowardZero, 0));\n\ +\n\ VXC_DP2x8(src, src, src, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniConvertI8toI8_2x8);\n\ - VXC_WriteImage2DArray(output, coord_out, src, VXC_MODIFIER(0, 3, 0,VXC_RM_TowardZero, 0));\n\ +\n\ + int8 output_desc;\n\ + _viv_asm(COPY, output_desc, output, sizeof(output_desc));\n\ + baseAddr = (int)coord_out.z * output_desc.s4 + output_desc.s0;\n\ + _viv_asm(MOV, coord_out.w, baseAddr);\n\ + VXC_OP4_NoDest(img_store_3d, output, coord_out.xyww, src,\n\ + VXC_MODIFIER(0, 3, 0,VXC_RM_TowardZero, 0));\n\ }\n\ \n\ __kernel void resize_nearest_I8toI8_op\n\ @@ -31222,8 +32320,14 @@ __kernel void resize_nearest_I8toI8_op\n\ vxc_char16 dst;\n\ int4 coord_in = (int4)(in_x_idx.x, in_y_idx, coord_out.z, 0);\n\ \n\ - VXC_ReadImage2DArray(src0, input, coord_in, \\\n\ - VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));\n\ + int8 input_desc;\n\ + _viv_asm(COPY, input_desc, input, sizeof(input_desc));\n\ + int baseAddr = (int)coord_in.z * input_desc.s4 + input_desc.s0;\n\ + _viv_asm(MOV, coord_in.w, baseAddr);\n\ +\n\ + VXC_OP4(img_load_3d, src0, input, coord_in.xyww, VXC_5BITOFFSET_XY(0, 0),\n\ + VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));\n\ +\n\ vxc_uchar16 mask = (vxc_uchar16)(8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8);\n\ vxc_ushort8 input_idx;\n\ _viv_asm(COPY, input_idx, in_x_idx, 16);\n\ @@ -31232,7 +32336,13 @@ __kernel void resize_nearest_I8toI8_op\n\ VXC_BitExtract(dst0, src0, src0, mask, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0));\n\ _viv_asm(COPY, dst, dst0, 8);\n\ VXC_DP2x8(dst, dst, dst, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), uniConvertI8toI8_2x8);\n\ - VXC_WriteImage2DArray(output, coord_out, dst, VXC_MODIFIER(0, 3, 0,VXC_RM_TowardZero, 0));\n\ +\n\ + int8 output_desc;\n\ + _viv_asm(COPY, output_desc, output, sizeof(output_desc));\n\ + baseAddr = (int)coord_out.z * output_desc.s4 + output_desc.s0;\n\ + _viv_asm(MOV, coord_out.w, baseAddr);\n\ + VXC_OP4_NoDest(img_store_3d, output, coord_out.xyww, dst,\n\ + VXC_MODIFIER(0, 3, 0,VXC_RM_TowardZero, 0));\n\ }\n\ \n\ __kernel void resize_nearest_U8toU8\n\ @@ -31248,22 +32358,34 @@ __kernel void resize_nearest_U8toU8\n\ vxc_uchar16 src;\n\ int4 coord_in = (int4)(in_x_idx.x, in_y_idx, coord_out.z, 0);\n\ \n\ - VXC_ReadImage2DArray(src, input, coord_in, \\\n\ - VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));\n\ + int8 input_desc;\n\ + _viv_asm(COPY, input_desc, input, sizeof(input_desc));\n\ + int baseAddr = (int)coord_in.z * input_desc.s4 + input_desc.s0;\n\ + _viv_asm(MOV, coord_in.w, baseAddr);\n\ +\n\ + VXC_OP4(img_load_3d, src, input, coord_in.xyww, VXC_5BITOFFSET_XY(0, 0),\n\ + VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));\n\ coord_in.x = in_x_idx.y;\n\ - VXC_ReadImage2DArray(src, input, coord_in, \\\n\ - VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(1, 1, 0, VXC_RM_TowardZero, 0));\n\ + VXC_OP4(img_load_3d, src, input, coord_in.xyww, VXC_5BITOFFSET_XY(0, 0),\n\ + VXC_MODIFIER(1, 1, 0, VXC_RM_TowardZero, 0));\n\ coord_in.x = in_x_idx.z;\n\ - VXC_ReadImage2DArray(src, input, coord_in, \\\n\ - VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(2, 2, 0, VXC_RM_TowardZero, 0));\n\ + VXC_OP4(img_load_3d, src, input, coord_in.xyww, VXC_5BITOFFSET_XY(0, 0),\n\ + VXC_MODIFIER(2, 2, 0, VXC_RM_TowardZero, 0));\n\ coord_in.x = in_x_idx.w;\n\ - VXC_ReadImage2DArray(src, input, coord_in, \\\n\ - VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(3, 3, 0, VXC_RM_TowardZero, 0));\n\ + VXC_OP4(img_load_3d, src, input, coord_in.xyww, VXC_5BITOFFSET_XY(0, 0),\n\ + VXC_MODIFIER(3, 3, 0, VXC_RM_TowardZero, 0));\n\ +\n\ vxc_ushort8 multiplier;\n\ _viv_asm(COPY, multiplier, multAndoutZP, 16);\n\ VXC_DP2x8(src, src, multiplier, \\\n\ VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), uniMultiplyAndPostShift_2x8);\n\ - VXC_WriteImage2DArray(output, coord_out, src, VXC_MODIFIER(0, 3, 0,VXC_RM_TowardZero, 0));\n\ +\n\ + int8 output_desc;\n\ + _viv_asm(COPY, output_desc, output, sizeof(output_desc));\n\ + baseAddr = (int)coord_out.z * output_desc.s4 + output_desc.s0;\n\ + _viv_asm(MOV, coord_out.w, baseAddr);\n\ + VXC_OP4_NoDest(img_store_3d, output, coord_out.xyww, src,\n\ + VXC_MODIFIER(0, 3, 0,VXC_RM_TowardZero, 0));\n\ }\n\ \n\ __kernel void resize_nearest_U8toU8_op\n\ @@ -31279,8 +32401,14 @@ __kernel void resize_nearest_U8toU8_op\n\ vxc_uchar16 src0, dst;\n\ int4 coord_in = (int4)(in_x_idx.x, in_y_idx, coord_out.z, 0);\n\ \n\ - VXC_ReadImage2DArray(src0, input, coord_in, \\\n\ - VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));\n\ + int8 input_desc;\n\ + _viv_asm(COPY, input_desc, input, sizeof(input_desc));\n\ + int baseAddr = (int)coord_in.z * input_desc.s4 + input_desc.s0;\n\ + _viv_asm(MOV, coord_in.w, baseAddr);\n\ +\n\ + VXC_OP4(img_load_3d, src0, input, coord_in.xyww, VXC_5BITOFFSET_XY(0, 0),\n\ + VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));\n\ +\n\ vxc_uchar16 mask = (vxc_uchar16)(8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8);\n\ vxc_ushort8 input_idx;\n\ _viv_asm(COPY, input_idx, in_x_idx, 16);\n\ @@ -31289,7 +32417,13 @@ __kernel void resize_nearest_U8toU8_op\n\ vxc_ushort8 multiplier;\n\ _viv_asm(COPY, multiplier, multAndoutZP, 16);\n\ VXC_DP2x8(dst, dst, multiplier, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), uniMultiplyAndPostShift_2x8);\n\ - VXC_WriteImage2DArray(output, coord_out, dst, VXC_MODIFIER(0, 3, 0,VXC_RM_TowardZero, 0));\n\ +\n\ + int8 output_desc;\n\ + _viv_asm(COPY, output_desc, output, sizeof(output_desc));\n\ + baseAddr = (int)coord_out.z * output_desc.s4 + output_desc.s0;\n\ + _viv_asm(MOV, coord_out.w, baseAddr);\n\ + VXC_OP4_NoDest(img_store_3d, output, coord_out.xyww, dst,\n\ + VXC_MODIFIER(0, 3, 0,VXC_RM_TowardZero, 0));\n\ }\n\ \n\ __kernel void resize_nearest_I16toI16\n\ @@ -31305,19 +32439,32 @@ __kernel void resize_nearest_I16toI16\n\ vxc_short8 src;\n\ int4 coord_in = (int4)(in_x_idx.x, in_y_idx, coord_out.z, 0);\n\ \n\ - VXC_ReadImage2DArray(src, input, coord_in, \\\n\ - VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));\n\ + int8 input_desc;\n\ + _viv_asm(COPY, input_desc, input, sizeof(input_desc));\n\ + int baseAddr = (int)coord_in.z * input_desc.s4 + input_desc.s0;\n\ + _viv_asm(MOV, coord_in.w, baseAddr);\n\ +\n\ + VXC_OP4(img_load_3d, src, input, coord_in.xyww, VXC_5BITOFFSET_XY(0, 0),\n\ + VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));\n\ coord_in.x = in_x_idx.y;\n\ - VXC_ReadImage2DArray(src, input, coord_in, \\\n\ - VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(1, 1, 0, VXC_RM_TowardZero, 0));\n\ + VXC_OP4(img_load_3d, src, input, coord_in.xyww, VXC_5BITOFFSET_XY(0, 0),\n\ + VXC_MODIFIER(1, 1, 0, VXC_RM_TowardZero, 0));\n\ coord_in.x = in_x_idx.z;\n\ - VXC_ReadImage2DArray(src, input, coord_in, \\\n\ - VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(2, 2, 0, VXC_RM_TowardZero, 0));\n\ + VXC_OP4(img_load_3d, src, input, coord_in.xyww, VXC_5BITOFFSET_XY(0, 0),\n\ + VXC_MODIFIER(2, 2, 0, VXC_RM_TowardZero, 0));\n\ coord_in.x = in_x_idx.w;\n\ - VXC_ReadImage2DArray(src, input, coord_in, \\\n\ - VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(3, 3, 0, VXC_RM_TowardZero, 0));\n\ + VXC_OP4(img_load_3d, src, input, coord_in.xyww, VXC_5BITOFFSET_XY(0, 0),\n\ + VXC_MODIFIER(3, 3, 0, VXC_RM_TowardZero, 0));\n\ +\n\ VXC_DP2x8(src, src, src, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), uniConvertI8toI8_2x8);\n\ - VXC_WriteImage2DArray(output, coord_out, src, VXC_MODIFIER(0, 3, 0,VXC_RM_TowardZero, 0));\n\ +\n\ +\n\ + int8 output_desc;\n\ + _viv_asm(COPY, output_desc, output, sizeof(output_desc));\n\ + baseAddr = (int)coord_out.z * output_desc.s4 + output_desc.s0;\n\ + _viv_asm(MOV, coord_out.w, baseAddr);\n\ + VXC_OP4_NoDest(img_store_3d, output, coord_out.xyww, src,\n\ + VXC_MODIFIER(0, 3, 0,VXC_RM_TowardZero, 0));\n\ }\n\ \n\ __kernel void resize_nearest_I16toI16_op\n\ @@ -31333,10 +32480,16 @@ __kernel void resize_nearest_I16toI16_op\n\ vxc_ushort8 src0, src1, dst0;\n\ vxc_short8 dst;\n\ int4 coord_in = (int4)(in_x_idx.x, in_y_idx, coord_out.z, 0);\n\ - VXC_ReadImage2DArray(src0, input, coord_in, \\\n\ - VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ - VXC_ReadImage2DArray(src1, input, coord_in, \\\n\ - VXC_5BITOFFSET_XY(8, 0), VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ +\n\ + int8 input_desc;\n\ + _viv_asm(COPY, input_desc, input, sizeof(input_desc));\n\ + int baseAddr = (int)coord_in.z * input_desc.s4 + input_desc.s0;\n\ + _viv_asm(MOV, coord_in.w, baseAddr);\n\ +\n\ + VXC_OP4(img_load_3d, src0, input, coord_in.xyww, VXC_5BITOFFSET_XY(0, 0),\n\ + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ + VXC_OP4(img_load_3d, src1, input, coord_in.xyww, VXC_5BITOFFSET_XY(8, 0),\n\ + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ \n\ //in_x_idx = in_x_idx - in_x_idx.xxxx;\n\ vxc_uchar16 mask = (vxc_uchar16)(8, 8, 8, 8, 8, 8, 8, 8, 16, 16, 16, 16, 16, 16, 16, 16);\n\ @@ -31346,7 +32499,13 @@ __kernel void resize_nearest_I16toI16_op\n\ VXC_BitExtract(dst0, src0, src1, mask, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0));\n\ _viv_asm(COPY, dst, dst0, 8);\n\ VXC_DP2x8(dst, dst, dst, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), uniConvertI8toI8_2x8);\n\ - VXC_WriteImage2DArray(output, coord_out, dst, VXC_MODIFIER(0, 3, 0,VXC_RM_TowardZero, 0));\n\ +\n\ + int8 output_desc;\n\ + _viv_asm(COPY, output_desc, output, sizeof(output_desc));\n\ + baseAddr = (int)coord_out.z * output_desc.s4 + output_desc.s0;\n\ + _viv_asm(MOV, coord_out.w, baseAddr);\n\ + VXC_OP4_NoDest(img_store_3d, output, coord_out.xyww, dst,\n\ + VXC_MODIFIER(0, 3, 0,VXC_RM_TowardZero, 0));\n\ }\n\ "; /* end of resize_nearest_vx*/ @@ -31673,6 +32832,142 @@ __kernel void select_I8_U8_U8toU8_2D(\n\ }\n\ "; /* end of select_vx*/ +static const char space2depth_internal_vx[] = "#include \"cl_viv_vx_ext.h\"\n\ +\n\ +_viv_uniform VXC_512Bits uniExtractEvenUint8Stride2_2x8;\n\ +_viv_uniform VXC_512Bits uniExtractOddUint8Stride2_2x8;\n\ +_viv_uniform VXC_512Bits uniExtractEvenFp16Stride2_4x4;\n\ +_viv_uniform VXC_512Bits uniExtractOddFp16Stride2_4x4;\n\ +\n\ +_viv_uniform int input_depth;\n\ +\n\ +#define SPACE2DEPTH_INTERNAL_QINT_TO_QINT(src0_type_name, src1_type_name, read_type) \\\n\ +__kernel void space2depth_internal_##src0_type_name##to##src1_type_name( \\\n\ + image2d_array_t input, \\\n\ + image2d_array_t output, \\\n\ + int block_size_x, \\\n\ + int block_size_y \\\n\ + ) \\\n\ +{ \\\n\ + int gidx = get_global_id(0); \\\n\ + int gidy = get_global_id(1); \\\n\ + int gidz = get_global_id(2); \\\n\ + int4 coord = (int4)(gidx, gidy, gidz, 0); \\\n\ + read_type src; \\\n\ + VXC_ReadImage2DArray(src, input, coord, VXC_5BITOFFSET_XY(0, 0), \\\n\ + VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0)); \\\n\ + \\\n\ + ushort stride_x = (ushort)block_size_x; \\\n\ + ushort stride_y = (ushort)block_size_y; \\\n\ + ushort sidx = (ushort)gidx; \\\n\ + ushort sidy = (ushort)gidy; \\\n\ + ushort tmpX = sidx % stride_x; \\\n\ + ushort tmpY = sidy % stride_y; \\\n\ + int tmpId0 = tmpX; \\\n\ + int tmpId1 = tmpY; \\\n\ + int4 coord_out = (int4)((int)(sidx / stride_x), (int)(sidy / stride_y), 0, 0); \\\n\ + coord_out.z = tmpId0 * input_depth + tmpId1 * block_size_x * input_depth + gidz; \\\n\ + VXC_WriteImage2DArray(output, coord_out, src, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0)); \\\n\ +}\n\ +SPACE2DEPTH_INTERNAL_QINT_TO_QINT(U8, U8, vxc_uchar16)\n\ +SPACE2DEPTH_INTERNAL_QINT_TO_QINT(I8, I8, vxc_char16)\n\ +SPACE2DEPTH_INTERNAL_QINT_TO_QINT(I16, I16, vxc_short8)\n\ +\n\ +__kernel void space2depth_internal_F16toF16(\n\ + image2d_array_t input,\n\ + image2d_array_t output,\n\ + int block_size_x,\n\ + int block_size_y\n\ + )\n\ +{\n\ + int gidx = get_global_id(0);\n\ + int gidy = get_global_id(1);\n\ + int gidz = get_global_id(2);\n\ + int4 coord = (int4)(gidx, gidy, gidz, 0);\n\ + vxc_short8 data, imgVal0;\n\ + VXC_ReadImage2DArray(data, input, coord, VXC_5BITOFFSET_XY(0, 0),\n\ + VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));\n\ +\n\ + ushort stride_x = (ushort)block_size_x;\n\ + ushort stride_y = (ushort)block_size_y;\n\ + ushort sidx = (ushort)gidx;\n\ + ushort sidy = (ushort)gidy;\n\ + ushort tmpX = sidx % stride_x;\n\ + ushort tmpY = sidy % stride_y;\n\ + int tmpId0 = tmpX;\n\ + int tmpId1 = tmpY;\n\ + int4 coord_out = (int4)((int)(sidx / stride_x), (int)(sidy / stride_y), 0, 0);\n\ + coord_out.z = tmpId0 * input_depth + tmpId1 * block_size_x * input_depth + gidz;\n\ +\n\ + VXC_WriteImage2DArray(output, coord_out, data, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));\n\ +}\n\ +\n\ +#define SPACE2DEPTH_INTERNAL_QINT_TO_QINT_X2Y1(src0_type_name, src1_type_name, read_type, write_type) \\\n\ +__kernel void space2depth_internal_##src0_type_name##to##src1_type_name##_X2Y1( \\\n\ + image2d_array_t input, \\\n\ + image2d_array_t output, \\\n\ + int block_size_x, \\\n\ + int block_size_y \\\n\ + ) \\\n\ +{ \\\n\ + int gidx = get_global_id(0); \\\n\ + int gidy = get_global_id(1); \\\n\ + int gidz = get_global_id(2); \\\n\ + \\\n\ + int4 coord = (int4)(gidx, gidy, gidz, 0); \\\n\ + int4 coord_out = (int4)(gidx >> 1, gidy, gidz, 0); \\\n\ + int out_d1; \\\n\ + read_type imageData; \\\n\ + write_type imgVal0, imgVal1; \\\n\ + \\\n\ + VXC_ReadImage2DArray(imageData, input, coord, VXC_5BITOFFSET_XY(0, 0), \\\n\ + VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); \\\n\ + \\\n\ + out_d1 = gidz + input_depth; \\\n\ + \\\n\ + VXC_DP2x8(imgVal0, imageData, imageData,\\\n\ + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniExtractEvenUint8Stride2_2x8); \\\n\ + VXC_DP2x8(imgVal1, imageData, imageData,\\\n\ + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniExtractOddUint8Stride2_2x8); \\\n\ + VXC_WriteImage2DArray(output, coord_out, imgVal0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \\\n\ + coord_out.z = out_d1; \\\n\ + VXC_WriteImage2DArray(output, coord_out, imgVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \\\n\ +}\n\ +SPACE2DEPTH_INTERNAL_QINT_TO_QINT_X2Y1(U8, U8, vxc_uchar16, vxc_uchar16)\n\ +SPACE2DEPTH_INTERNAL_QINT_TO_QINT_X2Y1(I8, I8, vxc_char16, vxc_char16)\n\ +\n\ +#define SPACE2DEPTH_INTERNAL_16BITS_X2Y1(src0_type_name, src1_type_name, read_type, write_type) \\\n\ +__kernel void space2depth_internal_##src0_type_name##to##src1_type_name##_X2Y1( \\\n\ + image2d_array_t input, \\\n\ + image2d_array_t output, \\\n\ + int block_size_x, \\\n\ + int block_size_y \\\n\ + ) \\\n\ +{ \\\n\ + int gidx = get_global_id(0); \\\n\ + int gidy = get_global_id(1); \\\n\ + int gidz = get_global_id(2); \\\n\ + \\\n\ + int4 coord = (int4)(gidx, gidy, gidz, 0); \\\n\ + int4 coord_out = (int4)(gidx >> 1, gidy, gidz, 0); \\\n\ + int out_d1; \\\n\ + read_type imageData; \\\n\ + write_type imgVal0, imgVal1; \\\n\ + \\\n\ + VXC_ReadImage2DArray(imageData, input, coord, VXC_5BITOFFSET_XY(0, 0), \\\n\ + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \\\n\ + out_d1 = gidz + input_depth; \\\n\ + VXC_DP4x4(imgVal0, imageData, imageData, \\\n\ + VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniExtractEvenFp16Stride2_4x4); \\\n\ + VXC_DP4x4(imgVal1, imageData, imageData, \\\n\ + VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniExtractOddFp16Stride2_4x4); \\\n\ + VXC_WriteImage2DArray(output, coord_out, imgVal0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \\\n\ + coord_out.z = out_d1; \\\n\ + VXC_WriteImage2DArray(output, coord_out, imgVal1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \\\n\ +}\n\ +SPACE2DEPTH_INTERNAL_16BITS_X2Y1(I16, I16, vxc_short8, vxc_short8)\n\ +SPACE2DEPTH_INTERNAL_16BITS_X2Y1(F16, F16, vxc_short8, vxc_short8)"; /* end of space2depth_internal_vx*/ + static const char swish_vx[] = "#include \"cl_viv_vx_ext.h\"\n\ \n\ _viv_uniform float logE;\n\ @@ -33164,6 +34459,151 @@ __kernel void upsample_U8_U8to_F16_2D\n\ }\n\ "; /* end of upsample_U8_vx*/ +static const char upsamplescale_vx[] = "\n\ +#include \"cl_viv_vx_ext.h\"\n\ +\n\ +_viv_uniform VXC_512Bits uniConvertDatatoF32_4x4;\n\ +_viv_uniform float output_scale;\n\ +_viv_uniform float tail;\n\ +\n\ +#define UPSAMPLE_SCALETO_FUN(src_name, dst_name, read_type, src_type, dst_type, write_type, conv_func) \\\n\ + __kernel void upsamplescale_##src_name##to##dst_name( \\\n\ + __read_only image2d_array_t input, \\\n\ + __write_only image2d_array_t output, \\\n\ + int stride, \\\n\ + float scale) \\\n\ +{ \\\n\ + int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0); \\\n\ + read_type read_val; \\\n\ + src_type src_val; \\\n\ + dst_type dst_val; \\\n\ + write_type write_val; \\\n\ + VXC_ReadImage2DArray(read_val, input, coord, VXC_5BITOFFSET_XY(0, 0), \\\n\ + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \\\n\ + _viv_asm(COPY, src_val, read_val, 16); \\\n\ + coord.xy *= stride; \\\n\ + int8 output_desc; \\\n\ + _viv_asm(COPY, output_desc, output, sizeof(output_desc)); \\\n\ + int baseAddr = (int)coord.z * output_desc.s4 + output_desc.s0; \\\n\ + _viv_asm(MOV, coord.w, baseAddr); \\\n\ + float4 data; \\\n\ + VXC_DP4x4(data, src_val, src_val, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), uniConvertDatatoF32_4x4); \\\n\ + data = data * output_scale + tail; \\\n\ + _viv_asm(conv_func, dst_val, data); \\\n\ + _viv_asm(COPY, write_val, dst_val, 16); \\\n\ + int4 coord_out = coord; \\\n\ + for (int y = 0; y < stride; y++) \\\n\ + { \\\n\ + coord_out.x = coord.x; \\\n\ + for (int x = 0; x < stride; ) \\\n\ + { \\\n\ + VXC_OP4_NoDest(img_store_3d, output, coord_out.xyww, write_val, \\\n\ + VXC_MODIFIER(0, 0, 0,VXC_RM_TowardZero, 0)); \\\n\ + x++; \\\n\ + coord_out.x ++; \\\n\ + } \\\n\ + coord_out.y ++; \\\n\ + } \\\n\ +}\n\ +\n\ +UPSAMPLE_SCALETO_FUN(F16, F16, vxc_short8, vxc_half8, half4, short4, CONV)\n\ +UPSAMPLE_SCALETO_FUN(F16, I16, vxc_short8, vxc_half8, int4, short4, CONV_RTE)\n\ +UPSAMPLE_SCALETO_FUN(F16, I8, vxc_short8, vxc_half8, int4, char4, CONV_RTE)\n\ +UPSAMPLE_SCALETO_FUN(F16, U8, vxc_short8, vxc_half8, int4, uchar4, CONV_RTE)\n\ +UPSAMPLE_SCALETO_FUN(I16, I16, vxc_short8, vxc_short8, int4, short4, CONV_RTE)\n\ +UPSAMPLE_SCALETO_FUN(I16, F16, vxc_short8, vxc_short8, half4, short4, CONV)\n\ +UPSAMPLE_SCALETO_FUN(I8, I8, vxc_char16, vxc_char16, int4, char4, CONV_RTE)\n\ +UPSAMPLE_SCALETO_FUN(I8, F16, vxc_short8, vxc_short8, half4, short4, CONV)\n\ +UPSAMPLE_SCALETO_FUN(U8, U8, vxc_uchar16, vxc_uchar16, int4, uchar4, CONV_RTE)\n\ +UPSAMPLE_SCALETO_FUN(U8, F16, vxc_short8, vxc_short8, half4, short4, CONV)\n\ +\n\ +"; /* end of upsamplescale_vx*/ + +static const char upsamplescale_k2_vx[] = "\n\ +#include \"cl_viv_vx_ext.h\"\n\ +\n\ +_viv_uniform VXC_512Bits uniUpScale2X_lo_2x8;\n\ +_viv_uniform VXC_512Bits uniUpScale2X_hi_2x8;\n\ +_viv_uniform int2 multAndoutZP;//[0:15] multiplier, [31:63] output zp\n\ +\n\ +#define UPSAMPLE_SCALETO8B_FUN(src_name, dst_name, read_type, src_type, dst_type) \\\n\ + __kernel void upsamplescale_##src_name##to##dst_name##_K2( \\\n\ + __read_only image2d_array_t input, \\\n\ + __write_only image2d_array_t output, \\\n\ + int stride, \\\n\ + float scale) \\\n\ +{ \\\n\ + int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0); \\\n\ + read_type read_val; \\\n\ + src_type src_val; \\\n\ + dst_type dst_val; \\\n\ + VXC_ReadImage2DArray(read_val, input, coord, VXC_5BITOFFSET_XY(0, 0), \\\n\ + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \\\n\ + _viv_asm(COPY, src_val, read_val, 16); \\\n\ + coord.xy <<= 1; \\\n\ + int8 output_desc; \\\n\ + _viv_asm(COPY, output_desc, output, sizeof(output_desc)); \\\n\ + int baseAddr = (int)coord.z * output_desc.s4 + output_desc.s0; \\\n\ + _viv_asm(MOV, coord.w, baseAddr); \\\n\ + vxc_ushort8 multiplier; \\\n\ + _viv_asm(COPY, multiplier, multAndoutZP, 16); \\\n\ + VXC_DP2x8(dst_val, src_val, multiplier, \\\n\ + VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniUpScale2X_lo_2x8); \\\n\ + VXC_DP2x8(dst_val, src_val, multiplier, \\\n\ + VXC_MODIFIER(8, 15, 0, VXC_RM_ToNearestEven, 1), uniUpScale2X_hi_2x8); \\\n\ + VXC_OP4_NoDest(img_store_3d, output, coord.xyww, dst_val, VXC_MODIFIER(0, 15, 0,VXC_RM_TowardZero, 0)); \\\n\ + coord.y ++; \\\n\ + VXC_OP4_NoDest(img_store_3d, output, coord.xyww, dst_val, VXC_MODIFIER(0, 15, 0,VXC_RM_TowardZero, 0)); \\\n\ +}\n\ +\n\ +UPSAMPLE_SCALETO8B_FUN(F16, I8, vxc_short8, vxc_half8, vxc_char16)\n\ +UPSAMPLE_SCALETO8B_FUN(F16, U8, vxc_short8, vxc_half8, vxc_uchar16)\n\ +UPSAMPLE_SCALETO8B_FUN(I8, I8, vxc_char16, vxc_char16, vxc_char16)\n\ +UPSAMPLE_SCALETO8B_FUN(U8, U8, vxc_uchar16, vxc_uchar16, vxc_uchar16)\n\ +\n\ +#define UPSAMPLE_SCALETO16B_FUN(src_name, dst_name, read_type, src_type, dst_type, write_type) \\\n\ + __kernel void upsamplescale_##src_name##to##dst_name##_K2( \\\n\ + __read_only image2d_array_t input, \\\n\ + __write_only image2d_array_t output, \\\n\ + int stride, \\\n\ + float scale) \\\n\ +{ \\\n\ + int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0); \\\n\ + read_type read_val; \\\n\ + src_type src_val; \\\n\ + dst_type dst0_val; \\\n\ + dst_type dst1_val; \\\n\ + write_type write_val; \\\n\ + VXC_ReadImage2DArray(read_val, input, coord, VXC_5BITOFFSET_XY(0, 0), \\\n\ + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \\\n\ + _viv_asm(COPY, src_val, read_val, 16); \\\n\ + coord.xy <<= 1; \\\n\ + int8 output_desc; \\\n\ + _viv_asm(COPY, output_desc, output, sizeof(output_desc)); \\\n\ + int baseAddr = (int)coord.z * output_desc.s4 + output_desc.s0; \\\n\ + _viv_asm(MOV, coord.w, baseAddr); \\\n\ + vxc_ushort8 multiplier; \\\n\ + _viv_asm(COPY, multiplier, multAndoutZP, 16); \\\n\ + VXC_DP2x8(dst0_val, src_val, multiplier, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1), uniUpScale2X_lo_2x8); \\\n\ + VXC_DP2x8(dst1_val, src_val, multiplier, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1), uniUpScale2X_hi_2x8); \\\n\ + _viv_asm(COPY, write_val, dst0_val, 16); \\\n\ + VXC_OP4_NoDest(img_store_3d, output, coord.xyww, write_val, VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0)); \\\n\ + coord.y ++; \\\n\ + VXC_OP4_NoDest(img_store_3d, output, coord.xyww, write_val, VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0)); \\\n\ + _viv_asm(COPY, write_val, dst1_val, 16); \\\n\ + coord.xy = coord.xy + (int2)(8, -1); \\\n\ + VXC_OP4_NoDest(img_store_3d, output, coord.xyww, write_val, VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0)); \\\n\ + coord.y ++; \\\n\ + VXC_OP4_NoDest(img_store_3d, output, coord.xyww, write_val, VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0)); \\\n\ +}\n\ +UPSAMPLE_SCALETO16B_FUN(F16, F16, vxc_short8, vxc_half8, vxc_half8, vxc_short8)\n\ +UPSAMPLE_SCALETO16B_FUN(F16, I16, vxc_short8, vxc_half8, vxc_short8, vxc_short8)\n\ +UPSAMPLE_SCALETO16B_FUN(I8, F16, vxc_char16, vxc_char16, vxc_half8, vxc_short8)\n\ +UPSAMPLE_SCALETO16B_FUN(U8, F16, vxc_uchar16, vxc_uchar16, vxc_half8, vxc_short8)\n\ +UPSAMPLE_SCALETO16B_FUN(I16, F16, vxc_short8, vxc_short8, vxc_half8, vxc_short8)\n\ +UPSAMPLE_SCALETO16B_FUN(I16, I16, vxc_short8, vxc_short8, vxc_short8, vxc_short8)\n\ +"; /* end of upsamplescale_k2_vx*/ + static const char vsi_nn_kernel_axis_aligned_bbox_transform_vx[] = "#include \"cl_viv_vx_ext.h\"\n\ \n\ __kernel void vxcAxis_aligned_bbox_transform(\n\ @@ -33184,119 +34624,6 @@ __kernel void vxcBox_with_nms_limit(\n\ }\n\ "; /* end of vsi_nn_kernel_box_with_nms_limit_vx*/ -static const char vsi_nn_kernel_crop_vx[] = "#include \"cl_viv_vx_ext.h\"\n\ -\n\ -//-----------------------------------------------tensor crop-------------------------------\n\ -__kernel void vxcTensorCrop_Int16(\n\ - __read_only image2d_array_t input,\n\ - __write_only image2d_array_t output,\n\ - int offset0,\n\ - int offset1,\n\ - int offset2)\n\ -{\n\ - int4 coord_in = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0);\n\ - vxc_ushort8 src0, src1, src2, src3;\n\ -\n\ - VXC_ReadImage2DArray(src0, input, coord_in, VXC_5BITOFFSET_XY(0, 0),\\\n\ - VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ - VXC_ReadImage2DArray(src1, input, coord_in, VXC_5BITOFFSET_XY(0, 1),\\\n\ - VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ - VXC_ReadImage2DArray(src2, input, coord_in, VXC_5BITOFFSET_XY(0, 2),\\\n\ - VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ - VXC_ReadImage2DArray(src3, input, coord_in, VXC_5BITOFFSET_XY(0, 3),\\\n\ - VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ - int4 coord_out = (int4)(get_global_id(0) - offset0, get_global_id(1)\\\n\ - - offset1, get_global_id(2) - offset2, 0);\n\ -\n\ - VXC_WriteImage2DArray(output, coord_out, src0, VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0));\n\ - coord_out.y ++;\n\ - VXC_WriteImage2DArray(output, coord_out, src1, VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0));\n\ - coord_out.y ++;\n\ - VXC_WriteImage2DArray(output, coord_out, src2, VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0));\n\ - coord_out.y ++;\n\ - VXC_WriteImage2DArray(output, coord_out, src3, VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0));\n\ -}\n\ -\n\ -__kernel void vxcTensorCrop_Int8(\n\ - __read_only image2d_array_t input,\n\ - __write_only image2d_array_t output,\n\ - int offset0,\n\ - int offset1,\n\ - int offset2)\n\ -{\n\ - int4 coord_in = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0);\n\ -\n\ - vxc_uchar16 src0, src1, src2, src3;\n\ -\n\ - VXC_ReadImage2DArray(src0, input, coord_in, VXC_5BITOFFSET_XY(0, 0),\\\n\ - VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));\n\ - VXC_ReadImage2DArray(src1, input, coord_in, VXC_5BITOFFSET_XY(0, 1),\\\n\ - VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));\n\ - VXC_ReadImage2DArray(src2, input, coord_in, VXC_5BITOFFSET_XY(0, 2),\\\n\ - VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));\n\ - VXC_ReadImage2DArray(src3, input, coord_in, VXC_5BITOFFSET_XY(0, 3),\\\n\ - VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));\n\ - int4 coord_out = (int4)(get_global_id(0) - offset0, get_global_id(1) - offset1,\\\n\ - get_global_id(2) - offset2, 0);\n\ -\n\ - VXC_WriteImage2DArray(output, coord_out, src0, VXC_MODIFIER(0, 15, 0,VXC_RM_TowardZero, 0));\n\ - coord_out.y ++;\n\ - VXC_WriteImage2DArray(output, coord_out, src1, VXC_MODIFIER(0, 15, 0,VXC_RM_TowardZero, 0));\n\ - coord_out.y ++;\n\ - VXC_WriteImage2DArray(output, coord_out, src2, VXC_MODIFIER(0, 15, 0,VXC_RM_TowardZero, 0));\n\ - coord_out.y ++;\n\ - VXC_WriteImage2DArray(output, coord_out, src3, VXC_MODIFIER(0, 15, 0,VXC_RM_TowardZero, 0));\n\ -}\n\ -\n\ -_viv_uniform VXC_512Bits uniConvertInt16toFp16_2x8;\n\ -\n\ -__kernel void vxcTensorCrop_Int16_Fp16(\n\ - __read_only image2d_array_t input,\n\ - __write_only image2d_array_t output,\n\ - int offset0,\n\ - int offset1,\n\ - int offset2)\n\ -{\n\ - int4 coord_in = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0);\n\ - vxc_short8 src0, src1, src2, src3;\n\ -\n\ - VXC_ReadImage2DArray(src0, input, coord_in, VXC_5BITOFFSET_XY(0, 0),\\\n\ - VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ - VXC_ReadImage2DArray(src1, input, coord_in, VXC_5BITOFFSET_XY(0, 1),\\\n\ - VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ - VXC_ReadImage2DArray(src2, input, coord_in, VXC_5BITOFFSET_XY(0, 2),\\\n\ - VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ - VXC_ReadImage2DArray(src3, input, coord_in, VXC_5BITOFFSET_XY(0, 3),\\\n\ - VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ - int4 coord_out = (int4)(get_global_id(0) - offset0, get_global_id(1)\\\n\ - - offset1, get_global_id(2) - offset2, 0);\n\ -\n\ - vxc_half8 dst0, dst1, dst2, dst3;\n\ - VXC_DP2x8(dst0, src0, src0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1),\\\n\ - uniConvertInt16toFp16_2x8);\n\ - VXC_DP2x8(dst1, src1, src1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1),\\\n\ - uniConvertInt16toFp16_2x8);\n\ - VXC_DP2x8(dst2, src2, src2, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1),\\\n\ - uniConvertInt16toFp16_2x8);\n\ - VXC_DP2x8(dst3, src3, src3, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1),\\\n\ - uniConvertInt16toFp16_2x8);\n\ -\n\ - vxc_short8 out0, out1, out2, out3;\n\ - _viv_asm(COPY, out0, dst0, 16);\n\ - _viv_asm(COPY, out1, dst1, 16);\n\ - _viv_asm(COPY, out2, dst2, 16);\n\ - _viv_asm(COPY, out3, dst3, 16);\n\ -\n\ - VXC_WriteImage2DArray(output, coord_out, out0, VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0));\n\ - coord_out.y ++;\n\ - VXC_WriteImage2DArray(output, coord_out, out1, VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0));\n\ - coord_out.y ++;\n\ - VXC_WriteImage2DArray(output, coord_out, out2, VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0));\n\ - coord_out.y ++;\n\ - VXC_WriteImage2DArray(output, coord_out, out3, VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0));\n\ -}\n\ -"; /* end of vsi_nn_kernel_crop_vx*/ - static const char vsi_nn_kernel_detection_postprocess_vx[] = "#include \"cl_viv_vx_ext.h\"\n\ \n\ __kernel void vxcDetection_postprocess(\n\ @@ -33352,71 +34679,6 @@ __kernel void vxcExtra_ending_u8(\n\ }\n\ "; /* end of vsi_nn_kernel_extra_ending_vx*/ -static const char vsi_nn_kernel_fullconnect2_vx[] = "#include \"cl_viv_vx_ext.h\"\n\ -\n\ -_viv_uniform int loopNum;\n\ -_viv_uniform VXC_512Bits uniMulAcc_16x1;\n\ -__kernel void vsi_nn_kernel_fullconnect2(\n\ - __read_only image2d_array_t input,\n\ - __read_only image2d_array_t weight,\n\ - __read_only image2d_array_t bias,\n\ - __write_only image2d_array_t output)\n\ -{\n\ - int4 coord_in = (int4)(16, get_global_id(0), get_global_id(1), 0);\n\ - int2 coord_out = (int2)(get_global_id(0), get_global_id(1));\n\ -\n\ - vxc_short8 v0, v1, v2, v3, v4, v5, v6, v7;\n\ - vxc_half8 i0, i1, i2, i3;\n\ - vxc_half8 w0, w1, w2, w3;\n\ - float4 sum = 0;\n\ - float dst = 0;\n\ - dst = read_imagef(bias, coord_in.ywww).x;\n\ - do\n\ - {\n\ - VXC_ReadImage(v0, input, coord_in.xz, VXC_5BITOFFSET_XY(-16, 0),\\\n\ - VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ - _viv_asm(COPY, i0, v0, 16);\n\ - VXC_ReadImage(v1, weight, coord_in.xy, VXC_5BITOFFSET_XY(-16, 0),\\\n\ - VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ - _viv_asm(COPY, w0, v1, 16);\n\ - VXC_ReadImage(v2, input, coord_in.xz, VXC_5BITOFFSET_XY(-8, 0),\\\n\ - VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ - _viv_asm(COPY, i1, v2, 16);\n\ - VXC_ReadImage(v3, weight, coord_in.xy, VXC_5BITOFFSET_XY(-8, 0),\\\n\ - VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ - _viv_asm(COPY, w1, v3, 16);\n\ - VXC_ReadImage(v4, input, coord_in.xz, VXC_5BITOFFSET_XY(0, 0),\\\n\ - VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ - _viv_asm(COPY, i2, v4, 16);\n\ - VXC_ReadImage(v5, weight, coord_in.xy, VXC_5BITOFFSET_XY(0, 0),\\\n\ - VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ - _viv_asm(COPY, w2, v5, 16);\n\ - VXC_ReadImage(v6, input, coord_in.xz, VXC_5BITOFFSET_XY(8, 0),\\\n\ - VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ - _viv_asm(COPY, i3, v6, 16);\n\ - VXC_ReadImage(v7, weight, coord_in.xy, VXC_5BITOFFSET_XY(8, 0),\\\n\ - VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ - _viv_asm(COPY, w3, v7, 16);\n\ -\n\ - coord_in.x += 32;\n\ -\n\ - VXC_DP16x1(sum, i0, w0, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0), uniMulAcc_16x1);\n\ - VXC_DP16x1(sum, i1, w1, VXC_MODIFIER(1, 1, 0, VXC_RM_TowardZero, 0), uniMulAcc_16x1);\n\ - VXC_DP16x1(sum, i2, w2, VXC_MODIFIER(2, 2, 0, VXC_RM_TowardZero, 0), uniMulAcc_16x1);\n\ - VXC_DP16x1(sum, i3, w3, VXC_MODIFIER(3, 3, 0, VXC_RM_TowardZero, 0), uniMulAcc_16x1);\n\ -\n\ - float4 tmp = {1, 1, 1, 1};\n\ - dst = dst + dot(sum, tmp);\n\ -\n\ - } while (coord_in.x < loopNum);\n\ -\n\ - vxc_half v;\n\ - _viv_asm(CONV, v, dst);\n\ - _viv_asm(COPY, v0, v, 16);\n\ - VXC_WriteImage(output, coord_out.xy, v0, VXC_MODIFIER(0, 0, 0,VXC_RM_TowardZero, 0));\n\ -}\n\ -"; /* end of vsi_nn_kernel_fullconnect2_vx*/ - static const char vsi_nn_kernel_generate_proposals_vx[] = "#include \"cl_viv_vx_ext.h\"\n\ \n\ __kernel void vxcGenerate_proposals(\n\ @@ -35071,426 +36333,6 @@ __kernel void GrayScaletoTensor_UInt8\n\ }\n\ "; /* end of vsi_nn_kernel_imageprocess_5_vx*/ -static const char vsi_nn_kernel_layernormalize_vx[] = "#include \"cl_viv_vx_ext.h\"\n\ -\n\ -/**************************layernorm float16***********************************/\n\ -_viv_uniform int width;\n\ -_viv_uniform float dimRatio;\n\ -_viv_uniform VXC_512Bits uniFp16SumSqr_dp8x2;\n\ -_viv_uniform VXC_512Bits UniFP16toFP32Lo4_dp4x4;\n\ -_viv_uniform VXC_512Bits uniExtractHalf4_dp4x4;\n\ -\n\ -__kernel void vxcLayerNorm(\n\ - image2d_array_t input,\n\ - image2d_array_t bias,\n\ - image2d_array_t scale,\n\ - image2d_array_t output,\n\ - float eps)\n\ -{\n\ - int4 coord = (int4)(0, get_global_id(1), 0, 0);\n\ - vxc_short8 src0, src1;\n\ - vxc_float sum = 0, sqr = 0;\n\ - VXC_ReadImage(src0, input, coord.xy, VXC_5BITOFFSET_XY(0, 0),\\\n\ - VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ - for(coord.x = 8; coord.x < (width+8); coord.x += 8)\n\ - {\n\ - vxc_half8 val0_h;\n\ - _viv_asm(COPY, val0_h, src0, 16);\n\ - VXC_ReadImage(src0, input, coord.xy, VXC_5BITOFFSET_XY(0, 0),\\\n\ - VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ - vxc_float4 sumsqr;\n\ - VXC_DP8x2(sumsqr, val0_h, val0_h, VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0),\\\n\ - uniFp16SumSqr_dp8x2);\n\ - sum += sumsqr.x;\n\ - sqr += sumsqr.y;\n\ - }\n\ - vxc_float mean;\n\ - mean = sum * dimRatio;\n\ - vxc_float vari;\n\ - vari = sqr*dimRatio - mean*mean;\n\ - vari += eps;\n\ - vari = rsqrt(vari);\n\ - vxc_float4 bias_f;\n\ - for(coord.x = 0; coord.x < width; coord.x += 4)\n\ - {\n\ - VXC_ReadImage(src0, input, coord.xy, VXC_5BITOFFSET_XY(0, 0),\\\n\ - VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0));\n\ - VXC_ReadImage(src1, scale, coord.xw, VXC_5BITOFFSET_XY(0, 0),\\\n\ - VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0));\n\ - bias_f = read_imagef(bias, coord.xwww);\n\ - vxc_half8 in_h, scale_h;\n\ - _viv_asm(COPY, in_h, src0, 16);\n\ - _viv_asm(COPY, scale_h, src1, 16);\n\ - vxc_float4 in_f, scale_f;\n\ - VXC_DP4x4(in_f, in_h, in_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\\\n\ - UniFP16toFP32Lo4_dp4x4);\n\ - VXC_DP4x4(scale_f, scale_h, scale_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\\\n\ - UniFP16toFP32Lo4_dp4x4);\n\ - vxc_float4 sub, norm;\n\ - sub = in_f - mean;\n\ - norm = scale_f * vari * sub + bias_f;\n\ - half4 norm_h;\n\ - _viv_asm(CONV, norm_h, norm);\n\ - vxc_half8 dst;\n\ - VXC_DP4x4(dst, norm_h, norm_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\\\n\ - uniExtractHalf4_dp4x4);\n\ - vxc_short8 dstval;\n\ - _viv_asm(COPY, dstval, dst, 16);\n\ - VXC_WriteImage(output, coord.xy, dstval, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0));\n\ - }\n\ -}\n\ -/*****************************layernorm uint8 to uint8****************************/\n\ -_viv_uniform VXC_512Bits uniConvert1stUint8SubZpToFp32_4x4;\n\ -_viv_uniform VXC_512Bits uniConvert2ndUint8SubZpToFp32_4x4;\n\ -_viv_uniform VXC_512Bits uniConvert3rdUint8SubZpToFp32_4x4;\n\ -_viv_uniform VXC_512Bits uniConvert4thUint8SubZpToFp32_4x4;\n\ -_viv_uniform VXC_512Bits uniSumU8_16x1;\n\ -_viv_uniform VXC_512Bits uniSqrSum_16x1;\n\ -_viv_uniform float input_scale;\n\ -_viv_uniform int inputZP;\n\ -_viv_uniform float outputScale;\n\ -_viv_uniform int output_ZP;\n\ -_viv_uniform int sumInZp;\n\ -_viv_uniform int tmpZp1;\n\ -_viv_uniform int tmpZp2;\n\ -_viv_uniform float e2InScale;\n\ -_viv_uniform VXC_512Bits uniConvertSecFp16Fp32_4x4;\n\ -_viv_uniform VXC_512Bits uniConvertInt32toUint8_2x8;\n\ -\n\ -__kernel void vxcLayerNorm_u8(\n\ - image2d_array_t input,\n\ - image2d_array_t bias,\n\ - image2d_array_t scale,\n\ - image2d_array_t output,\n\ - float eps)\n\ -{\n\ - int4 coord = (int4)(0, get_global_id(1), 0, 0);\n\ - vxc_uchar16 src0, src2;\n\ - vxc_short8 src1;\n\ - vxc_half8 scale_h;\n\ - float sum = 0, sqr = 0;\n\ - vxc_float4 bias_f0, bias_f1, scale_f0, scale_f1;\n\ - int tmpSum = 0, tmpSqr = 0;\n\ - vxc_int4 tmpSum1;\n\ - vxc_int4 tmpSqr1;\n\ - short zp = inputZP;\n\ -\n\ - for(coord.x = 0; coord.x < width; coord.x += 16)\n\ - {\n\ - VXC_ReadImage(src0, input, coord.xy, VXC_5BITOFFSET_XY(0, 0),\\\n\ - VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));\n\ - VXC_DP16x1(tmpSum1, src0, src0, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0), uniSumU8_16x1);\n\ - tmpSum += (tmpSum1.x);\n\ - VXC_DP16x1(tmpSqr1, src0, src0, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0), uniSqrSum_16x1);\n\ - tmpSqr += (tmpSqr1.x + tmpZp1 * tmpSum1.x);\n\ - }\n\ - sum = (tmpSum + sumInZp) * input_scale;\n\ - sqr = (tmpSqr + tmpZp2) * e2InScale;\n\ -\n\ - float mean, vari;\n\ - mean = sum * dimRatio;\n\ - vari = sqr*dimRatio - mean*mean;\n\ - vari += eps;\n\ - vari = rsqrt(vari);\n\ - vxc_int4 tmpVal0, tmpVal1;\n\ - vxc_float4 tmpData0, tmpData1, tmpData2, tmpData3;\n\ - int4 coord_bias = (int4)(0, 0, 0, 0);\n\ -\n\ - for(coord.x = 0; coord.x < width; coord.x += 16)\n\ - {\n\ - coord_bias.x = coord.x;\n\ - VXC_ReadImage(src0, input, coord.xy, VXC_5BITOFFSET_XY(0, 0),\\\n\ - VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));\n\ - VXC_ReadImage(src1, scale, coord.xw, VXC_5BITOFFSET_XY(0, 0),\\\n\ - VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ - _viv_asm(COPY, scale_h, src1, 16);\n\ - VXC_DP4x4(scale_f0, scale_h, scale_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\\\n\ - UniFP16toFP32Lo4_dp4x4);\n\ - VXC_DP4x4(scale_f1, scale_h, scale_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\\\n\ - uniConvertSecFp16Fp32_4x4);\n\ - bias_f0 = read_imagef(bias, coord_bias);\n\ - coord_bias.x += 4;\n\ - bias_f1 = read_imagef(bias, coord_bias);\n\ - coord_bias.x += 4;\n\ -\n\ - VXC_ReadImage(src1, scale, coord.xw, VXC_5BITOFFSET_XY(8, 0),\\\n\ - VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ - _viv_asm(COPY, scale_h, src1, 16);\n\ - VXC_DP4x4(tmpData0, src0, zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\\\n\ - uniConvert1stUint8SubZpToFp32_4x4);\n\ - VXC_DP4x4(tmpData1, src0, zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\\\n\ - uniConvert2ndUint8SubZpToFp32_4x4);\n\ - VXC_DP4x4(tmpData2, src0, zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\\\n\ - uniConvert3rdUint8SubZpToFp32_4x4);\n\ - VXC_DP4x4(tmpData3, src0, zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\\\n\ - uniConvert4thUint8SubZpToFp32_4x4);\n\ - tmpData0 *= input_scale;\n\ - tmpData1 *= input_scale;\n\ - tmpData2 *= input_scale;\n\ - tmpData3 *= input_scale;\n\ -\n\ - vxc_float4 norm;\n\ - tmpData0 -= mean;\n\ - norm = scale_f0 * vari * tmpData0 + bias_f0;\n\ - bias_f0 = read_imagef(bias, coord_bias);\n\ - VXC_DP4x4(scale_f0, scale_h, scale_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\\\n\ - UniFP16toFP32Lo4_dp4x4);\n\ - coord_bias.x += 4;\n\ - tmpVal0 = convert_int4_rte(norm * outputScale + output_ZP);\n\ -\n\ - tmpData1 -= mean;\n\ - norm = scale_f1 * vari * tmpData1 + bias_f1;\n\ - bias_f1 = read_imagef(bias, coord_bias);\n\ - VXC_DP4x4(scale_f1, scale_h, scale_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\\\n\ - uniConvertSecFp16Fp32_4x4);\n\ - tmpVal1 = convert_int4_rte(norm * outputScale + output_ZP);\n\ - VXC_DP2x8(src2, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1),\\\n\ - uniConvertInt32toUint8_2x8);\n\ -\n\ - tmpData2 -= mean;\n\ - norm = scale_f0 * vari * tmpData2 + bias_f0;\n\ - tmpVal0 = convert_int4_rte(norm * outputScale + output_ZP);\n\ -\n\ - tmpData3 -= mean;\n\ - norm = scale_f1 * vari * tmpData3 + bias_f1;\n\ - tmpVal1 = convert_int4_rte(norm * outputScale + output_ZP);\n\ - VXC_DP2x8(src2, tmpVal0, tmpVal1, VXC_MODIFIER(8, 15, 0, VXC_RM_TowardZero, 1),\\\n\ - uniConvertInt32toUint8_2x8);\n\ - VXC_WriteImage(output, coord.xy, src2, VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));\n\ - }\n\ -}\n\ -/***************************layernorm float16 to uint8**************************/\n\ -_viv_uniform float outputZP;\n\ -__kernel void vxcLayerNormFP16toU8(\n\ - image2d_array_t input,\n\ - image2d_array_t bias,\n\ - image2d_array_t scale,\n\ - image2d_array_t output,\n\ - float eps)\n\ -{\n\ - int4 coord = (int4)(0, get_global_id(1), 0, 0);\n\ - vxc_short8 src0, src1;\n\ - vxc_float sum = 0, sqr = 0;\n\ - VXC_ReadImage(src0, input, coord.xy, VXC_5BITOFFSET_XY(0, 0),\\\n\ - VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ - for(coord.x = 8; coord.x < (width+8); coord.x += 8)\n\ - {\n\ - vxc_half8 val0_h;\n\ - _viv_asm(COPY, val0_h, src0, 16);\n\ - VXC_ReadImage(src0, input, coord.xy, VXC_5BITOFFSET_XY(0, 0),\\\n\ - VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ - vxc_float4 sumsqr;\n\ - VXC_DP8x2(sumsqr, val0_h, val0_h, VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0),\\\n\ - uniFp16SumSqr_dp8x2);\n\ - sum += sumsqr.x;\n\ - sqr += sumsqr.y;\n\ - }\n\ - vxc_float mean;\n\ - mean = sum * dimRatio;\n\ - vxc_float vari;\n\ - vari = sqr*dimRatio - mean*mean;\n\ - vari += eps;\n\ - vari = rsqrt(vari);\n\ - vxc_float4 bias_f;\n\ - for(coord.x = 0; coord.x < width; coord.x += 4)\n\ - {\n\ - VXC_ReadImage(src0, input, coord.xy, VXC_5BITOFFSET_XY(0, 0),\\\n\ - VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0));\n\ - VXC_ReadImage(src1, scale, coord.xw, VXC_5BITOFFSET_XY(0, 0),\\\n\ - VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0));\n\ - bias_f = read_imagef(bias, coord.xwww);\n\ - vxc_half8 in_h, scale_h;\n\ - _viv_asm(COPY, in_h, src0, 16);\n\ - _viv_asm(COPY, scale_h, src1, 16);\n\ - vxc_float4 in_f, scale_f;\n\ - VXC_DP4x4(in_f, in_h, in_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\\\n\ - UniFP16toFP32Lo4_dp4x4);\n\ - VXC_DP4x4(scale_f, scale_h, scale_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\\\n\ - UniFP16toFP32Lo4_dp4x4);\n\ - vxc_float4 sub, norm;\n\ - sub = in_f - mean;\n\ - norm = scale_f * vari * sub + bias_f;\n\ - norm = norm * outputScale + outputZP;\n\ - int4 output_int4;\n\ - output_int4 = convert_int4_rte(norm);\n\ - vxc_uchar8 dst;\n\ - VXC_DP2x8(dst, output_int4, output_int4, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1),\n\ - uniConvertInt32toUint8_2x8);\n\ - VXC_WriteImage(output, coord.xy, dst, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0));\n\ - }\n\ -}"; /* end of vsi_nn_kernel_layernormalize_vx*/ - -static const char vsi_nn_kernel_layernormalize_U8_vx[] = "#include \"cl_viv_vx_ext.h\"\n\ -\n\ -/*****************************layernorm uint8 to fp16****************************/\n\ -_viv_uniform int width;\n\ -_viv_uniform float dimRatio;\n\ -_viv_uniform VXC_512Bits UniFP16toFP32Lo4_dp4x4;\n\ -_viv_uniform VXC_512Bits uniConvert1stUint8SubZpToFp32_4x4;\n\ -_viv_uniform VXC_512Bits uniConvert2ndUint8SubZpToFp32_4x4;\n\ -_viv_uniform VXC_512Bits uniConvert3rdUint8SubZpToFp32_4x4;\n\ -_viv_uniform VXC_512Bits uniConvert4thUint8SubZpToFp32_4x4;\n\ -_viv_uniform VXC_512Bits uniSumU8_16x1;\n\ -_viv_uniform VXC_512Bits uniSqrSum_16x1;\n\ -_viv_uniform float input_scale;\n\ -_viv_uniform int inputZP;\n\ -_viv_uniform int sumInZp;\n\ -_viv_uniform int tmpZp1;\n\ -_viv_uniform int tmpZp2;\n\ -_viv_uniform float e2InScale;\n\ -_viv_uniform VXC_512Bits uniConvertSecFp16Fp32_4x4;\n\ -_viv_uniform VXC_512Bits UniPackFP16even_2x8;\n\ -\n\ -__kernel void vxcLayerNormU8toFp16(\n\ - image2d_array_t input,\n\ - image2d_array_t bias,\n\ - image2d_array_t scale,\n\ - image2d_array_t output,\n\ - float eps)\n\ -{\n\ - int4 coord = (int4)(0, get_global_id(1), 0, 0);\n\ - vxc_uchar16 src0;\n\ - float sum = 0, sqr = 0;\n\ - int tmpSum = 0, tmpSqr = 0;\n\ - vxc_int4 tmpSum1;\n\ - vxc_int4 tmpSqr1;\n\ -\n\ - for(coord.x = 0; coord.x < width; coord.x += 16)\n\ - {\n\ - VXC_ReadImage(src0, input, coord.xy, VXC_5BITOFFSET_XY(0, 0),\\\n\ - VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));\n\ - VXC_DP16x1(tmpSum1, src0, src0, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0), uniSumU8_16x1);\n\ - tmpSum += (tmpSum1.x);\n\ - VXC_DP16x1(tmpSqr1, src0, src0, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0), uniSqrSum_16x1);\n\ - tmpSqr += (tmpSqr1.x + tmpZp1 * tmpSum1.x);\n\ - }\n\ - sum = (tmpSum + sumInZp) * input_scale;\n\ - sqr = (tmpSqr + tmpZp2) * e2InScale;\n\ -\n\ - float mean, vari;\n\ - mean = sum * dimRatio;\n\ - vari = sqr*dimRatio - mean*mean;\n\ - vari += eps;\n\ - vari = rsqrt(vari);\n\ - vxc_float4 tmpData0, tmpData1, tmpData2, tmpData3;\n\ - int4 coord_bias = (int4)(0, 0, 0, 0);\n\ - vxc_half8 scale_h;\n\ - vxc_float4 bias_f0, bias_f1, scale_f0, scale_f1;\n\ - vxc_short8 src1, outval;\n\ - short zp = inputZP;\n\ - half4 tmpVal0, tmpVal1;\n\ - vxc_half8 dst;\n\ -\n\ - for(coord.x = 0; coord.x < width; coord.x += 16)\n\ - {\n\ - coord_bias.x = coord.x;\n\ - VXC_ReadImage(src0, input, coord.xy, VXC_5BITOFFSET_XY(0, 0),\\\n\ - VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));\n\ - VXC_ReadImage(src1, scale, coord.xw, VXC_5BITOFFSET_XY(0, 0),\\\n\ - VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ - _viv_asm(COPY, scale_h, src1, 16);\n\ - VXC_DP4x4(scale_f0, scale_h, scale_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\\\n\ - UniFP16toFP32Lo4_dp4x4);\n\ - VXC_DP4x4(scale_f1, scale_h, scale_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\\\n\ - uniConvertSecFp16Fp32_4x4);\n\ - bias_f0 = read_imagef(bias, coord_bias);\n\ - coord_bias.x += 4;\n\ - bias_f1 = read_imagef(bias, coord_bias);\n\ - coord_bias.x += 4;\n\ -\n\ - VXC_ReadImage(src1, scale, coord.xw, VXC_5BITOFFSET_XY(8, 0),\\\n\ - VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ - _viv_asm(COPY, scale_h, src1, 16);\n\ - VXC_DP4x4(tmpData0, src0, zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\\\n\ - uniConvert1stUint8SubZpToFp32_4x4);\n\ - VXC_DP4x4(tmpData1, src0, zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\\\n\ - uniConvert2ndUint8SubZpToFp32_4x4);\n\ - VXC_DP4x4(tmpData2, src0, zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\\\n\ - uniConvert3rdUint8SubZpToFp32_4x4);\n\ - VXC_DP4x4(tmpData3, src0, zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\\\n\ - uniConvert4thUint8SubZpToFp32_4x4);\n\ - tmpData0 *= input_scale;\n\ - tmpData1 *= input_scale;\n\ - tmpData2 *= input_scale;\n\ - tmpData3 *= input_scale;\n\ -\n\ - vxc_float4 norm;\n\ - tmpData0 -= mean;\n\ - norm = scale_f0 * vari * tmpData0 + bias_f0;\n\ - bias_f0 = read_imagef(bias, coord_bias);\n\ - VXC_DP4x4(scale_f0, scale_h, scale_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\\\n\ - UniFP16toFP32Lo4_dp4x4);\n\ - coord_bias.x += 4;\n\ - _viv_asm(CONV, tmpVal0, norm);\n\ -\n\ - tmpData1 -= mean;\n\ - norm = scale_f1 * vari * tmpData1 + bias_f1;\n\ - bias_f1 = read_imagef(bias, coord_bias);\n\ - VXC_DP4x4(scale_f1, scale_h, scale_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\\\n\ - uniConvertSecFp16Fp32_4x4);\n\ - _viv_asm(CONV, tmpVal1, norm);\n\ - VXC_DP2x8(dst, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0),\\\n\ - UniPackFP16even_2x8);\n\ - _viv_asm(COPY, outval, dst, 16);\n\ - int2 coord_out = (int2)(coord.x, coord.y);\n\ - VXC_WriteImage(output, coord_out, outval, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ -\n\ - tmpData2 -= mean;\n\ - norm = scale_f0 * vari * tmpData2 + bias_f0;\n\ - _viv_asm(CONV, tmpVal0, norm);\n\ -\n\ - tmpData3 -= mean;\n\ - norm = scale_f1 * vari * tmpData3 + bias_f1;\n\ - _viv_asm(CONV, tmpVal1, norm);\n\ - VXC_DP2x8(dst, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0),\\\n\ - UniPackFP16even_2x8);\n\ - _viv_asm(COPY, outval, dst, 16);\n\ - coord_out.x += 8;\n\ - VXC_WriteImage(output, coord_out, outval, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ - }\n\ -}\n\ -"; /* end of vsi_nn_kernel_layernormalize_U8_vx*/ - -static const char vsi_nn_kernel_resize_vx[] = "#include \"cl_viv_vx_ext.h\"\n\ -\n\ -//--------------------------resize-------------------------\n\ -_viv_uniform VXC_512Bits uniPackEvenData_2x8;\n\ -__kernel void resize_16bits_downsample_quarter\n\ - (\n\ - __read_only image2d_array_t input,\n\ - __write_only image2d_array_t output\n\ - )\n\ -{\n\ - int2 coord = (int2)(get_global_id(0), get_global_id(1));\n\ - vxc_short8 src0, src1;\n\ - VXC_ReadImage(src0, input, coord.xy, VXC_5BITOFFSET_XY(0, 0),\\\n\ - VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ - VXC_ReadImage(src1, input, coord.xy, VXC_5BITOFFSET_XY(8, 0),\\\n\ - VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ -\n\ - coord = coord >> 1;\n\ - VXC_DP2x8(src0, src0, src1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardInf, 0), uniPackEvenData_2x8);\n\ - VXC_WriteImage(output, coord, src0, VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0));\n\ -}\n\ -\n\ -__kernel void resize_8bits_downsample_quarter\n\ - (\n\ - __read_only image2d_array_t input,\n\ - __write_only image2d_array_t output\n\ - )\n\ -{\n\ - int2 coord = (int2)(get_global_id(0), get_global_id(1));\n\ - vxc_char16 src0;\n\ - vxc_char8 dst;\n\ - VXC_ReadImage(src0, input, coord.xy, VXC_5BITOFFSET_XY(0, 0),\\\n\ - VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));\n\ -\n\ - coord = coord >> 1;\n\ - dst = src0.s02468ace;\n\ - VXC_WriteImage(output, coord, dst, VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0));\n\ -}\n\ -"; /* end of vsi_nn_kernel_resize_vx*/ - static const char vsi_nn_kernel_roi_align_vx[] = "#include \"cl_viv_vx_ext.h\"\n\ \n\ __kernel void vxcRoi_align(\n\ @@ -35501,193 +36343,6 @@ __kernel void vxcRoi_align(\n\ }\n\ "; /* end of vsi_nn_kernel_roi_align_vx*/ -static const char vsi_nn_kernel_scale_vx[] = "#include \"cl_viv_vx_ext.h\"\n\ -\n\ -//--------------------------scale-------------------------\n\ -_viv_uniform VXC_512Bits uniExtractHalf8_2x8;\n\ -_viv_uniform VXC_512Bits uniFp16MulFp16ToFp32_Lo_4x4;\n\ -_viv_uniform VXC_512Bits uniFp16MulFp16ToFp32_Hi_4x4;\n\ -__kernel void scale_fp16\n\ - (\n\ - __read_only image2d_array_t input,\n\ - __read_only image2d_array_t weights,\n\ - __read_only image2d_array_t biases,\n\ - __write_only image2d_array_t output\n\ - )\n\ -{\n\ - int4 coord = (int4)(get_global_id(0), get_global_id(1), 0, 0);\n\ - vxc_short8 vec0, vec1;\n\ - vxc_half8 src0;\n\ - vxc_half8 w0;\n\ - vxc_float4 b0, b1;\n\ - vxc_float4 dst0, dst1;\n\ - VXC_ReadImage(vec0, input, coord.xy, VXC_5BITOFFSET_XY(0, 0),\\\n\ - VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ - _viv_asm(COPY, src0, vec0, 16);\n\ - VXC_ReadImage(vec1, weights, coord.xw, VXC_5BITOFFSET_XY(0, 0),\\\n\ - VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ - _viv_asm(COPY, w0, vec1, 16);\n\ -\n\ - coord.z = coord.x + 4;\n\ -\n\ - b0 = read_imagef(biases, coord.xwww);\n\ - b1 = read_imagef(biases, coord.zwww);\n\ -\n\ - VXC_DP4x4(dst0, src0, w0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\\\n\ - uniFp16MulFp16ToFp32_Lo_4x4);\n\ - VXC_DP4x4(dst1, src0, w0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\\\n\ - uniFp16MulFp16ToFp32_Hi_4x4);\n\ - dst0 += b0;\n\ - dst1 += b1;\n\ -\n\ - half4 t0, t1;\n\ -\n\ - _viv_asm(CONV, t0, dst0);\n\ - _viv_asm(CONV, t1, dst1);\n\ -\n\ - VXC_DP2x8(w0, t0, t1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniExtractHalf8_2x8);\n\ - _viv_asm(COPY, vec0, w0, 16);\n\ -\n\ - VXC_WriteImage(output, coord.xy, vec0, VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0));\n\ -}\n\ -"; /* end of vsi_nn_kernel_scale_vx*/ - -static const char vsi_nn_kernel_shufflechannel_vx[] = "#include \"cl_viv_vx_ext.h\"\n\ -\n\ -/******************shuffle channel float16/int16********************/\n\ -_viv_uniform int group_column;\n\ -_viv_uniform float rgroup_column;\n\ -\n\ -__kernel void shuffleChannelVXC(\n\ - image2d_array_t input,\n\ - image2d_array_t output,\n\ - int group_number,\n\ - int axis)\n\ -{\n\ - int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0);\n\ - vxc_short8 src0, src1, src2, src3;\n\ - VXC_ReadImage2DArray(src0, input, coord, VXC_5BITOFFSET_XY(0, 0),\\\n\ - VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ - VXC_ReadImage2DArray(src1, input, coord, VXC_5BITOFFSET_XY(0, 1),\\\n\ - VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ - VXC_ReadImage2DArray(src2, input, coord, VXC_5BITOFFSET_XY(0, 2),\\\n\ - VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ - VXC_ReadImage2DArray(src3, input, coord, VXC_5BITOFFSET_XY(0, 3),\\\n\ - VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ -\n\ - int coordz = coord.z;\n\ - int index_col = coordz * rgroup_column;\n\ - int index_row = coordz - index_col * group_column;\n\ - coord.z = index_row * group_number + index_col;\n\ - VXC_WriteImage2DArray(output, coord, src0, VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0));\n\ - coord.y ++;\n\ - VXC_WriteImage2DArray(output, coord, src1, VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0));\n\ - coord.y ++;\n\ - VXC_WriteImage2DArray(output, coord, src2, VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0));\n\ - coord.y ++;\n\ - VXC_WriteImage2DArray(output, coord, src3, VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0));\n\ -}\n\ -\n\ -/*****************shuffle channel int8/uint8****************************/\n\ -\n\ -__kernel void shuffleChannel8BitsVXC(\n\ - image2d_array_t input,\n\ - image2d_array_t output,\n\ - int group_number,\n\ - int axis)\n\ -{\n\ - int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0);\n\ - vxc_char16 src0, src1, src2, src3;\n\ - VXC_ReadImage2DArray(src0, input, coord, VXC_5BITOFFSET_XY(0, 0),\\\n\ - VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));\n\ - VXC_ReadImage2DArray(src1, input, coord, VXC_5BITOFFSET_XY(0, 1),\\\n\ - VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));\n\ - VXC_ReadImage2DArray(src2, input, coord, VXC_5BITOFFSET_XY(0, 2),\\\n\ - VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));\n\ - VXC_ReadImage2DArray(src3, input, coord, VXC_5BITOFFSET_XY(0, 3),\\\n\ - VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));\n\ -\n\ - int coordz = coord.z;\n\ - int index_col = coordz * rgroup_column;\n\ - int index_row = coordz - index_col * group_column;\n\ - coord.z = index_row * group_number + index_col;\n\ - VXC_WriteImage2DArray(output, coord, src0, VXC_MODIFIER(0, 15, 0,VXC_RM_TowardZero, 0));\n\ - coord.y ++;\n\ - VXC_WriteImage2DArray(output, coord, src1, VXC_MODIFIER(0, 15, 0,VXC_RM_TowardZero, 0));\n\ - coord.y ++;\n\ - VXC_WriteImage2DArray(output, coord, src2, VXC_MODIFIER(0, 15, 0,VXC_RM_TowardZero, 0));\n\ - coord.y ++;\n\ - VXC_WriteImage2DArray(output, coord, src3, VXC_MODIFIER(0, 15, 0,VXC_RM_TowardZero, 0));\n\ -}\n\ -"; /* end of vsi_nn_kernel_shufflechannel_vx*/ - -static const char vsi_nn_kernel_shufflechannel_axis1_vx[] = "#include \"cl_viv_vx_ext.h\"\n\ -\n\ -/******************shuffle channel float16/int16********************/\n\ -_viv_uniform int group_column;\n\ -_viv_uniform float rgroup_column;\n\ -\n\ -__kernel void shuffleChannel16Bits_Axis1(\n\ - image2d_array_t input,\n\ - image2d_array_t output,\n\ - int group_number,\n\ - int axis)\n\ -{\n\ - int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0);\n\ - int4 coord_out = coord;\n\ - vxc_short8 src0, src1, src2, src3;\n\ - VXC_ReadImage2DArray(src0, input, coord, VXC_5BITOFFSET_XY(0, 0),\\\n\ - VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ - coord.x += 8;\n\ - VXC_ReadImage2DArray(src1, input, coord, VXC_5BITOFFSET_XY(0, 0),\\\n\ - VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ - coord.x += 8;\n\ - VXC_ReadImage2DArray(src2, input, coord, VXC_5BITOFFSET_XY(0, 0),\\\n\ - VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ - coord.x += 8;\n\ - VXC_ReadImage2DArray(src3, input, coord, VXC_5BITOFFSET_XY(0, 0),\\\n\ - VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ -\n\ - int coordy = coord.y;\n\ - int index_col = coordy * rgroup_column;\n\ - int index_row = coordy - index_col * group_column;\n\ - coord_out.y = index_row * group_number + index_col;\n\ - VXC_WriteImage2DArray(output, coord_out, src0, VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0));\n\ - coord_out.x += 8;\n\ - VXC_WriteImage2DArray(output, coord_out, src1, VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0));\n\ - coord_out.x += 8;\n\ - VXC_WriteImage2DArray(output, coord_out, src2, VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0));\n\ - coord_out.x += 8;\n\ - VXC_WriteImage2DArray(output, coord_out, src3, VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0));\n\ -}\n\ -\n\ -/*****************shuffle channel int8/uint8****************************/\n\ -\n\ -__kernel void shuffleChannel8Bits_Axis1(\n\ - image2d_array_t input,\n\ - image2d_array_t output,\n\ - int group_number,\n\ - int axis)\n\ -{\n\ - int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0);\n\ - int4 coord_out = coord;\n\ - vxc_char16 src0, src1;\n\ - VXC_ReadImage2DArray(src0, input, coord, VXC_5BITOFFSET_XY(0, 0),\\\n\ - VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));\n\ - coord.x += 16;\n\ - VXC_ReadImage2DArray(src1, input, coord, VXC_5BITOFFSET_XY(0, 0),\\\n\ - VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));\n\ -\n\ - int coordy = coord.y;\n\ - int index_col = coordy * rgroup_column;\n\ - int index_row = coordy - index_col * group_column;\n\ - coord_out.y = index_row * group_number + index_col;\n\ - VXC_WriteImage2DArray(output, coord_out, src0, VXC_MODIFIER(0, 15, 0,VXC_RM_TowardZero, 0));\n\ - coord_out.x += 16;\n\ - VXC_WriteImage2DArray(output, coord_out, src1, VXC_MODIFIER(0, 15, 0,VXC_RM_TowardZero, 0));\n\ -}\n\ -"; /* end of vsi_nn_kernel_shufflechannel_axis1_vx*/ - static const char vsi_nn_kernel_signalframe_vx[] = "#include \"cl_viv_vx_ext.h\"\n\ \n\ _viv_uniform int input_width;\n\ @@ -35968,49 +36623,6 @@ __kernel __attribute__((reqd_work_group_size(16, 1, 1))) void vxcSignalFrame_ten #endif\n\ "; /* end of vsi_nn_kernel_signalframe_vx*/ -static const char vsi_nn_kernel_space2depth_vx[] = "#include \"cl_viv_vx_ext.h\"\n\ -\n\ -_viv_uniform VXC_512Bits uniExtractEvenFp16Stride2_4x4;\n\ -_viv_uniform VXC_512Bits uniExtractOddFp16Stride2_4x4;\n\ -_viv_uniform int input_depth;\n\ -\n\ -__kernel void vxcReorg2_fp16_fp16_sx2_sy1\n\ - (\n\ - image2d_array_t input,\n\ - image2d_array_t output,\n\ - int stridex,\n\ - int stridey\n\ - )\n\ -{\n\ - int gidx = get_global_id(0);\n\ - int gidy = get_global_id(1);\n\ - int gidz = get_global_id(2);\n\ -\n\ - int4 coord = (int4)(gidx, gidy, gidz, 0);\n\ - int4 coord_out = (int4)(gidx >> 1, gidy, 0, 0);\n\ - int out_d0, out_d1;\n\ - vxc_short8 imageData;\n\ - vxc_short8 imgVal0, imgVal1;\n\ - //int tmpw = gidz / input_depth; \\n\\\n\ - //int tmpz = gidz % input_depth; \\n\\\n\ -\n\ - VXC_ReadImage2DArray(imageData, input, coord, VXC_5BITOFFSET_XY(0, 0),\n\ - VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ - VXC_DP4x4(imgVal0, imageData, imageData, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\n\ - uniExtractEvenFp16Stride2_4x4);\n\ - VXC_DP4x4(imgVal1, imageData, imageData, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\n\ - uniExtractOddFp16Stride2_4x4);\n\ -\n\ - out_d0 = gidz * 2 * 1;\n\ - out_d1 = out_d0 + 1;\n\ -\n\ - coord_out.z = out_d0;\n\ - VXC_WriteImage2DArray(output, coord_out, imgVal0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0));\n\ - coord_out.z = out_d1;\n\ - VXC_WriteImage2DArray(output, coord_out, imgVal1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0));\n\ -}\n\ -"; /* end of vsi_nn_kernel_space2depth_vx*/ - static const char vsi_nn_kernel_tensorstackconcat_vx[] = "#include \"cl_viv_vx_ext.h\"\n\ \n\ /*******************tensorstackconcat 16BITs********************/\n\ @@ -39894,6 +40506,151 @@ __kernel __attribute__((reqd_work_group_size(1, 16, 1))) void l2normalizescale_a }\n\ "; /* end of l2normalizescale_axis1_cl*/ +static const char layer_normalization_cl[] = "\n\ +__kernel __attribute__((reqd_work_group_size(16, 1, 1))) void layer_norm_F32toF32(\n\ + __read_only image2d_array_t input,\n\ + __read_only image2d_t bias,\n\ + __read_only image2d_t scale,\n\ + __write_only image2d_array_t output,\n\ + float eps,\n\ + float input_zp,\n\ + float input_scale,\n\ + float output_zp,\n\ + float output_scale,\n\ + float e2InScale,\n\ + float scale_inOut,\n\ + float sumZpScale,\n\ + float zp2ScaleE2,\n\ + float sumZpScaleE2,\n\ + int width,\n\ + int height,\n\ + float dim_ratio\n\ + )\n\ +{\n\ + int lidx = get_local_id(0);\n\ + int4 coord = (int4)(lidx, get_global_id(1), get_global_id(2), 0);\n\ +\n\ + float4 data, dst;\n\ + float2 sumSqr = (float2)(0);\n\ + float scale_vari, bias_val;\n\ + __local float2 local_sum[16];\n\ +\n\ + for(; coord.x < width;)\n\ + {\n\ + data = read_imagef(input, coord);\n\ + coord.x += 16;\n\ + sumSqr.x += data.x;\n\ + sumSqr.y += data.x * data.x;\n\ + }\n\ + local_sum[lidx] = sumSqr;\n\ + barrier(CLK_LOCAL_MEM_FENCE);\n\ + if(lidx == 0)\n\ + {\n\ + for(int i = 1; i < 16; i++)\n\ + {\n\ + sumSqr += local_sum[i];\n\ + }\n\ + local_sum[0] = sumSqr;\n\ + }\n\ + barrier(CLK_LOCAL_MEM_FENCE);\n\ + sumSqr = local_sum[0] * dim_ratio;\n\ + sumSqr.s1 = sumSqr.s1 - sumSqr.s0 * sumSqr.s0 + eps;\n\ + sumSqr.s1 = rsqrt(sumSqr.s1);\n\ +\n\ + for(coord.x = lidx; coord.x < width;)\n\ + {\n\ + float4 gamma = read_imagef(scale, coord.xw);\n\ + float4 beta = read_imagef(bias, coord.xw);\n\ + data = read_imagef(input, coord);\n\ +\n\ + scale_vari = gamma.s0 * sumSqr.s1;\n\ + bias_val = (beta.s0 - scale_vari * sumSqr.s0);\n\ +\n\ + dst.x = data.x * scale_vari + bias_val;\n\ + write_imagef(output, coord, dst);\n\ + coord.x += 16;\n\ + }\n\ +}\n\ +\n\ +__kernel __attribute__((reqd_work_group_size(16, 1, 1))) void layer_norm_U8toU8(\n\ + __read_only image2d_array_t input,\n\ + __read_only image2d_t bias,\n\ + __read_only image2d_t scale,\n\ + __write_only image2d_array_t output,\n\ + float eps,\n\ + float input_zp,\n\ + float input_scale,\n\ + float output_zp,\n\ + float output_scale,\n\ + float e2InScale,\n\ + float scale_inOut,\n\ + float sumZpScale,\n\ + float zp2ScaleE2,\n\ + float sumZpScaleE2,\n\ + int width,\n\ + int height,\n\ + float dim_ratio\n\ + )\n\ +{\n\ + int lidx = get_local_id(0);\n\ + int4 coord = (int4)(lidx, get_global_id(1), get_global_id(2), 0);\n\ +\n\ + uint4 data, dst;\n\ + float2 sumSqr;\n\ + uint tmpSum = 0, tmpSqr = 0;\n\ + float scale_vari, bias_val;\n\ + __local uint local_sum[1];\n\ + __local uint local_sqr[1];\n\ +\n\ + if(lidx == 0)\n\ + {\n\ + local_sum[0] = 0;\n\ + local_sqr[0] = 0;\n\ + }\n\ + barrier(CLK_LOCAL_MEM_FENCE);\n\ +\n\ + for(; coord.x < width;)\n\ + {\n\ + data = read_imageui(input, coord);\n\ + coord.x+=16;\n\ + tmpSum += data.x;\n\ + tmpSqr += data.x * data.x;\n\ + }\n\ + atom_add(local_sum, tmpSum);\n\ + atom_add(local_sqr, tmpSqr);\n\ + barrier(CLK_LOCAL_MEM_FENCE);\n\ + tmpSum = local_sum[0];\n\ + tmpSqr = local_sqr[0];\n\ + //sumSqr.x = ((float)tmpSum - width * input_zp) * input_scale;\n\ + //sumSqr.y = ((float)tmpSqr - 2 * input_zp * (float)tmpSum + width * input_zp * input_zp) * e2InScale;\n\ + sumSqr.x = (float)tmpSum * input_scale - sumZpScale;\n\ + sumSqr.y = (float)tmpSqr * e2InScale - zp2ScaleE2 * (float)tmpSum + sumZpScaleE2;\n\ +\n\ + sumSqr *= dim_ratio;\n\ + sumSqr.s1 = sumSqr.s1 - sumSqr.s0 * sumSqr.s0 + eps;\n\ + sumSqr.s1 = rsqrt(sumSqr.s1);\n\ +\n\ + for(coord.x = lidx; coord.x < width;)\n\ + {\n\ + float4 gamma = read_imagef(scale, coord.xw);\n\ + float4 beta = read_imagef(bias, coord.xw);\n\ + data = read_imageui(input, coord);\n\ +\n\ + scale_vari = gamma.s0 * sumSqr.s1;\n\ + float alpha = scale_inOut * scale_vari;\n\ + bias_val = (beta.s0 - scale_vari * sumSqr.s0) * output_scale + output_zp;\n\ +\n\ + float tmpVal = data.x - input_zp;\n\ +\n\ + float4 norm;\n\ + norm.x = tmpVal * alpha + bias_val;\n\ + dst = convert_uint4_rte(norm);\n\ + write_imageui(output, coord, dst);\n\ + coord.x+=16;\n\ + }\n\ +}\n\ +"; /* end of layer_normalization_cl*/ + static const char log_softmax_axis0_cl[] = "#define rlogE (0.693147182f)\n\ float LOG(float x)\n\ {\n\ @@ -43435,32 +44192,30 @@ static const char matrixmul_cl[] = "__kernel void gemm_F32F32toF32_2D(\n\ int K,\n\ int N,\n\ int ac2zero,\n\ - int bc2zero\n\ + int bc2zero,\n\ + float scale_a,\n\ + float zp_a,\n\ + float scale_b,\n\ + float zp_b,\n\ + float scale_out,\n\ + float zp_out\n\ )\n\ {\n\ - int gidx = get_global_id(0);\n\ - int gidy = get_global_id(1);\n\ -\n\ - int2 coord_a = (int2)(0, gidy);\n\ - int2 coord_b = (int2)(gidx, 0);\n\ -\n\ + int4 coord = (int4)(get_global_id(0), get_global_id(1), 0, 0);\n\ float4 sum = (float4)(0);\n\ \n\ - for(; coord_a.x < K;)\n\ + for(; coord.z < K;)\n\ {\n\ float4 tempA0;\n\ float4 tempB0;\n\ \n\ - tempA0 = read_imagef(inputA, coord_a);\n\ - tempB0 = read_imagef(inputB, coord_b);\n\ - coord_a.x++;\n\ - coord_b.y++;\n\ + tempA0 = read_imagef(inputA, coord.zy);\n\ + tempB0 = read_imagef(inputB, coord.xz);\n\ + coord.z++;\n\ \n\ - sum += tempA0 * tempB0;\n\ + sum = sum + tempA0 * tempB0;\n\ }\n\ -\n\ - coord_b.y = gidy;\n\ - write_imagef(output, coord_b, sum);\n\ + write_imagef(output, coord.xy, sum);\n\ }\n\ \n\ __kernel void gemm_F32F32toF32_3D(\n\ @@ -43471,7 +44226,13 @@ __kernel void gemm_F32F32toF32_3D(\n\ int K,\n\ int N,\n\ int ac2zero,\n\ - int bc2zero\n\ + int bc2zero,\n\ + float scale_a,\n\ + float zp_a,\n\ + float scale_b,\n\ + float zp_b,\n\ + float scale_out,\n\ + float zp_out\n\ )\n\ {\n\ int4 coord_a = (int4)(0, get_global_id(1), (ac2zero ? 0 : get_global_id(2)), 0);\n\ @@ -43489,13 +44250,163 @@ __kernel void gemm_F32F32toF32_3D(\n\ coord_a.x++;\n\ coord_b.y++;\n\ \n\ - sum += tempA0 * tempB0;\n\ + sum = sum + tempA0 * tempB0;\n\ }\n\ \n\ coord_b.y = get_global_id(1);\n\ coord_b.z = get_global_id(2);\n\ write_imagef(output, coord_b, sum);\n\ }\n\ +\n\ +__kernel void gemm_transb_F32F32toF32_2D(\n\ + __read_only image2d_t inputA,\n\ + __read_only image2d_t inputB,\n\ + __write_only image2d_t output,\n\ + int M,\n\ + int K,\n\ + int N,\n\ + int ac2zero,\n\ + int bc2zero,\n\ + float scale_a,\n\ + float zp_a,\n\ + float scale_b,\n\ + float zp_b,\n\ + float scale_out,\n\ + float zp_out\n\ + )\n\ +{\n\ + int4 coord = (int4)(get_global_id(0), get_global_id(1), 0, 0);\n\ + float4 sum = (float4)(0);\n\ +\n\ + for(; coord.z < K;)\n\ + {\n\ + float4 tempA0;\n\ + float4 tempB0;\n\ +\n\ + tempA0 = read_imagef(inputA, coord.zy);\n\ + tempB0 = read_imagef(inputB, coord.zx);\n\ + coord.z++;\n\ +\n\ + sum = sum + tempA0 * tempB0;\n\ + }\n\ + write_imagef(output, coord.xy, sum);\n\ +}\n\ +\n\ +__kernel void gemm_transb_F32F32toF32_3D(\n\ + __read_only image2d_array_t inputA,\n\ + __read_only image2d_array_t inputB,\n\ + __write_only image2d_array_t output,\n\ + int M,\n\ + int K,\n\ + int N,\n\ + int ac2zero,\n\ + int bc2zero,\n\ + float scale_a,\n\ + float zp_a,\n\ + float scale_b,\n\ + float zp_b,\n\ + float scale_out,\n\ + float zp_out\n\ + )\n\ +{\n\ + int4 coord_a = (int4)(0, get_global_id(1), (ac2zero ? 0 : get_global_id(2)), 0);\n\ + int4 coord_b = (int4)(0, get_global_id(0), (bc2zero ? 0 : get_global_id(2)), 0);\n\ +\n\ + float4 sum = (float4)(0);\n\ +\n\ + for(; coord_a.x < K;)\n\ + {\n\ + float4 tempA0;\n\ + float4 tempB0;\n\ +\n\ + tempA0 = read_imagef(inputA, coord_a);\n\ + tempB0 = read_imagef(inputB, coord_b);\n\ + coord_a.x++;\n\ + coord_b.x++;\n\ +\n\ + sum = sum + tempA0 * tempB0;\n\ + }\n\ +\n\ + coord_a.x = get_global_id(0);\n\ + coord_a.z = get_global_id(2);\n\ + write_imagef(output, coord_b, sum);\n\ +}\n\ +\n\ +__kernel void gemm_transb_F32I8toF32_2D(\n\ + __read_only image2d_t inputA,\n\ + __read_only image2d_t inputB,\n\ + __write_only image2d_t output,\n\ + int M,\n\ + int K,\n\ + int N,\n\ + int ac2zero,\n\ + int bc2zero,\n\ + float scale_a,\n\ + float zp_a,\n\ + float scale_b,\n\ + float zp_b,\n\ + float scale_out,\n\ + float zp_out\n\ + )\n\ +{\n\ + int4 coord = (int4)(get_global_id(0), get_global_id(1), 0, 0);\n\ + float4 sum = (float4)(0);\n\ + for(; coord.z < K;)\n\ + {\n\ + float4 tempA0;\n\ + float4 tempB0;\n\ +\n\ + tempA0 = read_imagef(inputA, coord.zy);\n\ + tempB0 = convert_float4(read_imagei(inputB, coord.zx));\n\ + coord.z++;\n\ + tempB0.x = (tempB0.x - zp_b) * scale_b;\n\ +\n\ + sum = sum + tempA0 * tempB0;\n\ + }\n\ +\n\ + write_imagef(output, coord.xy, sum);\n\ +}\n\ +\n\ +__kernel void gemm_transb_F32I8toF32_3D(\n\ + __read_only image2d_array_t inputA,\n\ + __read_only image2d_array_t inputB,\n\ + __write_only image2d_array_t output,\n\ + int M,\n\ + int K,\n\ + int N,\n\ + int ac2zero,\n\ + int bc2zero,\n\ + float scale_a,\n\ + float zp_a,\n\ + float scale_b,\n\ + float zp_b,\n\ + float scale_out,\n\ + float zp_out\n\ + )\n\ +{\n\ + int4 coord_a = (int4)(0, get_global_id(1), (ac2zero ? 0 : get_global_id(2)), 0);\n\ + int4 coord_b = (int4)(0, get_global_id(0), (bc2zero ? 0 : get_global_id(2)), 0);\n\ +\n\ + float4 sum = (float4)(0);\n\ +\n\ + for(; coord_a.x < K;)\n\ + {\n\ + float4 tempA0;\n\ + float4 tempB0;\n\ +\n\ + tempA0 = read_imagef(inputA, coord_a);\n\ + tempB0 = convert_float4(read_imagei(inputB, coord_b));\n\ + tempB0.x = (tempB0.x - zp_b) * scale_b;\n\ + coord_a.x++;\n\ + coord_b.x++;\n\ +\n\ + sum = sum + tempA0 * tempB0;\n\ + }\n\ +\n\ + coord_a.x = get_global_id(0);\n\ + coord_a.z = get_global_id(2);\n\ + write_imagef(output, coord_b, sum);\n\ +}\n\ "; /* end of matrixmul_cl*/ static const char matrixmul_transA_cl[] = "__kernel void gemm_transa_F32F32toF32_2D(\n\ @@ -43506,32 +44417,30 @@ static const char matrixmul_transA_cl[] = "__kernel void gemm_transa_F32F32toF32 int K,\n\ int N,\n\ int ac2zero,\n\ - int bc2zero\n\ + int bc2zero,\n\ + float scale_a,\n\ + float zp_a,\n\ + float scale_b,\n\ + float zp_b,\n\ + float scale_out,\n\ + float zp_out\n\ )\n\ {\n\ - int gidx = get_global_id(0);\n\ - int gidy = get_global_id(1);\n\ -\n\ - int2 coord_a = (int2)(gidy, 0);\n\ - int2 coord_b = (int2)(gidx, 0);\n\ -\n\ + int4 coord = (int4)(get_global_id(0), get_global_id(1), 0, 0);\n\ float4 sum = (float4)(0);\n\ \n\ - for(; coord_a.y < K;)\n\ + for(; coord.z < K;)\n\ {\n\ float4 tempA0;\n\ float4 tempB0;\n\ \n\ - tempA0 = read_imagef(inputA, coord_a);\n\ - tempB0 = read_imagef(inputB, coord_b);\n\ - coord_a.y++;\n\ - coord_b.y++;\n\ + tempA0 = read_imagef(inputA, coord.yz);\n\ + tempB0 = read_imagef(inputB, coord.xz);\n\ + coord.z++;\n\ \n\ - sum += tempA0 * tempB0;\n\ + sum = sum + tempA0 * tempB0;\n\ }\n\ -\n\ - coord_b.y = gidy;\n\ - write_imagef(output, coord_b, sum);\n\ + write_imagef(output, coord.xy, sum);\n\ }\n\ \n\ __kernel void gemm_transa_F32F32toF32_3D(\n\ @@ -43542,7 +44451,13 @@ __kernel void gemm_transa_F32F32toF32_3D(\n\ int K,\n\ int N,\n\ int ac2zero,\n\ - int bc2zero\n\ + int bc2zero,\n\ + float scale_a,\n\ + float zp_a,\n\ + float scale_b,\n\ + float zp_b,\n\ + float scale_out,\n\ + float zp_out\n\ )\n\ {\n\ int gidx = get_global_id(0);\n\ @@ -43563,7 +44478,7 @@ __kernel void gemm_transa_F32F32toF32_3D(\n\ coord_a.y++;\n\ coord_b.y++;\n\ \n\ - sum += tempA0 * tempB0;\n\ + sum = sum + tempA0 * tempB0;\n\ }\n\ \n\ coord_b.y = gidy;\n\ @@ -47186,6 +48101,115 @@ __kernel void resize_nearest_U8toU8(\n\ }\n\ "; /* end of resize_nearest_cl*/ +static const char roi_align_cl[] = "inline float roi_align_1x1\n\ +(\n\ + __read_only image2d_array_t input,\n\ + float2 region_start,\n\ + float2 region_end,\n\ + float2 bin_size,\n\ + int2 grid_size,\n\ + float2 rcp_of_grid_size,\n\ + int pz\n\ +)\n\ +{\n\ + float sum = 0;\n\ +\n\ + for(int iy = 0; iy < grid_size.y; ++iy)\n\ + {\n\ + for(int ix = 0; ix < grid_size.x; ++ix)\n\ + {\n\ + float2 ixy = (float2)(ix + 0.5f, iy + 0.5f);\n\ + float2 pos = region_start + ixy * bin_size * rcp_of_grid_size;\n\ +\n\ + int2 xy_low = convert_int2(pos);\n\ + int2 xy_high = xy_low + 1;\n\ +\n\ + float ly = pos.y - xy_low.y;\n\ + float lx = pos.x - xy_low.x;\n\ + float hy = 1.0f - ly;\n\ + float hx = 1.0f - lx;\n\ +\n\ + float w1 = hy * hx;\n\ + float w2 = hy * lx;\n\ + float w3 = ly * hx;\n\ + float w4 = ly * lx;\n\ +\n\ + float data1 = read_imagef(input, (int4)(xy_low.x, xy_low.y, pz, 0)).x;\n\ + float data2 = read_imagef(input, (int4)(xy_high.x, xy_low.y, pz, 0)).x;\n\ + float data3 = read_imagef(input, (int4)(xy_low.x, xy_high.y, pz, 0)).x;\n\ + float data4 = read_imagef(input, (int4)(xy_high.x, xy_high.y, pz, 0)).x;\n\ +\n\ + sum = sum + w1 * data1 + w2 * data2 + w3 * data3 + w4 * data4;\n\ + }\n\ + }\n\ +\n\ + return (float)(sum * rcp_of_grid_size.x * rcp_of_grid_size.y);\n\ +}\n\ +\n\ +\n\ +#define EPS_GRID 0.00001f\n\ +__kernel void roi_align_F32toF32\n\ +(\n\ + __read_only image2d_array_t input,\n\ + __read_only image2d_t rois,\n\ + __read_only image2d_t n_rois,\n\ + __write_only image2d_array_t output,\n\ + float spatial_x_scale,\n\ + float spatial_y_scale,\n\ + float in_width,\n\ + float in_height,\n\ + float rcp_of_out_width,\n\ + float rcp_of_out_height,\n\ + float sampling_x_ratio,\n\ + float sampling_y_ratio,\n\ + int depth\n\ +)\n\ +{\n\ + int px = get_global_id(0);\n\ + int py = get_global_id(1);\n\ + int pw = get_global_id(2);\n\ +\n\ + int roi_batch = read_imagei(n_rois, (int2)(pw, 0)).x;\n\ + float4 roi_x = read_imagef(rois, (int2)(0, pw));\n\ + float4 roi_y = read_imagef(rois, (int2)(1, pw));\n\ + float4 roi_z = read_imagef(rois, (int2)(2, pw));\n\ + float4 roi_w = read_imagef(rois, (int2)(3, pw));\n\ + float4 roi = (float4)(roi_x.x, roi_y.x, roi_z.x, roi_w.x);\n\ +\n\ + float4 roi_anchor = roi * (float4)(spatial_x_scale, spatial_y_scale, spatial_x_scale, spatial_y_scale);\n\ + float2 roi_dims = fmax(roi_anchor.zw - roi_anchor.xy, 1.0f);\n\ +\n\ + float2 spatial_indx = (float2)(px, py);\n\ + float2 pooled_dims = (float2)(rcp_of_out_width, rcp_of_out_height);\n\ + float2 max_spatial_dims = (float2)(in_width, in_height);\n\ +\n\ + float2 bin_size = roi_dims * pooled_dims;\n\ + float2 region_start = spatial_indx * bin_size + roi_anchor.xy;\n\ + float2 region_end = region_start + bin_size;\n\ +\n\ + float2 roi_bin_grid = (float2)(sampling_x_ratio, sampling_y_ratio);\n\ +\n\ + roi_bin_grid = roi_bin_grid == 0 ? ceil(bin_size - EPS_GRID) : roi_bin_grid;\n\ +\n\ + int kz = roi_batch * depth;\n\ + float2 rcp_of_grid_size = 1.0f / roi_bin_grid;\n\ + int2 grid_size_xy = convert_int2(roi_bin_grid);\n\ + float4 interp;\n\ + int kz1 = pw * depth;\n\ + for (int pz = 0; pz < depth; pz ++, kz ++, kz1 ++)\n\ + {\n\ + interp.x = roi_align_1x1( input,\n\ + region_start,\n\ + region_end,\n\ + bin_size,\n\ + grid_size_xy,\n\ + rcp_of_grid_size,\n\ + kz);\n\ +\n\ + write_imagef(output, (int4)(px, py, kz1, 0), interp);\n\ + }\n\ +}"; /* end of roi_align_cl*/ + static const char scatter_nd_cl[] = "__kernel void scatter_nd_U32toU32_1D(\n\ __read_only image2d_t input0,\n\ __read_only image2d_t input1,\n\ @@ -47548,6 +48572,98 @@ __kernel void select_I8_F32_F32toF32_2D(\n\ }\n\ "; /* end of select_cl*/ +static const char space2depth_internal_cl[] = "\n\ +__kernel void space2depth_internal_F32toF32 (\n\ + image2d_array_t input,\n\ + image2d_array_t output,\n\ + int block_size_x, int block_size_y,\n\ + float scaleInOut, float zpInOut)\n\ +{\n\ + int x = get_global_id(0);\n\ + int y = get_global_id(1);\n\ + int z = get_global_id(2);\n\ + int inDepth = get_image_array_size(input);\n\ +\n\ + int4 coord = (int4)(x, y, z, 0);\n\ + float4 data = {0.0};\n\ + data = read_imagef(input, coord);\n\ +\n\ + ushort blockSize_x = convert_ushort(block_size_x);\n\ + ushort blockSize_y = convert_ushort(block_size_y);\n\ + int4 coord_out = (int4)(convert_ushort(x)/blockSize_x, convert_ushort(y)/blockSize_y, 0, 0);\n\ + coord_out.z = ((x - coord_out.x * block_size_x) + (y - coord_out.y * block_size_y) * block_size_x) * inDepth\n\ + + z;\n\ + write_imagef(output, coord_out, data);\n\ +}\n\ +\n\ +__kernel void space2depth_internal_F32toF32_X2Y1 (\n\ + image2d_array_t input,\n\ + image2d_array_t output,\n\ + int block_size_x, int block_size_y,\n\ + float scaleInOut, float zpInOut)\n\ +{\n\ + int x = get_global_id(0);\n\ + int y = get_global_id(1);\n\ + int z = get_global_id(2);\n\ + int inDepth = get_image_array_size(input);\n\ +\n\ + int4 coord = (int4)(x, y, z, 0);\n\ + float4 data = {0.0};\n\ + data = read_imagef(input, coord);\n\ +\n\ + int4 coord_out = (int4)(x >> 1, y, 0, 0);\n\ + coord_out.z = (x & 1) * inDepth + z;\n\ + write_imagef(output, coord_out, data);\n\ +}\n\ +\n\ +__kernel void space2depth_internal_U8toU8 (\n\ + image2d_array_t input,\n\ + image2d_array_t output,\n\ + int block_size_x, int block_size_y,\n\ + float scaleInOut, float zpInOut)\n\ +{\n\ + int x = get_global_id(0);\n\ + int y = get_global_id(1);\n\ + int z = get_global_id(2);\n\ + int inDepth = get_image_array_size(input);\n\ +\n\ + int4 coord = (int4)(x, y, z, 0);\n\ + uint4 data = {0};\n\ + data = read_imageui(input, coord);\n\ +\n\ + ushort blockSize_x = convert_ushort(block_size_x);\n\ + ushort blockSize_y = convert_ushort(block_size_y);\n\ + int4 coord_out = (int4)(convert_ushort(x)/blockSize_x, convert_ushort(y)/blockSize_y, 0, 0);\n\ + coord_out.z = ((x - coord_out.x * block_size_x) + (y - coord_out.y * block_size_y) * block_size_x) * inDepth\n\ + + z;\n\ +\n\ + data.x = convert_uint(data.x * scaleInOut + zpInOut);\n\ + write_imageui(output, coord_out, data);\n\ +}\n\ +\n\ +__kernel void space2depth_internal_U8toU8_X2Y1 (\n\ + image2d_array_t input,\n\ + image2d_array_t output,\n\ + int block_size_x, int block_size_y,\n\ + float scaleInOut, float zpInOut)\n\ +{\n\ + int x = get_global_id(0);\n\ + int y = get_global_id(1);\n\ + int z = get_global_id(2);\n\ + int inDepth = get_image_array_size(input);\n\ +\n\ + int4 coord = (int4)(x, y, z, 0);\n\ + uint4 data = {0};\n\ + data = read_imageui(input, coord);\n\ +\n\ + int4 coord_out = (int4)(x >> 1, y, 0, 0);\n\ + coord_out.z = (x & 1) * inDepth + z;\n\ +\n\ + data.x = convert_uint(data.x * scaleInOut + zpInOut);\n\ + write_imageui(output, coord_out, data);\n\ +}\n\ +"; /* end of space2depth_internal_cl*/ + static const char swish_cl[] = "float sigmoid_(float x, float logE)\n\ {\n\ x *= -logE;\n\ @@ -48019,6 +49135,13 @@ static const source_map_t evis_resource[] = {"instance_normalization_u8_vx", instance_normalization_u8_vx}, {"l2normalizescale_axis0_vx", l2normalizescale_axis0_vx}, {"l2normalizescale_axis1_vx", l2normalizescale_axis1_vx}, + {"layer_normalization_vx", layer_normalization_vx}, + {"layer_normalization_2d_vx", layer_normalization_2d_vx}, + {"layer_normalization_i16_vx", layer_normalization_i16_vx}, + {"layer_normalization_u8_f16_vx", layer_normalization_u8_f16_vx}, + {"layer_normalization_wh_f16_vx", layer_normalization_wh_f16_vx}, + {"layer_normalization_wh_i16_vx", layer_normalization_wh_i16_vx}, + {"layer_normalization_wh_u8_vx", layer_normalization_wh_u8_vx}, {"log_softmax_axis0_vx", log_softmax_axis0_vx}, {"log_softmax_axis0_BF16_vx", log_softmax_axis0_BF16_vx}, {"log_softmax_axis1_vx", log_softmax_axis1_vx}, @@ -48082,27 +49205,21 @@ static const source_map_t evis_resource[] = {"pow_i8_vx", pow_i8_vx}, {"pow_u8_vx", pow_u8_vx}, {"pre_process_bgra_vx", pre_process_bgra_vx}, - {"pre_process_bgra_trans_vx", pre_process_bgra_trans_vx}, {"pre_process_gray_vx", pre_process_gray_vx}, {"pre_process_gray_copy_vx", pre_process_gray_copy_vx}, {"pre_process_nv12_scale_vx", pre_process_nv12_scale_vx}, {"pre_process_nv12_scale_8bits_vx", pre_process_nv12_scale_8bits_vx}, {"pre_process_nv12_scale_mix_vx", pre_process_nv12_scale_mix_vx}, - {"pre_process_nv12_trans_u8_vx", pre_process_nv12_trans_u8_vx}, {"pre_process_rgb_vx", pre_process_rgb_vx}, {"pre_process_rgb_copy_vx", pre_process_rgb_copy_vx}, - {"pre_process_rgb_copy_trans_vx", pre_process_rgb_copy_trans_vx}, - {"pre_process_rgb_trans_vx", pre_process_rgb_trans_vx}, {"pre_process_yuv420_copy_u8_vx", pre_process_yuv420_copy_u8_vx}, {"pre_process_yuv420_scale_fp16_vx", pre_process_yuv420_scale_fp16_vx}, {"pre_process_yuv420_scale_i16_vx", pre_process_yuv420_scale_i16_vx}, {"pre_process_yuv420_scale_i8_vx", pre_process_yuv420_scale_i8_vx}, {"pre_process_yuv420_scale_u8_vx", pre_process_yuv420_scale_u8_vx}, - {"pre_process_yuv420_trans_u8_vx", pre_process_yuv420_trans_u8_vx}, {"pre_process_yuv444_copy_u8_vx", pre_process_yuv444_copy_u8_vx}, {"pre_process_yuv444_scale_vx", pre_process_yuv444_scale_vx}, {"pre_process_yuv444_scale_fp16_vx", pre_process_yuv444_scale_fp16_vx}, - {"pre_process_yuv444_trans_u8_vx", pre_process_yuv444_trans_u8_vx}, {"prelu_vx", prelu_vx}, {"prelu_BF16_vx", prelu_BF16_vx}, {"random_multinomial_vx", random_multinomial_vx}, @@ -48144,6 +49261,7 @@ static const source_map_t evis_resource[] = {"scatter_nd_vx", scatter_nd_vx}, {"scatter_nd_big_vx", scatter_nd_big_vx}, {"select_vx", select_vx}, + {"space2depth_internal_vx", space2depth_internal_vx}, {"swish_vx", swish_vx}, {"tile_vx", tile_vx}, {"tile_mix_vx", tile_mix_vx}, @@ -48151,12 +49269,12 @@ static const source_map_t evis_resource[] = {"upsample_I16_vx", upsample_I16_vx}, {"upsample_I8_vx", upsample_I8_vx}, {"upsample_U8_vx", upsample_U8_vx}, + {"upsamplescale_vx", upsamplescale_vx}, + {"upsamplescale_k2_vx", upsamplescale_k2_vx}, {"vsi_nn_kernel_axis_aligned_bbox_transform_vx", vsi_nn_kernel_axis_aligned_bbox_transform_vx}, {"vsi_nn_kernel_box_with_nms_limit_vx", vsi_nn_kernel_box_with_nms_limit_vx}, - {"vsi_nn_kernel_crop_vx", vsi_nn_kernel_crop_vx}, {"vsi_nn_kernel_detection_postprocess_vx", vsi_nn_kernel_detection_postprocess_vx}, {"vsi_nn_kernel_extra_ending_vx", vsi_nn_kernel_extra_ending_vx}, - {"vsi_nn_kernel_fullconnect2_vx", vsi_nn_kernel_fullconnect2_vx}, {"vsi_nn_kernel_generate_proposals_vx", vsi_nn_kernel_generate_proposals_vx}, {"vsi_nn_kernel_header_vx", vsi_nn_kernel_header_vx}, {"vsi_nn_kernel_heatmap_max_keypoint_vx", vsi_nn_kernel_heatmap_max_keypoint_vx}, @@ -48165,15 +49283,8 @@ static const source_map_t evis_resource[] = {"vsi_nn_kernel_imageprocess_3_vx", vsi_nn_kernel_imageprocess_3_vx}, {"vsi_nn_kernel_imageprocess_4_vx", vsi_nn_kernel_imageprocess_4_vx}, {"vsi_nn_kernel_imageprocess_5_vx", vsi_nn_kernel_imageprocess_5_vx}, - {"vsi_nn_kernel_layernormalize_vx", vsi_nn_kernel_layernormalize_vx}, - {"vsi_nn_kernel_layernormalize_U8_vx", vsi_nn_kernel_layernormalize_U8_vx}, - {"vsi_nn_kernel_resize_vx", vsi_nn_kernel_resize_vx}, {"vsi_nn_kernel_roi_align_vx", vsi_nn_kernel_roi_align_vx}, - {"vsi_nn_kernel_scale_vx", vsi_nn_kernel_scale_vx}, - {"vsi_nn_kernel_shufflechannel_vx", vsi_nn_kernel_shufflechannel_vx}, - {"vsi_nn_kernel_shufflechannel_axis1_vx", vsi_nn_kernel_shufflechannel_axis1_vx}, {"vsi_nn_kernel_signalframe_vx", vsi_nn_kernel_signalframe_vx}, - {"vsi_nn_kernel_space2depth_vx", vsi_nn_kernel_space2depth_vx}, {"vsi_nn_kernel_tensorstackconcat_vx", vsi_nn_kernel_tensorstackconcat_vx}, {"vsi_nn_kernel_topk_vx", vsi_nn_kernel_topk_vx}, {"vsi_nn_kernel_transform_gemm_vx", vsi_nn_kernel_transform_gemm_vx}, @@ -48210,6 +49321,7 @@ static const source_map_t cl_resource[] = {"instance_normalization_u8_cl", instance_normalization_u8_cl}, {"l2normalizescale_axis0_cl", l2normalizescale_axis0_cl}, {"l2normalizescale_axis1_cl", l2normalizescale_axis1_cl}, + {"layer_normalization_cl", layer_normalization_cl}, {"log_softmax_axis0_cl", log_softmax_axis0_cl}, {"log_softmax_axis1_cl", log_softmax_axis1_cl}, {"log_softmax_axis2_cl", log_softmax_axis2_cl}, @@ -48271,8 +49383,10 @@ static const source_map_t cl_resource[] = {"resize_1d_nearest_cl", resize_1d_nearest_cl}, {"resize_bilinear_cl", resize_bilinear_cl}, {"resize_nearest_cl", resize_nearest_cl}, + {"roi_align_cl", roi_align_cl}, {"scatter_nd_cl", scatter_nd_cl}, {"select_cl", select_cl}, + {"space2depth_internal_cl", space2depth_internal_cl}, {"swish_cl", swish_cl}, {"tile_cl", tile_cl}, {"upsample_cl", upsample_cl}, diff --git a/src/tim/vx/internal/src/makefile.linux b/src/tim/vx/internal/src/makefile.linux index 86691ef..799a920 100644 --- a/src/tim/vx/internal/src/makefile.linux +++ b/src/tim/vx/internal/src/makefile.linux @@ -10,8 +10,11 @@ CFLAGS += -fvisibility=hidden -D'OVXLIB_API=__attribute__((visibility("default") ################################################################################ # Supply necessary libraries. - -LIBS += -L$(VIVANTE_SDK_LIB) -l OpenVX -l OpenVXU -l CLC -l VSC +ifeq ($(USE_VXC_BINARY)$(USE_VSC_LITE),11) +LIBS += -L$(VIVANTE_SDK_LIB) -l OpenVX -l OpenVXU -l CLC -l VSC_Lite -lGAL +else +LIBS += -L$(VIVANTE_SDK_LIB) -l OpenVX -l OpenVXU -l CLC -l VSC -lGAL +endif LIBS += -lm -ldl ############################################################################# diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_argmaxmin.c b/src/tim/vx/internal/src/ops/vsi_nn_op_argmaxmin.c index 8f115d9..0a260d2 100644 --- a/src/tim/vx/internal/src/ops/vsi_nn_op_argmaxmin.c +++ b/src/tim/vx/internal/src/ops/vsi_nn_op_argmaxmin.c @@ -219,7 +219,10 @@ static vsi_bool op_check IO_TYPE(D_F32, D_I32) IO_TYPE(D_F16, D_I32) IO_TYPE(D_I32, D_I32) - IO_TYPE(D_U8|Q_ASYM, D_I32) + IO_TYPE(D_I8|Q_DFP, D_I32) + IO_TYPE(D_I8, D_I32) + IO_TYPE(D_U8|Q_ASYM, D_I32) + IO_TYPE(D_U8, D_I32) END_IO_TYPE_DECL(ARGMIN) if(!VALIDATE_OP_IO_TYPES(ARGMIN, self, inputs, self->input.num, outputs, self->output.num)) { char* desc = generate_op_io_types_desc(inputs, diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_crop.c b/src/tim/vx/internal/src/ops/vsi_nn_op_crop.c index 04757b5..cb1cda3 100644 --- a/src/tim/vx/internal/src/ops/vsi_nn_op_crop.c +++ b/src/tim/vx/internal/src/ops/vsi_nn_op_crop.c @@ -44,190 +44,6 @@ #define _IO_NUM (_INPUT_NUM + _OUTPUT_NUM) #define _PARAM_NUM (_ARG_NUM + _IO_NUM) -#define USE_OVX_API TRUE - -#if (USE_OVX_API == FALSE) -extern vx_kernel_description_t * vx_kernel_CROP_list[]; - -static void _set_inputs_outputs - ( - vx_reference * params, - vsi_nn_tensor_t ** inputs, - vsi_nn_tensor_t ** outputs - ) -{ - uint32_t i; - uint32_t cnt; - - /* Set inputs */ - cnt = 0; - for( i = 0; i < _INPUT_NUM; i ++, cnt ++ ) - { - params[cnt] = (vx_reference)inputs[i]->t; - } - - /* Set outputs */ - for( i = 0; i < _OUTPUT_NUM; i ++, cnt ++ ) - { - params[cnt] = (vx_reference)outputs[i]->t; - } -} /* _set_inputs_outputs() */ - -static vsi_status _create_params - ( - vsi_nn_node_t * node, - vx_reference * params, - uint32_t num - ) -{ - vsi_status status; - vx_context ctx; - vsi_nn_crop_param * p; - if( 0 == num ) - { - return VSI_SUCCESS; - } - memset( params, 0, sizeof( vx_reference * ) * num ); - p = &(node->nn_param.crop); - ctx = vxGetContext( (vx_reference)node->graph->g ); - /* Init parameters */ -#define _SET_PARAM( i, type, arg ) do{ \ - params[i] = (vx_reference)vxCreateScalar( ctx, type, &p->arg ); \ - status = vxGetStatus( params[i] ); \ - if( VSI_SUCCESS != status ) { \ - goto set_param_error; \ - } \ - } while(0) - _SET_PARAM( 0, VX_TYPE_INT32, offset[0] ); - _SET_PARAM( 1, VX_TYPE_INT32, offset[1] ); - _SET_PARAM( 2, VX_TYPE_INT32, offset[2] ); - -#undef _SET_PARAM -set_param_error: - - return status; -} /* _create_params */ - -static void _release_params - ( - vx_reference * params, - uint32_t num - ) -{ - uint32_t i; - vx_scalar scalar; - for( i = 0; i < num; i ++ ) - { - scalar = (vx_scalar)params[i]; - vxReleaseScalar( &scalar ); - } -} /* _release_params() */ - -static vsi_status cpu_op_compute - ( - vsi_nn_node_t * self, - vsi_nn_tensor_t ** inputs, - vsi_nn_tensor_t ** outputs - ) -{ - vsi_status status = VSI_SUCCESS; - vx_reference params[_PARAM_NUM]; - vx_reference * args; - - args = ¶ms[_IO_NUM]; - - if( NULL == self->n ) - { - return VSI_FAILURE; - } - - /* Set inputs and outputs */ - _set_inputs_outputs( params, inputs, outputs ); - - /* Init parameters. */ - _create_params( self, args, _ARG_NUM ); - - /* Pass parameters to node. */ - status = vsi_nn_ClientNodePassParameters( self->n, params, _PARAM_NUM ); - - _release_params( args, _ARG_NUM ); - - return status; -} - -static vsi_status vx_op_pre_init - ( - vsi_nn_node_t * self, - vsi_nn_tensor_t ** inputs, - vsi_nn_tensor_t ** outputs, - vsi_nn_kernel_info_t * kernel_info - ) -{ - vsi_nn_type_e dataFormat = inputs[0]->attr.dtype.vx_type; - vsi_nn_type_e dstFormat = outputs[0]->attr.dtype.vx_type; - - if (dataFormat == VSI_NN_TYPE_FLOAT16 - || (dataFormat == VSI_NN_TYPE_INT16 && dstFormat == VSI_NN_TYPE_INT16)) - { - kernel_info->kernel_index = 1; - } - else if(dataFormat == VSI_NN_TYPE_INT16 && dstFormat == VSI_NN_TYPE_FLOAT16) - { - kernel_info->kernel_index = 3; - } - else - { - kernel_info->kernel_index = 2; - } - - return VSI_SUCCESS; -} - -static vsi_status vx_op_compute - ( - vsi_nn_node_t * self, - vsi_nn_tensor_t ** inputs, - vsi_nn_tensor_t ** outputs - ) -{ - vsi_status status = VSI_SUCCESS; - vx_reference params[_PARAM_NUM]; - vx_border_t border; - vx_reference * args; - - args = ¶ms[_IO_NUM]; - - if( NULL == self->n ) - { - return VSI_FAILURE; - } - - /* Set inputs and outputs */ - _set_inputs_outputs( params, inputs, outputs ); - - /* Init parameters. */ - _create_params( self, args, _ARG_NUM ); - - /* Pass parameters to node. */ - status = vsi_nn_ClientNodePassParameters( self->n, params, _PARAM_NUM ); - - _release_params( args, _ARG_NUM ); - - border.mode = VX_BORDER_REPLICATE; - border.constant_value.U32 = 0; - status |= vxSetNodeAttribute(self->n, VX_NODE_BORDER, &border, sizeof(border)); - - return status; -} - -static vsi_nn_op_compute_t op_compute_list[] = -{ - cpu_op_compute, - vx_op_compute, - NULL -}; -#endif - static vsi_status op_compute ( vsi_nn_node_t * self, @@ -236,7 +52,6 @@ static vsi_status op_compute ) { vsi_status status = VSI_FAILURE; -#if (USE_OVX_API == TRUE) vx_nn_stride_slice_params_t param; vsi_nn_tensor_t *begin_dims_tensor = NULL; vsi_nn_tensor_t *end_dims_tensor = NULL; @@ -317,36 +132,6 @@ static vsi_status op_compute { status = VSI_SUCCESS; } -#else - vsi_nn_kernel_info_t kernel_info; - - memset(&kernel_info, 0x0, sizeof(vsi_nn_kernel_info_t)); - status = VSI_FAILURE; - kernel_info.resource_num = 1; - kernel_info.resource_name = (char **)malloc(kernel_info.resource_num * sizeof(char *)); - kernel_info.resource_name[0] = "vsi_nn_kernel_crop"; - kernel_info.type = vsi_nn_GetVXKernelTypeForShader(); - kernel_info.kernel = vx_kernel_CROP_list; - kernel_info.init_index = 1; - - if (vsi_nn_is_do_vx_op_pre_init(kernel_info.type)) - { - vx_op_pre_init(self, inputs, outputs, &kernel_info); - } - - self->n = vsi_nn_RegisterClientKernelAndNewNode( - self->graph, &kernel_info); - if (kernel_info.resource_name) free(kernel_info.resource_name); - if( NULL == self->n ) - { - return VSI_FAILURE; - } - - if (NULL != op_compute_list[kernel_info.init_index]) - { - status = op_compute_list[kernel_info.init_index](self, inputs, outputs); - } -#endif OnError: if (begin_dims_tensor) vsi_nn_ReleaseTensor(&begin_dims_tensor); if (end_dims_tensor) vsi_nn_ReleaseTensor(&end_dims_tensor); diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_dataconvert.c b/src/tim/vx/internal/src/ops/vsi_nn_op_dataconvert.c index ca6b3db..e86fe3d 100644 --- a/src/tim/vx/internal/src/ops/vsi_nn_op_dataconvert.c +++ b/src/tim/vx/internal/src/ops/vsi_nn_op_dataconvert.c @@ -221,6 +221,9 @@ static vsi_bool op_check IO_TYPE(D_BF16, D_F32) IO_TYPE(D_I32, D_I32) IO_TYPE(D_I32, D_I16|Q_DFP) + IO_TYPE(D_I16, D_I16|Q_DFP) + IO_TYPE(D_I8, D_I8|Q_DFP) + IO_TYPE(D_U8, D_U8|Q_ASYM) END_IO_TYPE_DECL(DATACONVERT) if (!VALIDATE_OP_IO_TYPES(DATACONVERT, self, inputs, self->input.num, outputs, self->output.num)) { diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_deconvolution.c b/src/tim/vx/internal/src/ops/vsi_nn_op_deconvolution.c index d86b715..4e33da4 100644 --- a/src/tim/vx/internal/src/ops/vsi_nn_op_deconvolution.c +++ b/src/tim/vx/internal/src/ops/vsi_nn_op_deconvolution.c @@ -196,6 +196,7 @@ static vsi_bool op_check IO_TYPE(D_I16|Q_DFP, D_I16|Q_DFP, D_I16|Q_DFP, D_I16|Q_DFP) IO_TYPE(D_U8|Q_ASYM, D_I8|Q_DFP, D_I32|Q_DFP, D_U8|Q_ASYM) IO_TYPE(D_U8|Q_ASYM, D_I8|Q_SYM_PC, D_I32|Q_SYM_PC, D_U8|Q_ASYM) + IO_TYPE(D_I8|Q_ASYM, D_I8|Q_SYM_PC, D_I32|Q_SYM_PC, D_I8|Q_ASYM) IO_TYPE(D_U8|Q_ASYM, D_U8|Q_ASYM, D_NONE, D_U8|Q_ASYM) IO_TYPE(D_U8|Q_ASYM, D_U8|Q_ASYM, D_I32|Q_ASYM, D_U8|Q_ASYM) IO_TYPE(D_U8|Q_ASYM, D_U8|Q_ASYM, D_I32|Q_ASYM, D_I8|Q_DFP) diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_embedding_lookup.c b/src/tim/vx/internal/src/ops/vsi_nn_op_embedding_lookup.c index 39dbfe9..b942f2c 100644 --- a/src/tim/vx/internal/src/ops/vsi_nn_op_embedding_lookup.c +++ b/src/tim/vx/internal/src/ops/vsi_nn_op_embedding_lookup.c @@ -89,7 +89,11 @@ static vsi_bool op_check IO_TYPE(D_I32, D_F16, D_F16) IO_TYPE(D_I32, D_F32, D_F32) IO_TYPE(D_I32, D_I32, D_I32) + IO_TYPE(D_I32, D_U8|Q_ASYM, D_F32) IO_TYPE(D_I32, D_U8|Q_ASYM, D_U8|Q_ASYM) + IO_TYPE(D_I32, D_U8|Q_ASYM, D_I8|Q_DFP) + IO_TYPE(D_I32, D_U8|Q_ASYM, D_I8) + IO_TYPE(D_F16, D_F16, D_F16) END_IO_TYPE_DECL(EMBEDDING_LOOKUP) if (!VALIDATE_OP_IO_TYPES(EMBEDDING_LOOKUP, self, inputs, self->input.num, outputs, self->output.num)) diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_fullconnect2.c b/src/tim/vx/internal/src/ops/vsi_nn_op_fullconnect2.c index 7fca31a..d1af0cd 100644 --- a/src/tim/vx/internal/src/ops/vsi_nn_op_fullconnect2.c +++ b/src/tim/vx/internal/src/ops/vsi_nn_op_fullconnect2.c @@ -42,215 +42,6 @@ #define _IO_NUM (_INPUT_NUM + _OUTPUT_NUM) #define _PARAM_NUM (_ARG_NUM + _IO_NUM) -#define USE_OVX_API TRUE - -#if (USE_OVX_API == FALSE) -extern vx_kernel_description_t * vx_kernel_FCL2_list[]; - -static void _set_inputs_outputs - ( - vx_reference * params, - vsi_nn_tensor_t ** inputs, - vsi_nn_tensor_t ** outputs - ) -{ - uint32_t i; - uint32_t cnt; - - /* Set inputs */ - cnt = 0; - for( i = 0; i < _INPUT_NUM; i ++, cnt ++ ) - { - params[cnt] = (vx_reference)inputs[i]->t; - } - - /* Set outputs */ - for( i = 0; i < _OUTPUT_NUM; i ++, cnt ++ ) - { - params[cnt] = (vx_reference)outputs[i]->t; - } -} /* _set_inputs_outputs() */ - -static vsi_status _create_params - ( - vsi_nn_node_t * node, - vx_reference * params, - uint32_t num - ) -{ - vsi_status status; - vx_context ctx; - vsi_nn_fcl_param * p; - if( 0 == num ) - { - return VSI_SUCCESS; - } - memset( params, 0, sizeof( vx_reference * ) * num ); - p = &(node->nn_param.fcl); - ctx = vxGetContext( (vx_reference)node->graph->g ); - /* Init parameters */ -#define _SET_PARAM( i, type, arg ) do{ \ - params[i] = (vx_reference)vxCreateScalar( ctx, type, &p->arg ); \ - status = vxGetStatus( params[i] ); \ - if( VSI_SUCCESS != status ) { \ - goto set_param_error; \ - } \ - } while(0) - _SET_PARAM( 0, VX_TYPE_INT32, axis ); - //_SET_PARAM( 1, VX_TYPE_FLOAT32, bias ); - //_SET_PARAM( 2, VX_TYPE_TENSOR, data_bias ); - //_SET_PARAM( 3, VX_TYPE_TENSOR, data_weight ); - //_SET_PARAM( 4, VX_TYPE_FLOAT32, regularize ); - _SET_PARAM( 1, VX_TYPE_INT32, weights ); -#undef _SET_PARAM -set_param_error: - - return status; -} /* _create_params */ - -static void _release_params - ( - vx_reference * params, - uint32_t num - ) -{ - uint32_t i; - vx_scalar scalar; - for( i = 0; i < num; i ++ ) - { - scalar = (vx_scalar)params[i]; - vxReleaseScalar( &scalar ); - } -} /* _release_params() */ - -static vsi_status cpu_op_compute - ( - vsi_nn_node_t * self, - vsi_nn_tensor_t ** inputs, - vsi_nn_tensor_t ** outputs - ) -{ - vsi_status status = VSI_SUCCESS; - vx_reference params[_PARAM_NUM]; - vx_reference * args; - - args = ¶ms[_IO_NUM]; - - if( NULL == self->n ) - { - return VSI_FAILURE; - } - - /* Set inputs and outputs */ - _set_inputs_outputs( params, inputs, outputs ); - - /* Init parameters. */ - _create_params( self, args, _ARG_NUM ); - - /* Pass parameters to node. */ - status = vsi_nn_ClientNodePassParameters( self->n, params, _PARAM_NUM ); - - _release_params( args, _ARG_NUM ); - - return status; -} - -static vsi_status vx_op_compute - ( - vsi_nn_node_t * self, - vsi_nn_tensor_t ** inputs, - vsi_nn_tensor_t ** outputs - ) -{ - vsi_status status = VSI_SUCCESS; - uint32_t axis; - vsi_nn_fcl_param * p; - uint32_t i = 0; - uint32_t num_fc = 1, num_no_fc = 1; - uint32_t num_of_dims[3] = {0}; - uint32_t input_size[VSI_NN_MAX_DIM_NUM] = {0}; - uint32_t output_size[VSI_NN_MAX_DIM_NUM] = {0}; - uint32_t weights_size[VSI_NN_MAX_DIM_NUM] = {0}; - int32_t size[VSI_NN_MAX_DIM_NUM] = {0}; - uint32_t ofm = 0; - uint32_t dims = 0; - vx_tensor input = NULL; - vx_tensor output = NULL; - vx_tensor weight = NULL; - vx_tensor bias = NULL; - int32_t index = 0; - vx_border_t border; - - if( NULL == self->n ) - { - return VSI_FAILURE; - } - p = (vsi_nn_fcl_param *)&(self->nn_param.fcl); - axis = p->axis; - - memcpy(input_size, inputs[0]->attr.size, sizeof(uint32_t) * VSI_NN_MAX_DIM_NUM); - num_of_dims[0] = inputs[0]->attr.dim_num; - memcpy(output_size, outputs[0]->attr.size, sizeof(uint32_t) * VSI_NN_MAX_DIM_NUM); - num_of_dims[1] = outputs[0]->attr.dim_num; - memcpy(weights_size, inputs[1]->attr.size, sizeof(uint32_t) * VSI_NN_MAX_DIM_NUM); - num_of_dims[2] = inputs[1]->attr.dim_num; - - ofm = weights_size[num_of_dims[2] - 1]; - - for(i = 0; i <= (uint32_t)axis; ++i) - { - num_fc *= input_size[i]; - } - for(i = axis + 1; i < num_of_dims[0]; ++i) - { - num_no_fc *= input_size[i]; - } - - size[0] = num_fc; - size[1] = num_no_fc; - dims= 2; - input = vxReshapeTensor(inputs[0]->t, size, dims); - - size[0] = num_fc; - size[1] = ofm; - dims= 2; - weight = vxReshapeTensor(inputs[1]->t, size, dims); - - size[0] = ofm; - size[1] = 1; - dims= 2; - bias = vxReshapeTensor(inputs[2]->t, size, dims); - - size[0] = ofm; - size[1] = num_no_fc; - dims= 2; - output = vxReshapeTensor(outputs[0]->t, size, dims); - - status |= vxSetParameterByIndex(self->n, index++, (vx_reference)input); - status |= vxSetParameterByIndex(self->n, index++, (vx_reference)weight); - status |= vxSetParameterByIndex(self->n, index++, (vx_reference)bias); - status |= vxSetParameterByIndex(self->n, index++, (vx_reference)output); - - border.mode = VX_BORDER_CONSTANT; - border.constant_value.S16 = 0; - status |= vxSetNodeAttribute(self->n, VX_NODE_BORDER, &border, sizeof(border)); - - if (input) vxReleaseTensor(&input); - if (weight) vxReleaseTensor(&weight); - if (bias) vxReleaseTensor(&bias); - if (output) vxReleaseTensor(&output); - - return status; -} - -static vsi_nn_op_compute_t op_compute_list[] = -{ - cpu_op_compute, - vx_op_compute, - NULL -}; -#endif - static vsi_status op_compute ( vsi_nn_node_t * self, @@ -259,7 +50,6 @@ static vsi_status op_compute ) { vsi_status status = VSI_FAILURE; -#if (USE_OVX_API == TRUE) uint32_t axis; vsi_nn_fcl_param * p; uint32_t i = 0; @@ -343,30 +133,7 @@ static vsi_status op_compute if (weight) vxReleaseTensor(&weight); if (bias) vxReleaseTensor(&bias); if (output) vxReleaseTensor(&output); -#else - vsi_nn_kernel_info_t kernel_info; - memset(&kernel_info, 0x0, sizeof(vsi_nn_kernel_info_t)); - status = VSI_FAILURE; - kernel_info.resource_num = 1; - kernel_info.resource_name = (char **)malloc(kernel_info.resource_num * sizeof(char *)); - kernel_info.resource_name[0] = "vsi_nn_kernel_fullconnect2"; - kernel_info.type = VX_KERNEL_TYPE_VX; - kernel_info.kernel = vx_kernel_FCL2_list; - kernel_info.kernel_index = 1; - kernel_info.init_index = 1; - self->n = vsi_nn_RegisterClientKernelAndNewNode( - self->graph, &kernel_info); - if (kernel_info.resource_name) free(kernel_info.resource_name); - if( NULL == self->n ) - { - return VSI_FAILURE; - } - if (NULL != op_compute_list[kernel_info.init_index]) - { - status = op_compute_list[kernel_info.init_index](self, inputs, outputs); - } -#endif return status; } /* op_compute() */ diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_gather.c b/src/tim/vx/internal/src/ops/vsi_nn_op_gather.c index 4cc922e..81af2ce 100644 --- a/src/tim/vx/internal/src/ops/vsi_nn_op_gather.c +++ b/src/tim/vx/internal/src/ops/vsi_nn_op_gather.c @@ -74,6 +74,7 @@ static vsi_status op_compute vsi_nn_kernel_param_add_int32( param, "block_size", block_size ); vsi_nn_kernel_param_add_int32( param, "block_num", block_num ); vsi_nn_kernel_param_add_int32( param, "axis_num", axis_num ); + vsi_nn_kernel_param_add_int32( param, "axis", axis ); vsi_nn_kernel_param_add_int32( param, "indices_num", indices_num ); n = vsi_nn_kernel_selector( self->graph, "gather", inputs, 2, outputs, 1, param ); if( n != NULL ) diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_instancenormalize.c b/src/tim/vx/internal/src/ops/vsi_nn_op_instancenormalize.c index 46a23ce..2a0f6a2 100644 --- a/src/tim/vx/internal/src/ops/vsi_nn_op_instancenormalize.c +++ b/src/tim/vx/internal/src/ops/vsi_nn_op_instancenormalize.c @@ -41,6 +41,50 @@ #define _INPUT_NUM (3) #define _OUTPUT_NUM (1) +static vsi_status _try_set_high_presision_tensor + ( + vsi_nn_tensor_t **inputs + ) +{ + vsi_status status; + vsi_nn_vxtensor_attr_t attr; + + status = VSI_SUCCESS; + attr = VSI_NN_TENSOR_ATTR_HIGH_PRECISION; + + if(VSI_NN_TYPE_FLOAT32 == inputs[1]->attr.dtype.vx_type) + { + status = vsi_nn_SetTensorAttr(inputs[1], attr); + if(VSI_SUCCESS != status) + { + return status; + } + } + if(VSI_NN_TYPE_FLOAT32 == inputs[2]->attr.dtype.vx_type) + { + status = vsi_nn_SetTensorAttr(inputs[2], attr); + if(VSI_SUCCESS != status) + { + return status; + } + } + + return status; +} + +static vsi_bool _is_3d_instance_norm + ( + vsi_nn_node_t * self, + vsi_nn_tensor_t ** inputs + ) +{ + if( 3 == inputs[0]->attr.dim_num ) + { + return TRUE; + } + return FALSE; +} /* _is_3d_instance_norm() */ + static vsi_status op_compute ( vsi_nn_node_t * self, @@ -55,19 +99,42 @@ static vsi_status op_compute uint32_t *input_size = inputs[0]->attr.size; uint32_t dims_num = inputs[0]->attr.dim_num; int32_t rs_flg = 0; + vsi_nn_tensor_t * tmp_inputs[3] = {NULL, NULL, NULL}; + vsi_nn_tensor_t * tmp_outputs[1] = {NULL}; + vsi_nn_instancenorm_lcl_data2 *local = self->nn_param.instancenorm.lcl2_data; - param =vsi_nn_kernel_param_create(); - - if((input_size[1] * input_size[2] < 65536) - && dims_num > 2) + status = _try_set_high_presision_tensor(inputs); + if(status != VSI_SUCCESS) { - rs_flg = 1; + VSILOGE("Set tensor attr of high presision fail"); + return status; } + if(_is_3d_instance_norm(self, inputs)) + { + tmp_inputs[0] = local->reshaped_input; + tmp_outputs[0] = local->reshaped_output; + tmp_inputs[1] = inputs[1]; + tmp_inputs[2] = inputs[2]; + } + else + { + tmp_inputs[0] = inputs[0]; + tmp_outputs[0] = outputs[0]; + tmp_inputs[1] = inputs[1]; + tmp_inputs[2] = inputs[2]; + if((input_size[1] * input_size[2] < 65536) + && dims_num > 2) + { + rs_flg = 1; + } + } + + param =vsi_nn_kernel_param_create(); vsi_nn_kernel_param_add_float32( param, "eps", eps ); vsi_nn_kernel_param_add_int32( param, "reshape_flg", rs_flg ); n = vsi_nn_kernel_selector( self->graph, "instance_norm", - inputs, _INPUT_NUM, outputs, _OUTPUT_NUM, param ); + tmp_inputs, _INPUT_NUM, tmp_outputs, _OUTPUT_NUM, param ); if( n != NULL ) { self->n = (vx_node)n; @@ -82,6 +149,59 @@ static vsi_status op_compute return status; } /* op_compute() */ +static vsi_status op_optimize + ( + vsi_nn_node_t * self, + vsi_nn_tensor_t ** inputs, + vsi_nn_tensor_t ** outputs, + vsi_nn_opt_direction_e direction + ) +{ + uint32_t dim = 0; + vsi_nn_instancenorm_lcl_data2 *local = NULL; + uint32_t shape[VSI_NN_MAX_DIM_NUM]; + char tensor_name[128]; + + dim = inputs[0]->attr.dim_num; + if(_is_3d_instance_norm(self, inputs) == FALSE) + { + return VSI_SUCCESS; + } + + VSILOGD("Optimize 3D %s, uid %u", vsi_nn_OpGetName(self->op), self->uid); + /* + insert a reshape node before and after 3D instance_norm + */ + shape[0] = 1; + shape[1] = inputs[0]->attr.size[0]; + shape[2] = inputs[0]->attr.size[1]; + shape[3] = inputs[0]->attr.size[2]; + dim = 4; + local = self->nn_param.instancenorm.lcl2_data; + if (VSI_NN_OPTIMIZE_FORWARD == direction) + { + /* reshape 3d input (xcn) --> 4d input (whcn) */ + local->reshaped_input = vsi_nn_reshape_tensor(self->graph, inputs[0], shape, dim); + } + else + { + /* reshape 3d output(xcn) --> 4d output(whcn) */ + local->reshaped_output = vsi_nn_reshape_tensor(self->graph, outputs[0], shape, dim); + if(local->reshaped_output && local->reshaped_output->t) + { + memset(tensor_name, 0, sizeof(tensor_name)); + snprintf(tensor_name, sizeof(tensor_name), "uid_%u_reshape_out_0", self->uid); + if(vxSetReferenceName((vx_reference)local->reshaped_output->t, tensor_name) == VSI_FAILURE) + { + VSILOGW("Set uid %u batchnorm reshaped output name fail", self->uid); + return VSI_FAILURE; + } + } + } + + return VSI_SUCCESS; +} /* op_optimize() */ + static vsi_bool op_check ( vsi_nn_node_t * self, @@ -133,6 +253,8 @@ static vsi_status op_init self->nn_param.instancenorm.lcl2_data->reshapeFlg = 0; self->nn_param.instancenorm.lcl2_data->execute_on_sw = 0; self->nn_param.instancenorm.lcl2_data->hash_idx = 0; + self->nn_param.instancenorm.lcl2_data->reshaped_input = NULL; + self->nn_param.instancenorm.lcl2_data->reshaped_output = NULL; return status; } /* op_init() */ @@ -143,6 +265,7 @@ static vsi_status op_deinit ) { uint32_t i; + vsi_nn_instancenormalize_param *p = &(self->nn_param.instancenorm); for (i = 0; i < _VSI_NN_INSTANCENORM_LOCAL_TENSOR_NUM; i++) { if (self->nn_param.instancenorm.local.local_tensor[i] != NULL) @@ -151,6 +274,16 @@ static vsi_status op_deinit self->nn_param.instancenorm.local.local_tensor[i] = NULL; } } + if(p->lcl2_data->reshaped_input) + { + vsi_nn_ReleaseTensor(&(p->lcl2_data->reshaped_input)); + p->lcl2_data->reshaped_input = NULL; + } + if(p->lcl2_data->reshaped_output) + { + vsi_nn_ReleaseTensor(&(p->lcl2_data->reshaped_output)); + p->lcl2_data->reshaped_output = NULL; + } if(self->nn_param.instancenorm.lcl2_data) { free(self->nn_param.instancenorm.lcl2_data); @@ -173,7 +306,7 @@ DEF_OP_REG /* deinit */ op_deinit, /* check */ op_check, /* setup */ vsi_nn_op_common_setup, - /* optimize */ NULL, + /* optimize */ op_optimize, /* input_num */ _INPUT_NUM, /* output_num */ _OUTPUT_NUM ); diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_l2normalizescale.c b/src/tim/vx/internal/src/ops/vsi_nn_op_l2normalizescale.c index 52a54bb..04e5610 100644 --- a/src/tim/vx/internal/src/ops/vsi_nn_op_l2normalizescale.c +++ b/src/tim/vx/internal/src/ops/vsi_nn_op_l2normalizescale.c @@ -115,6 +115,45 @@ final: } +static vsi_bool _check_value_is_equal_to_one + ( + vsi_nn_graph_t* graph, + vsi_nn_tensor_t* tensor + ) +{ + vsi_bool ret = TRUE; + float* tensor_data = NULL; + uint32_t elements = 0; + uint32_t i = 0; + + elements = vsi_nn_GetElementNum( tensor ); + tensor_data = vsi_nn_ConvertTensorToFloat32Data( graph, tensor ); + if ( NULL == tensor_data ) + { + VSILOGE( "Convert data fail." ); + return FALSE; + } + + for (i = 0; i < elements; i++) + { + if ( vsi_abs(tensor_data[i] - 1.0f) > 1e-5 ) + { + ret = FALSE; + break; + } + } + + if ( !tensor->attr.is_created_from_handle ) + { + if ( tensor_data ) + { + free(tensor_data); + } + } + + return ret; +} + static vsi_status op_compute ( vsi_nn_node_t * self, @@ -141,6 +180,11 @@ static vsi_status op_compute p = &(self->nn_param.l2normalizescale); axis = p->axis; + if ( inputs[1]->attr.is_const == TRUE && _check_value_is_equal_to_one(self->graph, inputs[1]) ) + { + return vsi_nn_internal_compute_node( self ); + } + param =vsi_nn_kernel_param_create(); ret = vsi_nn_kernel_optimize_reduce_shape( @@ -240,6 +284,9 @@ static vsi_status op_deinit self->nn_param.l2normalizescale.local.local_tensor[i] = NULL; } } + + vsi_nn_internal_deinit_node_wksp( self ); + vsi_nn_op_common_deinit(self); return VSI_SUCCESS; @@ -253,11 +300,15 @@ static vsi_bool op_setup ) { vsi_bool ret = TRUE; + vsi_nn_internal_node_t* curr = NULL; + if( NULL == self ) { return FALSE; } + vsi_nn_internal_init_node_wksp( self ); + if (self->nn_param.l2normalizescale.axis < 0) { self->nn_param.l2normalizescale.axis += (int32_t)inputs[0]->attr.dim_num; @@ -269,6 +320,15 @@ static vsi_bool op_setup return FALSE; } + if ( inputs[1]->attr.is_const == TRUE && _check_value_is_equal_to_one( self->graph, inputs[1] ) ) + { + curr = vsi_nn_internal_new_node(self, VSI_NN_OP_L2_NORMALIZE, 0, 0); + curr->node->nn_param.l2_normalize.axis = self->nn_param.l2normalizescale.axis; + curr->inputs[0] = inputs[0]; + curr->outputs[0] = outputs[0]; + vsi_nn_internal_setup_node( self, curr ); + } + ret = vsi_nn_op_common_setup(self, inputs, outputs); return ret; @@ -280,7 +340,7 @@ static vsi_status op_init ) { vsi_status status = VSI_SUCCESS; - uint32_t i; + uint32_t i = 0; if (vsi_nn_compareVersion(self->graph, 1, 1, 13) == -1) { diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_layernormalize.c b/src/tim/vx/internal/src/ops/vsi_nn_op_layernormalize.c index 87f2b54..7cc8663 100644 --- a/src/tim/vx/internal/src/ops/vsi_nn_op_layernormalize.c +++ b/src/tim/vx/internal/src/ops/vsi_nn_op_layernormalize.c @@ -35,312 +35,11 @@ #include "vsi_nn_prv.h" #include "vsi_nn_log.h" #include "client/vsi_nn_vxkernel.h" +#include "kernel/vsi_nn_kernel.h" #include "utils/vsi_nn_constraint_check.h" -#define _ARG_NUM (1) #define _INPUT_NUM (3) #define _OUTPUT_NUM (1) -#define _IO_NUM (_INPUT_NUM + _OUTPUT_NUM) -#define _PARAM_NUM (_ARG_NUM + _IO_NUM) - -extern vx_kernel_description_t * vx_kernel_LAYERNORM_list[]; - -static void check_tensor_shape - ( - vsi_nn_node_t * self, - vsi_nn_tensor_t * input, - vx_reference * params, - uint32_t index, - vx_bool rsFlg - ) -{ - vsi_nn_tensor_attr_t attr; - - if (index == 0 ) - { - if(input->attr.dim_num == 1) - { - memcpy(&attr, &(input->attr), sizeof(vsi_nn_tensor_attr_t)); - attr.size[1] = 1; - attr.dim_num = 2; - self->nn_param.layernorm.local.local_tensor[index] = - vxReshapeTensor(input->t, (int32_t*)(attr.size), attr.dim_num); - params[index] = (vx_reference)self->nn_param.layernorm.local.local_tensor[index]; - } - else if ((input->attr.dim_num == 3 && input->attr.size[2] == 1) - ||(input->attr.dim_num == 4 && input->attr.size[2] == 1 && input->attr.size[3] == 1)) - { - memcpy(&attr, &(input->attr), sizeof(vsi_nn_tensor_attr_t)); - attr.dim_num = 2; - self->nn_param.layernorm.local.local_tensor[index] = - vxReshapeTensor(input->t, (int32_t*)(attr.size), attr.dim_num); - params[index] = (vx_reference)self->nn_param.layernorm.local.local_tensor[index]; - } - else - params[index] = (vx_reference)input->t; - } - else if(index == 1 ) - { - if(input->attr.dim_num == 1) - { - memcpy(&attr, &(input->attr), sizeof(vsi_nn_tensor_attr_t)); - attr.size[1] = 1; - attr.dim_num = 2; - self->nn_param.layernorm.local.local_tensor[index] = - vxReshapeTensor(input->t, (int32_t*)(attr.size), attr.dim_num); - params[index] = (vx_reference)self->nn_param.layernorm.local.local_tensor[index]; - } - else - params[index] = (vx_reference)input->t; - - } - else if(index == 2) - { - if(input->attr.dim_num == 1) - { - memcpy(&attr, &(input->attr), sizeof(vsi_nn_tensor_attr_t)); - attr.size[1] = 1; - attr.dim_num = 2; - self->nn_param.layernorm.local.local_tensor[index] = - vxReshapeTensor(input->t, (int32_t*)(attr.size), attr.dim_num); - params[index] = (vx_reference)self->nn_param.layernorm.local.local_tensor[index]; - } - else - params[index] = (vx_reference)input->t; - } - else if(index == 3) - { - if(input->attr.dim_num == 1) - { - memcpy(&attr, &(input->attr), sizeof(vsi_nn_tensor_attr_t)); - attr.size[1] = 1; - attr.dim_num = 2; - self->nn_param.layernorm.local.local_tensor[index] = - vxReshapeTensor(input->t, (int32_t*)(attr.size), attr.dim_num); - params[index] = (vx_reference)self->nn_param.layernorm.local.local_tensor[index]; - } - else if ((input->attr.dim_num == 3 && input->attr.size[2] == 1) - ||(input->attr.dim_num == 4 && input->attr.size[2] == 1 && input->attr.size[3] == 1)) - { - memcpy(&attr, &(input->attr), sizeof(vsi_nn_tensor_attr_t)); - attr.dim_num = 2; - self->nn_param.layernorm.local.local_tensor[index] = - vxReshapeTensor(input->t, (int32_t*)(attr.size), attr.dim_num); - params[index] = (vx_reference)self->nn_param.layernorm.local.local_tensor[index]; - } - else - params[index] = (vx_reference)input->t; - } - else - { - VSILOGE("No more local tensor!(LAYERNORM) at [%s : %d]\n", __FILE__, __LINE__); - } -} - -static void _set_inputs_outputs - ( - vx_reference * params, - vsi_nn_tensor_t ** inputs, - vsi_nn_tensor_t ** outputs - ) -{ - uint32_t i; - uint32_t cnt; - - /* Set inputs */ - cnt = 0; - for( i = 0; i < _INPUT_NUM; i ++, cnt ++ ) - { - params[cnt] = (vx_reference)inputs[i]->t; - } - - /* Set outputs */ - for( i = 0; i < _OUTPUT_NUM; i ++, cnt ++ ) - { - params[cnt] = (vx_reference)outputs[i]->t; - } -} /* _set_inputs_outputs() */ - -static vsi_status _create_params - ( - vsi_nn_node_t * node, - vx_reference * params, - uint32_t num - ) -{ - vsi_status status; - vx_context ctx; - vsi_nn_layernormalize_param * p; - if( 0 == num ) - { - return VSI_SUCCESS; - } - memset( params, 0, sizeof( vx_reference * ) * num ); - p = &(node->nn_param.layernorm); - ctx = vxGetContext( (vx_reference)node->graph->g ); - /* Init parameters */ -#define _SET_PARAM( i, type, arg ) do{ \ - params[i] = (vx_reference)vxCreateScalar( ctx, type, &p->arg ); \ - status = vxGetStatus( params[i] ); \ - if( VSI_SUCCESS != status ) { \ - goto set_param_error; \ - } \ - } while(0) - _SET_PARAM( 0, VX_TYPE_FLOAT32, eps ); -#undef _SET_PARAM -set_param_error: - - return status; -} /* _create_params */ - -static void _release_params - ( - vx_reference * params, - uint32_t num - ) -{ - uint32_t i; - vx_scalar scalar; - for( i = 0; i < num; i ++ ) - { - scalar = (vx_scalar)params[i]; - vxReleaseScalar( &scalar ); - } -} /* _release_params() */ - -static vsi_status cpu_op_compute - ( - vsi_nn_node_t * self, - vsi_nn_tensor_t ** inputs, - vsi_nn_tensor_t ** outputs - ) -{ - vsi_status status = VSI_SUCCESS; - vx_reference params[_PARAM_NUM]; - vx_reference * args; - - args = ¶ms[_IO_NUM]; - - if( NULL == self->n ) - { - return VSI_FAILURE; - } - - /* Set inputs and outputs */ - _set_inputs_outputs( params, inputs, outputs ); - - /* Init parameters. */ - _create_params( self, args, _ARG_NUM ); - - /* Pass parameters to node. */ - status = vsi_nn_ClientNodePassParameters( self->n, params, _PARAM_NUM ); - - _release_params( args, _ARG_NUM ); - - return status; -} - -static vsi_status vx_op_pre_compute - ( - vsi_nn_node_t * self, - vsi_nn_tensor_t ** inputs, - vsi_nn_tensor_t ** outputs, - vsi_nn_kernel_info_t * kernel_info - ) -{ - vsi_nn_type_e inputDataFormat = inputs[0]->attr.dtype.vx_type; - vsi_nn_type_e outputDataFormat = outputs[0]->attr.dtype.vx_type; - vsi_nn_type_e scaleDataFormat = inputs[2]->attr.dtype.vx_type; - if (inputDataFormat == VSI_NN_TYPE_FLOAT16 && outputDataFormat == VSI_NN_TYPE_FLOAT16 - && scaleDataFormat == VSI_NN_TYPE_FLOAT16) - { - kernel_info->kernel_index = 1; - } - else if (inputDataFormat == VSI_NN_TYPE_UINT8 && outputDataFormat == VSI_NN_TYPE_UINT8 - && scaleDataFormat == VSI_NN_TYPE_FLOAT16) - { - kernel_info->kernel_index = 2; - } - else if (inputDataFormat == VSI_NN_TYPE_FLOAT16 && outputDataFormat == VSI_NN_TYPE_UINT8 - && scaleDataFormat == VSI_NN_TYPE_FLOAT16) - { - kernel_info->kernel_index = 3; - } - else if (inputDataFormat == VSI_NN_TYPE_UINT8 && outputDataFormat == VSI_NN_TYPE_FLOAT16 - && scaleDataFormat == VSI_NN_TYPE_FLOAT16) - { - kernel_info->resource_name[0] = "vsi_nn_kernel_layernormalize_U8"; - kernel_info->kernel_index = 4; - } - else - { - VSILOGE("Not support input or output data format!(LAYERNORM) at [%s : %d]\n", __FILE__, __LINE__); - return VSI_FAILURE; - } - return VSI_SUCCESS; -} - -static vsi_status vx_op_compute - ( - vsi_nn_node_t * self, - vsi_nn_tensor_t ** inputs, - vsi_nn_tensor_t ** outputs - ) -{ - vsi_status status = VSI_SUCCESS; - vx_reference params[_PARAM_NUM]; - vx_border_t border; - vx_reference * args; - vx_bool rsFlg = FALSE; - int32_t in_zp; - vsi_nn_type_e inputDataFormat = inputs[0]->attr.dtype.vx_type; - vsi_nn_tensor_attr_t attr; - - args = ¶ms[_IO_NUM]; - - if( NULL == self->n ) - { - return VSI_FAILURE; - } - - /* Set inputs and outputs */ - //_set_inputs_outputs( params, inputs, outputs ); - check_tensor_shape(self, inputs[0], params, 0, rsFlg); - check_tensor_shape(self, inputs[1], params, 1, rsFlg); - check_tensor_shape(self, inputs[2], params, 2, rsFlg); - check_tensor_shape(self, outputs[0], params, 3, rsFlg); - memset(&attr, 0, sizeof(vsi_nn_tensor_attr_t)); - status = vsi_nn_vxGetTensorAttr(inputs[0]->t, &attr); - in_zp = attr.dtype.zero_point; - /* Init parameters. */ - _create_params( self, args, _ARG_NUM ); - - /* Pass parameters to node. */ - status = vsi_nn_ClientNodePassParameters( self->n, params, _PARAM_NUM ); - - _release_params( args, _ARG_NUM ); - - border.mode = VX_BORDER_CONSTANT; - border.constant_value.U32 = 0; - border.constant_value.S16 = 0; - border.constant_value.U8 = 0; - if(inputDataFormat == VSI_NN_TYPE_UINT8) - { - border.constant_value.U32 = (vx_uint32)in_zp; - border.constant_value.S16 = (vx_int16)in_zp; - border.constant_value.U8 = (vx_uint8)in_zp; - } - status |= vxSetNodeAttribute(self->n, VX_NODE_BORDER, &border, sizeof(border)); - - return status; -} - -static vsi_nn_op_compute_t op_compute_list[] = -{ - cpu_op_compute, - vx_op_compute, - NULL -}; static vsi_status op_compute ( @@ -349,35 +48,44 @@ static vsi_status op_compute vsi_nn_tensor_t ** outputs ) { - vsi_status status; - vsi_nn_kernel_info_t kernel_info; + vsi_status status = VSI_FAILURE; + vsi_nn_kernel_param_t * param = NULL; + vsi_nn_kernel_node_t n = NULL; + float eps = self->nn_param.instancenorm.eps; + uint32_t *input_size = inputs[0]->attr.size; + uint32_t dims_num = inputs[0]->attr.dim_num; + int32_t rs_flg = 0; + int32_t wh_flg = 0; - memset(&kernel_info, 0x0, sizeof(vsi_nn_kernel_info_t)); - status = VSI_FAILURE; - kernel_info.resource_num = 1; - kernel_info.resource_name = (char **)malloc(kernel_info.resource_num * sizeof(char *)); - kernel_info.resource_name[0] = "vsi_nn_kernel_layernormalize"; - kernel_info.type = vsi_nn_GetVXKernelTypeForShader(); - kernel_info.kernel = vx_kernel_LAYERNORM_list; - kernel_info.init_index = 1; + param =vsi_nn_kernel_param_create(); - if (vsi_nn_is_do_vx_op_pre_init(kernel_info.type)) + if (input_size[0] >= GPU_TENSOR_MAX_WIDTH) { - vx_op_pre_compute(self, inputs, outputs, &kernel_info); + wh_flg = 1; } - self->n = vsi_nn_RegisterClientKernelAndNewNode( - self->graph, &kernel_info); - if (kernel_info.resource_name) free(kernel_info.resource_name); - if( NULL == self->n ) + if ((input_size[1] * input_size[2] < GPU_TENSOR_MAX_WIDTH) + && dims_num > 2) { - return VSI_FAILURE; + rs_flg = 1; } - if (NULL != op_compute_list[kernel_info.init_index]) + vsi_nn_kernel_param_add_float32( param, "eps", eps ); + vsi_nn_kernel_param_add_int32( param, "reshape_flg", rs_flg ); + vsi_nn_kernel_param_add_int32( param, "wh_flg", wh_flg ); + n = vsi_nn_kernel_selector( self->graph, "layer_norm", + inputs, _INPUT_NUM, outputs, _OUTPUT_NUM, param ); + if ( n != NULL ) { - status = op_compute_list[kernel_info.init_index](self, inputs, outputs); + self->n = (vx_node)n; + status = VSI_SUCCESS; } + + if (param != NULL) + { + vsi_nn_kernel_param_release( ¶m ); + } + return status; } /* op_compute() */ @@ -389,10 +97,12 @@ static vsi_bool op_check ) { BEGIN_IO_TYPE_DECL(LAYER_NORM, 3, 1) + IO_TYPE(D_F32, D_F32, D_F32, D_F32) IO_TYPE(D_F16, D_F32, D_F16, D_F16) IO_TYPE(D_F16, D_F32, D_F16, D_U8|Q_ASYM) IO_TYPE(D_U8|Q_ASYM, D_F32, D_F16, D_F16) IO_TYPE(D_U8|Q_ASYM, D_F32, D_F16, D_U8|Q_ASYM) + IO_TYPE(D_I16|Q_DFP, D_F32, D_F16, D_I16|Q_DFP) END_IO_TYPE_DECL(LAYER_NORM) if (!VALIDATE_OP_IO_TYPES(LAYER_NORM, self, inputs, self->input.num, outputs, self->output.num)) { @@ -438,8 +148,8 @@ DEF_OP_REG /* check */ op_check, /* setup */ vsi_nn_op_common_setup, /* optimize */ NULL, - /* input_num */ 3, - /* output_num */ 1 + /* input_num */ _INPUT_NUM, + /* output_num */ _OUTPUT_NUM ); #ifdef __cplusplus } diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_matrixmul.c b/src/tim/vx/internal/src/ops/vsi_nn_op_matrixmul.c index e61783d..f4b8efe 100644 --- a/src/tim/vx/internal/src/ops/vsi_nn_op_matrixmul.c +++ b/src/tim/vx/internal/src/ops/vsi_nn_op_matrixmul.c @@ -65,13 +65,13 @@ static vsi_status op_compute vsi_nn_kernel_param_add_int32( param, "adjointB", adjointB ); n = vsi_nn_kernel_selector( self->graph, "matrixmul", inputs, 2, outputs, 1, param ); - if( n != NULL ) + if ( n != NULL ) { self->n = (vx_node)n; status = VSI_SUCCESS; } - if(param != NULL) + if (param != NULL) { vsi_nn_kernel_param_release( ¶m ); } @@ -103,15 +103,19 @@ static vsi_bool op_check IO_TYPE(D_I16|Q_DFP, D_F16, D_F16) IO_TYPE(D_F16, D_U8|Q_ASYM, D_F16) IO_TYPE(D_F16, D_I16|Q_DFP, D_F16) + IO_TYPE(D_F16, D_I16|Q_DFP, D_I16|Q_DFP) IO_TYPE(D_F16, D_I8|Q_DFP, D_F16) IO_TYPE(D_F16, D_U8|Q_ASYM, D_U8|Q_ASYM) IO_TYPE(D_F16, D_F16, D_F16) IO_TYPE(D_F32, D_F32, D_F32) + IO_TYPE(D_F32, D_I8|Q_DFP, D_F32) + IO_TYPE(D_F32, D_I16|Q_DFP, D_F32) + IO_TYPE(D_F32, D_I32, D_F32) IO_TYPE(D_F16, D_F16, D_I16|Q_DFP) IO_TYPE(D_F16, D_F16, D_I8|Q_DFP) IO_TYPE(D_F16, D_F16, D_U8|Q_ASYM) END_IO_TYPE_DECL(MATRIXMUL) - if(!VALIDATE_OP_IO_TYPES(MATRIXMUL, self, inputs, self->input.num, outputs, self->output.num)) { + if (!VALIDATE_OP_IO_TYPES(MATRIXMUL, self, inputs, self->input.num, outputs, self->output.num)) { char* desc = generate_op_io_types_desc(inputs, self->input.num, outputs, self->output.num); VSILOGE("Inputs/Outputs data type not support: %s", desc); @@ -141,7 +145,7 @@ static vsi_bool op_check return FALSE; } - if(inputs[0]->attr.dim_num > 2 && inputs[1]->attr.dim_num > 2 + if (inputs[0]->attr.dim_num > 2 && inputs[1]->attr.dim_num > 2 && inputs[0]->attr.size[2] != 1 && inputs[1]->attr.size[2] != 1 && inputs[0]->attr.size[2] != inputs[1]->attr.size[2]) { @@ -160,7 +164,7 @@ static vsi_bool op_setup ) { uint32_t i = 0; - if( VSI_NN_DIM_AUTO == outputs[0]->attr.dim_num ) + if ( VSI_NN_DIM_AUTO == outputs[0]->attr.dim_num ) { outputs[0]->attr.dim_num = vsi_nn_max(inputs[0]->attr.dim_num, inputs[1]->attr.dim_num); @@ -188,21 +192,21 @@ static vsi_bool op_setup return FALSE; } - if(inputs[0]->attr.dim_num > inputs[1]->attr.dim_num) + if (inputs[0]->attr.dim_num > inputs[1]->attr.dim_num) { for (i = 2; i < inputs[0]->attr.dim_num; i++) { outputs[0]->attr.size[i] = inputs[0]->attr.size[i]; } } - else if(inputs[1]->attr.dim_num > inputs[0]->attr.dim_num) + else if (inputs[1]->attr.dim_num > inputs[0]->attr.dim_num) { for (i = 2; i < inputs[1]->attr.dim_num; i++) { outputs[0]->attr.size[i] = inputs[1]->attr.size[i]; } } - else if(inputs[0]->attr.size[2] >= inputs[1]->attr.size[2]) + else if (inputs[0]->attr.size[2] >= inputs[1]->attr.size[2]) { for (i = 2; i < inputs[0]->attr.dim_num; i++) { diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_pre_process.c b/src/tim/vx/internal/src/ops/vsi_nn_op_pre_process.c index c1877de..ccb0510 100644 --- a/src/tim/vx/internal/src/ops/vsi_nn_op_pre_process.c +++ b/src/tim/vx/internal/src/ops/vsi_nn_op_pre_process.c @@ -81,295 +81,413 @@ static vsi_bool op_setup { /* TODO: Add code to comput outputs' shape. */ vsi_nn_internal_node_t* curr = NULL; - vsi_nn_pre_process_param * p; + vsi_nn_pre_process_param * p = NULL; vsi_bool ret = TRUE; + vsi_nn_internal_tensor_t* preprocess_tensor = NULL; + vsi_nn_preprocess_dest_layout_e layout = VSI_NN_DEST_LAYOUT_NCHW; p = (vsi_nn_pre_process_param *)&(self->nn_param.pre_process); vsi_nn_internal_init_node_wksp( self ); - if (p->type == VSI_NN_SOURCE_FORMAT_TENSOR) - { - curr = vsi_nn_internal_new_node( self, VSI_NN_OP_PRE_PROCESS_TENSOR, 0, 0 ); - - curr->node->nn_param.pre_process_tensor.perm = p->perm; - curr->node->nn_param.pre_process_tensor.dim_num = p->dim_num; - - curr->inputs[0] = inputs[PRE_PROCESS_INPUT0]; - curr->outputs[0] = outputs[PRE_PROCESS_OUTPUT]; - - vsi_nn_internal_setup_node(self, curr); - } - else if (p->type == VSI_NN_SOURCE_FORMAT_IMAGE_GRAY) - { - curr = vsi_nn_internal_new_node( self, VSI_NN_OP_PRE_PROCESS_GRAY, 0, 0 ); - - curr->node->nn_param.pre_process_gray.mean = p->norm.mean[0]; - curr->node->nn_param.pre_process_gray.scale = p->norm.scale; - curr->node->nn_param.pre_process_gray.rect.left = p->rect.left; - curr->node->nn_param.pre_process_gray.rect.top = p->rect.top; - curr->node->nn_param.pre_process_gray.rect.width = p->rect.width; - curr->node->nn_param.pre_process_gray.rect.height = p->rect.height; - curr->node->nn_param.pre_process_gray.output_attr.size = p->output_attr.size; - curr->node->nn_param.pre_process_gray.output_attr.dim_num = p->output_attr.dim_num; - - curr->inputs[0] = inputs[PRE_PROCESS_INPUT0]; - curr->outputs[0] = outputs[PRE_PROCESS_OUTPUT]; - - vsi_nn_internal_setup_node(self, curr); - } - else if (p->type == VSI_NN_SOURCE_FORMAT_IMAGE_RGB) - { - curr = vsi_nn_internal_new_node( self, VSI_NN_OP_PRE_PROCESS_RGB, 0, 0 ); - - if (p->reverse_channel) - { - curr->node->nn_param.pre_process_rgb.r_mean = p->norm.mean[2]; - curr->node->nn_param.pre_process_rgb.g_mean = p->norm.mean[1]; - curr->node->nn_param.pre_process_rgb.b_mean = p->norm.mean[0]; - } - else - { - curr->node->nn_param.pre_process_rgb.r_mean = p->norm.mean[0]; - curr->node->nn_param.pre_process_rgb.g_mean = p->norm.mean[1]; - curr->node->nn_param.pre_process_rgb.b_mean = p->norm.mean[2]; - } - - curr->node->nn_param.pre_process_rgb.rgb_scale = p->norm.scale; - curr->node->nn_param.pre_process_rgb.reverse_channel = p->reverse_channel; - curr->node->nn_param.pre_process_rgb.rect.left = p->rect.left; - curr->node->nn_param.pre_process_rgb.rect.top = p->rect.top; - curr->node->nn_param.pre_process_rgb.rect.width = p->rect.width; - curr->node->nn_param.pre_process_rgb.rect.height = p->rect.height; - curr->node->nn_param.pre_process_rgb.output_attr.size = p->output_attr.size; - curr->node->nn_param.pre_process_rgb.output_attr.dim_num = p->output_attr.dim_num; - curr->node->nn_param.pre_process_rgb.perm = p->perm; - curr->node->nn_param.pre_process_rgb.dim_num = p->dim_num; - - curr->inputs[0] = inputs[PRE_PROCESS_INPUT0]; - curr->outputs[0] = outputs[PRE_PROCESS_OUTPUT]; - - vsi_nn_internal_setup_node(self, curr); - } - else if (p->type == VSI_NN_SOURCE_FORMAT_IMAGE_YUV420) - { - curr = vsi_nn_internal_new_node( self, VSI_NN_OP_PRE_PROCESS_YUV420, 0, 0 ); - - if (p->reverse_channel) - { - curr->node->nn_param.pre_process_yuv420.r_mean = p->norm.mean[2]; - curr->node->nn_param.pre_process_yuv420.g_mean = p->norm.mean[1]; - curr->node->nn_param.pre_process_yuv420.b_mean = p->norm.mean[0]; - } - else - { - curr->node->nn_param.pre_process_yuv420.r_mean = p->norm.mean[0]; - curr->node->nn_param.pre_process_yuv420.g_mean = p->norm.mean[1]; - curr->node->nn_param.pre_process_yuv420.b_mean = p->norm.mean[2]; - } - - curr->node->nn_param.pre_process_yuv420.rgb_scale = p->norm.scale; - curr->node->nn_param.pre_process_yuv420.reverse_channel = p->reverse_channel; - curr->node->nn_param.pre_process_yuv420.rect.left = p->rect.left; - curr->node->nn_param.pre_process_yuv420.rect.top = p->rect.top; - curr->node->nn_param.pre_process_yuv420.rect.width = p->rect.width; - curr->node->nn_param.pre_process_yuv420.rect.height = p->rect.height; - curr->node->nn_param.pre_process_yuv420.output_attr.size = p->output_attr.size; - curr->node->nn_param.pre_process_yuv420.output_attr.dim_num = p->output_attr.dim_num; - curr->node->nn_param.pre_process_yuv420.perm = p->perm; - curr->node->nn_param.pre_process_yuv420.dim_num = p->dim_num; - - curr->inputs[0] = inputs[PRE_PROCESS_INPUT0]; - curr->inputs[1] = inputs[PRE_PROCESS_INPUT1]; - curr->inputs[2] = inputs[PRE_PROCESS_INPUT2]; - curr->outputs[0] = outputs[PRE_PROCESS_OUTPUT]; - - vsi_nn_internal_setup_node(self, curr); - } - else if (p->type == VSI_NN_SOURCE_FORMAT_IMAGE_BGRA) - { - curr = vsi_nn_internal_new_node( self, VSI_NN_OP_PRE_PROCESS_BGRA, 0, 0 ); - - if (p->reverse_channel) - { - curr->node->nn_param.pre_process_bgra.r_mean = p->norm.mean[2]; - curr->node->nn_param.pre_process_bgra.g_mean = p->norm.mean[1]; - curr->node->nn_param.pre_process_bgra.b_mean = p->norm.mean[0]; - } - else - { - curr->node->nn_param.pre_process_bgra.r_mean = p->norm.mean[0]; - curr->node->nn_param.pre_process_bgra.g_mean = p->norm.mean[1]; - curr->node->nn_param.pre_process_bgra.b_mean = p->norm.mean[2]; - } - - curr->node->nn_param.pre_process_bgra.rgb_scale = p->norm.scale; - curr->node->nn_param.pre_process_bgra.reverse_channel = p->reverse_channel; - curr->node->nn_param.pre_process_bgra.rect.left = p->rect.left; - curr->node->nn_param.pre_process_bgra.rect.top = p->rect.top; - curr->node->nn_param.pre_process_bgra.rect.width = p->rect.width; - curr->node->nn_param.pre_process_bgra.rect.height = p->rect.height; - curr->node->nn_param.pre_process_bgra.output_attr.size = p->output_attr.size; - curr->node->nn_param.pre_process_bgra.output_attr.dim_num = p->output_attr.dim_num; - curr->node->nn_param.pre_process_bgra.perm = p->perm; - curr->node->nn_param.pre_process_bgra.dim_num = p->dim_num; - - curr->inputs[0] = inputs[PRE_PROCESS_INPUT0]; - curr->outputs[0] = outputs[PRE_PROCESS_OUTPUT]; - - vsi_nn_internal_setup_node(self, curr); - } - else if (p->type == VSI_NN_SOURCE_FORMAT_IMAGE_RGB888_PLANAR) + if ( p->type == VSI_NN_SOURCE_FORMAT_IMAGE_YUV420 || + p->type == VSI_NN_SOURCE_FORMAT_IMAGE_YUV444 || + p->type == VSI_NN_SOURCE_FORMAT_IMAGE_NV12 || + p->type == VSI_NN_SOURCE_FORMAT_IMAGE_RGB || + p->type == VSI_NN_SOURCE_FORMAT_IMAGE_BGRA || + p->type == VSI_NN_SOURCE_FORMAT_IMAGE_RGB888_PLANAR + ) { uint32_t i = 0; - uint32_t axis = 2; - uint32_t group = 3; - vsi_nn_tensor_t ** input_tensor_group = &p->local->local_tensor[0]; - vsi_nn_internal_tensor_t * output_tensor_group[3] = {NULL}; - vsi_nn_internal_tensor_t* tmp_outputs[3] = { NULL }; + uint32_t _axis = 0; vsi_nn_tensor_attr_t attr; - float mean[3] = {0}; + vsi_bool use_virtual_tensor = TRUE; - memset(&attr, 0, sizeof(vsi_nn_tensor_attr_t)); - ret = vsi_nn_CreateTensorGroup(self->graph, inputs[0], axis, - input_tensor_group, group); - if (ret == FALSE) + for (i = 0; i < p->dim_num; i++) { - goto final; + _axis = p->perm[i]; + if (_axis != i) + break; } - memcpy(&attr, &outputs[0]->attr, sizeof(vsi_nn_tensor_attr_t)); - memcpy(&attr.size, p->output_attr.size, p->output_attr.dim_num * sizeof(uint32_t)); - attr.size[axis] = 1; - attr.vtl = TRUE; - attr.is_const = FALSE; - output_tensor_group[0] = vsi_nn_internal_new_tensor( self, &attr, 0.0f ); - output_tensor_group[1] = vsi_nn_internal_new_tensor( self, &attr, 0.0f ); - output_tensor_group[2] = vsi_nn_internal_new_tensor( self, &attr, 0.0f ); - - if (p->reverse_channel) + if (i != self->nn_param.pre_process_rgb.dim_num) { - int32_t order[3] = {2, 1, 0}; - - mean[0] = p->norm.mean[2]; - mean[1] = p->norm.mean[1]; - mean[2] = p->norm.mean[0]; - - vsi_nn_reorder_tensor( (vsi_nn_tensor_t **)output_tensor_group, order, - 3, (vsi_nn_tensor_t **)tmp_outputs ); - } - else - { - mean[0] = p->norm.mean[0]; - mean[1] = p->norm.mean[1]; - mean[2] = p->norm.mean[2]; - - memmove( tmp_outputs, output_tensor_group, sizeof(vsi_nn_tensor_t*) * 3 ); + layout = VSI_NN_DEST_LAYOUT_NHWC; } - for (i = 0; i < 3; i++) + if (layout == VSI_NN_DEST_LAYOUT_NHWC) + { + memcpy( &attr, &outputs[PRE_PROCESS_OUTPUT]->attr, sizeof( attr ) ); + attr.size[0] = p->output_attr.size[1]; + attr.size[1] = p->output_attr.size[2]; + attr.size[2] = p->output_attr.size[0]; + p->output_attr.size[0] = attr.size[0]; + p->output_attr.size[1] = attr.size[1]; + p->output_attr.size[2] = attr.size[2]; + attr.vtl = use_virtual_tensor; + attr.is_const = FALSE; + + preprocess_tensor = vsi_nn_internal_new_tensor( self, &attr, 0.0f ); + } + } + + switch (p->type) + { + case VSI_NN_SOURCE_FORMAT_TENSOR: + { + curr = vsi_nn_internal_new_node( self, VSI_NN_OP_PRE_PROCESS_TENSOR, 0, 0 ); + + curr->node->nn_param.pre_process_tensor.perm = p->perm; + curr->node->nn_param.pre_process_tensor.dim_num = p->dim_num; + + curr->inputs[0] = inputs[PRE_PROCESS_INPUT0]; + curr->outputs[0] = outputs[PRE_PROCESS_OUTPUT]; + + vsi_nn_internal_setup_node(self, curr); + } + break; + case VSI_NN_SOURCE_FORMAT_IMAGE_GRAY: { curr = vsi_nn_internal_new_node( self, VSI_NN_OP_PRE_PROCESS_GRAY, 0, 0 ); - curr->node->nn_param.pre_process_gray.mean = mean[i]; + curr->node->nn_param.pre_process_gray.mean = p->norm.mean[0]; curr->node->nn_param.pre_process_gray.scale = p->norm.scale; curr->node->nn_param.pre_process_gray.rect.left = p->rect.left; curr->node->nn_param.pre_process_gray.rect.top = p->rect.top; curr->node->nn_param.pre_process_gray.rect.width = p->rect.width; curr->node->nn_param.pre_process_gray.rect.height = p->rect.height; - curr->node->nn_param.pre_process_gray.output_attr.size = attr.size; + curr->node->nn_param.pre_process_gray.output_attr.size = p->output_attr.size; curr->node->nn_param.pre_process_gray.output_attr.dim_num = p->output_attr.dim_num; - curr->inputs[0] = input_tensor_group[i]; - curr->outputs[0] = output_tensor_group[i]->t; + curr->inputs[0] = inputs[PRE_PROCESS_INPUT0]; + curr->outputs[0] = outputs[PRE_PROCESS_OUTPUT]; vsi_nn_internal_setup_node(self, curr); } + break; + case VSI_NN_SOURCE_FORMAT_IMAGE_RGB: + { + curr = vsi_nn_internal_new_node( self, VSI_NN_OP_PRE_PROCESS_RGB, 0, 0 ); - curr = vsi_nn_internal_new_node( self, VSI_NN_OP_CONCAT, 3, 1 ); + if (p->reverse_channel) + { + curr->node->nn_param.pre_process_rgb.r_mean = p->norm.mean[2]; + curr->node->nn_param.pre_process_rgb.g_mean = p->norm.mean[1]; + curr->node->nn_param.pre_process_rgb.b_mean = p->norm.mean[0]; + } + else + { + curr->node->nn_param.pre_process_rgb.r_mean = p->norm.mean[0]; + curr->node->nn_param.pre_process_rgb.g_mean = p->norm.mean[1]; + curr->node->nn_param.pre_process_rgb.b_mean = p->norm.mean[2]; + } - curr->node->nn_param.concat.axis = axis; - curr->inputs[0] = tmp_outputs[0]->t; - curr->inputs[1] = tmp_outputs[1]->t; - curr->inputs[2] = tmp_outputs[2]->t; - curr->outputs[0] = outputs[0]; + curr->node->nn_param.pre_process_rgb.rgb_scale = p->norm.scale; + curr->node->nn_param.pre_process_rgb.reverse_channel = p->reverse_channel; + curr->node->nn_param.pre_process_rgb.rect.left = p->rect.left; + curr->node->nn_param.pre_process_rgb.rect.top = p->rect.top; + curr->node->nn_param.pre_process_rgb.rect.width = p->rect.width; + curr->node->nn_param.pre_process_rgb.rect.height = p->rect.height; + curr->node->nn_param.pre_process_rgb.output_attr.size = p->output_attr.size; + curr->node->nn_param.pre_process_rgb.output_attr.dim_num = p->output_attr.dim_num; + curr->node->nn_param.pre_process_rgb.perm = p->perm; + curr->node->nn_param.pre_process_rgb.dim_num = p->dim_num; - vsi_nn_internal_setup_node(self, curr); + curr->inputs[0] = inputs[PRE_PROCESS_INPUT0]; + if (layout == VSI_NN_DEST_LAYOUT_NHWC) + { + curr->outputs[0] = preprocess_tensor->t; + } + else + { + curr->outputs[0] = outputs[PRE_PROCESS_OUTPUT]; + } + + vsi_nn_internal_setup_node(self, curr); + } + break; + case VSI_NN_SOURCE_FORMAT_IMAGE_YUV420: + { + curr = vsi_nn_internal_new_node( self, VSI_NN_OP_PRE_PROCESS_YUV420, 0, 0 ); + + if (p->reverse_channel) + { + curr->node->nn_param.pre_process_yuv420.r_mean = p->norm.mean[2]; + curr->node->nn_param.pre_process_yuv420.g_mean = p->norm.mean[1]; + curr->node->nn_param.pre_process_yuv420.b_mean = p->norm.mean[0]; + } + else + { + curr->node->nn_param.pre_process_yuv420.r_mean = p->norm.mean[0]; + curr->node->nn_param.pre_process_yuv420.g_mean = p->norm.mean[1]; + curr->node->nn_param.pre_process_yuv420.b_mean = p->norm.mean[2]; + } + + curr->node->nn_param.pre_process_yuv420.rgb_scale = p->norm.scale; + curr->node->nn_param.pre_process_yuv420.reverse_channel = p->reverse_channel; + curr->node->nn_param.pre_process_yuv420.rect.left = p->rect.left; + curr->node->nn_param.pre_process_yuv420.rect.top = p->rect.top; + curr->node->nn_param.pre_process_yuv420.rect.width = p->rect.width; + curr->node->nn_param.pre_process_yuv420.rect.height = p->rect.height; + curr->node->nn_param.pre_process_yuv420.output_attr.size = p->output_attr.size; + curr->node->nn_param.pre_process_yuv420.output_attr.dim_num = p->output_attr.dim_num; + curr->node->nn_param.pre_process_yuv420.perm = p->perm; + curr->node->nn_param.pre_process_yuv420.dim_num = p->dim_num; + + curr->inputs[0] = inputs[PRE_PROCESS_INPUT0]; + curr->inputs[1] = inputs[PRE_PROCESS_INPUT1]; + curr->inputs[2] = inputs[PRE_PROCESS_INPUT2]; + if (layout == VSI_NN_DEST_LAYOUT_NHWC) + { + curr->outputs[0] = preprocess_tensor->t; + } + else + { + curr->outputs[0] = outputs[PRE_PROCESS_OUTPUT]; + } + + vsi_nn_internal_setup_node(self, curr); + } + break; + case VSI_NN_SOURCE_FORMAT_IMAGE_BGRA: + { + curr = vsi_nn_internal_new_node( self, VSI_NN_OP_PRE_PROCESS_BGRA, 0, 0 ); + + if (p->reverse_channel) + { + curr->node->nn_param.pre_process_bgra.r_mean = p->norm.mean[2]; + curr->node->nn_param.pre_process_bgra.g_mean = p->norm.mean[1]; + curr->node->nn_param.pre_process_bgra.b_mean = p->norm.mean[0]; + } + else + { + curr->node->nn_param.pre_process_bgra.r_mean = p->norm.mean[0]; + curr->node->nn_param.pre_process_bgra.g_mean = p->norm.mean[1]; + curr->node->nn_param.pre_process_bgra.b_mean = p->norm.mean[2]; + } + + curr->node->nn_param.pre_process_bgra.rgb_scale = p->norm.scale; + curr->node->nn_param.pre_process_bgra.reverse_channel = p->reverse_channel; + curr->node->nn_param.pre_process_bgra.rect.left = p->rect.left; + curr->node->nn_param.pre_process_bgra.rect.top = p->rect.top; + curr->node->nn_param.pre_process_bgra.rect.width = p->rect.width; + curr->node->nn_param.pre_process_bgra.rect.height = p->rect.height; + curr->node->nn_param.pre_process_bgra.output_attr.size = p->output_attr.size; + curr->node->nn_param.pre_process_bgra.output_attr.dim_num = p->output_attr.dim_num; + curr->node->nn_param.pre_process_bgra.perm = p->perm; + curr->node->nn_param.pre_process_bgra.dim_num = p->dim_num; + + curr->inputs[0] = inputs[PRE_PROCESS_INPUT0]; + if (layout == VSI_NN_DEST_LAYOUT_NHWC) + { + curr->outputs[0] = preprocess_tensor->t; + } + else + { + curr->outputs[0] = outputs[PRE_PROCESS_OUTPUT]; + } + + vsi_nn_internal_setup_node(self, curr); + } + break; + case VSI_NN_SOURCE_FORMAT_IMAGE_RGB888_PLANAR: + { + uint32_t i = 0; + uint32_t axis = 2; + uint32_t group = 3; + vsi_nn_tensor_t ** input_tensor_group = &p->local->local_tensor[0]; + vsi_nn_internal_tensor_t * output_tensor_group[3] = {NULL}; + vsi_nn_internal_tensor_t* tmp_outputs[3] = { NULL }; + vsi_nn_tensor_attr_t attr; + float mean[3] = {0}; + + memset(&attr, 0, sizeof(vsi_nn_tensor_attr_t)); + + ret = vsi_nn_CreateTensorGroup(self->graph, inputs[0], axis, + input_tensor_group, group); + if (ret == FALSE) + { + goto final; + } + + memcpy(&attr, &outputs[0]->attr, sizeof(vsi_nn_tensor_attr_t)); + memcpy(&attr.size, p->output_attr.size, p->output_attr.dim_num * sizeof(uint32_t)); + attr.size[axis] = 1; + attr.vtl = TRUE; + attr.is_const = FALSE; + output_tensor_group[0] = vsi_nn_internal_new_tensor( self, &attr, 0.0f ); + output_tensor_group[1] = vsi_nn_internal_new_tensor( self, &attr, 0.0f ); + output_tensor_group[2] = vsi_nn_internal_new_tensor( self, &attr, 0.0f ); + + if (p->reverse_channel) + { + int32_t order[3] = {2, 1, 0}; + + mean[0] = p->norm.mean[2]; + mean[1] = p->norm.mean[1]; + mean[2] = p->norm.mean[0]; + + vsi_nn_reorder_tensor( (vsi_nn_tensor_t **)output_tensor_group, order, + 3, (vsi_nn_tensor_t **)tmp_outputs ); + } + else + { + mean[0] = p->norm.mean[0]; + mean[1] = p->norm.mean[1]; + mean[2] = p->norm.mean[2]; + + memmove( tmp_outputs, output_tensor_group, sizeof(vsi_nn_tensor_t*) * 3 ); + } + + for (i = 0; i < 3; i++) + { + curr = vsi_nn_internal_new_node( self, VSI_NN_OP_PRE_PROCESS_GRAY, 0, 0 ); + + curr->node->nn_param.pre_process_gray.mean = mean[i]; + curr->node->nn_param.pre_process_gray.scale = p->norm.scale; + curr->node->nn_param.pre_process_gray.rect.left = p->rect.left; + curr->node->nn_param.pre_process_gray.rect.top = p->rect.top; + curr->node->nn_param.pre_process_gray.rect.width = p->rect.width; + curr->node->nn_param.pre_process_gray.rect.height = p->rect.height; + curr->node->nn_param.pre_process_gray.output_attr.size = attr.size; + curr->node->nn_param.pre_process_gray.output_attr.dim_num = p->output_attr.dim_num; + + curr->inputs[0] = input_tensor_group[i]; + curr->outputs[0] = output_tensor_group[i]->t; + + vsi_nn_internal_setup_node(self, curr); + } + + curr = vsi_nn_internal_new_node( self, VSI_NN_OP_CONCAT, 3, 1 ); + + curr->node->nn_param.concat.axis = axis; + curr->inputs[0] = tmp_outputs[0]->t; + curr->inputs[1] = tmp_outputs[1]->t; + curr->inputs[2] = tmp_outputs[2]->t; + if (layout == VSI_NN_DEST_LAYOUT_NHWC) + { + curr->outputs[0] = preprocess_tensor->t; + } + else + { + curr->outputs[0] = outputs[PRE_PROCESS_OUTPUT]; + } + + vsi_nn_internal_setup_node(self, curr); + } + break; + case VSI_NN_SOURCE_FORMAT_IMAGE_YUV444: + { + curr = vsi_nn_internal_new_node( self, VSI_NN_OP_PRE_PROCESS_YUV444, 0, 0 ); + + if (p->reverse_channel) + { + curr->node->nn_param.pre_process_yuv444.r_mean = p->norm.mean[2]; + curr->node->nn_param.pre_process_yuv444.g_mean = p->norm.mean[1]; + curr->node->nn_param.pre_process_yuv444.b_mean = p->norm.mean[0]; + } + else + { + curr->node->nn_param.pre_process_yuv444.r_mean = p->norm.mean[0]; + curr->node->nn_param.pre_process_yuv444.g_mean = p->norm.mean[1]; + curr->node->nn_param.pre_process_yuv444.b_mean = p->norm.mean[2]; + } + + curr->node->nn_param.pre_process_yuv444.rgb_scale = p->norm.scale; + curr->node->nn_param.pre_process_yuv444.reverse_channel = p->reverse_channel; + curr->node->nn_param.pre_process_yuv444.rect.left = p->rect.left; + curr->node->nn_param.pre_process_yuv444.rect.top = p->rect.top; + curr->node->nn_param.pre_process_yuv444.rect.width = p->rect.width; + curr->node->nn_param.pre_process_yuv444.rect.height = p->rect.height; + curr->node->nn_param.pre_process_yuv444.output_attr.size = p->output_attr.size; + curr->node->nn_param.pre_process_yuv444.output_attr.dim_num = p->output_attr.dim_num; + curr->node->nn_param.pre_process_yuv444.perm = p->perm; + curr->node->nn_param.pre_process_yuv444.dim_num = p->dim_num; + + curr->inputs[0] = inputs[PRE_PROCESS_INPUT0]; + curr->inputs[1] = inputs[PRE_PROCESS_INPUT1]; + curr->inputs[2] = inputs[PRE_PROCESS_INPUT2]; + if (layout == VSI_NN_DEST_LAYOUT_NHWC) + { + curr->outputs[0] = preprocess_tensor->t; + } + else + { + curr->outputs[0] = outputs[PRE_PROCESS_OUTPUT]; + } + + vsi_nn_internal_setup_node(self, curr); + } + break; + case VSI_NN_SOURCE_FORMAT_IMAGE_NV12: + { + curr = vsi_nn_internal_new_node( self, VSI_NN_OP_PRE_PROCESS_NV12, 0, 0 ); + + if (p->reverse_channel) + { + curr->node->nn_param.pre_process_nv12.r_mean = p->norm.mean[2]; + curr->node->nn_param.pre_process_nv12.g_mean = p->norm.mean[1]; + curr->node->nn_param.pre_process_nv12.b_mean = p->norm.mean[0]; + } + else + { + curr->node->nn_param.pre_process_nv12.r_mean = p->norm.mean[0]; + curr->node->nn_param.pre_process_nv12.g_mean = p->norm.mean[1]; + curr->node->nn_param.pre_process_nv12.b_mean = p->norm.mean[2]; + } + + curr->node->nn_param.pre_process_nv12.rgb_scale = p->norm.scale; + curr->node->nn_param.pre_process_nv12.reverse_channel = p->reverse_channel; + curr->node->nn_param.pre_process_nv12.rect.left = p->rect.left; + curr->node->nn_param.pre_process_nv12.rect.top = p->rect.top; + curr->node->nn_param.pre_process_nv12.rect.width = p->rect.width; + curr->node->nn_param.pre_process_nv12.rect.height = p->rect.height; + curr->node->nn_param.pre_process_nv12.output_attr.size = p->output_attr.size; + curr->node->nn_param.pre_process_nv12.output_attr.dim_num = p->output_attr.dim_num; + curr->node->nn_param.pre_process_nv12.perm = p->perm; + curr->node->nn_param.pre_process_nv12.dim_num = p->dim_num; + + curr->inputs[0] = inputs[PRE_PROCESS_INPUT0]; + curr->inputs[1] = inputs[PRE_PROCESS_INPUT1]; + if (layout == VSI_NN_DEST_LAYOUT_NHWC) + { + curr->outputs[0] = preprocess_tensor->t; + } + else + { + curr->outputs[0] = outputs[PRE_PROCESS_OUTPUT]; + } + + vsi_nn_internal_setup_node(self, curr); + } + break; + default: + { + VSILOGE( "Not support this type!(PRE_PROCESS)\n"); + ret = FALSE; + } + break; } - else if (p->type == VSI_NN_SOURCE_FORMAT_IMAGE_YUV444) + + if ( p->type == VSI_NN_SOURCE_FORMAT_IMAGE_YUV420 || + p->type == VSI_NN_SOURCE_FORMAT_IMAGE_YUV444 || + p->type == VSI_NN_SOURCE_FORMAT_IMAGE_NV12 || + p->type == VSI_NN_SOURCE_FORMAT_IMAGE_RGB || + p->type == VSI_NN_SOURCE_FORMAT_IMAGE_BGRA || + p->type == VSI_NN_SOURCE_FORMAT_IMAGE_RGB888_PLANAR + ) { - curr = vsi_nn_internal_new_node( self, VSI_NN_OP_PRE_PROCESS_YUV444, 0, 0 ); - - if (p->reverse_channel) + if (layout == VSI_NN_DEST_LAYOUT_NHWC) { - curr->node->nn_param.pre_process_yuv444.r_mean = p->norm.mean[2]; - curr->node->nn_param.pre_process_yuv444.g_mean = p->norm.mean[1]; - curr->node->nn_param.pre_process_yuv444.b_mean = p->norm.mean[0]; + curr = vsi_nn_internal_new_node( self, VSI_NN_OP_PERMUTE, 0, 0 ); + curr->node->nn_param.permute.perm = p->perm; + curr->node->nn_param.permute.dim_num = p->dim_num; + curr->inputs[0] = preprocess_tensor->t; + curr->outputs[0] = outputs[PRE_PROCESS_OUTPUT]; + + vsi_nn_internal_setup_node( self, curr ); } - else - { - curr->node->nn_param.pre_process_yuv444.r_mean = p->norm.mean[0]; - curr->node->nn_param.pre_process_yuv444.g_mean = p->norm.mean[1]; - curr->node->nn_param.pre_process_yuv444.b_mean = p->norm.mean[2]; - } - - curr->node->nn_param.pre_process_yuv444.rgb_scale = p->norm.scale; - curr->node->nn_param.pre_process_yuv444.reverse_channel = p->reverse_channel; - curr->node->nn_param.pre_process_yuv444.rect.left = p->rect.left; - curr->node->nn_param.pre_process_yuv444.rect.top = p->rect.top; - curr->node->nn_param.pre_process_yuv444.rect.width = p->rect.width; - curr->node->nn_param.pre_process_yuv444.rect.height = p->rect.height; - curr->node->nn_param.pre_process_yuv444.output_attr.size = p->output_attr.size; - curr->node->nn_param.pre_process_yuv444.output_attr.dim_num = p->output_attr.dim_num; - curr->node->nn_param.pre_process_yuv444.perm = p->perm; - curr->node->nn_param.pre_process_yuv444.dim_num = p->dim_num; - - curr->inputs[0] = inputs[PRE_PROCESS_INPUT0]; - curr->inputs[1] = inputs[PRE_PROCESS_INPUT1]; - curr->inputs[2] = inputs[PRE_PROCESS_INPUT2]; - curr->outputs[0] = outputs[PRE_PROCESS_OUTPUT]; - - vsi_nn_internal_setup_node(self, curr); - } - else if (p->type == VSI_NN_SOURCE_FORMAT_IMAGE_NV12) - { - curr = vsi_nn_internal_new_node( self, VSI_NN_OP_PRE_PROCESS_NV12, 0, 0 ); - - if (p->reverse_channel) - { - curr->node->nn_param.pre_process_nv12.r_mean = p->norm.mean[2]; - curr->node->nn_param.pre_process_nv12.g_mean = p->norm.mean[1]; - curr->node->nn_param.pre_process_nv12.b_mean = p->norm.mean[0]; - } - else - { - curr->node->nn_param.pre_process_nv12.r_mean = p->norm.mean[0]; - curr->node->nn_param.pre_process_nv12.g_mean = p->norm.mean[1]; - curr->node->nn_param.pre_process_nv12.b_mean = p->norm.mean[2]; - } - - curr->node->nn_param.pre_process_nv12.rgb_scale = p->norm.scale; - curr->node->nn_param.pre_process_nv12.reverse_channel = p->reverse_channel; - curr->node->nn_param.pre_process_nv12.rect.left = p->rect.left; - curr->node->nn_param.pre_process_nv12.rect.top = p->rect.top; - curr->node->nn_param.pre_process_nv12.rect.width = p->rect.width; - curr->node->nn_param.pre_process_nv12.rect.height = p->rect.height; - curr->node->nn_param.pre_process_nv12.output_attr.size = p->output_attr.size; - curr->node->nn_param.pre_process_nv12.output_attr.dim_num = p->output_attr.dim_num; - curr->node->nn_param.pre_process_nv12.perm = p->perm; - curr->node->nn_param.pre_process_nv12.dim_num = p->dim_num; - - curr->inputs[0] = inputs[PRE_PROCESS_INPUT0]; - curr->inputs[1] = inputs[PRE_PROCESS_INPUT1]; - curr->outputs[0] = outputs[PRE_PROCESS_OUTPUT]; - - vsi_nn_internal_setup_node(self, curr); - } - else - { - VSILOGE( "Not support this type!(PRE_PROCESS)\n"); - return FALSE; } final: diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_pre_process_bgra.c b/src/tim/vx/internal/src/ops/vsi_nn_op_pre_process_bgra.c index c0889c6..4ac9bb1 100644 --- a/src/tim/vx/internal/src/ops/vsi_nn_op_pre_process_bgra.c +++ b/src/tim/vx/internal/src/ops/vsi_nn_op_pre_process_bgra.c @@ -109,7 +109,6 @@ static vsi_bool op_setup { /* TODO: Add code to comput outputs' shape. */ vsi_nn_pre_process_bgra_param * p = NULL; - uint32_t axis = 0; uint32_t i = 0; p = (vsi_nn_pre_process_bgra_param *)&(self->nn_param.pre_process_bgra); @@ -155,28 +154,8 @@ static vsi_bool op_setup } } - for (i = 0; i < self->nn_param.pre_process_bgra.dim_num; i++) - { - axis = self->nn_param.pre_process_bgra.perm[i]; - if (axis != i) - break; - } - - if (i == self->nn_param.pre_process_bgra.dim_num) - self->nn_param.pre_process_bgra.local.enable_perm = FALSE; - else - self->nn_param.pre_process_bgra.local.enable_perm = TRUE; - - if (self->nn_param.pre_process_bgra.local.enable_perm == FALSE) - { - p->local.scale_x = (p->rect.width << 15) / outputs[0]->attr.size[0]; - p->local.scale_y = (p->rect.height << 15) / outputs[0]->attr.size[1]; - } - else - { - p->local.scale_x = (p->rect.width << 15) / outputs[0]->attr.size[1]; - p->local.scale_y = (p->rect.height << 15) / outputs[0]->attr.size[2]; - } + p->local.scale_x = (p->rect.width << 15) / outputs[0]->attr.size[0]; + p->local.scale_y = (p->rect.height << 15) / outputs[0]->attr.size[1]; p->local.enable_copy = ((p->local.scale_x == p->local.scale_y) && (p->local.scale_x == (1 << 15))); diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_pre_process_nv12.c b/src/tim/vx/internal/src/ops/vsi_nn_op_pre_process_nv12.c index d0f1454..d754e27 100644 --- a/src/tim/vx/internal/src/ops/vsi_nn_op_pre_process_nv12.c +++ b/src/tim/vx/internal/src/ops/vsi_nn_op_pre_process_nv12.c @@ -107,7 +107,6 @@ static vsi_bool op_setup { /* TODO: Add code to comput outputs' shape. */ vsi_nn_pre_process_nv12_param * p = NULL; - uint32_t axis = 0; uint32_t i = 0; p = (vsi_nn_pre_process_nv12_param *)&(self->nn_param.pre_process_nv12); @@ -153,28 +152,8 @@ static vsi_bool op_setup } } - for (i = 0; i < self->nn_param.pre_process_nv12.dim_num; i++) - { - axis = self->nn_param.pre_process_nv12.perm[i]; - if (axis != i) - break; - } - - if (i == self->nn_param.pre_process_nv12.dim_num) - self->nn_param.pre_process_nv12.local->enable_perm = FALSE; - else - self->nn_param.pre_process_nv12.local->enable_perm = TRUE; - - if (self->nn_param.pre_process_nv12.local->enable_perm == FALSE) - { - p->local->scale_x = (p->rect.width << 15) / outputs[0]->attr.size[0]; - p->local->scale_y = (p->rect.height << 15) / outputs[0]->attr.size[1]; - } - else - { - p->local->scale_x = (p->rect.width << 15) / outputs[0]->attr.size[1]; - p->local->scale_y = (p->rect.height << 15) / outputs[0]->attr.size[2]; - } + p->local->scale_x = (p->rect.width << 15) / outputs[0]->attr.size[0]; + p->local->scale_y = (p->rect.height << 15) / outputs[0]->attr.size[1]; p->local->enable_copy = ((p->local->scale_x == p->local->scale_y) && (p->local->scale_x == (1 << 15))); diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_pre_process_rgb.c b/src/tim/vx/internal/src/ops/vsi_nn_op_pre_process_rgb.c index bd9e5c8..a31005d 100644 --- a/src/tim/vx/internal/src/ops/vsi_nn_op_pre_process_rgb.c +++ b/src/tim/vx/internal/src/ops/vsi_nn_op_pre_process_rgb.c @@ -112,7 +112,6 @@ static vsi_bool op_setup { /* TODO: Add code to comput outputs' shape. */ vsi_nn_pre_process_rgb_param * p = NULL; - uint32_t axis = 0; uint32_t i = 0; p = (vsi_nn_pre_process_rgb_param *)&(self->nn_param.pre_process_rgb); @@ -158,17 +157,8 @@ static vsi_bool op_setup } } - for (i = 0; i < self->nn_param.pre_process_rgb.dim_num; i++) - { - axis = self->nn_param.pre_process_rgb.perm[i]; - if (axis != i) - break; - } - if (i == self->nn_param.pre_process_rgb.dim_num) - self->nn_param.pre_process_rgb.local.enable_perm = FALSE; - else - self->nn_param.pre_process_rgb.local.enable_perm = TRUE; + self->nn_param.pre_process_rgb.local.enable_perm = FALSE; if (self->nn_param.pre_process_rgb.local.enable_perm == FALSE) { diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_pre_process_yuv420.c b/src/tim/vx/internal/src/ops/vsi_nn_op_pre_process_yuv420.c index 3fe0c49..50c2355 100644 --- a/src/tim/vx/internal/src/ops/vsi_nn_op_pre_process_yuv420.c +++ b/src/tim/vx/internal/src/ops/vsi_nn_op_pre_process_yuv420.c @@ -108,7 +108,6 @@ static vsi_bool op_setup { /* TODO: Add code to comput outputs' shape. */ vsi_nn_pre_process_yuv420_param * p = NULL; - uint32_t axis = 0; uint32_t i = 0; p = (vsi_nn_pre_process_yuv420_param *)&(self->nn_param.pre_process_yuv420); @@ -154,28 +153,8 @@ static vsi_bool op_setup } } - for (i = 0; i < self->nn_param.pre_process_yuv420.dim_num; i++) - { - axis = self->nn_param.pre_process_yuv420.perm[i]; - if (axis != i) - break; - } - - if (i == self->nn_param.pre_process_yuv420.dim_num) - self->nn_param.pre_process_yuv420.local.enable_perm = FALSE; - else - self->nn_param.pre_process_yuv420.local.enable_perm = TRUE; - - if (self->nn_param.pre_process_yuv420.local.enable_perm == FALSE) - { - p->local.scale_x = (p->rect.width << 15) / outputs[0]->attr.size[0]; - p->local.scale_y = (p->rect.height << 15) / outputs[0]->attr.size[1]; - } - else - { - p->local.scale_x = (p->rect.width << 15) / outputs[0]->attr.size[1]; - p->local.scale_y = (p->rect.height << 15) / outputs[0]->attr.size[2]; - } + p->local.scale_x = (p->rect.width << 15) / outputs[0]->attr.size[0]; + p->local.scale_y = (p->rect.height << 15) / outputs[0]->attr.size[1]; p->local.enable_copy = ((p->local.scale_x == p->local.scale_y) && (p->local.scale_x == (1 << 15))); diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_pre_process_yuv444.c b/src/tim/vx/internal/src/ops/vsi_nn_op_pre_process_yuv444.c index 0d7d370..99a7674 100644 --- a/src/tim/vx/internal/src/ops/vsi_nn_op_pre_process_yuv444.c +++ b/src/tim/vx/internal/src/ops/vsi_nn_op_pre_process_yuv444.c @@ -108,7 +108,6 @@ static vsi_bool op_setup { /* TODO: Add code to comput outputs' shape. */ vsi_nn_pre_process_yuv444_param * p = NULL; - uint32_t axis = 0; uint32_t i = 0; p = (vsi_nn_pre_process_yuv444_param *)&(self->nn_param.pre_process_yuv444); @@ -154,28 +153,8 @@ static vsi_bool op_setup } } - for (i = 0; i < self->nn_param.pre_process_yuv444.dim_num; i++) - { - axis = self->nn_param.pre_process_yuv444.perm[i]; - if (axis != i) - break; - } - - if (i == self->nn_param.pre_process_yuv444.dim_num) - self->nn_param.pre_process_yuv444.local->enable_perm = FALSE; - else - self->nn_param.pre_process_yuv444.local->enable_perm = TRUE; - - if (self->nn_param.pre_process_yuv444.local->enable_perm == FALSE) - { - p->local->scale_x = (p->rect.width << 15) / outputs[0]->attr.size[0]; - p->local->scale_y = (p->rect.height << 15) / outputs[0]->attr.size[1]; - } - else - { - p->local->scale_x = (p->rect.width << 15) / outputs[0]->attr.size[1]; - p->local->scale_y = (p->rect.height << 15) / outputs[0]->attr.size[2]; - } + p->local->scale_x = (p->rect.width << 15) / outputs[0]->attr.size[0]; + p->local->scale_y = (p->rect.height << 15) / outputs[0]->attr.size[1]; p->local->enable_copy = ((p->local->scale_x == p->local->scale_y) && (p->local->scale_x == (1 << 15))); diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_reduce.c b/src/tim/vx/internal/src/ops/vsi_nn_op_reduce.c index 323ee4a..3d01e79 100644 --- a/src/tim/vx/internal/src/ops/vsi_nn_op_reduce.c +++ b/src/tim/vx/internal/src/ops/vsi_nn_op_reduce.c @@ -43,8 +43,6 @@ #define _IO_NUM (_INPUT_NUM + _OUTPUT_NUM) #define _PARAM_NUM (_ARG_NUM + _IO_NUM) -#define USE_OVX_API TRUE - typedef struct _vsi_nn_reduce_lcl2_data_t { vsi_nn_tensor_t *reshaped_input; @@ -57,125 +55,6 @@ typedef struct _vsi_nn_reduce_lcl2_data_t int32_t axes_num; } vsi_nn_reduce_lcl2_data_t; -#if (USE_OVX_API == FALSE) -extern vx_kernel_description_t * vx_kernel_REDUCE_list[]; - -static void _set_inputs_outputs - ( - vx_reference * params, - vsi_nn_tensor_t ** inputs, - vsi_nn_tensor_t ** outputs - ) -{ - vx_uint32 i; - vx_uint32 cnt; - - /* Set inputs */ - cnt = 0; - for( i = 0; i < _INPUT_NUM; i ++, cnt ++ ) - { - params[cnt] = (vx_reference)inputs[i]->t; - } - - /* Set outputs */ - for( i = 0; i < _OUTPUT_NUM; i ++, cnt ++ ) - { - params[cnt] = (vx_reference)outputs[i]->t; - } -} /* _set_inputs_outputs() */ - -static vx_status _create_params - ( - vsi_nn_node_t * node, - vx_reference * params, - vx_uint32 num - ) -{ - vx_status status; - vx_context ctx; - vsi_nn_reduce_param * p = NULL; - if( 0 == num ) - { - return VX_SUCCESS; - } - memset( params, 0, sizeof( vx_reference * ) * num ); - p = &node->nn_param.reduce; - ctx = vxGetContext( (vx_reference)node->graph->g ); - /* Init parameters */ -#define _SET_PARAM( i, type, arg ) do{ \ - params[i] = (vx_reference)vxCreateScalar( ctx, type, &p->arg ); \ - status = vxGetStatus( params[i] ); \ - if( VX_SUCCESS != status ) { \ - goto set_param_error; \ - } \ - } while(0) - _SET_PARAM( 0, VX_TYPE_INT32, axis_num ); - _SET_PARAM( 1, VX_TYPE_INT32, keep_dim ); - _SET_PARAM( 2, VX_TYPE_INT32, axis[0] ); - _SET_PARAM( 3, VX_TYPE_INT32, axis[1] ); - _SET_PARAM( 4, VX_TYPE_INT32, axis[2] ); - _SET_PARAM( 5, VX_TYPE_INT32, axis[3] ); - -#undef _SET_PARAM -set_param_error: - - return status; -} /* _create_params */ - -static void _release_params - ( - vx_reference * params, - vx_uint32 num - ) -{ - vx_uint32 i; - vx_scalar scalar; - for( i = 0; i < num; i ++ ) - { - scalar = (vx_scalar)params[i]; - vxReleaseScalar( &scalar ); - } -} /* _release_params() */ - -static vx_status cpu_op_compute - ( - vsi_nn_node_t * self, - vsi_nn_tensor_t ** inputs, - vsi_nn_tensor_t ** outputs - ) -{ - vx_status status = VX_SUCCESS; - vx_reference params[_PARAM_NUM]; - vx_reference * args = NULL; - - args = ¶ms[_IO_NUM]; - - if( NULL == self->n ) - { - return VX_FAILURE; - } - - /* Set inputs and outputs */ - _set_inputs_outputs( params, inputs, outputs ); - - /* Init parameters. */ - _create_params( self, args, _ARG_NUM ); - - /* Pass parameters to node. */ - status = vsi_nn_ClientNodePassParameters( self->n, params, _PARAM_NUM ); - - _release_params( args, _ARG_NUM ); - - return status; -} - -static vsi_nn_op_compute_t op_compute_list[] = -{ - cpu_op_compute, - NULL -}; -#endif - static vsi_status op_comput_reduce_mean(vsi_nn_node_t * self, vsi_nn_tensor_t *axis_tensor, vx_bool keep_dim, @@ -278,7 +157,6 @@ static vsi_status op_compute ) { vsi_status status = VSI_FAILURE; -#if (USE_OVX_API == TRUE) if (self->nn_param.reduce.type == VSI_NN_REDUCE_MEAN) { @@ -574,30 +452,6 @@ static vsi_status op_compute status = vsi_nn_internal_compute_node( self ); } -#else - vsi_nn_kernel_info_t kernel_info; - - memset(&kernel_info, 0x0, sizeof(vsi_nn_kernel_info_t)); - kernel_info.resource_num = 1; - kernel_info.resource_name = (char **)malloc(kernel_info.resource_num * sizeof(char *)); - kernel_info.resource_name[0] = "vsi_nn_kernel_reduce"; - kernel_info.type = VX_KERNEL_TYPE_CPU; - kernel_info.kernel = vx_kernel_REDUCE_list; - kernel_info.kernel_index = 0; - kernel_info.init_index = 0; - - self->n = vsi_nn_RegisterClientKernelAndNewNode( - self->graph, &kernel_info); - if (kernel_info.resource_name) free(kernel_info.resource_name); - if( NULL == self->n ) - { - return VSI_FAILURE; - } - if (NULL != op_compute_list[kernel_info.init_index]) - { - status = op_compute_list[kernel_info.init_index](self, inputs, outputs); - } -#endif return status; } /* op_compute() */ diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_resize.c b/src/tim/vx/internal/src/ops/vsi_nn_op_resize.c index 9fed06c..51ea588 100644 --- a/src/tim/vx/internal/src/ops/vsi_nn_op_resize.c +++ b/src/tim/vx/internal/src/ops/vsi_nn_op_resize.c @@ -50,215 +50,6 @@ #define _IO_NUM (_INPUT_NUM + _OUTPUT_NUM) #define _PARAM_NUM (_ARG_NUM + _IO_NUM) -#define USE_OVX_API TRUE - -#if (USE_OVX_API == FALSE) -extern vx_kernel_description_t * vx_kernel_RESIZE_list[]; - -static void _set_inputs_outputs - ( - vx_reference * params, - vsi_nn_tensor_t ** inputs, - vsi_nn_tensor_t ** outputs - ) -{ - uint32_t i; - uint32_t cnt; - - /* Set inputs */ - cnt = 0; - for( i = 0; i < _INPUT_NUM; i ++, cnt ++ ) - { - params[cnt] = (vx_reference)inputs[i]->t; - } - - /* Set outputs */ - for( i = 0; i < _OUTPUT_NUM; i ++, cnt ++ ) - { - params[cnt] = (vx_reference)outputs[i]->t; - } -} /* _set_inputs_outputs() */ - -static vsi_status _create_params - ( - vsi_nn_node_t * node, - vx_reference * params, - uint32_t num - ) -{ - vsi_status status; - vx_context ctx; - vsi_nn_resize_param * p; - if( 0 == num ) - { - return VSI_SUCCESS; - } - memset( params, 0, sizeof( vx_reference * ) * num ); - p = &node->nn_param.resize; - ctx = vxGetContext( (vx_reference)node->graph->g ); - /* Init parameters */ -#define _SET_PARAM( i, type, arg ) do{ \ - params[i] = (vx_reference)vxCreateScalar( ctx, type, &p->arg ); \ - status = vxGetStatus( params[i] ); \ - if( VSI_SUCCESS != status ) { \ - goto set_param_error; \ - } \ - } while(0) - _SET_PARAM( 0, VX_TYPE_INT32, factor ); -#undef _SET_PARAM -set_param_error: - - return status; -} /* _create_params */ - -static void _release_params - ( - vx_reference * params, - uint32_t num - ) -{ - uint32_t i; - vx_scalar scalar; - for( i = 0; i < num; i ++ ) - { - scalar = (vx_scalar)params[i]; - vxReleaseScalar( &scalar ); - } -} /* _release_params() */ - -static vsi_status op_pre_compute - ( - vsi_nn_node_t * self, - vsi_nn_tensor_t ** inputs, - vsi_nn_tensor_t ** outputs, - vsi_nn_kernel_info_t * kernel_info - ) -{ - vsi_nn_type_e inputFormat = inputs[0]->attr.dtype.vx_type; - vsi_nn_type_e outputFormat = outputs[0]->attr.dtype.vx_type; - vsi_nn_type_e enableFormat; - float scale_factor = self->nn_param.resize.factor; - - enableFormat = ((inputFormat == VSI_NN_TYPE_FLOAT16 && outputFormat == VSI_NN_TYPE_FLOAT16) || - (inputFormat == VSI_NN_TYPE_INT16 && outputFormat == VSI_NN_TYPE_INT16) || - (inputFormat == VSI_NN_TYPE_INT8 && outputFormat == VSI_NN_TYPE_INT8) || - (inputFormat == VSI_NN_TYPE_UINT8 && outputFormat == VSI_NN_TYPE_UINT8)); - - if(scale_factor == 0.5f && enableFormat && inputs[0]->attr.size[1] % 2 == 0 - && inputs[0]->attr.size[1] * inputs[0]->attr.size[2] < 65536) - { - kernel_info->type = VX_KERNEL_TYPE_VX; - kernel_info->init_index = 1; - if (inputFormat == VX_TYPE_FLOAT16 || inputFormat == VX_TYPE_INT16 ) - { - kernel_info->kernel_index = 1; - } - else - { - kernel_info->kernel_index = 2; - } - } - else - { - kernel_info->type = VX_KERNEL_TYPE_CPU; - kernel_info->kernel_index = 0; - kernel_info->init_index = 0; - } - - return VSI_SUCCESS; -} - -static vsi_status cpu_op_compute - ( - vsi_nn_node_t * self, - vsi_nn_tensor_t ** inputs, - vsi_nn_tensor_t ** outputs - ) -{ - vsi_status status = VSI_SUCCESS; - vx_reference params[_PARAM_NUM]; - vx_reference * args; - - args = ¶ms[_IO_NUM]; - - if( NULL == self->n ) - { - return VSI_FAILURE; - } - - /* Set inputs and outputs */ - _set_inputs_outputs( params, inputs, outputs ); - - /* Init parameters. */ - _create_params( self, args, _ARG_NUM ); - - /* Pass parameters to node. */ - status = vsi_nn_ClientNodePassParameters( self->n, params, _PARAM_NUM ); - - _release_params( args, _ARG_NUM ); - - return status; -} - -static vsi_status vx_op_compute - ( - vsi_nn_node_t * self, - vsi_nn_tensor_t ** inputs, - vsi_nn_tensor_t ** outputs - ) -{ - vsi_status status = VSI_SUCCESS; - vx_reference params[_IO_NUM]; - vx_border_t border; - int32_t sizes[4] = {0}; - uint32_t dims = 2; - uint32_t input_size[4] = {1, 1, 1, 1}; - uint32_t output_size[4] = {1, 1, 1, 1}; - uint32_t i; - - if( NULL == self->n ) - { - return VSI_FAILURE; - } - - for(i = 0; i < inputs[0]->attr.dim_num; ++i) - { - input_size[i] = inputs[0]->attr.size[i]; - } - for(i = 0; i < outputs[0]->attr.dim_num; ++i) - { - output_size[i] = outputs[0]->attr.size[i]; - } - - - sizes[0] = input_size[0]; - sizes[1] = input_size[1] * input_size[2] * input_size[3]; - self->nn_param.resize.local.local_tensor[0] = vxReshapeTensor(inputs[0]->t, sizes, dims); - - sizes[0] = output_size[0]; - sizes[1] = output_size[1] * output_size[2] * output_size[3]; - self->nn_param.resize.local.local_tensor[1] = vxReshapeTensor(outputs[0]->t, sizes, dims); - - params[0] = (vx_reference)self->nn_param.resize.local.local_tensor[0]; - params[1] = (vx_reference)self->nn_param.resize.local.local_tensor[1]; - - /* Pass parameters to node. */ - status = vsi_nn_ClientNodePassParameters( self->n, params, _IO_NUM ); - - border.mode = VX_BORDER_REPLICATE; - border.constant_value.U32 = 0; - status |= vxSetNodeAttribute(self->n, VX_NODE_BORDER, &border, sizeof(border)); - - return status; -} - -static vsi_nn_op_compute_t op_compute_list[] = -{ - cpu_op_compute, - vx_op_compute, - NULL -}; -#endif static vsi_bool _is_same_shape ( @@ -289,7 +80,7 @@ static vsi_status op_compute ) { vsi_status status = VSI_FAILURE; -#if (USE_OVX_API == TRUE) + if ( ((self->nn_param.resize.align_corners || self->nn_param.resize.half_pixel_centers) && (VSI_NN_INTERPOLATION_BILINEAR == self->nn_param.resize.type || VSI_NN_INTERPOLATION_NEAREST_NEIGHBOR == self->nn_param.resize.type)) @@ -318,30 +109,7 @@ static vsi_status op_compute status = VSI_SUCCESS; } } -#else - vsi_nn_kernel_info_t kernel_info; - - memset(&kernel_info, 0x0, sizeof(vsi_nn_kernel_info_t)); - kernel_info.resource_num = 1; - kernel_info.resource_name = (char **)malloc(kernel_info.resource_num * sizeof(char *)); - kernel_info.resource_name = "vsi_nn_kernel_resize"; - kernel_info.kernel = vx_kernel_RESIZE_list; - - op_pre_compute(self, inputs, outputs, &kernel_info); - - self->n = vsi_nn_RegisterClientKernelAndNewNode( - self->graph, &kernel_info); - if (kernel_info.resource_name) free(kernel_info.resource_name); - if( NULL == self->n ) - { - return VSI_FAILURE; - } - if (NULL != op_compute_list[kernel_info.init_index]) - { - status = op_compute_list[kernel_info.init_index](self, inputs, outputs); - } -#endif return status; } /* op_compute() */ @@ -446,17 +214,6 @@ static vsi_status op_deinit vsi_nn_node_t * self ) { -#if (USE_OVX_API == FALSE) - uint32_t i; - for (i = 0; i < _VSI_NN_RESIZE_LOCAL_TENSOR_NUM; i++) - { - if (self->nn_param.resize.local.local_tensor[i] != NULL) - { - vxReleaseTensor(&(self->nn_param.resize.local.local_tensor[i])); - self->nn_param.resize.local.local_tensor[i] = NULL; - } - } -#endif if ((self->nn_param.resize.align_corners || self->nn_param.resize.half_pixel_centers) && (VSI_NN_INTERPOLATION_BILINEAR == self->nn_param.resize.type || VSI_NN_INTERPOLATION_NEAREST_NEIGHBOR == self->nn_param.resize.type)) diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_roi_align.c b/src/tim/vx/internal/src/ops/vsi_nn_op_roi_align.c index 53f5bd4..472f994 100644 --- a/src/tim/vx/internal/src/ops/vsi_nn_op_roi_align.c +++ b/src/tim/vx/internal/src/ops/vsi_nn_op_roi_align.c @@ -31,166 +31,15 @@ #include "vsi_nn_node.h" #include "vsi_nn_prv.h" #include "utils/vsi_nn_math.h" +#include "utils/vsi_nn_util.h" #include "vsi_nn_ops.h" #include "vsi_nn_tensor.h" #include "vsi_nn_tensor_util.h" #include "client/vsi_nn_vxkernel.h" - -#define _ARG_NUM (6) -#define _INPUT_NUM (3) -#define _OUTPUT_NUM (1) -#define _IO_NUM (_INPUT_NUM + _OUTPUT_NUM) -#define _PARAM_NUM (_ARG_NUM + _IO_NUM) - -extern vx_kernel_description_t * vx_kernel_ROI_ALIGN_list[]; - -static void _set_inputs_outputs - ( - vx_reference * params, - vsi_nn_tensor_t ** inputs, - vsi_nn_tensor_t ** outputs - ) -{ - uint32_t i; - uint32_t cnt; - - /* Set inputs */ - cnt = 0; - for( i = 0; i < _INPUT_NUM; i ++, cnt ++ ) - { - params[cnt] = (vx_reference)inputs[i]->t; - } - - /* Set outputs */ - for( i = 0; i < _OUTPUT_NUM; i ++, cnt ++ ) - { - params[cnt] = (vx_reference)outputs[i]->t; - } -} /* _set_inputs_outputs() */ - -static vsi_status _create_params - ( - vsi_nn_node_t * node, - vx_reference * params, - uint32_t num - ) -{ - vsi_status status; - vx_context ctx; - vsi_nn_roi_align_param * p; - if( 0 == num ) - { - return VSI_SUCCESS; - } - memset( params, 0, sizeof( vx_reference * ) * num ); - p = &(node->nn_param.roi_align); - ctx = vxGetContext( (vx_reference)node->graph->g ); - /* Init parameters */ - #define _SET_PARAM( i, type, arg ) do{ \ - params[i] = (vx_reference)vxCreateScalar( ctx, type, &p->arg ); \ - status = vxGetStatus( params[i] ); \ - if( VSI_SUCCESS != status ) { \ - goto set_param_error; \ - } \ - } while(0) - _SET_PARAM( 0, VX_TYPE_INT32, output_height ); - _SET_PARAM( 1, VX_TYPE_INT32, output_width ); - _SET_PARAM( 2, VX_TYPE_FLOAT32, height_ratio ); - _SET_PARAM( 3, VX_TYPE_FLOAT32, width_ratio ); - _SET_PARAM( 4, VX_TYPE_INT32, height_sample_num ); - _SET_PARAM( 5, VX_TYPE_INT32, width_sample_num ); - #undef _SET_PARAM -set_param_error: - - return status; -} /* _create_params */ - -static void _release_params - ( - vx_reference * params, - uint32_t num - ) -{ - uint32_t i; - vx_scalar scalar; - for( i = 0; i < num; i ++ ) - { - scalar = (vx_scalar)params[i]; - vxReleaseScalar( &scalar ); - } -} /* _release_params() */ - -static vsi_status cpu_op_compute - ( - vsi_nn_node_t * self, - vsi_nn_tensor_t ** inputs, - vsi_nn_tensor_t ** outputs - ) -{ - vsi_status status = VSI_SUCCESS; - vx_reference params[_PARAM_NUM]; - vx_reference * args; - - args = ¶ms[_IO_NUM]; - - if( NULL == self->n ) - { - return VSI_FAILURE; - } - - /* Set inputs and outputs */ - _set_inputs_outputs( params, inputs, outputs ); - - /* Init parameters. */ - _create_params( self, args, _ARG_NUM ); - - /* Pass parameters to node. */ - status = vsi_nn_ClientNodePassParameters( self->n, params, _PARAM_NUM ); - - _release_params( args, _ARG_NUM ); - - return status; -} - -static vsi_status vx_op_compute - ( - vsi_nn_node_t * self, - vsi_nn_tensor_t ** inputs, - vsi_nn_tensor_t ** outputs - ) -{ - vsi_status status = VSI_SUCCESS; - vx_reference params[_PARAM_NUM]; - vx_reference * args; - - args = ¶ms[_IO_NUM]; - - if( NULL == self->n ) - { - return VSI_FAILURE; - } - - /* Set inputs and outputs */ - _set_inputs_outputs( params, inputs, outputs ); - /*TODO: Add code if need to change your parameter*/ - - /* Init parameters. */ - _create_params( self, args, _ARG_NUM ); - - /* Pass parameters to node. */ - status = vsi_nn_ClientNodePassParameters( self->n, params, _PARAM_NUM ); - - _release_params( args, _ARG_NUM ); - - return status; -} - -static vsi_nn_op_compute_t op_compute_list[] = -{ - cpu_op_compute, - vx_op_compute, - NULL -}; +#include "kernel/vsi_nn_kernel.h" +#include "vsi_nn_internal_node.h" +#include "kernel/vsi_nn_kernel_gpu_shape_optimize.h" +#include "utils/vsi_nn_constraint_check.h" static vsi_status op_compute ( @@ -199,46 +48,31 @@ static vsi_status op_compute vsi_nn_tensor_t ** outputs ) { - vsi_status status; - vsi_nn_kernel_info_t kernel_info; - char *path = NULL; + vsi_status status = VSI_FAILURE; + vsi_nn_kernel_param_t * param = NULL; + float width_ratio = self->nn_param.roi_align.width_ratio; + float height_ratio = self->nn_param.roi_align.height_ratio; + int32_t width_sample_num = self->nn_param.roi_align.width_sample_num; + int32_t height_sample_num = self->nn_param.roi_align.height_sample_num; - memset(&kernel_info, 0x0, sizeof(vsi_nn_kernel_info_t)); - status = VSI_FAILURE; - kernel_info.type = VX_KERNEL_TYPE_CPU; - kernel_info.kernel = vx_kernel_ROI_ALIGN_list; - kernel_info.resource_num = 1; - kernel_info.resource_name = (char **)malloc(kernel_info.resource_num * sizeof(char *)); - kernel_info.resource_name[0] = "vsi_nn_kernel_roi_align"; - path = getenv("USER_VX_SOURCE_PATH"); - if(path) - vsi_nn_VxResourceSetPath(path); + param = vsi_nn_kernel_param_create(); + vsi_nn_kernel_param_add_float32( param, "width_ratio", width_ratio ); + vsi_nn_kernel_param_add_float32( param, "height_ratio", height_ratio ); + vsi_nn_kernel_param_add_int32( param, "width_sample_num", width_sample_num ); + vsi_nn_kernel_param_add_int32( param, "height_sample_num", height_sample_num ); - if( kernel_info.type == VX_KERNEL_TYPE_VX) + self->n = (vx_node)vsi_nn_kernel_selector( self->graph, + "roi_align", + inputs, 3, + outputs, 1, param ); + + if ( self->n ) { - kernel_info.kernel_index = 1; - kernel_info.init_index = 1; - } - else /*kernel_info.type = VX_KERNEL_TYPE_CPU;*/ - { - kernel_info.kernel_index = 0; - kernel_info.init_index = 0; + status = VSI_SUCCESS; } - self->n = vsi_nn_RegisterClientKernelAndNewNode( - self->graph, &kernel_info); - if (kernel_info.resource_name) - { - free(kernel_info.resource_name); - } - if( NULL == self->n ) - { - return VSI_FAILURE; - } - if (NULL != op_compute_list[kernel_info.init_index]) - { - status = op_compute_list[kernel_info.init_index](self, inputs, outputs); - } + vsi_nn_kernel_param_release( ¶m ); + return status; } /* op_compute() */ @@ -249,17 +83,10 @@ static vsi_bool op_check vsi_nn_tensor_t ** outputs ) { - /*TODO: Check tensor shapes. */ - //If input0 is uint8, then input1 MUST be uint16, - //with zero point of 0 and scale of 0.125 - if (inputs[0]->attr.dtype.vx_type == VSI_NN_TYPE_UINT8 && - inputs[1]->attr.dtype.vx_type != VSI_NN_TYPE_UINT16) - { - return FALSE; - } return TRUE; } /* op_check() */ + static vsi_bool op_setup ( vsi_nn_node_t * self, @@ -267,19 +94,20 @@ static vsi_bool op_setup vsi_nn_tensor_t ** outputs ) { - /* TODO: Add code to compute outputs' shape. */ - if( VSI_NN_DIM_AUTO == outputs[0]->attr.dim_num ) + if ( VSI_NN_DIM_AUTO == outputs[0]->attr.dim_num ) { vsi_nn_roi_align_param *p; p = &(self->nn_param.roi_align); outputs[0]->attr.dim_num = 4; - outputs[0]->attr.size[0] = inputs[0]->attr.size[0]; - outputs[0]->attr.size[1] = p->output_width; - outputs[0]->attr.size[2] = p->output_height; + outputs[0]->attr.size[0] = p->output_width; + outputs[0]->attr.size[1] = p->output_height; + outputs[0]->attr.size[2] = inputs[0]->attr.size[2]; outputs[0]->attr.size[3] = inputs[1]->attr.size[1]; } + return TRUE; -} /* op_setup() */ +} /* op_init() */ + #ifdef __cplusplus extern "C" { @@ -294,8 +122,8 @@ DEF_OP_REG /* check */ op_check, /* setup */ op_setup, /* optimize */ NULL, - /* input_num */ _INPUT_NUM, - /* output_num */ _OUTPUT_NUM + /* input_num */ 3, + /* output_num */ 1 ); #ifdef __cplusplus } diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_scale.c b/src/tim/vx/internal/src/ops/vsi_nn_op_scale.c index 61b9d13..b7c4056 100644 --- a/src/tim/vx/internal/src/ops/vsi_nn_op_scale.c +++ b/src/tim/vx/internal/src/ops/vsi_nn_op_scale.c @@ -26,201 +26,21 @@ #include "vsi_nn_types.h" #include "vsi_nn_platform.h" +#include "vsi_nn_log.h" #include "vsi_nn_graph.h" #include "vsi_nn_node.h" +#include "vsi_nn_prv.h" +#include "utils/vsi_nn_math.h" #include "vsi_nn_ops.h" #include "vsi_nn_tensor.h" #include "vsi_nn_tensor_util.h" #include "utils/vsi_nn_util.h" -#include "vsi_nn_prv.h" -#include "vsi_nn_log.h" +#include "utils/vsi_nn_link_list.h" +#include "utils/vsi_nn_dtype_util.h" #include "client/vsi_nn_vxkernel.h" -#include "utils/vsi_nn_constraint_check.h" -#define _ARG_NUM (2) #define _INPUT_NUM (3) #define _OUTPUT_NUM (1) -#define _IO_NUM (_INPUT_NUM + _OUTPUT_NUM) -#define _PARAM_NUM (_ARG_NUM + _IO_NUM) - -extern vx_kernel_description_t * vx_kernel_SCALE_list[]; - -static void _set_inputs_outputs - ( - vx_reference * params, - vsi_nn_tensor_t ** inputs, - vsi_nn_tensor_t ** outputs - ) -{ - uint32_t i; - uint32_t cnt; - - /* Set inputs */ - cnt = 0; - for( i = 0; i < _INPUT_NUM; i ++, cnt ++ ) - { - params[cnt] = (vx_reference)inputs[i]->t; - } - - /* Set outputs */ - for( i = 0; i < _OUTPUT_NUM; i ++, cnt ++ ) - { - params[cnt] = (vx_reference)outputs[i]->t; - } -} /* _set_inputs_outputs() */ - -static vsi_status _create_params - ( - vsi_nn_node_t * node, - vx_reference * params, - uint32_t num - ) -{ - vsi_status status; - vx_context ctx; - vsi_nn_scale_param * p; - if( 0 == num ) - { - return VSI_SUCCESS; - } - memset( params, 0, sizeof( vx_reference * ) * num ); - p = (vsi_nn_scale_param *)node->nn_param.client_param; - ctx = vxGetContext( (vx_reference)node->graph->g ); - /* Init parameters */ -#define _SET_PARAM( i, type, arg ) do{ \ - params[i] = (vx_reference)vxCreateScalar( ctx, type, &p->arg ); \ - status = vxGetStatus( params[i] ); \ - if( VSI_SUCCESS != status ) { \ - goto set_param_error; \ - } \ - } while(0) - _SET_PARAM( 0, VX_TYPE_INT32, axis ); - _SET_PARAM( 1, VX_TYPE_FLOAT32, bias ); -#undef _SET_PARAM -set_param_error: - - return status; -} /* _create_params */ - -static void _release_params - ( - vx_reference * params, - uint32_t num - ) -{ - uint32_t i; - vx_scalar scalar; - for( i = 0; i < num; i ++ ) - { - scalar = (vx_scalar)params[i]; - vxReleaseScalar( &scalar ); - } -} /* _release_params() */ - -static vsi_status cpu_op_compute - ( - vsi_nn_node_t * self, - vsi_nn_tensor_t ** inputs, - vsi_nn_tensor_t ** outputs - ) -{ - vsi_status status = VSI_SUCCESS; - vx_reference params[_PARAM_NUM]; - vx_reference * args; - - args = ¶ms[_IO_NUM]; - - if( NULL == self->n ) - { - return VSI_FAILURE; - } - - /* Set inputs and outputs */ - _set_inputs_outputs( params, inputs, outputs ); - - /* Init parameters. */ - _create_params( self, args, _ARG_NUM ); - - /* Pass parameters to node. */ - status = vsi_nn_ClientNodePassParameters( self->n, params, _PARAM_NUM ); - - _release_params( args, _ARG_NUM ); - - return status; -} - -static void reshape_tensor_shape - ( - vsi_nn_node_t * self, - vsi_nn_tensor_t * input, - vx_reference * params, - uint32_t index - ) -{ - uint32_t i; - int32_t size[4] = {0}; - int32_t size0[4] = {1, 1, 1, 1}; - uint32_t dims = 2; - - for( i = 0; i < input->attr.dim_num; i++ ) - { - size0[i] = input->attr.size[i]; - } - - size[0] = size0[0]; - size[1] = size0[1] * size0[2] * size0[3]; - - self->nn_param.scale.local.local_tensor[index] = - vxReshapeTensor(input->t, size, dims); - params[index] = (vx_reference)self->nn_param.scale.local.local_tensor[index]; -} - -static vsi_status vx_op_compute - ( - vsi_nn_node_t * self, - vsi_nn_tensor_t ** inputs, - vsi_nn_tensor_t ** outputs - ) -{ - vsi_status status = VSI_SUCCESS; - vx_reference params[_IO_NUM]; - vx_border_t border; - - if( NULL == self->n ) - { - return VSI_FAILURE; - } - - if (inputs[0]->attr.dtype.vx_type != VSI_NN_TYPE_FLOAT16 || - inputs[1]->attr.dtype.vx_type != VSI_NN_TYPE_FLOAT16 || - inputs[2]->attr.dtype.vx_type != VSI_NN_TYPE_FLOAT32 || - outputs[1]->attr.dtype.vx_type != VSI_NN_TYPE_FLOAT16) - { - VSILOGE("scale shader unsuport format!\n"); - return VSI_FAILURE; - } - - reshape_tensor_shape(self, inputs[0], params, 0); - reshape_tensor_shape(self, inputs[1], params, 1); - reshape_tensor_shape(self, inputs[2], params, 2); - reshape_tensor_shape(self, outputs[0], params, 3); - - /* Pass parameters to node. */ - status = vsi_nn_ClientNodePassParameters( self->n, params, _IO_NUM ); - - border.mode = VX_BORDER_REPLICATE; - border.constant_value.U32 = 0; - status |= vxSetNodeAttribute(self->n, VX_NODE_BORDER, &border, sizeof(border)); - - return status; -} - -static vsi_nn_op_compute_t op_init_list[] = -{ - cpu_op_compute, - vx_op_compute, - NULL -}; static vsi_status op_compute ( @@ -229,31 +49,7 @@ static vsi_status op_compute vsi_nn_tensor_t ** outputs ) { - vsi_status status; - vsi_nn_kernel_info_t kernel_info; - - memset(&kernel_info, 0x0, sizeof(vsi_nn_kernel_info_t)); - status = VSI_FAILURE; - kernel_info.resource_num = 1; - kernel_info.resource_name = (char **)malloc(kernel_info.resource_num * sizeof(char *)); - kernel_info.resource_name[0] = "vsi_nn_kernel_scale"; - kernel_info.type = vsi_nn_GetVXKernelTypeForShader(); - kernel_info.kernel = vx_kernel_SCALE_list; - kernel_info.kernel_index = 1; - kernel_info.init_index = 1; - - self->n = vsi_nn_RegisterClientKernelAndNewNode( - self->graph, &kernel_info); - if (kernel_info.resource_name) free(kernel_info.resource_name); - if( NULL == self->n ) - { - return status; - } - if (NULL != op_init_list[kernel_info.init_index]) - { - status = op_init_list[kernel_info.init_index](self, inputs, outputs); - } - return status; + return vsi_nn_internal_compute_node( self ); } /* op_compute() */ static vsi_bool op_check @@ -263,38 +59,55 @@ static vsi_bool op_check vsi_nn_tensor_t ** outputs ) { - BEGIN_IO_TYPE_DECL(SCALE, 3, 1) - IO_TYPE(D_F16, D_F16, D_F32, D_F16) - END_IO_TYPE_DECL(SCALE) - if(!VALIDATE_OP_IO_TYPES(SCALE, self, inputs, self->input.num, outputs, self->output.num)) { - char* desc = generate_op_io_types_desc(inputs, - self->input.num, outputs, self->output.num); - VSILOGE("Inputs/Outputs data type not support: %s", desc); - destroy_op_io_types_desc(desc); - return FALSE; - } + /*TODO: Check tensor shapes. */ return TRUE; } /* op_check() */ +static vsi_bool op_setup + ( + vsi_nn_node_t * node, + vsi_nn_tensor_t ** inputs, + vsi_nn_tensor_t ** outputs + ) +{ + vsi_nn_internal_node_t* curr = NULL; + vsi_bool ret = TRUE; + + vsi_nn_internal_init_node_wksp( node ); + + curr = vsi_nn_internal_new_node( node, VSI_NN_OP_A_TIMES_B_PLUS_C, node->input.num, node->output.num ); + curr->inputs[0] = inputs[0]; + curr->inputs[1] = inputs[1]; + curr->inputs[2] = inputs[2]; + curr->outputs[0] = outputs[0]; + + vsi_nn_internal_setup_node(node, curr); + + return ret; +} /* op_setup() */ + +static vsi_status op_optimize + ( + vsi_nn_node_t * self, + vsi_nn_tensor_t ** inputs, + vsi_nn_tensor_t ** outputs, + vsi_nn_opt_direction_e direction + ) +{ + return vsi_nn_internal_optimize_node( self, direction ); +} /* op_optimize() */ + static vsi_status op_deinit ( vsi_nn_node_t * self ) { - uint32_t i; - for (i = 0; i < _VSI_NN_SCALE_LOCAL_TENSOR_NUM; i++) - { - if (self->nn_param.scale.local.local_tensor[i] != NULL) - { - vxReleaseTensor(&(self->nn_param.scale.local.local_tensor[i])); - self->nn_param.scale.local.local_tensor[i] = NULL; - } - } + vsi_nn_internal_deinit_node_wksp( self ); + vsi_nn_op_common_deinit(self); return VSI_SUCCESS; -} /* op_deinit() */ - +} #ifdef __cplusplus extern "C" { #endif @@ -306,12 +119,11 @@ DEF_OP_REG /* compute */ op_compute, /* deinit */ op_deinit, /* check */ op_check, - /* setup */ vsi_nn_op_common_setup, - /* optimize */ NULL, + /* setup */ op_setup, + /* optimize */ op_optimize, /* input_num */ _INPUT_NUM, /* output_num */ _OUTPUT_NUM ); #ifdef __cplusplus } #endif - diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_shufflechannel.c b/src/tim/vx/internal/src/ops/vsi_nn_op_shufflechannel.c index 5c02808..b87e1e6 100644 --- a/src/tim/vx/internal/src/ops/vsi_nn_op_shufflechannel.c +++ b/src/tim/vx/internal/src/ops/vsi_nn_op_shufflechannel.c @@ -38,265 +38,12 @@ #include "client/vsi_nn_vxkernel.h" #include "utils/vsi_nn_constraint_check.h" -#define USE_OVXLIB (0) - #define _ARG_NUM (2) #define _INPUT_NUM (1) #define _OUTPUT_NUM (1) #define _IO_NUM (_INPUT_NUM + _OUTPUT_NUM) #define _PARAM_NUM (_ARG_NUM + _IO_NUM) -#if (USE_OVXLIB) - -extern vx_kernel_description_t * vx_kernel_SHUFFLECHANNEL_list[]; - -static vsi_bool _reshape_tensor - ( - vsi_nn_node_t * self, - vsi_nn_tensor_t ** inputs, - vsi_nn_tensor_t ** outputs - ) -{ - uint32_t i = 0; - uint32_t sizes[VSI_NN_MAX_DIM_NUM] = {0}; - int32_t axis = 0; - vsi_nn_shufflechannel_param * p = NULL; - uint32_t before_size = 1; - uint32_t after_size = 1; - uint32_t * input_sizes = inputs[0]->attr.size; - uint32_t dims = inputs[0]->attr.dim_num; - - p = &(self->nn_param.shufflechannel); - axis = p->axis; - - for ( i = 0; i < (uint32_t)axis; i++) - { - before_size *= input_sizes[i]; - } - for ( i = axis + 1; i < dims; i++) - { - after_size *= input_sizes[i]; - } - - if (axis == 2 && after_size == 1) - { - sizes[0] = input_sizes[0]; - sizes[1] = input_sizes[1]; - sizes[2] = input_sizes[2]; - } - else - { - sizes[0] = before_size; - sizes[1] = input_sizes[axis]; - sizes[2] = after_size; - p->axis = 1; - } - dims = 3; - - p->local->input_tensor = vxReshapeTensor(inputs[0]->t, (int32_t *)sizes, dims); - p->local->output_tensor = vxReshapeTensor(outputs[0]->t, (int32_t *)sizes, dims); - - return TRUE; -} - -static void _set_inputs_outputs - ( - vsi_nn_node_t * self, - vx_reference * params, - vsi_nn_tensor_t ** inputs, - vsi_nn_tensor_t ** outputs - ) -{ - vsi_nn_shufflechannel_param * p = NULL; - - p = &(self->nn_param.shufflechannel); - - params[0] = (vx_reference)p->local->input_tensor; - params[1] = (vx_reference)p->local->output_tensor; -} /* _set_inputs_outputs() */ - -static vsi_status _create_params - ( - vsi_nn_node_t * node, - vx_reference * params, - uint32_t num - ) -{ - vsi_status status = VSI_SUCCESS; - vx_context ctx; - vsi_nn_shufflechannel_param * p = NULL; - if( 0 == num ) - { - return VSI_SUCCESS; - } - memset( params, 0, sizeof( vx_reference * ) * num ); - p = &(node->nn_param.shufflechannel); - ctx = vxGetContext( (vx_reference)node->graph->g ); - /* Init parameters */ -#define _SET_PARAM( i, type, arg ) do{ \ - params[i] = (vx_reference)vxCreateScalar( ctx, type, &p->arg ); \ - status = vxGetStatus( params[i] ); \ - if( VSI_SUCCESS != status ) { \ - goto set_param_error; \ - } \ - } while(0) - _SET_PARAM( 0, VX_TYPE_INT32, group_number ); - _SET_PARAM( 1, VX_TYPE_INT32, axis ); -#undef _SET_PARAM -set_param_error: - - return status; -} /* _create_params */ - -static void _release_params - ( - vx_reference * params, - uint32_t num - ) -{ - uint32_t i = 0; - vx_scalar scalar; - for( i = 0; i < num; i ++ ) - { - scalar = (vx_scalar)params[i]; - vxReleaseScalar( &scalar ); - } -} /* _release_params() */ - -static vsi_status cpu_op_compute - ( - vsi_nn_node_t * self, - vsi_nn_tensor_t ** inputs, - vsi_nn_tensor_t ** outputs - ) -{ - vsi_status status = VSI_SUCCESS; - vx_reference params[_PARAM_NUM]; - vx_reference * args = NULL; - - args = ¶ms[_IO_NUM]; - - if( NULL == self->n ) - { - return VSI_FAILURE; - } - - /* Set inputs and outputs */ - _set_inputs_outputs( self, params, inputs, outputs ); - - /* Init parameters. */ - _create_params( self, args, _ARG_NUM ); - - /* Pass parameters to node. */ - status = vsi_nn_ClientNodePassParameters( self->n, params, _PARAM_NUM ); - - _release_params( args, _ARG_NUM ); - - return status; -} - -static vsi_status vx_op_pre_compute - ( - vsi_nn_node_t * self, - vsi_nn_tensor_t ** inputs, - vsi_nn_tensor_t ** outputs, - vsi_nn_kernel_info_t * kernel_info - ) -{ - vsi_nn_type_e inputDataFormat = inputs[0]->attr.dtype.vx_type; - vsi_nn_type_e outputDataFormat = outputs[0]->attr.dtype.vx_type; - int8_t inputFixedPointPos = inputs[0]->attr.dtype.fl; - int8_t outputFixedPointPos = outputs[0]->attr.dtype.fl; - int32_t inputZeroPoint = inputs[0]->attr.dtype.zero_point; - int32_t outputZeroPoint = outputs[0]->attr.dtype.zero_point; - vx_float32 inputScale = inputs[0]->attr.dtype.scale; - vx_float32 outputScale = outputs[0]->attr.dtype.scale; - int32_t axis = self->nn_param.shufflechannel.axis; - uint32_t *sizes = inputs[0]->attr.size; - vsi_bool is16Bits = FALSE; - vsi_bool is8Bits = FALSE; - - is16Bits = ((inputDataFormat == VSI_NN_TYPE_FLOAT16 && outputDataFormat == VSI_NN_TYPE_FLOAT16) - || (inputDataFormat == VSI_NN_TYPE_INT16 && outputDataFormat == VSI_NN_TYPE_INT16 - && inputFixedPointPos == outputFixedPointPos)) ? TRUE : FALSE; - is8Bits = ((inputDataFormat == VSI_NN_TYPE_INT8 && outputDataFormat == VSI_NN_TYPE_INT8 - && inputFixedPointPos == outputFixedPointPos) - || (inputDataFormat == VSI_NN_TYPE_UINT8 && outputDataFormat == VSI_NN_TYPE_UINT8 - && inputZeroPoint == outputZeroPoint && inputScale == outputScale)) ? TRUE : FALSE; -#define VSI_NN_TENSOR_WIDTH_MAX (65536) - kernel_info->kernel_index = 0; - if (sizes[0] < VSI_NN_TENSOR_WIDTH_MAX && sizes[1] < VSI_NN_TENSOR_WIDTH_MAX) - { - if ( is16Bits && axis == 2 ) - { - kernel_info->kernel_index = 1; - } - else if ( is8Bits && axis == 2) - { - kernel_info->kernel_index = 2; - } - else if ( is16Bits && axis == 1) - { - kernel_info->resource_name[0] = "vsi_nn_kernel_shufflechannel_axis1"; - kernel_info->kernel_index = 3; - } - else if ( is8Bits && axis == 1) - { - kernel_info->resource_name[0] = "vsi_nn_kernel_shufflechannel_axis1"; - kernel_info->kernel_index = 4; - } - } -#undef VSI_NN_TENSOR_WIDTH_MAX - - return VSI_SUCCESS; -} - -static vsi_status vx_op_compute - ( - vsi_nn_node_t * self, - vsi_nn_tensor_t ** inputs, - vsi_nn_tensor_t ** outputs - ) -{ - vsi_status status = VSI_SUCCESS; - vx_reference params[_PARAM_NUM]; - vx_border_t border; - vx_reference * args = NULL; - - args = ¶ms[_IO_NUM]; - - if( NULL == self->n ) - { - return VSI_FAILURE; - } - - /* Set inputs and outputs */ - _set_inputs_outputs( self, params, inputs, outputs ); - - /* Init parameters. */ - _create_params( self, args, _ARG_NUM ); - - /* Pass parameters to node. */ - status = vsi_nn_ClientNodePassParameters( self->n, params, _PARAM_NUM ); - - _release_params( args, _ARG_NUM ); - - border.mode = VX_BORDER_REPLICATE; - border.constant_value.U32 = 0; - status |= vxSetNodeAttribute(self->n, VX_NODE_BORDER, &border, sizeof(border)); - - return status; -} - -static vsi_nn_op_compute_t op_compute_list[] = -{ - cpu_op_compute, - vx_op_compute, - NULL -}; - -#endif - static vsi_status op_compute ( vsi_nn_node_t * self, @@ -304,41 +51,6 @@ static vsi_status op_compute vsi_nn_tensor_t ** outputs ) { -#if(USE_OVXLIB) - vsi_status status = VSI_FAILURE; - vsi_nn_kernel_info_t kernel_info; - - memset(&kernel_info, 0x0, sizeof(vsi_nn_kernel_info_t)); - - /* setup input/output shape */ - _reshape_tensor( self, inputs, outputs); - - kernel_info.resource_num = 1; - kernel_info.resource_name = (char **)malloc(kernel_info.resource_num * sizeof(char *)); - kernel_info.resource_name[0] = "vsi_nn_kernel_shufflechannel"; - kernel_info.type = vsi_nn_GetVXKernelTypeForShader(); - kernel_info.kernel = vx_kernel_SHUFFLECHANNEL_list; - kernel_info.init_index = 1; - - if (vsi_nn_is_do_vx_op_pre_init(kernel_info.type)) - { - vx_op_pre_compute(self, inputs, outputs, &kernel_info); - } - - self->n = vsi_nn_RegisterClientKernelAndNewNode( - self->graph, &kernel_info); - if (kernel_info.resource_name) free(kernel_info.resource_name); - if( NULL == self->n ) - { - return VSI_FAILURE; - } - - if (NULL != op_compute_list[kernel_info.init_index]) - { - status = op_compute_list[kernel_info.init_index](self, inputs, outputs); - } - return status; -#else vsi_status status = VSI_FAILURE; vx_nn_reorg_params_ext2_t param; vsi_nn_tensor_t *block_size_tensor = NULL; @@ -381,7 +93,6 @@ static vsi_status op_compute } return status; -#endif } /* op_compute() */ static vsi_bool op_setup diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_space2depth.c b/src/tim/vx/internal/src/ops/vsi_nn_op_space2depth.c index 47a5ac7..3a2aea3 100644 --- a/src/tim/vx/internal/src/ops/vsi_nn_op_space2depth.c +++ b/src/tim/vx/internal/src/ops/vsi_nn_op_space2depth.c @@ -37,208 +37,11 @@ #include "utils/vsi_nn_math.h" #include "client/vsi_nn_vxkernel.h" #include "libnnext/vx_lib_nnext.h" +#include "vsi_nn_test.h" #include "utils/vsi_nn_constraint_check.h" -#define _ARG_NUM (2) #define _INPUT_NUM (1) #define _OUTPUT_NUM (1) -#define _IO_NUM (_INPUT_NUM + _OUTPUT_NUM) -#define _PARAM_NUM (_ARG_NUM + _IO_NUM) - -extern vx_kernel_description_t * vx_kernel_SPACE2DEPTH_list[]; - -static void _set_inputs_outputs - ( - vx_reference * params, - vsi_nn_tensor_t ** inputs, - vsi_nn_tensor_t ** outputs - ) -{ - uint32_t i; - uint32_t cnt; - - /* Set inputs */ - cnt = 0; - for( i = 0; i < _INPUT_NUM; i ++, cnt ++ ) - { - params[cnt] = (vx_reference)inputs[i]->t; - } - - /* Set outputs */ - for( i = 0; i < _OUTPUT_NUM; i ++, cnt ++ ) - { - params[cnt] = (vx_reference)outputs[i]->t; - } -} /* _set_inputs_outputs() */ - -static vsi_status _create_params - ( - vsi_nn_node_t * node, - vx_reference * params, - uint32_t num - ) -{ - vsi_status status; - vx_context ctx; - vsi_nn_space2depth_param * p; - if( 0 == num ) - { - return VSI_SUCCESS; - } - memset( params, 0, sizeof( vx_reference * ) * num ); - p = &(node->nn_param.space2depth); - ctx = vxGetContext( (vx_reference)node->graph->g ); - /* Init parameters */ -#define _SET_PARAM( i, type, arg ) do{ \ - params[i] = (vx_reference)vxCreateScalar( ctx, type, &p->arg ); \ - status = vxGetStatus( params[i] ); \ - if( VSI_SUCCESS != status ) { \ - goto set_param_error; \ - } \ - } while(0) - _SET_PARAM( 0, VX_TYPE_INT32, block_size[0] ); - _SET_PARAM( 1, VX_TYPE_INT32, block_size[1] ); -#undef _SET_PARAM -set_param_error: - - return status; -} /* _create_params */ - -static void _release_params - ( - vx_reference * params, - uint32_t num - ) -{ - uint32_t i; - vx_scalar scalar; - for( i = 0; i < num; i ++ ) - { - scalar = (vx_scalar)params[i]; - vxReleaseScalar( &scalar ); - } -} /* _release_params() */ - -static vsi_status cpu_op_compute - ( - vsi_nn_node_t * self, - vsi_nn_tensor_t ** inputs, - vsi_nn_tensor_t ** outputs - ) -{ - vsi_status status = VSI_SUCCESS; - vx_reference params[_PARAM_NUM]; - vx_reference * args; - - args = ¶ms[_IO_NUM]; - - if( NULL == self->n ) - { - return VSI_FAILURE; - } - - /* Set inputs and outputs */ - _set_inputs_outputs( params, inputs, outputs ); - - /* Init parameters. */ - _create_params( self, args, _ARG_NUM ); - - /* Pass parameters to node. */ - status = vsi_nn_ClientNodePassParameters( self->n, params, _PARAM_NUM ); - - _release_params( args, _ARG_NUM ); - - return status; -} - -static vsi_status vx_op_pre_compute - ( - vsi_nn_node_t * self, - vsi_nn_tensor_t ** inputs, - vsi_nn_tensor_t ** outputs, - vsi_nn_kernel_info_t * kernel_info - ) -{ - vsi_status status = VX_SUCCESS; - vsi_nn_type_e dataFormat = inputs[0]->attr.dtype.vx_type; - int8_t input_fixPointPos = 0; - int8_t output_fixPointPos = 0; - vx_bool dataTypeFlg = FALSE; - vsi_nn_tensor_attr_t attr[2]; - - memset(&attr[0], 0, sizeof(vsi_nn_tensor_attr_t)); - memset(&attr[1], 0, sizeof(vsi_nn_tensor_attr_t)); - - status = vsi_nn_vxGetTensorAttr(inputs[0]->t, &attr[0]); - status |= vsi_nn_vxGetTensorAttr(outputs[0]->t, &attr[1]); - if (status != VX_SUCCESS) - { - VSILOGE("vsi_nn_vxGetTensorAttr failure! at line %d\n", __LINE__); - return status; - } - - input_fixPointPos = attr[0].dtype.fl; - output_fixPointPos = attr[1].dtype.fl; - - if(input_fixPointPos == output_fixPointPos) - dataTypeFlg = TRUE; - - if ((dataFormat == VSI_NN_TYPE_INT16 && dataTypeFlg) || dataFormat == VSI_NN_TYPE_FLOAT16) - { - kernel_info->kernel_index = 2; - } - else - { - VSILOGE("Not support input or output data format!(PRELU)\n"); - return VSI_FAILURE; - } - - return VSI_SUCCESS; -} - -static vsi_status vx_op_compute - ( - vsi_nn_node_t * self, - vsi_nn_tensor_t ** inputs, - vsi_nn_tensor_t ** outputs - ) -{ - vsi_status status = VSI_SUCCESS; - vx_reference params[_PARAM_NUM]; - vx_border_t border; - vx_reference * args; - - args = ¶ms[_IO_NUM]; - - if( NULL == self->n ) - { - return VSI_FAILURE; - } - - /* Set inputs and outputs */ - _set_inputs_outputs( params, inputs, outputs ); - - /* Init parameters. */ - _create_params( self, args, _ARG_NUM ); - - /* Pass parameters to node. */ - status = vsi_nn_ClientNodePassParameters( self->n, params, _PARAM_NUM ); - - _release_params( args, _ARG_NUM ); - - border.mode = VX_BORDER_REPLICATE; - border.constant_value.U32 = 0; - status |= vxSetNodeAttribute(self->n, VX_NODE_BORDER, &border, sizeof(border)); - - return status; -} - -static vsi_nn_op_compute_t op_compute_list[] = -{ - cpu_op_compute, - vx_op_compute, - NULL -}; static vsi_status op_compute ( @@ -248,23 +51,26 @@ static vsi_status op_compute ) { vsi_status status = VSI_FAILURE; - int32_t size_x = self->nn_param.space2depth.block_size[0]; - int32_t size_y = self->nn_param.space2depth.block_size[1]; - if (size_x == size_y) + if (self->nn_param.space2depth.block_size[0] == self->nn_param.space2depth.block_size[1]) { vx_nn_reorg_params_t param; vsi_nn_tensor_t *block_size_tensor = NULL; + vsi_nn_tensor_attr_t attr; memset(¶m, 0, sizeof(vx_nn_reorg_params_t)); - block_size_tensor = vsi_nn_VariableToTensor(self, - (uint8_t *)&self->nn_param.space2depth.block_size[0], - VSI_NN_TYPE_INT32); - if( NULL == block_size_tensor ) - { - VSILOGE("Create block_size_tensor fail.(space2depth)"); - return VSI_FAILURE; - } - self->nn_param.space2depth.local.block_size_tensor = block_size_tensor; + memset(&attr, 0, sizeof(attr)); + attr.size[0] = 2; + attr.size[1] = 1; + attr.dim_num = 2; + attr.is_const = TRUE; + attr.dtype.vx_type = VSI_NN_TYPE_INT32; + attr.dtype.qnt_type = VSI_NN_QNT_TYPE_NONE; + block_size_tensor = vsi_nn_CreateTensorFromData( + self->graph, + (uint8_t *)self->nn_param.space2depth.block_size, + &attr); + TEST_CHECK_PTR(block_size_tensor, final); + param.block_size = REQUIRED_IO(block_size_tensor); param.type = VX_REORG_SPACE_TO_DEPTH; @@ -274,46 +80,39 @@ static vsi_status op_compute sizeof(vx_nn_reorg_params_t), outputs[0]->t); - if( NULL != self->n ) + if ( NULL != self->n ) { status = VSI_SUCCESS; } +final: + if (block_size_tensor) vsi_nn_ReleaseTensor(&block_size_tensor); } else { - vsi_nn_kernel_info_t kernel_info; - memset(&kernel_info, 0x0, sizeof(vsi_nn_kernel_info_t)); - kernel_info.resource_num = 1; - kernel_info.resource_name = (char **)malloc(kernel_info.resource_num * sizeof(char *)); - kernel_info.resource_name[0] = "vsi_nn_kernel_space2depth"; - //kernel_info.type = VX_KERNEL_TYPE_CPU; - kernel_info.type = vsi_nn_GetVXKernelTypeForShader(); - kernel_info.kernel = vx_kernel_SPACE2DEPTH_list; - kernel_info.kernel_index = 1; - //kernel_info.init_index = 0; - kernel_info.init_index = 1; - - if (vsi_nn_is_do_vx_op_pre_init(kernel_info.type)) - { - vx_op_pre_compute(self, inputs, outputs, &kernel_info); - } - - self->n = vsi_nn_RegisterClientKernelAndNewNode( - self->graph, &kernel_info); - if (kernel_info.resource_name) free(kernel_info.resource_name); - if( NULL == self->n ) - { - return VSI_FAILURE; - } - if (NULL != op_compute_list[kernel_info.init_index]) - { - status = op_compute_list[kernel_info.init_index](self, inputs, outputs); - } + status = vsi_nn_internal_compute_node( self ); } return status; } /* op_compute() */ +static vsi_status op_optimize + ( + vsi_nn_node_t * self, + vsi_nn_tensor_t ** inputs, + vsi_nn_tensor_t ** outputs, + vsi_nn_opt_direction_e direction + ) +{ + if (self->nn_param.space2depth.block_size[0] != self->nn_param.space2depth.block_size[1]) + { + return vsi_nn_internal_optimize_node(self, direction ); + } + else + { + return VSI_SUCCESS; + } +} /* op_optimize() */ + static vsi_bool op_check ( vsi_nn_node_t * self, @@ -321,7 +120,7 @@ static vsi_bool op_check vsi_nn_tensor_t ** outputs ) { - if(self->nn_param.space2depth.block_size[0] < 0 + if (self->nn_param.space2depth.block_size[0] < 0 || self->nn_param.space2depth.block_size[1] < 0) { VSILOGE("Block size can't be less than zero in space to depth"); @@ -341,7 +140,7 @@ static vsi_bool op_check /* HW 9.0 */ IO_TYPE(D_BF16, D_BF16) END_IO_TYPE_DECL(SPACE2DEPTH) - if(!VALIDATE_OP_IO_TYPES(SPACE2DEPTH, self, inputs, self->input.num, outputs, self->output.num)) { + if (!VALIDATE_OP_IO_TYPES(SPACE2DEPTH, self, inputs, self->input.num, outputs, self->output.num)) { char* desc = generate_op_io_types_desc(inputs, self->input.num, outputs, self->output.num); VSILOGE("Inputs/Outputs data type not support: %s", desc); @@ -353,6 +152,30 @@ static vsi_bool op_check return TRUE; } /* op_check() */ +static vsi_bool op_set_space2depth_internal + ( + vsi_nn_node_t * self, + vsi_nn_tensor_t ** inputs, + vsi_nn_tensor_t ** outputs, + vsi_nn_op_t type_name + ) +{ + vsi_bool retn = TRUE; + vsi_nn_internal_node_t* curr = NULL; + + vsi_nn_internal_init_node_wksp( self ); + + curr = vsi_nn_internal_new_node( self, type_name, 0, 0 ); + curr->node->nn_param.space2depth_internal.block_size_x = + self->nn_param.space2depth.block_size[0]; + curr->node->nn_param.space2depth_internal.block_size_y = + self->nn_param.space2depth.block_size[1]; + curr->inputs[0] = inputs[0]; + curr->outputs[0] = outputs[0]; + retn = vsi_nn_internal_setup_node(self, curr); + + return retn; +} static vsi_bool op_setup ( @@ -361,9 +184,10 @@ static vsi_bool op_setup vsi_nn_tensor_t ** outputs ) { + vsi_bool ret = TRUE; uint32_t size_x = node->nn_param.space2depth.block_size[0]; uint32_t size_y = node->nn_param.space2depth.block_size[1]; - if( VSI_NN_DIM_AUTO == outputs[0]->attr.dim_num ) + if ( VSI_NN_DIM_AUTO == outputs[0]->attr.dim_num ) { outputs[0]->attr.dim_num = inputs[0]->attr.dim_num; outputs[0]->attr.size[0] = inputs[0]->attr.size[0] / size_x; @@ -372,7 +196,12 @@ static vsi_bool op_setup outputs[0]->attr.size[3] = inputs[0]->attr.size[3]; } - return TRUE; + if (size_x != size_y) + { + ret = op_set_space2depth_internal(node, inputs, outputs, VSI_NN_OP_SPACE2DEPTH_INTERNAL); + } + + return ret; } /* op_setup() */ static vsi_status op_deinit @@ -380,11 +209,14 @@ static vsi_status op_deinit vsi_nn_node_t * self ) { - if (self->nn_param.space2depth.local.block_size_tensor != NULL) + if (self->nn_param.space2depth.block_size[0] != self->nn_param.space2depth.block_size[1]) { - vsi_nn_ReleaseTensor(&(self->nn_param.space2depth.local.block_size_tensor)); + vsi_nn_internal_deinit_node_wksp(self); + } + else + { + vsi_nn_op_common_deinit(self); } - vsi_nn_op_common_deinit(self); return VSI_SUCCESS; } /* op_deinit() */ @@ -401,9 +233,9 @@ DEF_OP_REG /* deinit */ op_deinit, /* check */ op_check, /* setup */ op_setup, - /* optimize */ NULL, - /* input_num */ 1, - /* output_num */ 1 + /* optimize */ op_optimize, + /* input_num */ _INPUT_NUM, + /* output_num */ _OUTPUT_NUM ); #ifdef __cplusplus } diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_space2depth_internal.c b/src/tim/vx/internal/src/ops/vsi_nn_op_space2depth_internal.c new file mode 100644 index 0000000..5660eea --- /dev/null +++ b/src/tim/vx/internal/src/ops/vsi_nn_op_space2depth_internal.c @@ -0,0 +1,159 @@ +/**************************************************************************** +* +* Copyright (c) 2020 Vivante Corporation +* +* Permission is hereby granted, free of charge, to any person obtaining a +* copy of this software and associated documentation files (the "Software"), +* to deal in the Software without restriction, including without limitation +* the rights to use, copy, modify, merge, publish, distribute, sublicense, +* and/or sell copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following conditions: +* +* The above copyright notice and this permission notice shall be included in +* all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +* DEALINGS IN THE SOFTWARE. +* +*****************************************************************************/ +#include +#include + +#include "vsi_nn_types.h" +#include "vsi_nn_log.h" +#include "vsi_nn_node.h" +#include "vsi_nn_prv.h" +#include "vsi_nn_ops.h" +#include "vsi_nn_tensor.h" +#include "vsi_nn_tensor_util.h" +#include "utils/vsi_nn_util.h" +#include "utils/vsi_nn_math.h" +#include "kernel/vsi_nn_kernel.h" +#include "kernel/vsi_nn_kernel_gpu_shape_optimize.h" +#include "utils/vsi_nn_constraint_check.h" + +#define _INPUT_NUM (1) +#define _OUTPUT_NUM (1) + +static vsi_status op_compute + ( + vsi_nn_node_t * self, + vsi_nn_tensor_t ** inputs, + vsi_nn_tensor_t ** outputs + ) +{ + vsi_status status = VSI_FAILURE; + vsi_nn_kernel_param_t * param = NULL; + int32_t block_size_x = self->nn_param.space2depth_internal.block_size_x; + int32_t block_size_y = self->nn_param.space2depth_internal.block_size_y; + + if ( NULL == self ) + { + return VSI_FAILURE; + } + + param =vsi_nn_kernel_param_create(); + + // Add params + vsi_nn_kernel_param_add_int32( param, "block_size_x", block_size_x ); + vsi_nn_kernel_param_add_int32( param, "block_size_y", block_size_y ); + + self->n = (vx_node)vsi_nn_kernel_selector( self->graph, "space2depth_internal", inputs, 1, outputs, 1, param ); + + if ( self->n != NULL ) + { + status = VSI_SUCCESS; + } + + if (param != NULL) + { + vsi_nn_kernel_param_release( ¶m ); + } + return status; +} /* op_compute() */ + +static vsi_bool op_setup + ( + vsi_nn_node_t * self, + vsi_nn_tensor_t ** inputs, + vsi_nn_tensor_t ** outputs + ) +{ + uint32_t size_x = self->nn_param.space2depth_internal.block_size_x; + uint32_t size_y = self->nn_param.space2depth_internal.block_size_y; + if ( VSI_NN_DIM_AUTO == outputs[0]->attr.dim_num ) + { + outputs[0]->attr.dim_num = inputs[0]->attr.dim_num; + outputs[0]->attr.size[0] = inputs[0]->attr.size[0] * size_x; + outputs[0]->attr.size[1] = inputs[0]->attr.size[1] * size_y; + outputs[0]->attr.size[2] = inputs[0]->attr.size[2] / (size_x * size_y); + outputs[0]->attr.size[3] = inputs[0]->attr.size[3]; + } + + return TRUE; +} /* op_setup() */ + +static vsi_bool op_check + ( + vsi_nn_node_t * self, + vsi_nn_tensor_t ** inputs, + vsi_nn_tensor_t ** outputs + ) +{ + BEGIN_IO_TYPE_DECL(SPACE2DEPTH_INTERNAL, 1, 1) + IO_TYPE(D_F16, D_F16) + IO_TYPE(D_I8|Q_DFP, D_I8|Q_DFP) + IO_TYPE(D_I16|Q_DFP, D_I16|Q_DFP) + IO_TYPE(D_U8|Q_ASYM, D_U8|Q_ASYM) + IO_TYPE(D_F32, D_F32) + IO_TYPE(D_F32, D_BF16) + IO_TYPE(D_BF16, D_F32) + + /* HW 9.0 */ + IO_TYPE(D_BF16, D_BF16) + END_IO_TYPE_DECL(SPACE2DEPTH_INTERNAL) + if (!VALIDATE_OP_IO_TYPES(SPACE2DEPTH_INTERNAL, self, inputs, self->input.num, outputs, self->output.num)) { + char* desc = generate_op_io_types_desc(inputs, + self->input.num, outputs, self->output.num); + VSILOGE("Inputs/Outputs data type not support: %s", desc); + destroy_op_io_types_desc(desc); + return FALSE; + } + + return TRUE; +} /* op_check() */ + +static vsi_status op_deinit + ( + vsi_nn_node_t * self + ) +{ + vsi_nn_op_common_deinit(self); + + return VSI_SUCCESS; +} /* op_deinit() */ + + +#ifdef __cplusplus +extern "C" { +#endif +DEF_OP_REG + ( + /* op_name */ SPACE2DEPTH_INTERNAL, + /* init */ NULL, + /* compute */ op_compute, + /* deinit */ op_deinit, + /* check */ op_check, + /* setup */ op_setup, + /* optimize */ NULL, + /* input_num */ _INPUT_NUM, + /* output_num */ _OUTPUT_NUM + ); +#ifdef __cplusplus +} +#endif diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_tensorstackconcat.c b/src/tim/vx/internal/src/ops/vsi_nn_op_tensorstackconcat.c index 4283e40..39d32a5 100644 --- a/src/tim/vx/internal/src/ops/vsi_nn_op_tensorstackconcat.c +++ b/src/tim/vx/internal/src/ops/vsi_nn_op_tensorstackconcat.c @@ -346,10 +346,10 @@ static vsi_bool op_check { BEGIN_IO_TYPE_DECL(TENSORSTACKCONCAT, 2, 1) - IO_TYPE(D_F16, D_F16, D_F16) - IO_TYPE(D_U8|Q_ASYM, D_U8|Q_ASYM, D_U8|Q_ASYM) - IO_TYPE(D_I16|Q_DFP, D_I16|Q_DFP, D_I16|Q_DFP) - IO_TYPE(D_I8|Q_DFP, D_I8|Q_DFP, D_I8|Q_DFP) + IO_TYPE(D_F16, D_I32, D_F16) + IO_TYPE(D_U8|Q_ASYM, D_I32, D_U8|Q_ASYM) + IO_TYPE(D_I16|Q_DFP, D_I32, D_I16|Q_DFP) + IO_TYPE(D_I8|Q_DFP, D_I32, D_I8|Q_DFP) END_IO_TYPE_DECL(TENSORSTACKCONCAT) if(!VALIDATE_OP_IO_TYPES(TENSORSTACKCONCAT, self, inputs, self->input.num, outputs, self->output.num)) { char* desc = generate_op_io_types_desc(inputs, diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_unstack.c b/src/tim/vx/internal/src/ops/vsi_nn_op_unstack.c index 906fb7c..5717fe3 100644 --- a/src/tim/vx/internal/src/ops/vsi_nn_op_unstack.c +++ b/src/tim/vx/internal/src/ops/vsi_nn_op_unstack.c @@ -182,7 +182,7 @@ static vsi_bool op_setup vsi_nn_internal_setup_node( self, curr ); slices = (uint32_t *)vsi_nn_internal_new_node_param(curr, - VSI_NN_MAX_DIM_NUM * sizeof(uint32_t)); + tensor_num * sizeof(uint32_t)); curr = vsi_nn_internal_new_node( self, VSI_NN_OP_SPLIT, 1, tensor_num ); curr->node->nn_param.split.axis = 1; curr->node->nn_param.split.slices = slices; diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_upsamplescale.c b/src/tim/vx/internal/src/ops/vsi_nn_op_upsamplescale.c new file mode 100644 index 0000000..c79c373 --- /dev/null +++ b/src/tim/vx/internal/src/ops/vsi_nn_op_upsamplescale.c @@ -0,0 +1,253 @@ +/**************************************************************************** +* +* Copyright (c) 2020 Vivante Corporation +* +* Permission is hereby granted, free of charge, to any person obtaining a +* copy of this software and associated documentation files (the "Software"), +* to deal in the Software without restriction, including without limitation +* the rights to use, copy, modify, merge, publish, distribute, sublicense, +* and/or sell copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following conditions: +* +* The above copyright notice and this permission notice shall be included in +* all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +* DEALINGS IN THE SOFTWARE. +* +*****************************************************************************/ + + +#include +#include + +#include "vsi_nn_types.h" +#include "vsi_nn_log.h" +#include "vsi_nn_node.h" +#include "vsi_nn_prv.h" +#include "vsi_nn_ops.h" +#include "vsi_nn_tensor.h" +#include "utils/vsi_nn_util.h" +#include "kernel/vsi_nn_kernel.h" +#include "utils/vsi_nn_constraint_check.h" + +typedef struct _upsamplescale_local_data_t { + int32_t placeholder; +} upsamplescale_local_data_t; + +/* + Declare number of input and output. + */ +#define _INPUT_NUM (1) +#define _OUTPUT_NUM (1) + +#define _EPSILON 1e-8 + +static vsi_status op_compute + ( + vsi_nn_node_t * self, + vsi_nn_tensor_t ** inputs, + vsi_nn_tensor_t ** outputs + ) +{ + vsi_status status = VSI_FAILURE; + int32_t stride = self->nn_param.upsamplescale.stride; + float scale = self->nn_param.upsamplescale.scale; + vsi_nn_kernel_param_t * param = NULL; + + if( NULL == self ) + { + return VSI_FAILURE; + } + + if (stride == 1 || vsi_nn_abs(scale - 1.0f) == _EPSILON) + { + return vsi_nn_internal_compute_node( self ); + } + + param =vsi_nn_kernel_param_create(); + + vsi_nn_kernel_param_add_int32( param, "stride", stride ); + vsi_nn_kernel_param_add_float32( param, "scale", scale ); + + self->n = (vx_node)vsi_nn_kernel_selector( self->graph, + "upsamplescale", + inputs, 1, + outputs, 1, param ); + + vsi_nn_kernel_param_release( ¶m ); + + if( self->n ) + { + status = VSI_SUCCESS; + } + + return status; +} /* op_compute() */ + +static vsi_bool op_check + ( + vsi_nn_node_t * self, + vsi_nn_tensor_t ** inputs, + vsi_nn_tensor_t ** outputs + ) +{ + BEGIN_IO_TYPE_DECL(UPSAMPLESCALE, 1, 1) + IO_TYPE(D_F16, D_U8|Q_ASYM) + IO_TYPE(D_F16, D_I16|Q_DFP) + IO_TYPE(D_F16, D_I8|Q_DFP) + IO_TYPE(D_F16, D_U8) + IO_TYPE(D_F16, D_I16) + IO_TYPE(D_F16, D_I8) + IO_TYPE(D_F16, D_F16) + IO_TYPE(D_U8|Q_ASYM, D_U8|Q_ASYM) + IO_TYPE(D_U8|Q_ASYM, D_F16) + IO_TYPE(D_I8|Q_DFP, D_I8|Q_DFP) + IO_TYPE(D_I8|Q_DFP, D_F16) + IO_TYPE(D_I16|Q_DFP, D_I16|Q_DFP) + IO_TYPE(D_I16|Q_DFP, D_F16) + IO_TYPE(D_U8, D_U8) + IO_TYPE(D_U8, D_F16) + IO_TYPE(D_I8, D_I8) + IO_TYPE(D_I8, D_F16) + IO_TYPE(D_I16, D_I16) + IO_TYPE(D_I16, D_F16) + END_IO_TYPE_DECL(UPSAMPLESCALE) + if (!VALIDATE_OP_IO_TYPES(UPSAMPLESCALE, self, inputs, self->input.num, outputs, self->output.num)) + { + char* desc = generate_op_io_types_desc(inputs, + self->input.num, outputs, self->output.num); + VSILOGE("Inputs/Outputs data type not support: %s", desc); + destroy_op_io_types_desc(desc); + return FALSE; + } + + return TRUE; +} /* op_check() */ + +static vsi_status op_optimize + ( + vsi_nn_node_t * self, + vsi_nn_tensor_t ** inputs, + vsi_nn_tensor_t ** outputs, + vsi_nn_opt_direction_e direction + ) +{ + int32_t stride = self->nn_param.upsamplescale.stride; + float scale = self->nn_param.upsamplescale.scale; + + if (stride == 1 && vsi_nn_abs(scale - 1.0f) == _EPSILON) + { + return vsi_nn_internal_optimize_node( self, direction ); + } + else + { + return VSI_SUCCESS; + } +} + +static vsi_bool op_setup + ( + vsi_nn_node_t * self, + vsi_nn_tensor_t ** inputs, + vsi_nn_tensor_t ** outputs + ) +{ + /* TODO: Add code to comput outputs' shape. */ + int32_t stride = self->nn_param.upsamplescale.stride; + float scale = self->nn_param.upsamplescale.scale; + int32_t i = 0; + vsi_nn_internal_node_t* curr = NULL; + + vsi_nn_internal_init_node_wksp(self); + + if (stride == 1 && vsi_nn_abs(scale - 1.0f) == _EPSILON) + { + curr = vsi_nn_internal_new_node(self, VSI_NN_OP_DATACONVERT, 0, 0); + curr->inputs[0] = inputs[0]; + curr->outputs[0] = outputs[0]; + + vsi_nn_internal_setup_node(self, curr); + } + else if (stride == 1) + { + curr = vsi_nn_internal_new_node(self, VSI_NN_OP_LINEAR, 0, 0); + curr->node->nn_param.linear.a = scale; + curr->node->nn_param.linear.b = 0; + curr->inputs[0] = inputs[0]; + curr->outputs[0] = outputs[0]; + + vsi_nn_internal_setup_node(self, curr); + } + else if (vsi_nn_abs(scale - 1.0f) == _EPSILON) + { + curr = vsi_nn_internal_new_node(self, VSI_NN_OP_RESIZE, 0, 0); + curr->node->nn_param.resize.type = VSI_NN_INTERPOLATION_NEAREST_NEIGHBOR; + curr->node->nn_param.resize.align_corners = FALSE; + curr->node->nn_param.resize.half_pixel_centers = FALSE; + curr->node->nn_param.resize.size[0] = inputs[0]->attr.size[0] * stride; + curr->node->nn_param.resize.size[1] = inputs[0]->attr.size[1] * stride; + curr->inputs[0] = inputs[0]; + curr->outputs[0] = outputs[0]; + + vsi_nn_internal_setup_node(self, curr); + } + else + { + outputs[0]->attr.size[0] = inputs[0]->attr.size[0] * stride; + outputs[0]->attr.size[1] = inputs[0]->attr.size[1] * stride; + for (i = 2; i < (int32_t)inputs[0]->attr.dim_num; i++) + { + outputs[0]->attr.size[i] = inputs[0]->attr.size[i]; + } + outputs[0]->attr.dim_num = inputs[0]->attr.dim_num; + } + + return TRUE; +} /* op_setup() */ + +static vsi_status op_init + ( + vsi_nn_node_t* self + ) +{ + return VSI_SUCCESS; +} /* op_init() */ + +static vsi_status op_deinit + ( + vsi_nn_node_t* self + ) +{ + vsi_status status = VSI_SUCCESS; + + vsi_nn_internal_deinit_node_wksp( self ); + + status = vsi_nn_op_common_deinit(self); + + return status; +} /* op_deinit() */ + +__BEGIN_DECLS + +/* Registrar */ +DEF_OP_REG + ( + /* op_name */ UPSAMPLESCALE, + /* init */ op_init, + /* compute */ op_compute, + /* deinit */ op_deinit, + /* check */ op_check, + /* setup */ op_setup, + /* optimize */ op_optimize, + /* input_num */ _INPUT_NUM, + /* output_num */ _OUTPUT_NUM + ); + +__END_DECLS + diff --git a/src/tim/vx/internal/src/utils/vsi_nn_code_generator.c b/src/tim/vx/internal/src/utils/vsi_nn_code_generator.c index 518b099..16c1bff 100644 --- a/src/tim/vx/internal/src/utils/vsi_nn_code_generator.c +++ b/src/tim/vx/internal/src/utils/vsi_nn_code_generator.c @@ -418,6 +418,7 @@ static _op_param_gen_t s_op_gen[] = /* DECONVOLUTION1D */ NULL, /* INTERP */ NULL, /* RESIZE_1D */ NULL, + /* UPSAMPLESCALE */ NULL, }; _compiler_assert( _cnt_of_array(s_op_gen) == VSI_NN_OP_NUM, vsi_nn_code_generator_c ); diff --git a/src/tim/vx/internal/src/vsi_nn_graph.c b/src/tim/vx/internal/src/vsi_nn_graph.c index 392370a..a49d8f8 100644 --- a/src/tim/vx/internal/src/vsi_nn_graph.c +++ b/src/tim/vx/internal/src/vsi_nn_graph.c @@ -588,11 +588,6 @@ vsi_status vsi_nn_SetupGraph vsi_status status; vsi_nn_node_id_t *sorted_nodes; vsi_nn_node_id_t *nodes_list; - uint32_t num_of_graph_inputs; - vx_reference *graph_inputs = NULL; - uint32_t num_of_graph_outputs; - vx_reference *graph_outputs = NULL; - vsi_nn_tensor_t *tensor; vsi_bool dirty = FALSE; status = VSI_FAILURE; @@ -670,54 +665,9 @@ vsi_status vsi_nn_SetupGraph status = vsi_nn_TrySetupCompleteSignalNode( graph ); TEST_CHECK_STATUS( status, final ); - /* Explicitly set graph inputs and outputs */ - num_of_graph_inputs = graph->input.num; - graph_inputs = (vx_reference *)malloc( num_of_graph_inputs * sizeof( vx_reference ) ); - for( i = 0; i < num_of_graph_inputs; i++ ) - { - tensor = vsi_nn_GetTensor( graph, graph->input.tensors[i] ); - if (tensor) - { - graph_inputs[i] = (vx_reference)( tensor->t ); - } - else - { - graph_inputs[i] = NULL; - } - } - num_of_graph_outputs = graph->output.num; - if( graph->complete_signal.exists ) - { - num_of_graph_outputs += 1; - } - graph_outputs = (vx_reference *)malloc( num_of_graph_outputs * sizeof( vx_reference ) ); - for( i = 0; i < num_of_graph_outputs; i++ ) - { - tensor = vsi_nn_GetTensor( graph, graph->output.tensors[i] ); - if (tensor) - { - graph_outputs[i] = (vx_reference)( tensor->t ); - } - else - { - graph_outputs[i] = NULL; - } - } - if( graph->complete_signal.exists ) - { - graph_outputs[num_of_graph_outputs - 1] = \ - (vx_reference)graph->complete_signal.tensor->t; - } - status = vxIdentifyGraphInputsAndOutputs( graph->g, - num_of_graph_inputs, - graph_inputs, - num_of_graph_outputs, - graph_outputs ); - - if( VSI_SUCCESS != status ) - { - goto final; - } + /* Setup binary graph inputs and outputs. */ + status = vsi_nn_setup_binary_graph_inputs_outputs( graph ); + TEST_CHECK_STATUS( status, final ); final: if( NULL != sorted_nodes ) @@ -728,14 +678,6 @@ final: { free( nodes_list ); } - if ( NULL != graph_inputs) - { - free( graph_inputs ); - } - if ( NULL != graph_outputs) - { - free( graph_outputs ); - } return status; } /* vsi_nn_SetupGraph() */ @@ -1599,33 +1541,46 @@ void vsi_nn_DumpGraphToJson for(j = 0; j < node->input.num; j++) { tio = &tensor_ref[node->input.tensors[j]]; - if(tio->input.num > 0) + if(NULL == vsi_nn_GetTensor(graph, node->input.tensors[j])) { - table = tio->input.table; - - /* tensor only 1 input node */ - in_node = vsi_nn_GetNode(graph, table[0].node); if(j == node->input.num - 1) { - fprintf(fp, "\"@uid_%u:out%u\" ", in_node->uid, table[0].index); + fprintf(fp, "\"not used\" "); } else { - fprintf(fp, "\"@uid_%u:out%u\", ", in_node->uid, table[0].index); + fprintf(fp, "\"not used\", "); } } else { - if(j == node->input.num - 1) + if(tio->input.num > 0) { - fprintf(fp, "\"datainput_%u:out0\" ", j); + table = tio->input.table; + + /* tensor only 1 input node */ + in_node = vsi_nn_GetNode(graph, table[0].node); + if(j == node->input.num - 1) + { + fprintf(fp, "\"@uid_%u:out%u\" ", in_node->uid, table[0].index); + } + else + { + fprintf(fp, "\"@uid_%u:out%u\", ", in_node->uid, table[0].index); + } } else { - fprintf(fp, "\"datainput_%u:out0\", ", j); + if(j == node->input.num - 1) + { + fprintf(fp, "\"datainput_%u:out0\" ", j); + } + else + { + fprintf(fp, "\"datainput_%u:out0\", ", j); + } } } - } /* dump input shape */ @@ -1633,14 +1588,14 @@ void vsi_nn_DumpGraphToJson for(j = 0; j < node->input.num; j++) { tensor = vsi_nn_GetTensor(graph, node->input.tensors[j]); - if(vsi_nn_ShapeToString( tensor->attr.size, tensor->attr.dim_num, + if(NULL != tensor && vsi_nn_ShapeToString( tensor->attr.size, tensor->attr.dim_num, shape, _SHAPE_BUF_SIZE, TRUE ) > 0) { fprintf(fp, "[%s ]", shape); } else { - fprintf(fp, "[ - ]"); + fprintf(fp, "[]"); } if(j < node->input.num - 1) { @@ -1667,14 +1622,14 @@ void vsi_nn_DumpGraphToJson for(j = 0; j < node->output.num; j++) { tensor = vsi_nn_GetTensor(graph, node->output.tensors[j]); - if(vsi_nn_ShapeToString( tensor->attr.size, tensor->attr.dim_num, + if(NULL != tensor && vsi_nn_ShapeToString( tensor->attr.size, tensor->attr.dim_num, shape, _SHAPE_BUF_SIZE, TRUE ) > 0) { fprintf(fp, "[%s ]", shape); } else { - fprintf(fp, "[ - ]"); + fprintf(fp, "[]"); } if(j < node->output.num - 1) { @@ -1762,6 +1717,124 @@ final: return status; } /* vsi_nn_TrySetupCompleteSignalNode() */ + +/* + * Documented in vsi_nn_graph.h + */ +vsi_status vsi_nn_setup_binary_graph_inputs_outputs + ( + vsi_nn_graph_t* graph + ) +{ + uint32_t i,j; + vsi_status status; + uint32_t num_of_graph_inputs; + uint32_t num_of_graph_real_inputs; + vx_reference *graph_inputs = NULL; + uint32_t num_of_graph_outputs; + uint32_t num_of_graph_real_outputs; + vx_reference *graph_outputs = NULL; + vsi_nn_tensor_t *tensor; + + num_of_graph_real_inputs = 0; + num_of_graph_real_outputs = 0; + + /* Explicitly set graph inputs and outputs */ + num_of_graph_inputs = graph->input.num; + for( i = 0; i < num_of_graph_inputs; i++ ) + { + tensor = vsi_nn_GetTensor( graph, graph->input.tensors[i] ); + if (tensor) + { + num_of_graph_real_inputs += 1; + } + else + { + ;//do nothing + } + } + graph_inputs = (vx_reference *)malloc( num_of_graph_real_inputs * sizeof( vx_reference ) ); + for( i = 0, j = 0; i < num_of_graph_inputs; i++ ) + { + tensor = vsi_nn_GetTensor( graph, graph->input.tensors[i] ); + if (tensor) + { + if(j > num_of_graph_real_inputs -1) + { + status = VSI_FAILURE; + goto final; + } + graph_inputs[j++] = (vx_reference)( tensor->t ); + } + else + { + ;//do nothing + } + } + num_of_graph_outputs = graph->output.num; + if( graph->complete_signal.exists ) + { + num_of_graph_outputs += 1; + } + for( i = 0; i < num_of_graph_outputs; i++ ) + { + tensor = vsi_nn_GetTensor( graph, graph->output.tensors[i] ); + if (tensor) + { + num_of_graph_real_outputs += 1; + } + else + { + ;//do nothing + } + } + graph_outputs = (vx_reference *)malloc( num_of_graph_real_outputs * sizeof( vx_reference ) ); + for( i = 0, j = 0; i < num_of_graph_outputs; i++ ) + { + tensor = vsi_nn_GetTensor( graph, graph->output.tensors[i] ); + if (tensor) + { + if(j > num_of_graph_real_outputs -1) + { + status = VSI_FAILURE; + goto final; + } + graph_outputs[j++] = (vx_reference)( tensor->t ); + } + else + { + ;//do nothing + } + } + if( graph->complete_signal.exists ) + { + graph_outputs[num_of_graph_real_outputs - 1] = \ + (vx_reference)graph->complete_signal.tensor->t; + } + + status = vxIdentifyGraphInputsAndOutputs( graph->g, + num_of_graph_real_inputs, + graph_inputs, + num_of_graph_real_outputs, + graph_outputs ); + + if( VSI_SUCCESS != status ) + { + goto final; + } + +final: + if ( NULL != graph_inputs) + { + free( graph_inputs ); + } + if ( NULL != graph_outputs) + { + free( graph_outputs ); + } + return status; +} /* vsi_nn_setup_binary_graph_inputs_outputs() */ + vsi_status vsi_nn_SetupRNNConnections ( vsi_nn_graph_t* graph, diff --git a/src/tim/vx/internal/src/vsi_nn_pre_post_process.c b/src/tim/vx/internal/src/vsi_nn_pre_post_process.c index 51083b1..c7c49a6 100644 --- a/src/tim/vx/internal/src/vsi_nn_pre_post_process.c +++ b/src/tim/vx/internal/src/vsi_nn_pre_post_process.c @@ -241,6 +241,31 @@ static void _set_preproc_node_input_attr input_attr->size[2] = 1; } } + + if(*source_format == VSI_NN_SOURCE_FORMAT_IMAGE_RGB888_PLANAR || *source_format == VSI_NN_SOURCE_FORMAT_IMAGE_GRAY) + { + if(*source_layout == VSI_NN_SOURCE_LAYOUT_NHWC) + { + input_attr->size[0] = input_size->w; + input_attr->size[1] = input_size->h; + input_attr->size[2] = input_size->c; + } + } + + if(*source_format == VSI_NN_SOURCE_FORMAT_IMAGE_BGRA) + { + if(*source_layout == VSI_NN_SOURCE_LAYOUT_NHWC) + { + input_attr->size[0] = 4*input_attr->size[1]; + input_attr->size[1] = input_attr->size[2]; + input_attr->size[2] = 1; + } + else + { + input_attr->size[0] = 4*input_attr->size[0]; + input_attr->size[2] = 1; + } + } } /*_set_preproc_node_input_attr() */ static void _set_preproc_node_output_attr