diff --git a/src/tim/vx/internal/BUILD b/src/tim/vx/internal/BUILD index 97e4591..186c6a9 100644 --- a/src/tim/vx/internal/BUILD +++ b/src/tim/vx/internal/BUILD @@ -195,14 +195,6 @@ cc_library( "src/kernel/vsi_nn_kernel_param.c", "src/kernel/vsi_nn_gpu.c", "src/kernel/vsi_nn_kernel_gpu_shape_optimize.c", - "src/libnnext/ops/kernel/vsi_nn_kernel_imageprocess.c", - "src/libnnext/ops/kernel/vsi_nn_kernel_signalframe.c", - "src/libnnext/ops/kernel/vsi_nn_kernel_tensorstackconcat.c", - "src/libnnext/ops/kernel/vsi_nn_kernel_sync_host.c", - "src/libnnext/ops/kernel/vsi_nn_kernel_spatial_transformer.c", - "src/libnnext/ops/kernel/vsi_nn_kernel_extra_ending.c", - "src/libnnext/ops/kernel/vsi_nn_kernel_heatmap_max_keypoint.c", - "src/libnnext/ops/kernel/vsi_nn_kernel_box_with_nms_limit.c", "src/libnnext/vsi_nn_libnnext_resource.c", "src/libnnext/vsi_nn_vxkernel.c", ] + [":kernel_srcs"] diff --git a/src/tim/vx/internal/include/interface/ops.def b/src/tim/vx/internal/include/interface/ops.def index 6315513..f511b00 100644 --- a/src/tim/vx/internal/include/interface/ops.def +++ b/src/tim/vx/internal/include/interface/ops.def @@ -156,3 +156,5 @@ DEF_OP(ERF) DEF_OP(ONE_HOT) DEF_OP(NMS) DEF_OP(GROUPED_CONV1D) +DEF_OP(SCATTER_ND_UPDATE) +DEF_OP(GELU) \ No newline at end of file diff --git a/src/tim/vx/internal/include/ops/vsi_nn_op_gelu.h b/src/tim/vx/internal/include/ops/vsi_nn_op_gelu.h new file mode 100644 index 0000000..5cb011c --- /dev/null +++ b/src/tim/vx/internal/include/ops/vsi_nn_op_gelu.h @@ -0,0 +1,37 @@ +/**************************************************************************** +* +* Copyright (c) 2020 Vivante Corporation +* +* Permission is hereby granted, free of charge, to any person obtaining a +* copy of this software and associated documentation files (the "Software"), +* to deal in the Software without restriction, including without limitation +* the rights to use, copy, modify, merge, publish, distribute, sublicense, +* and/or sell copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following conditions: +* +* The above copyright notice and this permission notice shall be included in +* all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +* DEALINGS IN THE SOFTWARE. +* +*****************************************************************************/ + +#ifndef _VSI_NN_OP_GELU_H +#define _VSI_NN_OP_GELU_H + +#include "vsi_nn_types.h" + +typedef struct _vsi_nn_gelu_param +{ + vsi_bool approximate; +} vsi_nn_gelu_param; + + +#endif + diff --git a/src/tim/vx/internal/include/ops/vsi_nn_op_scatter_nd_update.h b/src/tim/vx/internal/include/ops/vsi_nn_op_scatter_nd_update.h new file mode 100644 index 0000000..68e1b29 --- /dev/null +++ b/src/tim/vx/internal/include/ops/vsi_nn_op_scatter_nd_update.h @@ -0,0 +1,43 @@ +/**************************************************************************** +* +* Copyright (c) 2020 Vivante Corporation +* +* Permission is hereby granted, free of charge, to any person obtaining a +* copy of this software and associated documentation files (the "Software"), +* to deal in the Software without restriction, including without limitation +* the rights to use, copy, modify, merge, publish, distribute, sublicense, +* and/or sell copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following conditions: +* +* The above copyright notice and this permission notice shall be included in +* all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +* DEALINGS IN THE SOFTWARE. +* +*****************************************************************************/ +#ifndef _VSI_NN_OP_SCATTER_ND_UPDATE_H +#define _VSI_NN_OP_SCATTER_ND_UPDATE_H + +#include "vsi_nn_types.h" + +#ifdef __cplusplus +extern "C" { +#endif + +typedef struct _vsi_nn_scatter_nd_update_param +{ + vsi_bool use_locking; +} vsi_nn_scatter_nd_update_param; + +#ifdef __cplusplus +} +#endif + +#endif + diff --git a/src/tim/vx/internal/include/ops/vsi_nn_op_signalframe.h b/src/tim/vx/internal/include/ops/vsi_nn_op_signalframe.h index 6ef0871..e6ff0c4 100644 --- a/src/tim/vx/internal/include/ops/vsi_nn_op_signalframe.h +++ b/src/tim/vx/internal/include/ops/vsi_nn_op_signalframe.h @@ -53,7 +53,11 @@ typedef struct _vsi_nn_signalframe_param uint32_t window_length; uint32_t step; uint32_t pad_end; - uint32_t pad; + union + { + uint32_t pad; + float pad_value; + }; uint32_t axis; } vsi_nn_signalframe_param; diff --git a/src/tim/vx/internal/include/ops/vsi_nn_op_spatial_transformer.h b/src/tim/vx/internal/include/ops/vsi_nn_op_spatial_transformer.h index 0716fa9..d4a7c20 100644 --- a/src/tim/vx/internal/include/ops/vsi_nn_op_spatial_transformer.h +++ b/src/tim/vx/internal/include/ops/vsi_nn_op_spatial_transformer.h @@ -55,8 +55,7 @@ typedef struct _vsi_nn_spatial_transformer_param float theta_2_1; float theta_2_2; float theta_2_3; - - vsi_nn_spatial_transformer_lcl_data lcl; + vsi_bool align_corners; } vsi_nn_spatial_transformer_param; #ifdef __cplusplus @@ -64,4 +63,3 @@ typedef struct _vsi_nn_spatial_transformer_param #endif #endif - diff --git a/src/tim/vx/internal/include/vsi_nn_context.h b/src/tim/vx/internal/include/vsi_nn_context.h index 9a571b9..cfeb25b 100644 --- a/src/tim/vx/internal/include/vsi_nn_context.h +++ b/src/tim/vx/internal/include/vsi_nn_context.h @@ -63,8 +63,16 @@ typedef struct _vsi_nn_hw_config_t #if VX_HARDWARE_CAPS_PARAMS_EXT_SUPPORT uint32_t subGroupSize; #endif + uint32_t use_40bits_va; } vsi_nn_hw_config_t; +typedef struct _vsi_nn_runtime_option_t +{ + int32_t enable_shader; + int32_t enable_opcheck; + int32_t enable_concat_optimize; +} vsi_nn_runtime_option_t; + /** * Ovxlib NN runtime context. */ @@ -72,6 +80,7 @@ typedef struct _vsi_nn_context_t { vx_context c; vsi_nn_hw_config_t config; + vsi_nn_runtime_option_t options; } *vsi_nn_context_t; /** diff --git a/src/tim/vx/internal/include/vsi_nn_internal_node.h b/src/tim/vx/internal/include/vsi_nn_internal_node.h index 4c8113c..e314f7d 100644 --- a/src/tim/vx/internal/include/vsi_nn_internal_node.h +++ b/src/tim/vx/internal/include/vsi_nn_internal_node.h @@ -87,7 +87,8 @@ vsi_nn_internal_tensor_t* vsi_nn_internal_create_zero_bias_tensor ( vsi_nn_node_t* node, vsi_nn_tensor_attr_t* input_attr, - vsi_nn_tensor_attr_t* weight_attr + vsi_nn_tensor_attr_t* weight_attr, + vsi_bool use_virtual_tensor ); vsi_status vsi_nn_internal_deinit_node diff --git a/src/tim/vx/internal/include/vsi_nn_node_type.h b/src/tim/vx/internal/include/vsi_nn_node_type.h index f9a4606..b490601 100644 --- a/src/tim/vx/internal/include/vsi_nn_node_type.h +++ b/src/tim/vx/internal/include/vsi_nn_node_type.h @@ -170,6 +170,8 @@ #include "ops/vsi_nn_op_one_hot.h" #include "ops/vsi_nn_op_nms.h" #include "ops/vsi_nn_op_grouped_conv1d.h" +#include "ops/vsi_nn_op_scatter_nd_update.h" +#include "ops/vsi_nn_op_gelu.h" /* custom node head define define */ #include "custom/vsi_nn_custom_node_type.h" @@ -326,6 +328,8 @@ typedef union _vsi_nn_nn_param vsi_nn_one_hot_param one_hot; vsi_nn_nms_param nms; vsi_nn_grouped_conv1d_param grouped_conv1d; + vsi_nn_scatter_nd_update_param scatter_nd_update; + vsi_nn_gelu_param gelu; uint8_t client_param[128]; /* custom node data struct define */ diff --git a/src/tim/vx/internal/include/vsi_nn_tensor_util.h b/src/tim/vx/internal/include/vsi_nn_tensor_util.h index 895307e..bf39d49 100644 --- a/src/tim/vx/internal/include/vsi_nn_tensor_util.h +++ b/src/tim/vx/internal/include/vsi_nn_tensor_util.h @@ -721,6 +721,13 @@ vsi_status vsi_nn_SwapHandle void ** old_ptr ); +vsi_bool vsi_nn_ConvertTensor + ( + vsi_nn_graph_t* graph, + vsi_nn_tensor_t* input, + vsi_nn_tensor_t* output + ); + #ifdef __cplusplus } #endif diff --git a/src/tim/vx/internal/include/vsi_nn_version.h b/src/tim/vx/internal/include/vsi_nn_version.h index 5e544c2..1e7123d 100644 --- a/src/tim/vx/internal/include/vsi_nn_version.h +++ b/src/tim/vx/internal/include/vsi_nn_version.h @@ -33,7 +33,7 @@ extern "C"{ #define VSI_NN_VERSION_MAJOR 1 #define VSI_NN_VERSION_MINOR 1 -#define VSI_NN_VERSION_PATCH 32 +#define VSI_NN_VERSION_PATCH 33 #define VSI_NN_VERSION \ (VSI_NN_VERSION_MAJOR * 10000 + VSI_NN_VERSION_MINOR * 100 + VSI_NN_VERSION_PATCH) diff --git a/src/tim/vx/internal/src/custom/ops/kernel/cpu/custom_softmax_cpu.c b/src/tim/vx/internal/src/custom/ops/kernel/cpu/custom_softmax_cpu.c index abedba1..1eeb997 100644 --- a/src/tim/vx/internal/src/custom/ops/kernel/cpu/custom_softmax_cpu.c +++ b/src/tim/vx/internal/src/custom/ops/kernel/cpu/custom_softmax_cpu.c @@ -35,7 +35,6 @@ #include "utils/vsi_nn_dtype_util.h" #include "kernel/vsi_nn_kernel.h" #include "libnnext/vsi_nn_vxkernel.h" -//#include "libnnext/vx_lib_nnext.h" #define _CPU_ARG_NUM (1) #define _CPU_INPUT_NUM (1) diff --git a/src/tim/vx/internal/src/custom/ops/kernel/evis/custom_softmax_evis.c b/src/tim/vx/internal/src/custom/ops/kernel/evis/custom_softmax_evis.c index 34d679b..f60ae11 100644 --- a/src/tim/vx/internal/src/custom/ops/kernel/evis/custom_softmax_evis.c +++ b/src/tim/vx/internal/src/custom/ops/kernel/evis/custom_softmax_evis.c @@ -35,7 +35,6 @@ #include "utils/vsi_nn_dtype_util.h" #include "kernel/vsi_nn_kernel.h" #include "libnnext/vsi_nn_vxkernel.h" -//#include "libnnext/vx_lib_nnext.h" #define _CPU_ARG_NUM (1) #define _CPU_INPUT_NUM (1) diff --git a/src/tim/vx/internal/src/kernel/cl/add_mean_std_norm_cl.c b/src/tim/vx/internal/src/kernel/cl/add_mean_std_norm_cl.c index 163ebe1..0e94576 100644 --- a/src/tim/vx/internal/src/kernel/cl/add_mean_std_norm_cl.c +++ b/src/tim/vx/internal/src/kernel/cl/add_mean_std_norm_cl.c @@ -35,7 +35,6 @@ #include "vsi_nn_tensor_util.h" #include "utils/vsi_nn_util.h" #include "kernel/vsi_nn_kernel.h" -#include "libnnext/vx_lib_nnext.h" __BEGIN_DECLS @@ -284,4 +283,3 @@ static vsi_nn_kernel_node_t _setup __END_DECLS REGISTER_BACKEND_CL( add_mean_std_norm, _setup ) - diff --git a/src/tim/vx/internal/src/kernel/cl/cast_cl.c b/src/tim/vx/internal/src/kernel/cl/cast_cl.c index 112cfca..6b578af 100644 --- a/src/tim/vx/internal/src/kernel/cl/cast_cl.c +++ b/src/tim/vx/internal/src/kernel/cl/cast_cl.c @@ -35,7 +35,6 @@ #include "vsi_nn_tensor_util.h" #include "utils/vsi_nn_util.h" #include "kernel/vsi_nn_kernel.h" -#include "libnnext/vx_lib_nnext.h" __BEGIN_DECLS @@ -280,4 +279,3 @@ static vsi_nn_kernel_node_t _setup __END_DECLS REGISTER_BACKEND_CL( cast, _setup ) - diff --git a/src/tim/vx/internal/src/kernel/cl/clip_cl.c b/src/tim/vx/internal/src/kernel/cl/clip_cl.c index d8cb733..f8ec75d 100644 --- a/src/tim/vx/internal/src/kernel/cl/clip_cl.c +++ b/src/tim/vx/internal/src/kernel/cl/clip_cl.c @@ -35,7 +35,6 @@ #include "vsi_nn_tensor_util.h" #include "utils/vsi_nn_util.h" #include "kernel/vsi_nn_kernel.h" -#include "libnnext/vx_lib_nnext.h" __BEGIN_DECLS @@ -223,7 +222,6 @@ static vsi_status _query_kernel } return status; - } /* _query_kernel() */ @@ -303,4 +301,3 @@ static vsi_nn_kernel_node_t _setup __END_DECLS REGISTER_BACKEND_CL( clip, _setup ) - diff --git a/src/tim/vx/internal/src/kernel/cl/detect_post_box_cl.c b/src/tim/vx/internal/src/kernel/cl/detect_post_box_cl.c index 2fe07d1..697c34e 100644 --- a/src/tim/vx/internal/src/kernel/cl/detect_post_box_cl.c +++ b/src/tim/vx/internal/src/kernel/cl/detect_post_box_cl.c @@ -35,7 +35,6 @@ #include "vsi_nn_tensor_util.h" #include "utils/vsi_nn_util.h" #include "kernel/vsi_nn_kernel.h" -#include "libnnext/vx_lib_nnext.h" __BEGIN_DECLS @@ -191,7 +190,6 @@ static vsi_status _query_kernel { *is_use_u8_kernel = FALSE; param_def_size = _DETECT_POST_BOX_F32_PARAM_NUM; - } key = DETECT_POST_BOX_HASH_KEY( in0_dtype, in1_dtype, out_dtype ); @@ -311,4 +309,3 @@ static vsi_nn_kernel_node_t _setup __END_DECLS REGISTER_BACKEND_CL( detect_post_box, _setup ) - diff --git a/src/tim/vx/internal/src/kernel/cl/detect_post_nms_cl.c b/src/tim/vx/internal/src/kernel/cl/detect_post_nms_cl.c index 193be18..d54182d 100644 --- a/src/tim/vx/internal/src/kernel/cl/detect_post_nms_cl.c +++ b/src/tim/vx/internal/src/kernel/cl/detect_post_nms_cl.c @@ -35,7 +35,6 @@ #include "vsi_nn_tensor_util.h" #include "utils/vsi_nn_util.h" #include "kernel/vsi_nn_kernel.h" -#include "libnnext/vx_lib_nnext.h" __BEGIN_DECLS #if 0 @@ -188,4 +187,3 @@ static vsi_nn_kernel_node_t _setup __END_DECLS REGISTER_BACKEND_CL( detect_post_nms, _setup ) - diff --git a/src/tim/vx/internal/src/kernel/cl/eltwise_unary_cl.c b/src/tim/vx/internal/src/kernel/cl/eltwise_unary_cl.c index 6b0d6d5..50416af 100644 --- a/src/tim/vx/internal/src/kernel/cl/eltwise_unary_cl.c +++ b/src/tim/vx/internal/src/kernel/cl/eltwise_unary_cl.c @@ -49,6 +49,8 @@ typedef enum UNARY_HSIGMOID, UNARY_MISH, UNARY_ROUND, + UNARY_GELU, + UNARY_HGELU } unary_type_e; /* @@ -94,6 +96,8 @@ typedef enum #define HSIGMOID_OPERATION hard_sigmoid #define MISH_OPERATION mish #define ROUND_OPERATION round +#define GELU_OPERATION gelu +#define HGELU_OPERATION hard_gelu static const struct { uint32_t key; @@ -117,6 +121,10 @@ static const struct { TENSOR_UNARY_KERNELS_FLOAT(MISH_OPERATION, UNARY_MISH, F16, F16) TENSOR_UNARY_KERNELS_FLOAT(ROUND_OPERATION, UNARY_ROUND, F32, F32) TENSOR_UNARY_KERNELS_FLOAT(ROUND_OPERATION, UNARY_ROUND, F16, F16) + TENSOR_UNARY_KERNELS_FLOAT(GELU_OPERATION, UNARY_GELU, F32, F32) + TENSOR_UNARY_KERNELS_FLOAT(GELU_OPERATION, UNARY_GELU, F16, F16) + TENSOR_UNARY_KERNELS_FLOAT(HGELU_OPERATION, UNARY_HGELU, F32, F32) + TENSOR_UNARY_KERNELS_FLOAT(HGELU_OPERATION, UNARY_HGELU, F16, F16) TENSOR_UNARY_KERNELS_FLOAT_2D(SIN_OPERATION, UNARY_SIN, F32, F32) TENSOR_UNARY_KERNELS_FLOAT_2D(SIN_OPERATION, UNARY_SIN, F16, F16) @@ -134,6 +142,10 @@ static const struct { TENSOR_UNARY_KERNELS_FLOAT_2D(MISH_OPERATION, UNARY_MISH, F16, F16) TENSOR_UNARY_KERNELS_FLOAT_2D(ROUND_OPERATION, UNARY_ROUND, F32, F32) TENSOR_UNARY_KERNELS_FLOAT_2D(ROUND_OPERATION, UNARY_ROUND, F16, F16) + TENSOR_UNARY_KERNELS_FLOAT_2D(GELU_OPERATION, UNARY_GELU, F32, F32) + TENSOR_UNARY_KERNELS_FLOAT_2D(GELU_OPERATION, UNARY_GELU, F16, F16) + TENSOR_UNARY_KERNELS_FLOAT_2D(HGELU_OPERATION, UNARY_HGELU, F32, F32) + TENSOR_UNARY_KERNELS_FLOAT_2D(HGELU_OPERATION, UNARY_HGELU, F16, F16) TENSOR_UNARY_KERNELS(SIN_OPERATION, UNARY_SIN, U8, U8) TENSOR_UNARY_KERNELS(EXP_OPERATION, UNARY_EXP, U8, U8) @@ -143,6 +155,8 @@ static const struct { TENSOR_UNARY_KERNELS(HSIGMOID_OPERATION, UNARY_HSIGMOID, U8, U8) TENSOR_UNARY_KERNELS(MISH_OPERATION, UNARY_MISH, U8, U8) TENSOR_UNARY_KERNELS(ROUND_OPERATION, UNARY_ROUND, U8, U8) + TENSOR_UNARY_KERNELS(GELU_OPERATION, UNARY_GELU, U8, U8) + TENSOR_UNARY_KERNELS(HGELU_OPERATION, UNARY_HGELU, U8, U8) TENSOR_UNARY_KERNELS_2D(SIN_OPERATION, UNARY_SIN, U8, U8) TENSOR_UNARY_KERNELS_2D(EXP_OPERATION, UNARY_EXP, U8, U8) @@ -152,6 +166,8 @@ static const struct { TENSOR_UNARY_KERNELS_2D(HSIGMOID_OPERATION, UNARY_HSIGMOID, U8, U8) TENSOR_UNARY_KERNELS_2D(MISH_OPERATION, UNARY_MISH, U8, U8) TENSOR_UNARY_KERNELS_2D(ROUND_OPERATION, UNARY_ROUND, U8, U8) + TENSOR_UNARY_KERNELS_2D(GELU_OPERATION, UNARY_GELU, U8, U8) + TENSOR_UNARY_KERNELS_2D(HGELU_OPERATION, UNARY_HGELU, U8, U8) TENSOR_UNARY_KERNELS(NEG_OPERATION, UNARY_NEG, I32, I32) @@ -166,6 +182,8 @@ static const struct { #undef HSIGMOID_OPERATION #undef MISH_OPERATION #undef ROUND_OPERATION +#undef GELU_OPERATION +#undef HGELU_OPERATION /* * Kernel params */ @@ -417,4 +435,6 @@ REGISTER_ELTWISE_UNARY_BACKEND_CL( neg, UNARY_NEG ) REGISTER_ELTWISE_UNARY_BACKEND_CL( hard_sigmoid, UNARY_HSIGMOID ) REGISTER_ELTWISE_UNARY_BACKEND_CL( mish, UNARY_MISH ) REGISTER_ELTWISE_UNARY_BACKEND_CL( round, UNARY_ROUND ) +REGISTER_ELTWISE_UNARY_BACKEND_CL( gelu, UNARY_GELU ) +REGISTER_ELTWISE_UNARY_BACKEND_CL( hard_gelu, UNARY_HGELU ) __END_DECLS diff --git a/src/tim/vx/internal/src/kernel/cl/floordiv_cl.c b/src/tim/vx/internal/src/kernel/cl/floordiv_cl.c index a500383..11029f6 100644 --- a/src/tim/vx/internal/src/kernel/cl/floordiv_cl.c +++ b/src/tim/vx/internal/src/kernel/cl/floordiv_cl.c @@ -35,7 +35,6 @@ #include "vsi_nn_tensor_util.h" #include "utils/vsi_nn_util.h" #include "kernel/vsi_nn_kernel.h" -#include "libnnext/vx_lib_nnext.h" __BEGIN_DECLS diff --git a/src/tim/vx/internal/src/kernel/cl/grucell_activation_cl.c b/src/tim/vx/internal/src/kernel/cl/grucell_activation_cl.c index 88cd40f..410fe56 100644 --- a/src/tim/vx/internal/src/kernel/cl/grucell_activation_cl.c +++ b/src/tim/vx/internal/src/kernel/cl/grucell_activation_cl.c @@ -35,7 +35,6 @@ #include "vsi_nn_tensor_util.h" #include "utils/vsi_nn_util.h" #include "kernel/vsi_nn_kernel.h" -#include "libnnext/vx_lib_nnext.h" __BEGIN_DECLS @@ -210,4 +209,3 @@ static vsi_nn_kernel_node_t _setup __END_DECLS REGISTER_BACKEND_CL( grucell_activation, _setup ) - diff --git a/src/tim/vx/internal/src/kernel/cl/grucell_activation_sma_cl.c b/src/tim/vx/internal/src/kernel/cl/grucell_activation_sma_cl.c index 9fb557f..1a849fe 100644 --- a/src/tim/vx/internal/src/kernel/cl/grucell_activation_sma_cl.c +++ b/src/tim/vx/internal/src/kernel/cl/grucell_activation_sma_cl.c @@ -35,7 +35,6 @@ #include "vsi_nn_tensor_util.h" #include "utils/vsi_nn_util.h" #include "kernel/vsi_nn_kernel.h" -#include "libnnext/vx_lib_nnext.h" __BEGIN_DECLS @@ -210,4 +209,3 @@ static vsi_nn_kernel_node_t _setup __END_DECLS REGISTER_BACKEND_CL( grucell_activation_sma, _setup ) - diff --git a/src/tim/vx/internal/src/kernel/cl/l2normalizescale_cl.c b/src/tim/vx/internal/src/kernel/cl/l2normalizescale_cl.c index c604edb..8d4d7b3 100644 --- a/src/tim/vx/internal/src/kernel/cl/l2normalizescale_cl.c +++ b/src/tim/vx/internal/src/kernel/cl/l2normalizescale_cl.c @@ -35,7 +35,6 @@ #include "vsi_nn_tensor_util.h" #include "utils/vsi_nn_util.h" #include "kernel/vsi_nn_kernel.h" -#include "libnnext/vx_lib_nnext.h" __BEGIN_DECLS @@ -331,4 +330,3 @@ static vsi_nn_kernel_node_t _setup __END_DECLS REGISTER_BACKEND_CL( l2normalizescale, _setup ) - diff --git a/src/tim/vx/internal/src/kernel/cl/logical_not_cl.c b/src/tim/vx/internal/src/kernel/cl/logical_not_cl.c index 67cf6e8..1e0780e 100644 --- a/src/tim/vx/internal/src/kernel/cl/logical_not_cl.c +++ b/src/tim/vx/internal/src/kernel/cl/logical_not_cl.c @@ -35,7 +35,6 @@ #include "vsi_nn_tensor_util.h" #include "utils/vsi_nn_util.h" #include "kernel/vsi_nn_kernel.h" -#include "libnnext/vx_lib_nnext.h" __BEGIN_DECLS @@ -240,4 +239,3 @@ static vsi_nn_kernel_node_t _setup __END_DECLS REGISTER_BACKEND_CL( logical_not, _setup ) - diff --git a/src/tim/vx/internal/src/kernel/cl/random_multinomial_cl.c b/src/tim/vx/internal/src/kernel/cl/random_multinomial_cl.c index e209157..4b518be 100644 --- a/src/tim/vx/internal/src/kernel/cl/random_multinomial_cl.c +++ b/src/tim/vx/internal/src/kernel/cl/random_multinomial_cl.c @@ -307,7 +307,8 @@ static vsi_status _query_kernel kernel->info.parameters = param_def; kernel->info.numParams = (uint32_t)param_size; kernel->info.initialize = initializer; - vsi_nn_kernel_add_source( kernel, VSI_NN_GPU_SOURCE_FMT_CODE, 1, + vsi_nn_kernel_add_source( kernel, VSI_NN_GPU_SOURCE_FMT_CODE, 2, + "eltwise_ops_helper", kernel_map[i].source_name ); vsi_nn_kernel_add_source( kernel, VSI_NN_GPU_SOURCE_FMT_EXECUTABLE, 1, kernel_map[i].source_name ); diff --git a/src/tim/vx/internal/src/kernel/cl/scatter_nd_update_cl.c b/src/tim/vx/internal/src/kernel/cl/scatter_nd_update_cl.c new file mode 100644 index 0000000..4f2e8c7 --- /dev/null +++ b/src/tim/vx/internal/src/kernel/cl/scatter_nd_update_cl.c @@ -0,0 +1,376 @@ +/**************************************************************************** +* +* Copyright (c) 2020 Vivante Corporation +* +* Permission is hereby granted, free of charge, to any person obtaining a +* copy of this software and associated documentation files (the "Software"), +* to deal in the Software without restriction, including without limitation +* the rights to use, copy, modify, merge, publish, distribute, sublicense, +* and/or sell copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following conditions: +* +* The above copyright notice and this permission notice shall be included in +* all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +* DEALINGS IN THE SOFTWARE. +* +*****************************************************************************/ + +#include +#include +#include +#include "vsi_nn_types.h" +#include "vsi_nn_tensor.h" +#include "vsi_nn_graph.h" +#include "vsi_nn_log.h" +#include "vsi_nn_prv.h" +#include "vsi_nn_error.h" +#include "vsi_nn_tensor_util.h" +#include "utils/vsi_nn_util.h" +#include "kernel/vsi_nn_kernel.h" +#include "kernel/vsi_nn_kernel_eltwise.h" + +__BEGIN_DECLS + +/* + * Define kernel meta. + */ +#define KERNEL_SOURCE_1 "scatter_nd_update" + +#define HASH_SCATTER_ND_UPDATE_KEY(_input0_type, _input2_type, _output_type, _coord_dim) \ + ((_input0_type << 24) | (_input2_type << 16) | (_output_type << 8) | (_coord_dim)) + +#define HASH_SCATTER_ND_UPDATE_SH_KERNEL_NAME(SRC0_TYPE, SRC2_TYPE, DST_TYPE) \ + CVIVANTE_NAMESPACE("cl.scatter_nd_update_"#SRC0_TYPE#SRC2_TYPE"to"#DST_TYPE) + +#define TENSOR_SCATTER_ND_UPDATE_KERNELS(IN0_TYPE, IN1_TYPE, IN2_TYPE, OUT_TYPE, SOURCE) \ + { HASH_SCATTER_ND_UPDATE_KEY(IN0_TYPE, IN2_TYPE, OUT_TYPE, 0), \ + HASH_SCATTER_ND_UPDATE_SH_KERNEL_NAME(IN0_TYPE, IN2_TYPE, OUT_TYPE), \ + SOURCE }, + +static const struct { + uint32_t key; + char* function_name; + const char* source_name; + } scatter_nd_update_map[] = +{ + TENSOR_SCATTER_ND_UPDATE_KERNELS(I32, I32, I32, I32, KERNEL_SOURCE_1) + TENSOR_SCATTER_ND_UPDATE_KERNELS(U32, I32, U32, U32, KERNEL_SOURCE_1) + TENSOR_SCATTER_ND_UPDATE_KERNELS(F32, I32, F32, F32, KERNEL_SOURCE_1) +}; + +/* + * Kernel params + */ +static vx_param_description_t _scatter_nd_update_kernel_param_def[] = +{ + {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, + {VX_OUTPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED}, +}; + +#define _SCATTER_ND_UPDATE_PARAM_NUM _cnt_of_array(_scatter_nd_update_kernel_param_def) + +static vsi_status cal_scatter_nd_update_tensor_reshape_size + ( + vsi_nn_tensor_t ** inputs, + int32_t sizes[VSI_NN_MAX_DIM_NUM], + uint32_t block_size, + uint32_t coordDim, + uint32_t* width, + uint32_t* area, + uint32_t* vol, + int32_t* newDim + ) +{ + vsi_status status = VSI_FAILURE; + uint32_t dims_num = inputs[0]->attr.dim_num; + uint32_t *input_size = inputs[0]->attr.size; + uint32_t i = 0; + uint32_t elementCnt = 1; + + if (coordDim != 0 && (width == NULL || area == NULL)) + { + return status; + } + +#define VSI_NN_MAX_IMAGE_WIDTH (65536) + + newDim[0] = 0; + for(i = 0; i < dims_num; ++i) + { + elementCnt *= input_size[i]; + } + + for(i = 0; i < VSI_NN_MAX_DIM_NUM; ++i) + { + sizes[i] = 1; + } + + if ((elementCnt / block_size) < VSI_NN_MAX_IMAGE_WIDTH) + { + sizes[0] = block_size; + sizes[1] = elementCnt / block_size; + status = VSI_SUCCESS; + newDim[0] = 2; + } + else + { + return status; + } + + if (coordDim == 1) // index shape + { + *width = 0; + *area = 0; + } + else if (coordDim == 2) + { + *width = input_size[dims_num - 2]; + *area = 0; + } + else if (coordDim == 3) + { + *width = input_size[dims_num - 3]; + *area = input_size[dims_num - 3] * input_size[dims_num - 2]; + } + else if (coordDim == 4) + { + *width = input_size[dims_num - 4]; + *area = input_size[dims_num - 4] * input_size[dims_num - 3]; + *vol = input_size[dims_num - 4] * input_size[dims_num - 3] * input_size[dims_num - 2]; + } + else if (coordDim == 5) + { + *width = input_size[dims_num - 5]; + *area = input_size[dims_num - 5] * input_size[dims_num - 4]; + *vol = input_size[dims_num - 5] * input_size[dims_num - 4] * input_size[dims_num - 3]; + } +#undef VSI_NN_MAX_IMAGE_WIDTH + + return status; +} /* _get_EltOP_tensor_reshape_size */ + +/* + * Kernel initializer + */ +DEF_KERNEL_INITIALIZER(_scatter_nd_update_initializer) + ( + vsi_nn_kernel_node_t node, + const vsi_nn_kernel_node_param_t * param, + size_t param_size + ) +{ + vsi_status status = VSI_FAILURE; + gpu_param_t gpu_param = { + 3, + {0, 0, 0}, + {0, 0, 0}, + {0, 0, 0}, + {0, 0, 0} + }; + + vsi_nn_kernel_tensor_attr_t * attr[1] = { NULL }; + int32_t block_size = 0; + int32_t height = 0; + + attr[0] = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[3] ); + CHECK_PTR_FAIL_GOTO( attr[0], "Create tensor attr buffer fail.", final ); + + block_size = attr[0]->shape->data[0]; + height = attr[0]->shape->data[1]; + + gpu_param.global_scale[0] = 1; + gpu_param.global_scale[1] = 1; + gpu_param.global_scale[2] = 1; + + gpu_param.global_size[0] = block_size; + gpu_param.global_size[1] = height; + gpu_param.global_size[2] = 1; + + status = vsi_nn_kernel_gpu_config( node, &gpu_param ); + CHECK_STATUS_FAIL_GOTO(status, final); + +final: + if (attr[0]) + { + vsi_nn_kernel_tensor_attr_release( &attr[0] ); + attr[0] = NULL; + } + return status; +} /* _scatter_nd_update_initializer() */ + +static vsi_status _query_kernel + ( + vsi_nn_kernel_t * kernel, + vsi_nn_tensor_t * const * const inputs, + vsi_nn_tensor_t * const * const outputs, + int32_t coord_dim + ) +{ + vsi_status status = VSI_FAILURE; + vsi_nn_kernel_dtype_e input0_dtype = U8; + vsi_nn_kernel_dtype_e input2_dtype = U8; + vsi_nn_kernel_dtype_e output_dtype = U8; + uint32_t key = 0; + int i = 0; + + input0_dtype = vsi_nn_kernel_map_dtype( inputs[0]->attr.dtype.vx_type ); + input2_dtype = vsi_nn_kernel_map_dtype( inputs[2]->attr.dtype.vx_type ); + output_dtype = vsi_nn_kernel_map_dtype( outputs[0]->attr.dtype.vx_type ); + + key = HASH_SCATTER_ND_UPDATE_KEY( input0_dtype, input2_dtype, output_dtype, 0 ); + + for( i = 0; i < _cnt_of_array(scatter_nd_update_map); i ++ ) + { + if ( scatter_nd_update_map[i].key == key ) + { + break; + } + } + + if ( i < _cnt_of_array(scatter_nd_update_map) ) + { + snprintf( kernel->info.name, VX_MAX_KERNEL_NAME, "%s", scatter_nd_update_map[i].function_name ); + kernel->info.parameters = _scatter_nd_update_kernel_param_def; + kernel->info.numParams = _cnt_of_array( _scatter_nd_update_kernel_param_def ); + kernel->info.initialize = _scatter_nd_update_initializer; + // Register code source + vsi_nn_kernel_add_source( kernel, VSI_NN_GPU_SOURCE_FMT_CODE, 2, + "eltwise_ops_helper", + scatter_nd_update_map[i].source_name ); + // Register binary source + vsi_nn_kernel_add_source( kernel, VSI_NN_GPU_SOURCE_FMT_EXECUTABLE, 1, + scatter_nd_update_map[i].source_name ); + status = VSI_SUCCESS; + } + return status; +} /* _query_kernel() */ +static vsi_nn_kernel_node_t _setup + ( + vsi_nn_graph_t * graph, + vsi_nn_tensor_t ** inputs, + size_t input_num, + vsi_nn_tensor_t ** outputs, + size_t output_num, + const vsi_nn_kernel_param_t * params, + vsi_nn_kernel_t * kernel + ) +{ + vsi_status status = VSI_FAILURE; + vsi_nn_kernel_node_param_t node_params[_SCATTER_ND_UPDATE_PARAM_NUM] = {NULL}; + vsi_nn_kernel_node_t node = NULL; + int32_t shapes[3][VSI_NN_MAX_DIM_NUM] = {{0}}; + int32_t block_size = vsi_nn_kernel_param_get_int32( params, "block_size" ); + int32_t coord_dim = vsi_nn_kernel_param_get_int32( params, "coord_dim" ); + int32_t idx_num = vsi_nn_kernel_param_get_int32( params, "idx_num" ); + int32_t rs_in_dim = 0, rs_idx_dim = 0, rs_out_dim = 0; + uint32_t width = 0, area = 0, vol = 0; + int32_t offsetX = 0, offsetY = 0, offsetZ = 0, offsetW = 0, offset_idx = 0; + + status = cal_scatter_nd_update_tensor_reshape_size(&inputs[1], shapes[0], + coord_dim, 0, NULL, NULL, NULL, &rs_in_dim); + status |= cal_scatter_nd_update_tensor_reshape_size(&inputs[2], shapes[1], + block_size, 0, NULL, NULL, NULL, &rs_idx_dim); + status |= cal_scatter_nd_update_tensor_reshape_size(&outputs[0], shapes[2], + block_size, coord_dim, &width, &area, &vol, &rs_out_dim); + if (status != VSI_SUCCESS) + { + return NULL; + } + + if ( !vsi_nn_kernel_gpu_check_shape( (int32_t*)outputs[0]->attr.size, + outputs[0]->attr.dim_num ) ) + { + return NULL; + } + + if (coord_dim == 5) + { + offset_idx = 1; + } + if (coord_dim == 4 || coord_dim == 5) + { + offsetX = vol; + offsetY = area; + offsetZ = width; + offsetW = 1; + } + else if (coord_dim == 3) + { + offsetX = area; + offsetY = width; + offsetZ = 1; + offsetW = 0; + } + else if (coord_dim == 2) + { + offsetX = width; + offsetY = 1; + offsetZ = 0; + offsetW = 0; + } + else if (coord_dim == 1) + { + offsetX = 1; + offsetY = 0; + offsetZ = 0; + offsetW = 0; + } + + status = _query_kernel( kernel, inputs, outputs, coord_dim ); + if ( VSI_SUCCESS == status) + { + node = vsi_nn_kernel_create_node( graph, kernel ); + if ( node ) + { + uint32_t index = 0; + /* Pass parameters to node. */ + node_params[index++] = vsi_nn_kernel_tensor_reshape( inputs[0]->t, shapes[2], rs_out_dim ); + node_params[index++] = vsi_nn_kernel_tensor_reshape( inputs[1]->t, shapes[0], rs_in_dim ); + node_params[index++] = vsi_nn_kernel_tensor_reshape( inputs[2]->t, shapes[1], rs_idx_dim ); + node_params[index++] = vsi_nn_kernel_tensor_reshape( outputs[0]->t, shapes[2], rs_out_dim ); + node_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &offsetX ); + node_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &offsetY ); + node_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &offsetZ ); + node_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &offsetW ); + node_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &offset_idx ); + node_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &coord_dim ); + node_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &idx_num ); + /* Pass parameters to node. */ + status = vsi_nn_kernel_node_pass_param( node, node_params, _SCATTER_ND_UPDATE_PARAM_NUM ); + CHECK_STATUS(status); + vsi_nn_kernel_tensor_release( &node_params[0] ); + vsi_nn_kernel_tensor_release( &node_params[1] ); + vsi_nn_kernel_tensor_release( &node_params[2] ); + vsi_nn_kernel_tensor_release( &node_params[3] ); + vsi_nn_kernel_scalar_release( &node_params[4] ); + vsi_nn_kernel_scalar_release( &node_params[5] ); + vsi_nn_kernel_scalar_release( &node_params[6] ); + vsi_nn_kernel_scalar_release( &node_params[7] ); + vsi_nn_kernel_scalar_release( &node_params[8] ); + vsi_nn_kernel_scalar_release( &node_params[9] ); + vsi_nn_kernel_scalar_release( &node_params[10] ); + } + } + return node; +} /* _setup() */ + +__END_DECLS + +REGISTER_BACKEND_CL( scatter_nd_update, _setup ) + diff --git a/src/tim/vx/internal/src/kernel/cl/signal_frame_cl.c b/src/tim/vx/internal/src/kernel/cl/signal_frame_cl.c new file mode 100644 index 0000000..f73089c --- /dev/null +++ b/src/tim/vx/internal/src/kernel/cl/signal_frame_cl.c @@ -0,0 +1,298 @@ +/**************************************************************************** +* +* Copyright (c) 2020 Vivante Corporation +* +* Permission is hereby granted, free of charge, to any person obtaining a +* copy of this software and associated documentation files (the "Software"), +* to deal in the Software without restriction, including without limitation +* the rights to use, copy, modify, merge, publish, distribute, sublicense, +* and/or sell copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following conditions: +* +* The above copyright notice and this permission notice shall be included in +* all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +* DEALINGS IN THE SOFTWARE. +* +*****************************************************************************/ + +#include +#include +#include +#include "vsi_nn_types.h" +#include "vsi_nn_tensor.h" +#include "vsi_nn_graph.h" +#include "vsi_nn_log.h" +#include "vsi_nn_error.h" +#include "vsi_nn_prv.h" +#include "vsi_nn_tensor_util.h" +#include "utils/vsi_nn_util.h" +#include "kernel/vsi_nn_kernel.h" +#include "utils/vsi_nn_dtype_util.h" + +__BEGIN_DECLS + +#define SIGNAL_FRAME_HASH_KEY( IN_DTYPE, OUT_DTYPE ) \ + ( ( IN_DTYPE << 8 ) | ( OUT_DTYPE ) ) +#define SIGNAL_FRAME_KERNEL_MAP( IN_DTYPE, OUT_DTYPE ) \ + { SIGNAL_FRAME_HASH_KEY( IN_DTYPE, OUT_DTYPE ), \ + CVIVANTE_NAMESPACE("cl.signal_frame_"#IN_DTYPE"to"#OUT_DTYPE), \ + "signal_frame" } + +typedef struct +{ + uint32_t key; + char * function_name; + const char * source_name; +} _kernel_map_type; + +static const _kernel_map_type _signal_frame_kernel_map[] = +{ + // Register kernel here + SIGNAL_FRAME_KERNEL_MAP( F32, F32 ), + + SIGNAL_FRAME_KERNEL_MAP( U8, U8) +}; + +/* + * Kernel params + */ +static vx_param_description_t _signal_frame_kernel_param_def[] = +{ + {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, + {VX_OUTPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED}, + // Add kererl parameters here +}; +#define _SIGNAL_FRAME_PARAM_NUM _cnt_of_array( _signal_frame_kernel_param_def ) +#define FRAME_STEP (2) + +/* + * Kernel initializer + */ +DEF_KERNEL_INITIALIZER(_signal_frame_initializer) + ( + vsi_nn_kernel_node_t node, + const vsi_nn_kernel_node_param_t * param, + size_t param_size + ) +{ + gpu_param_t gpu_param = { + 3, // workdim + {0, 0, 0}, // globalWorkOffset: control the start location be processed in the image + {0, 0, 0}, // globalWorkScale: how many pixels could be processed by a single thread + {0, 0, 0}, // localWorkSize: local group size in thread + {0, 0, 0} // globalWorkSize: image size in thread + }; + + vsi_status status = VSI_FAILURE; + vsi_nn_kernel_tensor_attr_t * attr[2] = { NULL }; + vsi_int_array_t * out_shape = NULL; + + attr[0] = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[0] ); + CHECK_PTR_FAIL_GOTO( attr[0], "Create tensor attr buffer fail.", final ); + attr[1] = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[1] ); + CHECK_PTR_FAIL_GOTO( attr[1], "Create tensor attr buffer fail.", final ); + + out_shape = attr[1]->shape; + + gpu_param.global_scale[0] = 1; + gpu_param.global_scale[1] = 1; + gpu_param.global_scale[2] = 1; + gpu_param.global_size[0] = gpu_align_p2( + (out_shape->data[0] + gpu_param.global_scale[0] - 1) + / gpu_param.global_scale[0], 4); + gpu_param.global_size[1] = ( + (out_shape->data[1] + gpu_param.global_scale[1] - 1) + / gpu_param.global_scale[1]); + gpu_param.global_size[2] = out_shape->size > 2 ? out_shape->data[2] : 1; + + status = vsi_nn_kernel_gpu_config( node, &gpu_param ); + +final: +#define SAFE_FREE_TENSOR_ATTR(_PTR) if ( _PTR ) { vsi_nn_kernel_tensor_attr_release( &_PTR ); _PTR = NULL; } + SAFE_FREE_TENSOR_ATTR(attr[0]); + SAFE_FREE_TENSOR_ATTR(attr[1]); +#undef SAFE_FREE_TENSOR_ATTR + + return status; +} /* _erf_initializer() */ + +/* + * Query kernel + */ +static vsi_status _query_kernel + ( + vsi_nn_kernel_t * kernel, + vsi_nn_tensor_t * const * const inputs, + vsi_nn_tensor_t * const * const outputs + ) +{ + vsi_status status = VSI_FAILURE; + vsi_nn_kernel_dtype_e in_dtype; + vsi_nn_kernel_dtype_e out_dtype; + uint32_t key = 0; + uint32_t i = 0; + + in_dtype = vsi_nn_kernel_map_dtype( inputs[0]->attr.dtype.vx_type ); + out_dtype = vsi_nn_kernel_map_dtype( outputs[0]->attr.dtype.vx_type ); + + in_dtype = in_dtype == F16 ? F32 : in_dtype; + out_dtype = out_dtype == F16 ? F32 : out_dtype; + key = SIGNAL_FRAME_HASH_KEY( in_dtype, out_dtype ); + + for ( i = 0; i < _cnt_of_array(_signal_frame_kernel_map); i ++ ) + { + if ( _signal_frame_kernel_map[i].key == key ) + { + break; + } + } + if ( i < _cnt_of_array(_signal_frame_kernel_map) ) + { + snprintf( kernel->info.name, VX_MAX_KERNEL_NAME, "%s", _signal_frame_kernel_map[i].function_name ); + kernel->info.parameters = _signal_frame_kernel_param_def; + kernel->info.numParams = _cnt_of_array( _signal_frame_kernel_param_def ); + kernel->info.initialize = _signal_frame_initializer; + // Register code source + vsi_nn_kernel_add_source( kernel, VSI_NN_GPU_SOURCE_FMT_CODE, 1, + _signal_frame_kernel_map[i].source_name ); + // Register binary source + vsi_nn_kernel_add_source( kernel, VSI_NN_GPU_SOURCE_FMT_EXECUTABLE, 1, + _signal_frame_kernel_map[i].source_name ); + status = VSI_SUCCESS; + } + return status; +} /* _query_kernel() */ + +static vsi_nn_kernel_node_t _setup + ( + vsi_nn_graph_t * graph, + vsi_nn_tensor_t ** inputs, + size_t input_num, + vsi_nn_tensor_t ** outputs, + size_t output_num, + const vsi_nn_kernel_param_t * params, + vsi_nn_kernel_t * kernel + ) +{ + vsi_status status = VSI_FAILURE; + vsi_nn_kernel_node_param_t node_params[_SIGNAL_FRAME_PARAM_NUM] = {NULL}; + vsi_nn_kernel_node_t node = NULL; + int32_t frame_length = vsi_nn_kernel_param_get_int32( params, "frame_length" ); + int32_t frame_step = vsi_nn_kernel_param_get_int32( params, "frame_step" ); + int32_t axis = vsi_nn_kernel_param_get_int32( params, "axis" ); + int32_t pad_end = vsi_nn_kernel_param_get_int32( params, "pad_end" ); + float pad_value = vsi_nn_kernel_param_get_float32( params, "pad_val" ); + int32_t num_frames = outputs[0]->attr.size[axis + 1]; + int32_t rank = inputs[0]->attr.dim_num; + int32_t inner = 1; + int32_t outer = 1; + int32_t length_samples = inputs[0]->attr.size[axis]; + int32_t i = 0; + vsi_nn_tensor_t* rs_tensors[2] = { NULL }; + int32_t shape[2][VSI_NN_MAX_DIM_NUM] = {{ 0 }}; + + for (i = 0; i < axis; i++) + { + inner *= inputs[0]->attr.size[i]; + } + + for (i = axis + 1; i < rank; i++) + { + outer *= inputs[0]->attr.size[i]; + } + + shape[0][0] = inner; + shape[0][1] = length_samples; + shape[0][2] = 1; + shape[0][3] = outer; + + shape[1][0] = inner; + shape[1][1] = frame_length; + shape[1][2] = num_frames; + shape[1][3] = outer; + + rs_tensors[0] = vsi_nn_reshape_tensor( graph, + inputs[0], (uint32_t*)shape[0], 4 ); + rs_tensors[1] = vsi_nn_reshape_tensor( graph, + outputs[0], (uint32_t*)shape[1], 4 ); + + if ( !vsi_nn_kernel_gpu_check_shape( (int32_t*)rs_tensors[1]->attr.size, + rs_tensors[1]->attr.dim_num ) ) + { + return NULL; + } + + status = _query_kernel( kernel, inputs, outputs ); + if ( VSI_SUCCESS == status ) + { + node = vsi_nn_kernel_create_node( graph, kernel ); + if ( node ) + { + if ( pad_end ) + { + // Set default border mode. + vx_border_t border; + uint32_t data = 0; + uint32_t dsize = 1; + + vsi_nn_Float32ToDtype(pad_value, (uint8_t*)&data, &outputs[0]->attr.dtype); + border.mode = VX_BORDER_CONSTANT; + dsize = vsi_nn_GetTypeBytes( inputs[0]->attr.dtype.vx_type ); + if ( dsize == 1 ) + { + border.constant_value.U8 = (uint8_t)data; + } + else if ( dsize == 4 ) + { + border.constant_value.U32 = data; + } + else + { + border.constant_value.U16 = (uint16_t)data; + } + + status |= vxSetNodeAttribute( (vx_node)node, VX_NODE_BORDER, &border, sizeof(border) ); + } + /* Set inputs and outputs */ + vsi_nn_kernel_node_pack_io( node_params, _SIGNAL_FRAME_PARAM_NUM, + rs_tensors, 1, &rs_tensors[1], 1 ); + + node_params[FRAME_STEP] = vsi_nn_kernel_scalar_create( + graph, I32, &frame_step ); + + /* Pass parameters to node. */ + status = vsi_nn_kernel_node_pass_param( node, node_params, _SIGNAL_FRAME_PARAM_NUM ); + CHECK_STATUS_FAIL_GOTO( status, final ); + } + } + +final: + if (rs_tensors[0]) + { + vsi_nn_ReleaseTensor( &rs_tensors[0] ); + } + + if (rs_tensors[1]) + { + vsi_nn_ReleaseTensor( &rs_tensors[1] ); + } + + if (node_params[FRAME_STEP]) + { + vsi_nn_kernel_scalar_release( &node_params[FRAME_STEP] ); + } + + return node; +} /* _setup() */ + +__END_DECLS + +REGISTER_BACKEND_CL( signal_frame, _setup ) diff --git a/src/tim/vx/internal/src/kernel/cl/slice_cl.c b/src/tim/vx/internal/src/kernel/cl/slice_cl.c index d05a32e..f0da3cb 100644 --- a/src/tim/vx/internal/src/kernel/cl/slice_cl.c +++ b/src/tim/vx/internal/src/kernel/cl/slice_cl.c @@ -22,7 +22,6 @@ * *****************************************************************************/ - #include #include #include @@ -66,7 +65,6 @@ __BEGIN_DECLS { SLICE_HASH_KEY( IN0_DTYPE, IN1_DTYPE, OUT_DTYPE, 1 ), \ SLICE_SH_KERNEL_2D_NAME( IN0_DTYPE, IN1_DTYPE, OUT_DTYPE ), SOURCE } - typedef struct { uint32_t key; @@ -221,7 +219,6 @@ static vsi_status _query_kernel return status; } /* _query_kernel() */ - static vsi_nn_kernel_node_t _setup ( vsi_nn_graph_t * graph, @@ -268,7 +265,7 @@ static vsi_nn_kernel_node_t _setup if ( !vsi_nn_kernel_gpu_check_shape( (int32_t*)reshape_tensors[0]->attr.size, inputs[0]->attr.dim_num ) || input_batch != output_batch ) { - return NULL; + goto final; } image_2d = (rank[0] < 3 || shapes[0][2] == 1); @@ -300,6 +297,13 @@ static vsi_nn_kernel_node_t _setup vsi_nn_kernel_scalar_release( &node_params[SCALAR_OUTPUT_ZP] ); } } + +final: + for (i = 0; i < _IO_NUM; i++) + { + vsi_safe_release_tensor(reshape_tensors[i]); + } + return node; } /* _setup() */ diff --git a/src/tim/vx/internal/src/kernel/cpu/box_with_nms_limit_cpu.c b/src/tim/vx/internal/src/kernel/cpu/box_with_nms_limit_cpu.c new file mode 100644 index 0000000..c471f16 --- /dev/null +++ b/src/tim/vx/internal/src/kernel/cpu/box_with_nms_limit_cpu.c @@ -0,0 +1,535 @@ +/**************************************************************************** +* +* Copyright (c) 2020 Vivante Corporation +* +* Permission is hereby granted, free of charge, to any person obtaining a +* copy of this software and associated documentation files (the "Software"), +* to deal in the Software without restriction, including without limitation +* the rights to use, copy, modify, merge, publish, distribute, sublicense, +* and/or sell copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following conditions: +* +* The above copyright notice and this permission notice shall be included in +* all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +* DEALINGS IN THE SOFTWARE. +* +*****************************************************************************/ + +#include +#include +#include +#include "vsi_nn_types.h" +#include "vsi_nn_tensor.h" +#include "vsi_nn_graph.h" +#include "vsi_nn_log.h" +#include "vsi_nn_error.h" +#include "vsi_nn_prv.h" +#include "vsi_nn_tensor_util.h" +#include "utils/vsi_nn_util.h" +#include "kernel/vsi_nn_kernel.h" +#include "libnnext/vx_lib_nnext.h" + +__BEGIN_DECLS + +/* + * Define kernel meta. + */ +#define _INPUT_NUM (3) +#define _OUTPUT_NUM (4) +#define _KERNEL_NAME CVIVANTE_NAMESPACE("cpu.box_with_nms_limit") + +/* + * Kernel params + */ +static vx_param_description_t _box_with_nms_limit_kernel_param_def[] = +{ + {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, + {VX_OUTPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, + {VX_OUTPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, + {VX_OUTPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, + {VX_OUTPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED}, +}; +#define _BOX_WITH_NMS_LIMIT_PARAM_NUM _cnt_of_array( _box_with_nms_limit_kernel_param_def ) +#define SCORE_THRESHOLD (7) +#define MAX_NUM_DETECTIONS (8) +#define NMS_KERNEL_METHOD (9) +#define IOU_THRESHOLD (10) +#define SIGMA (11) +#define NMS_SCORE_THRESHOLD (12) + +static float hard_nms_kernel + ( + float iou, + float iouThreshold + ) +{ + return iou < iouThreshold ? 1.0f : 0.0f; +} + +static float linear_nms_kernel + ( + float iou, + float iouThreshold + ) +{ + return iou < iouThreshold ? 1.0f : 1.0f - iou; +} + +static float gaussian_nms_kernel + ( + float iou, + float sigma + ) +{ + return (float)(exp(-1.0f * iou * iou / sigma)); +} + +void swap_element + ( + uint32_t* list, + uint32_t first, + uint32_t second + ) +{ + uint32_t temp = list[first]; + list[first] = list[second]; + list[second] = temp; +} + +uint32_t max_element + ( + float* data, + uint32_t* index_list, + uint32_t len + ) +{ + uint32_t i; + uint32_t max_index = 0; + float max_val = data[index_list[0]]; + for(i = 1; i < len; i++) + { + float val = data[index_list[i]]; + if (max_val < val) + { + max_val = val; + max_index = i; + } + } + return max_index; +} + +static uint32_t max_comp_func + ( + void* data, + int32_t left, + int32_t right + ) +{ + float* fdata = (float*)data; + return fdata[left] >= fdata[right]; +} + +void sort_element_by_score + ( + float* data, + uint32_t* index_list, + uint32_t len + ) +{ + vsi_nn_partition(data, 0, len - 1, max_comp_func, TRUE, index_list); +} + +typedef struct +{ + float* fdata; + uint32_t numClasses; +} class_comp_param; + +static uint32_t class_comp_func + ( + void* data, + int32_t left, + int32_t right + ) +{ + class_comp_param *p = (class_comp_param*)data; + float* fdata = p->fdata; + uint32_t numClasses = p->numClasses; + uint32_t lhsClass = left % numClasses, rhsClass = right % numClasses; + return lhsClass == rhsClass ? fdata[left] > fdata[right] + : lhsClass < rhsClass; +} + +static void sort_element_by_class + ( + float* data, + uint32_t* index_list, + uint32_t len, + uint32_t numClasses + ) +{ + class_comp_param class_comp; + class_comp.fdata = data; + class_comp.numClasses = numClasses; + vsi_nn_partition(&class_comp, 0, len - 1, class_comp_func, TRUE, index_list); +} + +// Taking two indices of bounding boxes, return the intersection-of-union. +float getIoUAxisAligned + ( + const float* roi1, + const float* roi2 + ) +{ + const float area1 = (roi1[2] - roi1[0]) * (roi1[3] - roi1[1]); + const float area2 = (roi2[2] - roi2[0]) * (roi2[3] - roi2[1]); + const float x1 = vsi_nn_max(roi1[0], roi2[0]); + const float x2 = vsi_nn_min(roi1[2], roi2[2]); + const float y1 = vsi_nn_max(roi1[1], roi2[1]); + const float y2 = vsi_nn_min(roi1[3], roi2[3]); + const float w = vsi_nn_max(x2 - x1, 0.0f); + const float h = vsi_nn_max(y2 - y1, 0.0f); + const float areaIntersect = w * h; + const float areaUnion = area1 + area2 - areaIntersect; + return areaIntersect / areaUnion; +} +/* + * Kernel function + */ +DEF_KERNEL_EXECUTOR(_compute) + ( + vsi_nn_kernel_node_t node, + const vsi_nn_kernel_node_param_t * param, + size_t param_size + ) +{ + vsi_status status = VSI_FAILURE; + vsi_nn_kernel_tensor_t input[_INPUT_NUM] = {NULL}; + vsi_nn_kernel_tensor_t output[_OUTPUT_NUM] = {NULL}; + float *f32_in_buffer[_INPUT_NUM] = {NULL}; + int32_t* int32_in_buffer[_INPUT_NUM] = {NULL}; + float *f32_out_buffer[_OUTPUT_NUM] = {NULL}; + int32_t* int32_out_buffer[_OUTPUT_NUM] = {0}; + vsi_nn_kernel_tensor_attr_t *in_attr[_INPUT_NUM]; + vsi_nn_kernel_tensor_attr_t *out_attr[_OUTPUT_NUM]; + size_t out_stride_size[_OUTPUT_NUM][VSI_NN_MAX_DIM_NUM] = {{1}}; + size_t out_elements[_OUTPUT_NUM] = {0}; + size_t out_bytes[_OUTPUT_NUM] = {0}; + uint32_t i = 0; + float score_threshold = 0; + int32_t max_num_detections = 0; + int32_t nms_kernel_method = 0; + float iou_threshold = 0; + float sigma = 0; + float nms_score_threshold = 0; + uint32_t j = 0, n = 0, b = 0, c = 0; + const uint32_t kRoiDim = 4; + uint32_t numRois = 0; + uint32_t numClasses = 0; + int32_t ind = 0; + uint32_t * batch_data = NULL; + int32_t numBatch = 0; + uint32_t * select = NULL; + uint32_t select_size = 0; + uint32_t scores_index = 0; + uint32_t roi_index = 0; + uint32_t roi_out_index = 0; + + /* prepare data */ + for (i = 0; i < _INPUT_NUM; i ++) + { + input[i] = (vsi_nn_kernel_tensor_t)param[i]; + in_attr[i] = vsi_nn_kernel_tensor_attr_create( input[i] ); + if (i == 2) + { + int32_in_buffer[i] = (int32_t*)vsi_nn_kernel_tensor_create_buffer( input[i], in_attr[i], TRUE ); + CHECK_PTR_FAIL_GOTO( int32_in_buffer[i], "Create input buffer fail.", final ); + } + else + { + f32_in_buffer[i] = (float*)vsi_nn_kernel_tensor_create_buffer( input[i], in_attr[i], TRUE ); + CHECK_PTR_FAIL_GOTO( f32_in_buffer[i], "Create input buffer fail.", final ); + } + } + + for (i = 0; i < _OUTPUT_NUM; i ++) + { + output[i] = (vsi_nn_kernel_tensor_t)param[i + _INPUT_NUM]; + out_attr[i] = vsi_nn_kernel_tensor_attr_create( output[i] ); + vsi_nn_kernel_tensor_attr_get_stride( out_attr[i], out_stride_size[i] ); + out_elements[i] = vsi_nn_kernel_tensor_attr_get_size( out_attr[i] ); + out_bytes[i] = out_elements[i] * sizeof(float); + if (i < 2) + { + f32_out_buffer[i] = (float *)malloc( out_bytes[i] ); + CHECK_PTR_FAIL_GOTO( f32_out_buffer[i], "Create output buffer fail.", final ); + memset( f32_out_buffer[i], 0, out_bytes[i] ); + } + else + { + int32_out_buffer[i] = (int32_t *)malloc( out_bytes[i] ); + CHECK_PTR_FAIL_GOTO( int32_out_buffer[i], "Create output buffer fail.", final ); + memset( int32_out_buffer[i], 0, out_bytes[i] ); + } + } + +#define VSI_NN_KERNEL_READ_SCALAR(type, idx, pointer) \ + vsi_nn_kernel_scalar_read_##type((vsi_nn_kernel_scalar_t)param[idx], pointer) + + status = VSI_NN_KERNEL_READ_SCALAR(float32, SCORE_THRESHOLD, &score_threshold); + status |= VSI_NN_KERNEL_READ_SCALAR(int32, MAX_NUM_DETECTIONS, &max_num_detections); + status |= VSI_NN_KERNEL_READ_SCALAR(int32, NMS_KERNEL_METHOD, &nms_kernel_method); + status |= VSI_NN_KERNEL_READ_SCALAR(float32, IOU_THRESHOLD, &iou_threshold); + status |= VSI_NN_KERNEL_READ_SCALAR(float32, SIGMA, &sigma); + status |= VSI_NN_KERNEL_READ_SCALAR(float32, NMS_SCORE_THRESHOLD, &nms_score_threshold); + CHECK_STATUS_FAIL_GOTO(status, final ); +#undef VSI_NN_KERNEL_READ_SCALAR + + numRois = in_attr[0]->shape->data[1]; + numClasses = in_attr[0]->shape->data[0]; + + batch_data = (uint32_t*)malloc(numRois * sizeof(uint32_t)); + CHECK_PTR_FAIL_GOTO( batch_data, "Create batch_data fail.", final ); + memset(batch_data, 0, numRois * sizeof(uint32_t)); + + for (i = 0, ind = -1; i < numRois; i++) + { + if (int32_in_buffer[2][i] != ind) + { + ind = int32_in_buffer[2][i]; + numBatch++; + } + batch_data[numBatch - 1]++; + } + select = (uint32_t*)malloc(numBatch * numRois + * numClasses * sizeof(uint32_t)); + CHECK_PTR_FAIL_GOTO( select, "Create select fail.", final ); + memset(select, 0, numBatch * numRois * numClasses * sizeof(uint32_t)); + for (n = 0; n < (uint32_t)numBatch; n++) + { + int32_t numDetections_batch = 0; + uint32_t select_start_batch = select_size; + uint32_t select_len = 0; + // Exclude class 0 (background) + for (c = 1; c < numClasses; c++) + { + uint32_t select_start = select_size; + int32_t maxNumDetections0 = max_num_detections; + uint32_t numDetections = 0; + for (b = 0; b < batch_data[n]; b++) + { + uint32_t index = b * numClasses + c; + float score = f32_in_buffer[0][scores_index + index]; + if (score > score_threshold) { + select[select_size] = index; + select_size++; + } + } + select_len = select_size - select_start; + + if (maxNumDetections0 < 0) + { + maxNumDetections0 = select_len; + } + + for (j = 0; (j < select_len && numDetections < (uint32_t)maxNumDetections0); j++) + { + // find max score and swap to the front. + int32_t max_index = max_element(&(f32_in_buffer[0][scores_index]), + &(select[select_start + j]), select_len - j) + j; + + swap_element(&(select[select_start]), max_index, j); + + // Calculate IoU of the rest, swap to the end (disgard) if needed. + for (i = j + 1; i < select_len; i++) + { + int32_t roiBase0 = roi_index + select[select_start + i] * kRoiDim; + int32_t roiBase1 = roi_index + select[select_start + j] * kRoiDim; + float iou = getIoUAxisAligned(&(f32_in_buffer[1][roiBase0]), + &(f32_in_buffer[1][roiBase1])); + float kernel_iou; + if (nms_kernel_method == 0) + { + kernel_iou = hard_nms_kernel(iou, iou_threshold); + } + else if (nms_kernel_method == 1) + { + kernel_iou = linear_nms_kernel(iou, iou_threshold); + } + else + { + kernel_iou = gaussian_nms_kernel(iou, sigma); + } + f32_in_buffer[0][scores_index + select[select_start + i]] *= kernel_iou; + if (f32_in_buffer[0][scores_index + select[select_start + i]] < nms_score_threshold) + { + swap_element(&(select[select_start]), i, select_len - 1); + i--; + select_len--; + } + } + numDetections++; + } + select_size = select_start + select_len; + numDetections_batch += numDetections; + } + + // Take top max_num_detections. + sort_element_by_score(&(f32_in_buffer[0][scores_index]), &(select[select_start_batch]), + numDetections_batch); + + if (numDetections_batch > max_num_detections && max_num_detections >= 0) + { + select_size = select_start_batch + max_num_detections; + } + select_len = select_size - select_start_batch; + // Sort again by class. + sort_element_by_class(&(f32_in_buffer[0][scores_index]), &(select[select_start_batch]), + select_len, numClasses); + + for (i = 0; i < select_len; i++) + { + int32_t in_index0 = scores_index + select[select_start_batch + i]; + int32_t in_index1 = roi_index + select[select_start_batch + i] * kRoiDim; + f32_out_buffer[0][roi_out_index] = f32_in_buffer[0][in_index0]; + memcpy(&(f32_out_buffer[1][roi_out_index * kRoiDim]), + &f32_in_buffer[1][in_index1], kRoiDim * sizeof(float)); + int32_out_buffer[2][roi_out_index] = select[select_start_batch + i] % numClasses; + int32_out_buffer[3][roi_out_index] = n; + roi_out_index++; + } + + scores_index += batch_data[n] * numClasses; + roi_index += batch_data[n] * numClasses * kRoiDim; + } + + /* save data */ + for(i = 0; i < _OUTPUT_NUM; i++) + { + if (i < 2) + { + status = vsi_nn_kernel_tensor_write_from_float( output[i], out_attr[i], + f32_out_buffer[i], out_elements[i] ); + } + else + { + status = vsi_nn_kernel_tensor_write( output[i], out_attr[i], + int32_out_buffer[i], out_bytes[i] ); + } + CHECK_STATUS_FAIL_GOTO( status, final ); + } +final: + vsi_nn_safe_free(batch_data); + vsi_nn_safe_free(select); + for (i = 0; i < _INPUT_NUM; i++) + { + vsi_nn_safe_free(f32_in_buffer[i]); + vsi_nn_safe_free(int32_in_buffer[i]); + + if (in_attr[i]) + { + vsi_nn_kernel_tensor_attr_release( &in_attr[i] ); + } + } + for (i = 0; i < _OUTPUT_NUM; i++) + { + vsi_nn_safe_free(f32_out_buffer[i]); + vsi_nn_safe_free(int32_out_buffer[i]); + + if (out_attr[i]) + { + vsi_nn_kernel_tensor_attr_release( &out_attr[i] ); + } + } + + return status; +} /* _compute() */ + +/* + * Query kernel + */ +static vsi_status _query_kernel + ( + vsi_nn_kernel_t * kernel, + vsi_nn_tensor_t * const * const inputs, + vsi_nn_tensor_t * const * const outputs + /* Add extra params */ + ) +{ + vsi_status status = VSI_SUCCESS; + snprintf( kernel->info.name, VX_MAX_KERNEL_NAME, "%s", _KERNEL_NAME ); + kernel->info.function = _compute; + kernel->info.parameters = _box_with_nms_limit_kernel_param_def; + kernel->info.numParams = _cnt_of_array( _box_with_nms_limit_kernel_param_def ); + + return status; +} /* _query_kernel() */ + +static vsi_nn_kernel_node_t _setup + ( + vsi_nn_graph_t * graph, + vsi_nn_tensor_t ** inputs, + size_t input_num, + vsi_nn_tensor_t ** outputs, + size_t output_num, + const vsi_nn_kernel_param_t * params, + vsi_nn_kernel_t * kernel + ) +{ + vsi_status status = VSI_FAILURE; + vsi_nn_kernel_node_param_t node_params[_BOX_WITH_NMS_LIMIT_PARAM_NUM] = {NULL}; + vsi_nn_kernel_node_t node = NULL; + float score_threshold = vsi_nn_kernel_param_get_float32( params, "score_threshold" ); + int32_t max_num_detections = vsi_nn_kernel_param_get_int32( params, "max_num_detections" ); + int32_t nms_kernel_method = vsi_nn_kernel_param_get_int32( params, "nms_kernel_method" ); + float iou_threshold = vsi_nn_kernel_param_get_float32( params, "iou_threshold" ); + float sigma = vsi_nn_kernel_param_get_float32( params, "sigma" ); + float nms_score_threshold = vsi_nn_kernel_param_get_float32( params, "nms_score_threshold" ); + + status = _query_kernel( kernel, inputs, outputs ); + if ( VSI_SUCCESS == status ) + { + node = vsi_nn_kernel_create_node( graph, kernel ); + if ( node ) + { + /* Set inputs and outputs */ + vsi_nn_kernel_node_pack_io( node_params, _BOX_WITH_NMS_LIMIT_PARAM_NUM, + inputs, input_num, outputs, output_num ); + node_params[SCORE_THRESHOLD] = vsi_nn_kernel_scalar_create( graph, F32, &score_threshold ); + node_params[MAX_NUM_DETECTIONS] = vsi_nn_kernel_scalar_create( graph, I32, &max_num_detections ); + node_params[NMS_KERNEL_METHOD] = vsi_nn_kernel_scalar_create( graph, I32, &nms_kernel_method ); + node_params[IOU_THRESHOLD] = vsi_nn_kernel_scalar_create( graph, F32, &iou_threshold ); + node_params[SIGMA] = vsi_nn_kernel_scalar_create( graph, F32, &sigma ); + node_params[NMS_SCORE_THRESHOLD] = vsi_nn_kernel_scalar_create( graph, F32, &nms_score_threshold ); + /* Pass parameters to node. */ + status = vsi_nn_kernel_node_pass_param( node, node_params, _BOX_WITH_NMS_LIMIT_PARAM_NUM ); + vsi_nn_kernel_scalar_release( &node_params[SCORE_THRESHOLD] ); + vsi_nn_kernel_scalar_release( &node_params[MAX_NUM_DETECTIONS] ); + vsi_nn_kernel_scalar_release( &node_params[NMS_KERNEL_METHOD] ); + vsi_nn_kernel_scalar_release( &node_params[IOU_THRESHOLD] ); + vsi_nn_kernel_scalar_release( &node_params[SIGMA] ); + vsi_nn_kernel_scalar_release( &node_params[NMS_SCORE_THRESHOLD] ); + } + } + + return node; +} /* _setup() */ + +__END_DECLS + +REGISTER_BACKEND_CPU( box_with_nms_limit, _setup ) diff --git a/src/tim/vx/internal/src/kernel/cpu/eltwise_unary_cpu.c b/src/tim/vx/internal/src/kernel/cpu/eltwise_unary_cpu.c index 64f9490..a00cfcb 100644 --- a/src/tim/vx/internal/src/kernel/cpu/eltwise_unary_cpu.c +++ b/src/tim/vx/internal/src/kernel/cpu/eltwise_unary_cpu.c @@ -47,6 +47,8 @@ typedef enum UNARY_HSIGMOID, UNARY_MISH, UNARY_ROUND, + UNARY_GELU, + UNARY_HGELU, } unary_type_e; @@ -109,6 +111,58 @@ static float round_eval(float data) return data; } +static float erf_eval(float x) +{ + float res = 0; + float tmp = x; + float factorial = 1; /*n!*/ + float x_pow = x; + int32_t one = 1; + int32_t n = 1; + + if (x <= -3) + { + return -1; + } + else if (x >= 3) + { + return 1; + } + + while (vsi_abs(tmp) > 1e-5) + { + res += tmp; + + factorial *= n; + one *= -1; + x_pow *= x * x; + tmp = one / factorial * x_pow / ( 2 * n + 1); + + n ++; + } +#define VSI_MUL2_RSQRTPI (1.1283791670955126f) + + res *= VSI_MUL2_RSQRTPI; + + return res; +} + +static float gelu_eval(float data) +{ + data = (float)(0.5f * data * (1 + erf_eval(data / (float)sqrt(2.0f)))); + + return data; +} + +#define VSI_SQRT_2_RCP_PI 0.7978845834732056f +static float hgelu_eval(float data) +{ + float cdf = (float)(0.5f * (1.0f + tanh((VSI_SQRT_2_RCP_PI * + (data + 0.044715f * data * data * data))))); + + return data * cdf; +} + DEF_KERNEL_EXECUTOR(_eltwise_unary_exec) ( vsi_nn_kernel_node_t node, @@ -176,6 +230,12 @@ DEF_KERNEL_EXECUTOR(_eltwise_unary_exec) case UNARY_ROUND: data = round_eval(data); break; + case UNARY_GELU: + data = gelu_eval(data); + break; + case UNARY_HGELU: + data = hgelu_eval(data); + break; default: break; } @@ -309,4 +369,6 @@ REGISTER_ELTWISE_UNARY_BACKEND_CPU( elu, UNARY_ELU ) REGISTER_ELTWISE_UNARY_BACKEND_CPU( neg, UNARY_NEG ) REGISTER_ELTWISE_UNARY_BACKEND_CPU( hard_sigmoid, UNARY_HSIGMOID ) REGISTER_ELTWISE_UNARY_BACKEND_CPU( mish, UNARY_MISH ) -REGISTER_ELTWISE_UNARY_BACKEND_CPU( round, UNARY_ROUND ) \ No newline at end of file +REGISTER_ELTWISE_UNARY_BACKEND_CPU( round, UNARY_ROUND ) +REGISTER_ELTWISE_UNARY_BACKEND_CPU( gelu, UNARY_GELU ) +REGISTER_ELTWISE_UNARY_BACKEND_CPU( hard_gelu, UNARY_HGELU ) \ No newline at end of file diff --git a/src/tim/vx/internal/src/kernel/cpu/erf_cpu.c b/src/tim/vx/internal/src/kernel/cpu/erf_cpu.c index 07f8e82..b3d1562 100644 --- a/src/tim/vx/internal/src/kernel/cpu/erf_cpu.c +++ b/src/tim/vx/internal/src/kernel/cpu/erf_cpu.c @@ -101,11 +101,11 @@ DEF_KERNEL_EXECUTOR(_compute) CHECK_PTR_FAIL_GOTO( f32_out_buffer[i], "Create output buffer fail.", final ); memset( f32_out_buffer[i], 0, out_bytes[i] ); } -#define ERF_PI 3.141592653589793 +#define VSI_ERF_PI 3.141592653589793 for (i = 0; i < out_elements[0]; i ++) { /* 2 / sqrt(pi) * (sum[(-1)^n! * x ^ (2n + 1)] + x) */ - float x = f32_in_buffer[0][i]; + float x = vsi_clamp(f32_in_buffer[0][i], -2, 2); float res = 0; float tmp = x; float factorial = 1; /*n!*/ @@ -126,7 +126,7 @@ DEF_KERNEL_EXECUTOR(_compute) } - res *= 2.0f / (float)sqrt(ERF_PI); + res *= 2.0f / (float)sqrt(VSI_ERF_PI); f32_out_buffer[0][i] = res; } diff --git a/src/tim/vx/internal/src/kernel/cpu/extra_ending_cpu.c b/src/tim/vx/internal/src/kernel/cpu/extra_ending_cpu.c new file mode 100644 index 0000000..978010c --- /dev/null +++ b/src/tim/vx/internal/src/kernel/cpu/extra_ending_cpu.c @@ -0,0 +1,188 @@ +/**************************************************************************** +* +* Copyright (c) 2020 Vivante Corporation +* +* Permission is hereby granted, free of charge, to any person obtaining a +* copy of this software and associated documentation files (the "Software"), +* to deal in the Software without restriction, including without limitation +* the rights to use, copy, modify, merge, publish, distribute, sublicense, +* and/or sell copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following conditions: +* +* The above copyright notice and this permission notice shall be included in +* all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +* DEALINGS IN THE SOFTWARE. +* +*****************************************************************************/ + + +#include +#include +#include +#include "vsi_nn_types.h" +#include "vsi_nn_tensor.h" +#include "vsi_nn_graph.h" +#include "vsi_nn_log.h" +#include "vsi_nn_error.h" +#include "vsi_nn_prv.h" +#include "vsi_nn_tensor_util.h" +#include "utils/vsi_nn_util.h" +#include "kernel/vsi_nn_kernel.h" +#include "libnnext/vx_lib_nnext.h" + +__BEGIN_DECLS + +/* + * Define kernel meta. + */ +#define _INPUT_NUM (2) +#define _OUTPUT_NUM (1) +#define _KERNEL_NAME CVIVANTE_NAMESPACE("cpu.extra_ending") + + +/* + * Kernel params + */ +static vx_param_description_t _extra_ending_kernel_param_def[] = +{ + {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, + {VX_OUTPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, + // Add kererl parameters here +}; +#define _EXTRA_ENDING_PARAM_NUM _cnt_of_array( _extra_ending_kernel_param_def ) + +/* + * Kernel function + */ +DEF_KERNEL_EXECUTOR(_compute) + ( + vsi_nn_kernel_node_t node, + const vsi_nn_kernel_node_param_t * param, + size_t param_size + ) +{ + vsi_status status = VSI_FAILURE; + vsi_nn_kernel_tensor_t input[_INPUT_NUM] = {NULL}; + vsi_nn_kernel_tensor_t output[_OUTPUT_NUM] = {NULL}; + uint8_t *u8_in_buffer[_INPUT_NUM] = {NULL}; + uint8_t *u8_out_buffer[_OUTPUT_NUM] = {NULL}; + vsi_nn_kernel_tensor_attr_t *in_attr[_INPUT_NUM] = {NULL}; + vsi_nn_kernel_tensor_attr_t *out_attr[_OUTPUT_NUM] = {NULL}; + size_t out_stride_size[_OUTPUT_NUM][VSI_NN_MAX_DIM_NUM] = {{1}}; + size_t out_elements[_OUTPUT_NUM] = {0}; + size_t out_bytes[_OUTPUT_NUM] = {0}; + int32_t i = 0; + + /* prepare data */ + input[1] = (vsi_nn_kernel_tensor_t)param[1]; + in_attr[1] = vsi_nn_kernel_tensor_attr_create( input[1] ); + u8_in_buffer[1] = (uint8_t*)vsi_nn_kernel_tensor_create_buffer( input[1], in_attr[1], FALSE ); + CHECK_PTR_FAIL_GOTO( u8_in_buffer[i], "Create input buffer fail.", final ); + + for (i = 0; i < _OUTPUT_NUM; i ++) + { + output[i] = (vsi_nn_kernel_tensor_t)param[i + _INPUT_NUM]; + out_attr[i] = vsi_nn_kernel_tensor_attr_create( output[i] ); + vsi_nn_kernel_tensor_attr_get_stride( out_attr[i], out_stride_size[i] ); + out_elements[i] = vsi_nn_kernel_tensor_attr_get_size( out_attr[i] ); + out_bytes[i] = out_elements[i] * sizeof(uint8_t); + u8_out_buffer[i] = (uint8_t *)malloc( out_bytes[i] ); + CHECK_PTR_FAIL_GOTO( u8_out_buffer[i], "Create output buffer fail.", final ); + memset( u8_out_buffer[i], 0, out_bytes[i] ); + } + + memcpy(u8_out_buffer[0], u8_in_buffer[1], out_bytes[0]); + + for (i = 0; i < _OUTPUT_NUM; i ++) + { + status = vsi_nn_kernel_tensor_write( output[i], out_attr[i], + u8_out_buffer[i], out_bytes[i] ); + CHECK_STATUS_FAIL_GOTO( status, final ); + } +final: + for (i = 0; i < _INPUT_NUM; i++) + { + vsi_nn_safe_free(u8_in_buffer[i]); + + if (in_attr[i]) + { + vsi_nn_kernel_tensor_attr_release( &in_attr[i] ); + } + } + for (i = 0; i < _OUTPUT_NUM; i++) + { + vsi_nn_safe_free(u8_out_buffer[i]); + + if (out_attr[i]) + { + vsi_nn_kernel_tensor_attr_release( &out_attr[i] ); + } + } + + return status; +} /* _compute() */ + + +/* + * Query kernel + */ +static vsi_status _query_kernel + ( + vsi_nn_kernel_t * kernel, + vsi_nn_tensor_t * const * const inputs, + vsi_nn_tensor_t * const * const outputs + ) +{ + vsi_status status = VSI_SUCCESS; + snprintf( kernel->info.name, VX_MAX_KERNEL_NAME, "%s", _KERNEL_NAME ); + kernel->info.function = _compute; + kernel->info.parameters = _extra_ending_kernel_param_def; + kernel->info.numParams = _cnt_of_array( _extra_ending_kernel_param_def ); + + return status; +} /* _query_kernel() */ + + +static vsi_nn_kernel_node_t _setup + ( + vsi_nn_graph_t * graph, + vsi_nn_tensor_t ** inputs, + size_t input_num, + vsi_nn_tensor_t ** outputs, + size_t output_num, + const vsi_nn_kernel_param_t * params, + vsi_nn_kernel_t * kernel + ) +{ + vsi_status status = VSI_FAILURE; + vsi_nn_kernel_node_param_t node_params[_EXTRA_ENDING_PARAM_NUM] = {NULL}; + vsi_nn_kernel_node_t node = NULL; + + status = _query_kernel( kernel, inputs, outputs ); + if ( VSI_SUCCESS == status) + { + node = vsi_nn_kernel_create_node( graph, kernel ); + if ( node ) + { + /* Set inputs and outputs */ + vsi_nn_kernel_node_pack_io( node_params, _EXTRA_ENDING_PARAM_NUM, + inputs, input_num, outputs, output_num ); + /* Pass parameters to node. */ + status = vsi_nn_kernel_node_pass_param( node, node_params, _EXTRA_ENDING_PARAM_NUM ); + } + } + + return node; +} /* _setup() */ + +__END_DECLS + +REGISTER_BACKEND_CPU( extra_ending, _setup ) diff --git a/src/tim/vx/internal/src/kernel/cpu/heatmap_max_keypoint_cpu.c b/src/tim/vx/internal/src/kernel/cpu/heatmap_max_keypoint_cpu.c new file mode 100644 index 0000000..791926b --- /dev/null +++ b/src/tim/vx/internal/src/kernel/cpu/heatmap_max_keypoint_cpu.c @@ -0,0 +1,323 @@ +/**************************************************************************** +* +* Copyright (c) 2020 Vivante Corporation +* +* Permission is hereby granted, free of charge, to any person obtaining a +* copy of this software and associated documentation files (the "Software"), +* to deal in the Software without restriction, including without limitation +* the rights to use, copy, modify, merge, publish, distribute, sublicense, +* and/or sell copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following conditions: +* +* The above copyright notice and this permission notice shall be included in +* all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +* DEALINGS IN THE SOFTWARE. +* +*****************************************************************************/ + + +#include +#include +#include +#include +#include "vsi_nn_types.h" +#include "vsi_nn_tensor.h" +#include "vsi_nn_graph.h" +#include "vsi_nn_log.h" +#include "vsi_nn_error.h" +#include "vsi_nn_prv.h" +#include "vsi_nn_tensor_util.h" +#include "utils/vsi_nn_util.h" +#include "kernel/vsi_nn_kernel.h" +#include "libnnext/vx_lib_nnext.h" + +__BEGIN_DECLS + +/* + * Define kernel meta. + */ +#define _INPUT_NUM (2) +#define _OUTPUT_NUM (2) +#define _KERNEL_NAME CVIVANTE_NAMESPACE("cpu.heatmap_max_keypoint") + + +/* + * Kernel params + */ +static vx_param_description_t _heatmap_max_keypoint_kernel_param_def[] = +{ + {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, + {VX_OUTPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, + {VX_OUTPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, +}; +#define _HEATMAP_MAX_KEYPOINT_PARAM_NUM _cnt_of_array( _heatmap_max_keypoint_kernel_param_def ) + +// This function uses Taylor expansion up to the quatratic term to approximate bicubic +// upscaling result. +// 2nd order Taylor expansion: D(x) = D - b'x + 1/2 * x'Ax +// where D = grid[1][1], Taylor expansion center, the original score, +// x = delta, the correction on max keypoint position, +// D(x) = deltaScore, the accuracy score after correction +static void _solve_for_delta + ( + const float grid[3][3], + float* delta, + float* deltaScore, + float fpAtol, + float fpRtol + ) +{ + // b: negative 1st order derivative at center + // A: Hessian matrix at center (2nd order derivative) + float A[2][2], b[2]; + float crossProd1, crossProd2; + float detA; + b[0] = -(grid[1][2] - grid[1][0]) / 2.0f; + b[1] = -(grid[2][1] - grid[0][1]) / 2.0f; + A[0][0] = grid[1][0] - 2.0f * grid[1][1] + grid[1][2]; + A[0][1] = (grid[2][2] - grid[2][0] - grid[0][2] + grid[0][0]) / 4.0f; + A[1][0] = A[0][1]; + A[1][1] = grid[0][1] - 2.0f * grid[1][1] + grid[2][1]; + + // solve Ax=b, where x=delta -> delta = inv(A) * b + crossProd1 = A[0][0] * A[1][1]; + crossProd2 = A[0][1] * A[1][0]; + detA = crossProd1 - crossProd2; + // check if A is invertible + if (fabs(detA) < (fpAtol + fpRtol * crossProd1)) return; + delta[0] = (A[1][1] * b[0] - A[0][1] * b[1]) / detA; + delta[1] = (A[0][0] * b[1] - A[1][0] * b[0]) / detA; + + // clip out of range delta, i.e. delta > 3/2 + if (fabs(delta[0]) > 1.5f || fabs(delta[1]) > 1.5f) + { + float scale = (float)(1.5f / vsi_nn_max(fabs(delta[0]), fabs(delta[1]))); + delta[0] *= scale; + delta[1] *= scale; + } + + *deltaScore = grid[1][1] - b[0] * delta[0] - b[1] * delta[1] + + ((A[0][0] * delta[0] + A[0][1] * delta[1]) * delta[0] + + (A[1][0] * delta[0] + A[1][1] * delta[1]) * delta[1]) / + 2.0f; +} +/* + * Kernel function + */ +DEF_KERNEL_EXECUTOR(_compute) + ( + vsi_nn_kernel_node_t node, + const vsi_nn_kernel_node_param_t * param, + size_t param_size + ) +{ + vsi_status status = VSI_FAILURE; + vsi_nn_kernel_tensor_t input[_INPUT_NUM] = {NULL}; + vsi_nn_kernel_tensor_t output[_OUTPUT_NUM] = {NULL}; + float *f32_in_buffer[_INPUT_NUM] = {NULL}; + float *f32_out_buffer[_OUTPUT_NUM] = {NULL}; + vsi_nn_kernel_tensor_attr_t *in_attr[_INPUT_NUM]; + vsi_nn_kernel_tensor_attr_t *out_attr[_OUTPUT_NUM]; + size_t out_stride_size[_OUTPUT_NUM][VSI_NN_MAX_DIM_NUM] = {{1}}; + size_t out_elements[_OUTPUT_NUM] = {0}; + size_t out_bytes[_OUTPUT_NUM] = {0}; + uint32_t i = 0; + uint32_t j = 0; + uint32_t k = 0; + uint32_t numBoxes = 0; + uint32_t heatmapSize = 0; + uint32_t numKeypoints = 0; + uint32_t boxInfoLength = 4; + uint32_t output_score_index = 0; + uint32_t output_keypoint_index = 0; + + /* prepare data */ + for (i = 0; i < _INPUT_NUM; i ++) + { + input[i] = (vsi_nn_kernel_tensor_t)param[i]; + in_attr[i] = vsi_nn_kernel_tensor_attr_create( input[i] ); + f32_in_buffer[i] = (float*)vsi_nn_kernel_tensor_create_buffer( input[i], in_attr[i], TRUE ); + CHECK_PTR_FAIL_GOTO( f32_in_buffer[i], "Create input buffer fail.", final ); + } + + for (i = 0; i < _OUTPUT_NUM; i ++) + { + output[i] = (vsi_nn_kernel_tensor_t)param[i + _INPUT_NUM]; + out_attr[i] = vsi_nn_kernel_tensor_attr_create( output[i] ); + vsi_nn_kernel_tensor_attr_get_stride( out_attr[i], out_stride_size[i] ); + out_elements[i] = vsi_nn_kernel_tensor_attr_get_size( out_attr[i] ); + out_bytes[i] = out_elements[i] * sizeof(float); + + f32_out_buffer[i] = (float *)malloc( out_bytes[i] ); + CHECK_PTR_FAIL_GOTO( f32_out_buffer[i], "Create output buffer fail.", final ); + memset( f32_out_buffer[i], 0, out_bytes[i] ); + } + + numBoxes = in_attr[0]->shape->data[3]; + heatmapSize = in_attr[0]->shape->data[2]; + numKeypoints = in_attr[0]->shape->data[0]; + + for(i = 0; i < numBoxes; i++) + { + for (j = 0; j < numKeypoints; j++) + { + uint32_t maxIndex = 0; + float maxScore = -FLT_MAX; + uint32_t maxIndexWidth; + uint32_t maxIndexHeight; + float localGrid[3][3] = {{0}}; + int32_t dh, dw; + float delta[2] = {0.0f, 0.0f}, deltaScore; + float wRoiStart = f32_in_buffer[1][i * boxInfoLength]; + float hRoiStart = f32_in_buffer[1][i * boxInfoLength + 1]; + float wRoiEnd = f32_in_buffer[1][i * boxInfoLength + 2]; + float hRoiEnd = f32_in_buffer[1][i * boxInfoLength + 3]; + float roiWidth = wRoiEnd - wRoiStart; + float roiHeight = hRoiEnd - hRoiStart; + float wRelativePos; + float hRelativePos; + for (k = 0; k < heatmapSize * heatmapSize; k++) + { + uint32_t index = i * heatmapSize * heatmapSize * numKeypoints + + k * numKeypoints + j; + float val = f32_in_buffer[0][index]; + if (maxScore < val) + { + maxScore = val; + maxIndex = k; + } + } + maxIndexWidth = maxIndex % heatmapSize; + maxIndexHeight = maxIndex / heatmapSize; + + // get local 3x3 grid + for (dh = -1; dh <= 1; dh++) + { + for (dw = -1; dw <= 1; dw++) + { + // cast uint32_t to int32_t + int32_t h = (int32_t)(maxIndexHeight) + dh; + int32_t w = (int32_t)(maxIndexWidth) + dw; + uint32_t heatmapIndex; + + // use mirroring for out of bound indexing + // need to ensure heatmapSize >= 2 + h = h < 0 ? 1 : (h >= (int32_t)heatmapSize ? heatmapSize - 2 : h); + w = w < 0 ? 1 : (w >= (int32_t)heatmapSize ? heatmapSize - 2 : w); + + heatmapIndex = i * heatmapSize * heatmapSize * numKeypoints + + (uint32_t)(h) * heatmapSize * numKeypoints + + (uint32_t)(w) * numKeypoints + j; + localGrid[dh + 1][dw + 1] = f32_in_buffer[0][heatmapIndex]; + } + } + deltaScore = maxScore; + _solve_for_delta((const float (*)[3])localGrid, delta, &deltaScore, 1e-3f, 1e-3f); + + wRelativePos = ((float)(maxIndexWidth) + delta[0] + 0.5f) / + (float)(heatmapSize); + hRelativePos = ((float)(maxIndexHeight) + delta[1] + 0.5f) / + (float)(heatmapSize); + f32_out_buffer[0][output_score_index] = deltaScore; + f32_out_buffer[1][output_keypoint_index] = wRelativePos * roiWidth + wRoiStart; + f32_out_buffer[1][output_keypoint_index + 1] = hRelativePos * roiHeight + hRoiStart; + output_score_index++; + output_keypoint_index += 2; + } + } + + /* save data */ + for(i = 0; i < _OUTPUT_NUM; i++) + { + status = vsi_nn_kernel_tensor_write_from_float( output[i], out_attr[i], + f32_out_buffer[i], out_elements[i] ); + CHECK_STATUS_FAIL_GOTO( status, final ); + } +final: + for (i = 0; i < _INPUT_NUM; i++) + { + vsi_nn_safe_free(f32_in_buffer[i]); + + if (in_attr[i]) + { + vsi_nn_kernel_tensor_attr_release( &in_attr[i] ); + } + } + for (i = 0; i < _OUTPUT_NUM; i++) + { + vsi_nn_safe_free(f32_out_buffer[i]); + + if (out_attr[i]) + { + vsi_nn_kernel_tensor_attr_release( &out_attr[i] ); + } + } + + return status; +} /* _compute() */ + + +/* + * Query kernel + */ +static vsi_status _query_kernel + ( + vsi_nn_kernel_t * kernel, + vsi_nn_tensor_t * const * const inputs, + vsi_nn_tensor_t * const * const outputs + /* Add extra params */ + ) +{ + vsi_status status = VSI_SUCCESS; + snprintf( kernel->info.name, VX_MAX_KERNEL_NAME, "%s", _KERNEL_NAME ); + kernel->info.function = _compute; + kernel->info.parameters = _heatmap_max_keypoint_kernel_param_def; + kernel->info.numParams = _cnt_of_array( _heatmap_max_keypoint_kernel_param_def ); + + return status; +} /* _query_kernel() */ + + +static vsi_nn_kernel_node_t _setup + ( + vsi_nn_graph_t * graph, + vsi_nn_tensor_t ** inputs, + size_t input_num, + vsi_nn_tensor_t ** outputs, + size_t output_num, + const vsi_nn_kernel_param_t * params, + vsi_nn_kernel_t * kernel + ) +{ + vsi_status status = VSI_FAILURE; + vsi_nn_kernel_node_param_t node_params[_HEATMAP_MAX_KEYPOINT_PARAM_NUM] = {NULL}; + vsi_nn_kernel_node_t node = NULL; + + status = _query_kernel( kernel, inputs, outputs ); + if ( VSI_SUCCESS == status) + { + node = vsi_nn_kernel_create_node( graph, kernel ); + if ( node ) + { + /* Set inputs and outputs */ + vsi_nn_kernel_node_pack_io( node_params, _HEATMAP_MAX_KEYPOINT_PARAM_NUM, + inputs, input_num, outputs, output_num ); + /* Pass parameters to node. */ + status = vsi_nn_kernel_node_pass_param( node, node_params, _HEATMAP_MAX_KEYPOINT_PARAM_NUM ); + } + } + + return node; +} /* _setup() */ + +__END_DECLS + +REGISTER_BACKEND_CPU( heatmap_max_keypoint, _setup ) diff --git a/src/tim/vx/internal/src/kernel/cpu/scatter_nd_update_cpu.c b/src/tim/vx/internal/src/kernel/cpu/scatter_nd_update_cpu.c new file mode 100644 index 0000000..d5e7c39 --- /dev/null +++ b/src/tim/vx/internal/src/kernel/cpu/scatter_nd_update_cpu.c @@ -0,0 +1,285 @@ +/**************************************************************************** +* +* Copyright (c) 2019 Vivante Corporation +* +* Permission is hereby granted, free of charge, to any person obtaining a +* copy of this software and associated documentation files (the "Software"), +* to deal in the Software without restriction, including without limitation +* the rights to use, copy, modify, merge, publish, distribute, sublicense, +* and/or sell copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following conditions: +* +* The above copyright notice and this permission notice shall be included in +* all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +* DEALINGS IN THE SOFTWARE. +* +*****************************************************************************/ + + +#include +#include +#include +#include "vsi_nn_types.h" +#include "vsi_nn_tensor.h" +#include "vsi_nn_graph.h" +#include "vsi_nn_log.h" +#include "vsi_nn_prv.h" +#include "vsi_nn_tensor_util.h" +#include "vsi_nn_error.h" +#include "utils/vsi_nn_util.h" +#include "kernel/vsi_nn_kernel.h" +#include "libnnext/vsi_nn_vxkernel.h" + +__BEGIN_DECLS + +/* + * Define kernel meta. + */ +#define _CPU_ARG_NUM (3) +#define _CPU_INPUT_NUM (3) +#define _CPU_OUTPUT_NUM (1) +#define _CPU_IO_NUM (_CPU_INPUT_NUM + _CPU_OUTPUT_NUM) +#define _CPU_PARAM_NUM (_CPU_ARG_NUM + _CPU_IO_NUM) +#define _KERNEL_NAME CVIVANTE_NAMESPACE("cpu.scatter_nd_update") + +DEF_KERNEL_EXECUTOR(_scatter_nd_update_exec) + ( + vsi_nn_kernel_node_t node, + const vsi_nn_kernel_node_param_t * param, + size_t param_size + ) +{ + vsi_status status = VSI_FAILURE; + vsi_nn_kernel_tensor_t tensors[_CPU_IO_NUM] = { NULL }; + uint32_t * para_buffer[1] = { NULL }; + uint32_t * mask = NULL; + float * buffer[3] = { NULL }; + size_t out_elements = 0; + vsi_nn_kernel_tensor_attr_t * attr[4] = { NULL }; + int32_t i = 0, j = 0; + int32_t block_size = 1, indices_num = 1; + int32_t coord_dim = 1; + int32_t mask_len = 0; + + tensors[0] = (vsi_nn_kernel_tensor_t)param[0]; // ref + tensors[1] = (vsi_nn_kernel_tensor_t)param[1]; // idx int + tensors[2] = (vsi_nn_kernel_tensor_t)param[2]; // update + tensors[3] = (vsi_nn_kernel_tensor_t)param[3]; // output + + attr[0] = vsi_nn_kernel_tensor_attr_create( tensors[0] ); + CHECK_PTR_FAIL_GOTO( attr[0], "Create tensor attr buffer fail.", final ); + attr[1] = vsi_nn_kernel_tensor_attr_create( tensors[1] ); + CHECK_PTR_FAIL_GOTO( attr[1], "Create tensor attr buffer fail.", final ); + attr[2] = vsi_nn_kernel_tensor_attr_create( tensors[2] ); + CHECK_PTR_FAIL_GOTO( attr[2], "Create tensor attr buffer fail.", final ); + attr[3] = vsi_nn_kernel_tensor_attr_create( tensors[3] ); + CHECK_PTR_FAIL_GOTO( attr[3], "Create tensor attr buffer fail.", final ); + + out_elements = vsi_nn_kernel_tensor_attr_get_size( attr[3] ); + + buffer[0] = (float*)vsi_nn_kernel_tensor_create_buffer( tensors[0], attr[0], TRUE ); + CHECK_PTR_FAIL_GOTO( buffer[0], "Create input0 buffer fail.", final ); + + para_buffer[0] = (uint32_t*)vsi_nn_kernel_tensor_create_buffer( tensors[1], attr[1], FALSE ); + CHECK_PTR_FAIL_GOTO( para_buffer[0], "Create input1 buffer fail.", final ); + + buffer[1] = (float*)vsi_nn_kernel_tensor_create_buffer( tensors[2], attr[2], TRUE ); + CHECK_PTR_FAIL_GOTO( buffer[1], "Create input2 buffer fail.", final ); + + vsi_nn_kernel_scalar_read_int32((vsi_nn_kernel_scalar_t)param[4], &(block_size)); + vsi_nn_kernel_scalar_read_int32((vsi_nn_kernel_scalar_t)param[5], &(coord_dim)); + vsi_nn_kernel_scalar_read_int32((vsi_nn_kernel_scalar_t)param[6], &(indices_num)); + + buffer[2] = (float *)malloc( out_elements * sizeof(float) ); + CHECK_PTR_FAIL_GOTO( buffer[2], "Create output buffer fail.", final ); + memcpy( buffer[2], buffer[0], out_elements * sizeof(float) ); + + mask_len = (int32_t)out_elements / block_size; + mask = (uint32_t *)malloc( mask_len * sizeof(uint32_t) ); + memset(mask, 0, mask_len * sizeof(uint32_t)); + + if (coord_dim <= 5) + { + int32_t stride[5] = {0, 0, 0, 0, 0}; + int32_t new_shape[5] = {1, 1, 1, 1, 1}; + int32_t merge_dim = (int32_t)attr[3]->shape->size - coord_dim + 1; + + for(i = 0; i < merge_dim; ++i) + { + new_shape[0] *= attr[3]->shape->data[i]; + } + stride[0] = new_shape[0] / block_size; + + for(i = 1; i < coord_dim; ++i) + { + new_shape[i] = attr[3]->shape->data[merge_dim + i - 1]; + + stride[i] = stride[i - 1] * new_shape[i]; + } + + for(i = 0; i < indices_num; i++) + { + uint32_t in_index = i * block_size; + uint32_t out_index = 0; + uint32_t coord[5] = {0}; + int32_t byd_flg = 0; + int32_t mask_idx = 0; + + for(j = 0; j < coord_dim; j++) + { + coord[j] = para_buffer[0][i * coord_dim + coord_dim - j - 1]; + if (coord[j] >= (uint32_t)new_shape[j]) + { + byd_flg = 1; + break; + } + } + if (byd_flg) + { + continue; + } + + mask_idx = coord[4] * stride[3] + coord[3] * stride[2] + + coord[2] * stride[1] + coord[1] * stride[0] + coord[0]; + out_index = mask_idx * block_size; + if (mask[mask_idx] == 0) + { + memset(buffer[2] + out_index, 0, block_size * sizeof(float)); + mask[mask_idx] = 1; + } + for(j = 0; j < block_size; j++) + { + buffer[2][out_index + j] += buffer[1][in_index + j]; + } + } + } + else + { + status = VSI_FAILURE; + CHECK_STATUS_FAIL_GOTO( status, final ); + } + + status = vsi_nn_kernel_tensor_write_from_float( tensors[3], attr[3], + buffer[2], out_elements ); + CHECK_STATUS_FAIL_GOTO( status, final ); + +final: + if ( para_buffer[0] ) + { + free( para_buffer[0] ); + } + + if (mask) + { + free(mask); + } + for( i = 0; i < 3; i ++ ) + { + if ( buffer[i] ) + { + free( buffer[i] ); + } + } + for( i = 0; i < 4; i ++ ) + { + if (attr[i]) { vsi_nn_kernel_tensor_attr_release( &attr[i] ); } + } + return status; +} /* _scatter_nd_update_exec() */ +/* + * Kernel params + */ +static vx_param_description_t _scatter_nd_update_kernel_param_def[] = +{ + {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, + {VX_OUTPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED}, + // Add kererl parameters here +}; + +static const vx_kernel_description_t _kernel_info = +{ + KERNEL_ID_PLACEHOLDER, + _KERNEL_NAME, + _scatter_nd_update_exec, + _scatter_nd_update_kernel_param_def, + _cnt_of_array( _scatter_nd_update_kernel_param_def ), + vsi_nn_KernelValidator, + NULL, + NULL, + vsi_nn_KernelInitializer, + vsi_nn_KernelDeinitializer +}; + +static vsi_status _query_kernel + ( + vsi_nn_tensor_t* const* const inputs, + vsi_nn_tensor_t* const* const outputs, + vsi_nn_kernel_t* kernel + ) +{ + memmove( &kernel->info, &_kernel_info, sizeof(vx_kernel_description_t) ); + return VSI_SUCCESS; +} /* _query_kernel() */ + +static vsi_nn_kernel_node_t _setup + ( + vsi_nn_graph_t * graph, + vsi_nn_tensor_t ** inputs, + size_t input_num, + vsi_nn_tensor_t ** outputs, + size_t output_num, + const vsi_nn_kernel_param_t * params, + vsi_nn_kernel_t * kernel + ) +{ + vsi_status status = VSI_FAILURE; + vsi_nn_kernel_node_param_t backend_params[_CPU_PARAM_NUM] = {NULL}; + vsi_nn_kernel_node_t node = NULL; + int32_t block_size = vsi_nn_kernel_param_get_int32( params, "block_size" ); + int32_t coord_dim = vsi_nn_kernel_param_get_int32( params, "coord_dim" ); + int32_t idx_num = vsi_nn_kernel_param_get_int32( params, "idx_num" ); + + status = _query_kernel( inputs, outputs, kernel ); + if ( VSI_SUCCESS == status) + { + node = vsi_nn_kernel_create_node( graph, kernel ); + if ( node ) + { + uint32_t index = 4; + /* Set inputs and outputs */ + vsi_nn_kernel_node_pack_io( backend_params, _CPU_PARAM_NUM, + inputs, _CPU_INPUT_NUM, outputs, _CPU_OUTPUT_NUM ); + backend_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &block_size ); + backend_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &coord_dim ); + backend_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &idx_num ); + /* Pass parameters to node. */ + status = vsi_nn_kernel_node_pass_param( node, backend_params, _CPU_PARAM_NUM ); + CHECK_STATUS( status ); + vsi_nn_kernel_scalar_release( &backend_params[4] ); + vsi_nn_kernel_scalar_release( &backend_params[5] ); + vsi_nn_kernel_scalar_release( &backend_params[6] ); + } + else + { + status = VSI_FAILURE; + } + } + return node; +} /* _setup() */ + +__END_DECLS + +REGISTER_BACKEND_CPU( scatter_nd_update, _setup ) + diff --git a/src/tim/vx/internal/src/kernel/cpu/signal_frame_cpu.c b/src/tim/vx/internal/src/kernel/cpu/signal_frame_cpu.c new file mode 100644 index 0000000..a13aee1 --- /dev/null +++ b/src/tim/vx/internal/src/kernel/cpu/signal_frame_cpu.c @@ -0,0 +1,289 @@ +/**************************************************************************** +* +* Copyright (c) 2020 Vivante Corporation +* +* Permission is hereby granted, free of charge, to any person obtaining a +* copy of this software and associated documentation files (the "Software"), +* to deal in the Software without restriction, including without limitation +* the rights to use, copy, modify, merge, publish, distribute, sublicense, +* and/or sell copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following conditions: +* +* The above copyright notice and this permission notice shall be included in +* all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +* DEALINGS IN THE SOFTWARE. +* +*****************************************************************************/ + +#include +#include +#include +#include "vsi_nn_types.h" +#include "vsi_nn_tensor.h" +#include "vsi_nn_graph.h" +#include "vsi_nn_log.h" +#include "vsi_nn_error.h" +#include "vsi_nn_prv.h" +#include "vsi_nn_tensor_util.h" +#include "utils/vsi_nn_util.h" +#include "kernel/vsi_nn_kernel.h" + +__BEGIN_DECLS + +/* + * Define kernel meta. + */ +#define _INPUT_NUM (1) +#define _OUTPUT_NUM (1) +#define _KERNEL_NAME CVIVANTE_NAMESPACE("cpu.signal_frame") + +/* + * Kernel params + */ +static vx_param_description_t _signal_frame_kernel_param_def[] = +{ + {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, + {VX_OUTPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED}, + // Add kererl parameters here +}; +#define _SIGNAL_FRAME_PARAM_NUM _cnt_of_array( _signal_frame_kernel_param_def ) +#define FRAME_LENGHT (2) +#define FRAME_STEP (3) +#define AXIS (4) +#define PAD_END (5) +#define PAD_VAL (6) + +/* + * Kernel function + */ +DEF_KERNEL_EXECUTOR(_compute) + ( + vsi_nn_kernel_node_t node, + const vsi_nn_kernel_node_param_t * param, + size_t param_size + ) +{ + vsi_status status = VSI_FAILURE; + vsi_nn_kernel_tensor_t input[_INPUT_NUM] = {NULL}; + vsi_nn_kernel_tensor_t output[_OUTPUT_NUM] = {NULL}; + float *f32_in_buffer[_INPUT_NUM] = {NULL}; + float *f32_out_buffer[_OUTPUT_NUM] = {NULL}; + vsi_nn_kernel_tensor_attr_t *in_attr[_INPUT_NUM] = {NULL}; + vsi_nn_kernel_tensor_attr_t *out_attr[_OUTPUT_NUM] = {NULL}; + size_t out_stride_size[_OUTPUT_NUM][VSI_NN_MAX_DIM_NUM] = {{1}}; + size_t out_elements[_OUTPUT_NUM] = {0}; + size_t out_bytes[_OUTPUT_NUM] = {0}; + int32_t i = 0; + int32_t j = 0; + int32_t k = 0; + int32_t frame_length = 0; + int32_t frame_step = 0; + int32_t axis = 0; + int32_t pad_end = 0; + int32_t length_samples = 0; + int32_t num_frames = 0; + int32_t inner_dim = 1; + int32_t outer_dim = 1; + int32_t inner_size = 1; + float pad_val = 0; + + /* prepare data */ + for (i = 0; i < _INPUT_NUM; i ++) + { + input[i] = (vsi_nn_kernel_tensor_t)param[i]; + in_attr[i] = vsi_nn_kernel_tensor_attr_create( input[i] ); + f32_in_buffer[i] = (float*)vsi_nn_kernel_tensor_create_buffer( input[i], in_attr[i], TRUE ); + CHECK_PTR_FAIL_GOTO( f32_in_buffer[i], "Create input buffer fail.", final ); + } + for (i = 0; i < _OUTPUT_NUM; i ++) + { + output[i] = (vsi_nn_kernel_tensor_t)param[i + _INPUT_NUM]; + out_attr[i] = vsi_nn_kernel_tensor_attr_create( output[i] ); + vsi_nn_kernel_tensor_attr_get_stride( out_attr[i], out_stride_size[i] ); + out_elements[i] = vsi_nn_kernel_tensor_attr_get_size( out_attr[i] ); + out_bytes[i] = out_elements[i] * sizeof(float); + f32_out_buffer[i] = (float *)malloc( out_bytes[i] ); + CHECK_PTR_FAIL_GOTO( f32_out_buffer[i], "Create output buffer fail.", final ); + memset( f32_out_buffer[i], 0, out_bytes[i] ); + } + + status = vsi_nn_kernel_scalar_read_int32((vsi_nn_kernel_scalar_t)param[FRAME_LENGHT], &frame_length); + status |= vsi_nn_kernel_scalar_read_int32((vsi_nn_kernel_scalar_t)param[FRAME_STEP], &frame_step); + status |= vsi_nn_kernel_scalar_read_int32((vsi_nn_kernel_scalar_t)param[AXIS], &axis); + status |= vsi_nn_kernel_scalar_read_int32((vsi_nn_kernel_scalar_t)param[PAD_END], &pad_end); + status |= vsi_nn_kernel_scalar_read_float32((vsi_nn_kernel_scalar_t)param[PAD_VAL], &pad_val); + CHECK_STATUS_FAIL_GOTO( status, final ); + + for (i = 0; i < axis; i++) + { + inner_dim *= in_attr[0]->shape->data[i]; + } + length_samples = in_attr[0]->shape->data[axis]; + for (i = axis + 1; i < (int32_t)in_attr[0]->shape->size; i++) + { + outer_dim *= in_attr[0]->shape->data[i]; + } + + for (i = 0; i < axis + 1; i++) + { + inner_size *= out_attr[0]->shape->data[i]; + } + + num_frames = (length_samples + frame_step - 1) / frame_step; + num_frames = pad_end ? num_frames : (length_samples - frame_length) / frame_step + 1; + + for (i = 0; i < outer_dim; i++) + { + float * src_ptr = f32_in_buffer[0] + i * length_samples * inner_dim; + float * dst_ptr = f32_out_buffer[0] + i * num_frames * frame_length * inner_dim; + + for (j = 0; j < num_frames; j++) + { + for (k = 0; k < frame_length; k++) + { + int32_t m = j * frame_step + k; + + if (pad_end) + { + if (m >= length_samples) + { + int32_t l = 0; + for (l = 0; l < inner_dim; l++) + { + (dst_ptr + (j * frame_length + k) * inner_dim)[l] = pad_val; + } + } + else + { + memcpy(dst_ptr + (j * frame_length + k) * inner_dim, src_ptr + m * inner_dim, + inner_dim * sizeof(float)); + } + } + else + { + memcpy(dst_ptr + (j * frame_length + k) * inner_dim, src_ptr + m * inner_dim, + inner_dim * sizeof(float)); + } + } + } + } + + /* save data */ + for(i = 0; i < _OUTPUT_NUM; i++) + { + status = vsi_nn_kernel_tensor_write_from_float( output[i], out_attr[i], + f32_out_buffer[i], out_elements[i] ); + CHECK_STATUS_FAIL_GOTO( status, final ); + } + +final: + for (i = 0; i < _INPUT_NUM; i++) + { + if (f32_in_buffer[i]) + { + free(f32_in_buffer[i]); + f32_in_buffer[i] = NULL; + } + if (in_attr[i]) + { + vsi_nn_kernel_tensor_attr_release( &in_attr[i] ); + } + } + for (i = 0; i < _OUTPUT_NUM; i++) + { + if (f32_out_buffer[i]) + { + free(f32_out_buffer[i]); + f32_out_buffer[i] = NULL; + } + if (out_attr[i]) + { + vsi_nn_kernel_tensor_attr_release( &out_attr[i] ); + } + } + + return status; +} /* _compute() */ + +/* + * Query kernel + */ +static vsi_status _query_kernel + ( + vsi_nn_kernel_t * kernel, + vsi_nn_tensor_t * const * const inputs, + vsi_nn_tensor_t * const * const outputs + /* Add extra params */ + ) +{ + vsi_status status = VSI_FAILURE; + snprintf( kernel->info.name, VX_MAX_KERNEL_NAME, "%s", _KERNEL_NAME ); + kernel->info.function = _compute; + kernel->info.parameters = _signal_frame_kernel_param_def; + kernel->info.numParams = _cnt_of_array( _signal_frame_kernel_param_def ); + status = VSI_SUCCESS; + + return status; +} /* _query_kernel() */ + +static vsi_nn_kernel_node_t _setup + ( + vsi_nn_graph_t * graph, + vsi_nn_tensor_t ** inputs, + size_t input_num, + vsi_nn_tensor_t ** outputs, + size_t output_num, + const vsi_nn_kernel_param_t * params, + vsi_nn_kernel_t * kernel + ) +{ + vsi_status status = VSI_FAILURE; + vsi_nn_kernel_node_param_t node_params[_SIGNAL_FRAME_PARAM_NUM] = {NULL}; + vsi_nn_kernel_node_t node = NULL; + int32_t frame_length = vsi_nn_kernel_param_get_int32( params, "frame_length" ); + int32_t frame_step = vsi_nn_kernel_param_get_int32( params, "frame_step" ); + int32_t axis = vsi_nn_kernel_param_get_int32( params, "axis" ); + int32_t pad_end = vsi_nn_kernel_param_get_int32( params, "pad_end" ); + float pad_val = vsi_nn_kernel_param_get_float32( params, "pad_val" ); + + status = _query_kernel( kernel, inputs, outputs ); + if ( VSI_SUCCESS == status) + { + node = vsi_nn_kernel_create_node( graph, kernel ); + if ( node ) + { + /* Set inputs and outputs */ + vsi_nn_kernel_node_pack_io( node_params, _SIGNAL_FRAME_PARAM_NUM, + inputs, input_num, outputs, output_num ); + /* Pass parameters to node. */ + node_params[FRAME_LENGHT] = vsi_nn_kernel_scalar_create( graph, I32, &frame_length ); + node_params[FRAME_STEP] = vsi_nn_kernel_scalar_create( graph, I32, &frame_step ); + node_params[AXIS] = vsi_nn_kernel_scalar_create( graph, I32, &axis ); + node_params[PAD_END] = vsi_nn_kernel_scalar_create( graph, I32, &pad_end ); + node_params[PAD_VAL] = vsi_nn_kernel_scalar_create( graph, F32, &pad_val ); + status = vsi_nn_kernel_node_pass_param( node, node_params, _SIGNAL_FRAME_PARAM_NUM ); + vsi_nn_kernel_scalar_release( &node_params[FRAME_LENGHT] ); + vsi_nn_kernel_scalar_release( &node_params[FRAME_STEP] ); + vsi_nn_kernel_scalar_release( &node_params[AXIS] ); + vsi_nn_kernel_scalar_release( &node_params[PAD_END] ); + vsi_nn_kernel_scalar_release( &node_params[PAD_VAL] ); + } + } + return node; +} /* _setup() */ + +__END_DECLS + +REGISTER_BACKEND_CPU( signal_frame, _setup ) diff --git a/src/tim/vx/internal/src/kernel/cpu/spatial_transformer_cpu.c b/src/tim/vx/internal/src/kernel/cpu/spatial_transformer_cpu.c new file mode 100644 index 0000000..4dd9d59 --- /dev/null +++ b/src/tim/vx/internal/src/kernel/cpu/spatial_transformer_cpu.c @@ -0,0 +1,389 @@ +/**************************************************************************** +* +* Copyright (c) 2020 Vivante Corporation +* +* Permission is hereby granted, free of charge, to any person obtaining a +* copy of this software and associated documentation files (the "Software"), +* to deal in the Software without restriction, including without limitation +* the rights to use, copy, modify, merge, publish, distribute, sublicense, +* and/or sell copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following conditions: +* +* The above copyright notice and this permission notice shall be included in +* all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +* DEALINGS IN THE SOFTWARE. +* +*****************************************************************************/ + + +#include +#include +#include +#include "vsi_nn_types.h" +#include "vsi_nn_tensor.h" +#include "vsi_nn_graph.h" +#include "vsi_nn_log.h" +#include "vsi_nn_error.h" +#include "vsi_nn_prv.h" +#include "vsi_nn_tensor_util.h" +#include "utils/vsi_nn_util.h" +#include "kernel/vsi_nn_kernel.h" +#include "libnnext/vx_lib_nnext.h" + +__BEGIN_DECLS + +/* + * Define kernel meta. + */ +#define _INPUT_NUM (2) +#define _OUTPUT_NUM (1) +#define _KERNEL_NAME CVIVANTE_NAMESPACE("cpu.spatial_transformer") + + +/* + * Kernel params + */ +static vx_param_description_t _spatial_transformer_kernel_param_def[] = +{ + {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, + {VX_OUTPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED}, +}; +#define _SPATIAL_TRANSFORMER_PARAM_NUM _cnt_of_array( _spatial_transformer_kernel_param_def ) +#define HAS_THETA_1_1 (3) +#define HAS_THETA_1_2 (4) +#define HAS_THETA_1_3 (5) +#define HAS_THETA_2_1 (6) +#define HAS_THETA_2_2 (7) +#define HAS_THETA_2_3 (8) +#define THETA_1_1 (9) +#define THETA_1_2 (10) +#define THETA_1_3 (11) +#define THETA_2_1 (12) +#define THETA_2_2 (13) +#define THETA_2_3 (14) +#define ALIGN_CORNERS (15) + +static void _transform_affine(int32_t dst_x, int32_t dst_y, const float m[], float *src_x, float *src_y) +{ + *src_x = dst_x * m[0] + dst_y * m[2] + m[4]; + *src_y = dst_x * m[1] + dst_y * m[3] + m[5]; +} + +static float _read_pixel(float *base, vsi_nn_kernel_tensor_attr_t *attr, + float x, float y, int32_t z, int32_t b) +{ + vsi_bool out_of_bounds = (x < 0 || y < 0 || x >= attr->shape->data[0] || y >= attr->shape->data[1]); + int32_t bx, by; + int32_t offset = (b * attr->shape->data[2] + z) * attr->shape->data[0] * attr->shape->data[1]; + float pixel = 0; + + if (out_of_bounds) + { + return 0; + } + // bounded x/y + bx = (int32_t)x; + by = (int32_t)y; + + pixel = base[attr->shape->data[0] * by + bx + offset]; + + return pixel; +} + +/* + * Kernel function + */ +DEF_KERNEL_EXECUTOR(_compute) + ( + vsi_nn_kernel_node_t node, + const vsi_nn_kernel_node_param_t * param, + size_t param_size + ) +{ + vsi_status status = VSI_FAILURE; + vsi_nn_kernel_tensor_t input[_INPUT_NUM] = {NULL}; + vsi_nn_kernel_tensor_t output[_OUTPUT_NUM] = {NULL}; + float *f32_in_buffer[_INPUT_NUM] = {NULL}; + float *f32_out_buffer[_OUTPUT_NUM] = {NULL}; + vsi_nn_kernel_tensor_attr_t *in_attr[_INPUT_NUM] = {NULL}; + vsi_nn_kernel_tensor_attr_t *out_attr[_OUTPUT_NUM] = {NULL}; + size_t out_stride_size[_OUTPUT_NUM][VSI_NN_MAX_DIM_NUM] = {{1}}; + size_t out_elements[_OUTPUT_NUM] = {0}; + size_t out_bytes[_OUTPUT_NUM] = {0}; + int32_t i = 0; + int32_t b = 0; + int32_t c = 0; + int32_t j = 0; + int32_t x = 0; + int32_t y = 0; + int32_t has_theta[6] = {0}; + int32_t batch = 1; + int32_t depth = 1; + int32_t height = 1; + int32_t width = 1; + int32_t input_height = 1; + int32_t input_width = 1; + int32_t rank = 0; + int32_t index = 0; + int32_t align_corners = 0; + float theta[6] = {0}; + + /* prepare data */ + for (i = 0; i < _INPUT_NUM; i ++) + { + input[i] = (vsi_nn_kernel_tensor_t)param[i]; + in_attr[i] = vsi_nn_kernel_tensor_attr_create( input[i] ); + f32_in_buffer[i] = (float*)vsi_nn_kernel_tensor_create_buffer( input[i], in_attr[i], TRUE ); + CHECK_PTR_FAIL_GOTO( f32_in_buffer[i], "Create input buffer fail.", final ); + } + for (i = 0; i < _OUTPUT_NUM; i ++) + { + output[i] = (vsi_nn_kernel_tensor_t)param[i + _INPUT_NUM]; + out_attr[i] = vsi_nn_kernel_tensor_attr_create( output[i] ); + vsi_nn_kernel_tensor_attr_get_stride( out_attr[i], out_stride_size[i] ); + out_elements[i] = vsi_nn_kernel_tensor_attr_get_size( out_attr[i] ); + out_bytes[i] = out_elements[i] * sizeof(float); + f32_out_buffer[i] = (float *)malloc( out_bytes[i] ); + CHECK_PTR_FAIL_GOTO( f32_out_buffer[i], "Create output buffer fail.", final ); + memset( f32_out_buffer[i], 0, out_bytes[i] ); + } + + status = vsi_nn_kernel_scalar_read_int32((vsi_nn_kernel_scalar_t)param[HAS_THETA_1_1], &has_theta[0]); + status |= vsi_nn_kernel_scalar_read_int32((vsi_nn_kernel_scalar_t)param[HAS_THETA_1_2], &has_theta[1]); + status |= vsi_nn_kernel_scalar_read_int32((vsi_nn_kernel_scalar_t)param[HAS_THETA_1_3], &has_theta[2]); + status |= vsi_nn_kernel_scalar_read_int32((vsi_nn_kernel_scalar_t)param[HAS_THETA_2_1], &has_theta[3]); + status |= vsi_nn_kernel_scalar_read_int32((vsi_nn_kernel_scalar_t)param[HAS_THETA_2_2], &has_theta[4]); + status |= vsi_nn_kernel_scalar_read_int32((vsi_nn_kernel_scalar_t)param[HAS_THETA_2_3], &has_theta[5]); + status |= vsi_nn_kernel_scalar_read_float32((vsi_nn_kernel_scalar_t)param[THETA_1_1], &theta[0]); + status |= vsi_nn_kernel_scalar_read_float32((vsi_nn_kernel_scalar_t)param[THETA_1_2], &theta[1]); + status |= vsi_nn_kernel_scalar_read_float32((vsi_nn_kernel_scalar_t)param[THETA_1_3], &theta[2]); + status |= vsi_nn_kernel_scalar_read_float32((vsi_nn_kernel_scalar_t)param[THETA_2_1], &theta[3]); + status |= vsi_nn_kernel_scalar_read_float32((vsi_nn_kernel_scalar_t)param[THETA_2_2], &theta[4]); + status |= vsi_nn_kernel_scalar_read_float32((vsi_nn_kernel_scalar_t)param[THETA_2_3], &theta[5]); + status |= vsi_nn_kernel_scalar_read_int32((vsi_nn_kernel_scalar_t)param[ALIGN_CORNERS], &align_corners); + CHECK_STATUS_FAIL_GOTO( status, final ); + + rank = (int32_t)out_attr[0]->shape->size; + width = out_attr[0]->shape->data[0]; + height = out_attr[0]->shape->data[1]; + depth = rank > 2 ? out_attr[0]->shape->data[2] : 1; + batch = rank > 3 ? out_attr[0]->shape->data[3] : 1; + + input_width = in_attr[0]->shape->data[0]; + input_height = in_attr[0]->shape->data[1]; + + for (b = 0; b < batch; b++) + { + float _w = (float)input_width; + float _h = (float)input_height; + float w = (float)width; + float h = (float)height; + float matrix_m[6] = {0}; + j = 0; + for (i = 0; i < 6; i++) + { + if (has_theta[i] == 0) + { + theta[i] = f32_in_buffer[1][b * in_attr[1]->shape->data[0] + j]; + j ++; + } + } + + if (align_corners && w > 1) + { + w = w - 1; + } + + if (align_corners && h > 1) + { + h = h - 1; + } + + matrix_m[0] = theta[4] * _w / w; + matrix_m[2] = theta[3] * _w / h; + matrix_m[4] = (theta[5] - theta[4] - theta[3] + 1) * _w * 0.5f; + matrix_m[1] = theta[1] * _h / w; + matrix_m[3] = theta[0] * _h / h; + matrix_m[5] = (theta[2] - theta[1] - theta[0] + 1) * _h * 0.5f; + for (c = 0; c < depth; c++) + { + for (y = 0; y < height; y++) + { + for (x = 0; x < width; x++) + { + float xf = 0; + float yf = 0; + float tl = 0, tr = 0, bl = 0, br = 0; + float ar = 0, ab = 0, al = 0, at = 0; + + _transform_affine(x, y, matrix_m, &xf, &yf); + + xf = xf < 0 ? xf - 1 : xf; + yf = yf < 0 ? yf - 1 : yf; + ar = xf - floorf(xf); + ab = yf - floorf(yf); + al = 1.0f - ar; + at = 1.0f - ab; + + tl = _read_pixel(f32_in_buffer[0], in_attr[0], floorf(xf), floorf(yf), c, b); + tr = _read_pixel(f32_in_buffer[0], in_attr[0], floorf(xf) + 1, floorf(yf), c, b); + bl = _read_pixel(f32_in_buffer[0], in_attr[0], floorf(xf), floorf(yf) + 1, c, b); + br = _read_pixel(f32_in_buffer[0], in_attr[0], floorf(xf) + 1, floorf(yf) + 1, c, b); + + f32_out_buffer[0][index ++] = tl * al * at + tr * ar * at + bl * al * ab + br * ar * ab; + } + } + } + } + + /* save data */ + for(i = 0; i < _OUTPUT_NUM; i++) + { + status = vsi_nn_kernel_tensor_write_from_float( output[i], out_attr[i], + f32_out_buffer[i], out_elements[i] ); + CHECK_STATUS_FAIL_GOTO( status, final ); + } +final: + for (i = 0; i < _INPUT_NUM; i++) + { + if (f32_in_buffer[i]) + { + free(f32_in_buffer[i]); + f32_in_buffer[i] = NULL; + } + if (in_attr[i]) + { + vsi_nn_kernel_tensor_attr_release( &in_attr[i] ); + } + } + for(i = 0; i < _OUTPUT_NUM; i++) + { + if (f32_out_buffer[i]) + { + free(f32_out_buffer[i]); + f32_out_buffer[i] = NULL; + } + if (out_attr[i]) + { + vsi_nn_kernel_tensor_attr_release( &out_attr[i] ); + } + } + + return status; +} /* _compute() */ + + +/* + * Query kernel + */ +static vsi_status _query_kernel + ( + vsi_nn_kernel_t * kernel, + vsi_nn_tensor_t * const * const inputs, + vsi_nn_tensor_t * const * const outputs + /* Add extra params */ + ) +{ + snprintf( kernel->info.name, VX_MAX_KERNEL_NAME, "%s", _KERNEL_NAME ); + kernel->info.function = _compute; + kernel->info.parameters = _spatial_transformer_kernel_param_def; + kernel->info.numParams = _cnt_of_array( _spatial_transformer_kernel_param_def ); + + return VSI_SUCCESS; +} /* _query_kernel() */ + + +static vsi_nn_kernel_node_t _setup + ( + vsi_nn_graph_t * graph, + vsi_nn_tensor_t ** inputs, + size_t input_num, + vsi_nn_tensor_t ** outputs, + size_t output_num, + const vsi_nn_kernel_param_t * params, + vsi_nn_kernel_t * kernel + ) +{ + vsi_status status = VSI_FAILURE; + vsi_nn_kernel_node_param_t node_params[_SPATIAL_TRANSFORMER_PARAM_NUM]; + vsi_nn_kernel_node_t node = NULL; + int32_t has_theta_1_1 = vsi_nn_kernel_param_get_int32( params, "has_theta_1_1" ); + int32_t has_theta_1_2 = vsi_nn_kernel_param_get_int32( params, "has_theta_1_2" ); + int32_t has_theta_1_3 = vsi_nn_kernel_param_get_int32( params, "has_theta_1_3" ); + int32_t has_theta_2_1 = vsi_nn_kernel_param_get_int32( params, "has_theta_2_1" ); + int32_t has_theta_2_2 = vsi_nn_kernel_param_get_int32( params, "has_theta_2_2" ); + int32_t has_theta_2_3 = vsi_nn_kernel_param_get_int32( params, "has_theta_2_3" ); + float theta_1_1 = vsi_nn_kernel_param_get_float32( params, "theta_1_1" ); + float theta_1_2 = vsi_nn_kernel_param_get_float32( params, "theta_1_2" ); + float theta_1_3 = vsi_nn_kernel_param_get_float32( params, "theta_1_3" ); + float theta_2_1 = vsi_nn_kernel_param_get_float32( params, "theta_2_1" ); + float theta_2_2 = vsi_nn_kernel_param_get_float32( params, "theta_2_2" ); + float theta_2_3 = vsi_nn_kernel_param_get_float32( params, "theta_2_3" ); + int32_t align_corners = vsi_nn_kernel_param_get_int32( params, "align_corners" ); + + status = _query_kernel( kernel, inputs, outputs ); + if ( VSI_SUCCESS == status) + { + node = vsi_nn_kernel_create_node( graph, kernel ); + if ( node ) + { + /* Set inputs and outputs */ + vsi_nn_kernel_node_pack_io( node_params, _SPATIAL_TRANSFORMER_PARAM_NUM, + inputs, input_num, outputs, output_num ); + node_params[HAS_THETA_1_1] = vsi_nn_kernel_scalar_create( graph, I32, &has_theta_1_1 ); + node_params[HAS_THETA_1_2] = vsi_nn_kernel_scalar_create( graph, I32, &has_theta_1_2 ); + node_params[HAS_THETA_1_3] = vsi_nn_kernel_scalar_create( graph, I32, &has_theta_1_3 ); + node_params[HAS_THETA_2_1] = vsi_nn_kernel_scalar_create( graph, I32, &has_theta_2_1 ); + node_params[HAS_THETA_2_2] = vsi_nn_kernel_scalar_create( graph, I32, &has_theta_2_2 ); + node_params[HAS_THETA_2_3] = vsi_nn_kernel_scalar_create( graph, I32, &has_theta_2_3 ); + node_params[THETA_1_1] = vsi_nn_kernel_scalar_create( graph, F32, &theta_1_1 ); + node_params[THETA_1_2] = vsi_nn_kernel_scalar_create( graph, F32, &theta_1_2 ); + node_params[THETA_1_3] = vsi_nn_kernel_scalar_create( graph, F32, &theta_1_3 ); + node_params[THETA_2_1] = vsi_nn_kernel_scalar_create( graph, F32, &theta_2_1 ); + node_params[THETA_2_2] = vsi_nn_kernel_scalar_create( graph, F32, &theta_2_2 ); + node_params[THETA_2_3] = vsi_nn_kernel_scalar_create( graph, F32, &theta_2_3 ); + node_params[ALIGN_CORNERS] = vsi_nn_kernel_scalar_create( graph, I32, &align_corners ); + /* Pass parameters to node. */ + status = vsi_nn_kernel_node_pass_param( node, node_params, _SPATIAL_TRANSFORMER_PARAM_NUM ); + vsi_nn_kernel_scalar_release( &node_params[HAS_THETA_1_1] ); + vsi_nn_kernel_scalar_release( &node_params[HAS_THETA_1_2] ); + vsi_nn_kernel_scalar_release( &node_params[HAS_THETA_1_3] ); + vsi_nn_kernel_scalar_release( &node_params[HAS_THETA_2_1] ); + vsi_nn_kernel_scalar_release( &node_params[HAS_THETA_2_2] ); + vsi_nn_kernel_scalar_release( &node_params[HAS_THETA_2_3] ); + vsi_nn_kernel_scalar_release( &node_params[THETA_1_1] ); + vsi_nn_kernel_scalar_release( &node_params[THETA_1_2] ); + vsi_nn_kernel_scalar_release( &node_params[THETA_1_3] ); + vsi_nn_kernel_scalar_release( &node_params[THETA_2_1] ); + vsi_nn_kernel_scalar_release( &node_params[THETA_2_2] ); + vsi_nn_kernel_scalar_release( &node_params[THETA_2_3] ); + vsi_nn_kernel_scalar_release( &node_params[ALIGN_CORNERS] ); + } + } + return node; +} /* _setup() */ + +__END_DECLS + +REGISTER_BACKEND_CPU( spatial_transformer, _setup ) diff --git a/src/tim/vx/internal/src/kernel/cpu/sync_host_cpu.c b/src/tim/vx/internal/src/kernel/cpu/sync_host_cpu.c new file mode 100644 index 0000000..7c71221 --- /dev/null +++ b/src/tim/vx/internal/src/kernel/cpu/sync_host_cpu.c @@ -0,0 +1,185 @@ +/**************************************************************************** +* +* Copyright (c) 2020 Vivante Corporation +* +* Permission is hereby granted, free of charge, to any person obtaining a +* copy of this software and associated documentation files (the "Software"), +* to deal in the Software without restriction, including without limitation +* the rights to use, copy, modify, merge, publish, distribute, sublicense, +* and/or sell copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following conditions: +* +* The above copyright notice and this permission notice shall be included in +* all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +* DEALINGS IN THE SOFTWARE. +* +*****************************************************************************/ + + +#include +#include +#include +#include "vsi_nn_types.h" +#include "vsi_nn_tensor.h" +#include "vsi_nn_graph.h" +#include "vsi_nn_log.h" +#include "vsi_nn_error.h" +#include "vsi_nn_prv.h" +#include "vsi_nn_tensor_util.h" +#include "utils/vsi_nn_util.h" +#include "kernel/vsi_nn_kernel.h" +#include "libnnext/vx_lib_nnext.h" + +__BEGIN_DECLS + +/* + * Define kernel meta. + */ +#define _INPUT_NUM (1) +#define _OUTPUT_NUM (1) +#define _KERNEL_NAME CVIVANTE_NAMESPACE("cpu.sync_host") + + +/* + * Kernel params + */ +static vx_param_description_t _sync_host_kernel_param_def[] = +{ + {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, + {VX_OUTPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, +}; +#define _SYNC_HOST_PARAM_NUM _cnt_of_array( _sync_host_kernel_param_def ) + + +/* + * Kernel function + */ +DEF_KERNEL_EXECUTOR(_compute) + ( + vsi_nn_kernel_node_t node, + const vsi_nn_kernel_node_param_t * param, + size_t param_size + ) +{ + vsi_status status = VSI_FAILURE; + vsi_nn_kernel_tensor_t input[_INPUT_NUM] = {NULL}; + vsi_nn_kernel_tensor_t output[_OUTPUT_NUM] = {NULL}; + void *in_buffer[_INPUT_NUM] = {NULL}; + vsi_nn_kernel_tensor_attr_t *in_attr[_INPUT_NUM]; + vsi_nn_kernel_tensor_attr_t *out_attr[_OUTPUT_NUM]; + size_t out_bytes[_OUTPUT_NUM] = {0}; + uint32_t i = 0; + + /* prepare data */ + for (i = 0; i < _INPUT_NUM; i ++) + { + input[i] = (vsi_nn_kernel_tensor_t)param[i]; + in_attr[i] = vsi_nn_kernel_tensor_attr_create( input[i] ); + in_buffer[i] = vsi_nn_kernel_tensor_create_buffer( input[i], in_attr[i], FALSE ); + CHECK_PTR_FAIL_GOTO( in_buffer[i], "Create input buffer fail.", final ); + } + + for(i = 0; i < _OUTPUT_NUM; i ++) + { + output[i] = (vsi_nn_kernel_tensor_t)param[i + _INPUT_NUM]; + out_attr[i] = vsi_nn_kernel_tensor_attr_create( output[i] ); + out_bytes[i] = vsi_nn_kernel_tensor_attr_get_bytes( out_attr[i] ); + } + + /* save data */ + for(i = 0; i < _OUTPUT_NUM; i++) + { + status = vsi_nn_kernel_tensor_write( output[i], out_attr[i], + in_buffer[i], out_bytes[i] ); + CHECK_STATUS_FAIL_GOTO( status, final ); + } + +final: + for (i = 0; i < _INPUT_NUM; i++) + { + if (in_buffer[i]) + { + free(in_buffer[i]); + in_buffer[i] = NULL; + } + if (in_attr[i]) + { + vsi_nn_kernel_tensor_attr_release( &in_attr[i] ); + } + } + for(i = 0; i < _OUTPUT_NUM; i++) + { + if (out_attr[i]) + { + vsi_nn_kernel_tensor_attr_release( &out_attr[i] ); + } + } + + return status; +} /* _compute() */ + + +/* + * Query kernel + */ +static vsi_status _query_kernel + ( + vsi_nn_kernel_t * kernel, + vsi_nn_tensor_t * const * const inputs, + vsi_nn_tensor_t * const * const outputs + /* Add extra params */ + ) +{ + vsi_status status = VSI_FAILURE; + snprintf( kernel->info.name, VX_MAX_KERNEL_NAME, "%s", _KERNEL_NAME ); + kernel->info.function = _compute; + kernel->info.parameters = _sync_host_kernel_param_def; + kernel->info.numParams = _cnt_of_array( _sync_host_kernel_param_def ); + status = VSI_SUCCESS; + + return status; +} /* _query_kernel() */ + + +static vsi_nn_kernel_node_t _setup + ( + vsi_nn_graph_t * graph, + vsi_nn_tensor_t ** inputs, + size_t input_num, + vsi_nn_tensor_t ** outputs, + size_t output_num, + const vsi_nn_kernel_param_t * params, + vsi_nn_kernel_t * kernel + ) +{ + vsi_status status = VSI_FAILURE; + vsi_nn_kernel_node_param_t node_params[_SYNC_HOST_PARAM_NUM]; + vsi_nn_kernel_node_t node = NULL; + + status = _query_kernel( kernel, inputs, outputs /* Add extra params */ ); + if ( VSI_SUCCESS == status) + { + node = vsi_nn_kernel_create_node( graph, kernel ); + if ( node ) + { + /* Set inputs and outputs */ + vsi_nn_kernel_node_pack_io( node_params, _SYNC_HOST_PARAM_NUM, + inputs, input_num, outputs, output_num ); + /* Pass parameters to node. */ + status = vsi_nn_kernel_node_pass_param( node, node_params, _SYNC_HOST_PARAM_NUM ); + } + } + + return node; +} /* _setup() */ + +__END_DECLS + +REGISTER_BACKEND_CPU( sync_host, _setup ) diff --git a/src/tim/vx/internal/src/kernel/cpu/tensorstackconcat_cpu.c b/src/tim/vx/internal/src/kernel/cpu/tensorstackconcat_cpu.c new file mode 100644 index 0000000..7ffc7d8 --- /dev/null +++ b/src/tim/vx/internal/src/kernel/cpu/tensorstackconcat_cpu.c @@ -0,0 +1,219 @@ +/**************************************************************************** +* +* Copyright (c) 2020 Vivante Corporation +* +* Permission is hereby granted, free of charge, to any person obtaining a +* copy of this software and associated documentation files (the "Software"), +* to deal in the Software without restriction, including without limitation +* the rights to use, copy, modify, merge, publish, distribute, sublicense, +* and/or sell copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following conditions: +* +* The above copyright notice and this permission notice shall be included in +* all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +* DEALINGS IN THE SOFTWARE. +* +*****************************************************************************/ + + +#include +#include +#include +#include "vsi_nn_types.h" +#include "vsi_nn_tensor.h" +#include "vsi_nn_graph.h" +#include "vsi_nn_log.h" +#include "vsi_nn_error.h" +#include "vsi_nn_prv.h" +#include "vsi_nn_tensor_util.h" +#include "utils/vsi_nn_util.h" +#include "kernel/vsi_nn_kernel.h" +#include "libnnext/vx_lib_nnext.h" + +__BEGIN_DECLS + +/* + * Define kernel meta. + */ +#define _INPUT_NUM (2) +#define _OUTPUT_NUM (1) +#define _KERNEL_NAME CVIVANTE_NAMESPACE("cpu.tensorstackconcat") + + +/* + * Kernel params + */ +static vx_param_description_t _tensorstackconcat_kernel_param_def[] = +{ + {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, + {VX_OUTPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, +}; +#define _TENSORSTACKCONCAT_PARAM_NUM _cnt_of_array( _tensorstackconcat_kernel_param_def ) + + +/* + * Kernel function + */ +DEF_KERNEL_EXECUTOR(_compute) + ( + vsi_nn_kernel_node_t node, + const vsi_nn_kernel_node_param_t * param, + size_t param_size + ) +{ + vsi_status status = VSI_FAILURE; + vsi_nn_kernel_tensor_t input[_INPUT_NUM] = {NULL}; + vsi_nn_kernel_tensor_t output[_OUTPUT_NUM] = {NULL}; + float *f32_in_buffer[_INPUT_NUM] = {NULL}; + float *f32_out_buffer[_OUTPUT_NUM] = {NULL}; + vsi_nn_kernel_tensor_attr_t *in_attr[_INPUT_NUM]; + vsi_nn_kernel_tensor_attr_t *out_attr[_OUTPUT_NUM]; + size_t out_elements[_OUTPUT_NUM] = {0}; + uint32_t i = 0; + uint32_t depth = 0; + uint32_t height = 1; + uint32_t width = 0; + uint32_t index = 0; + uint32_t c = 0, y = 0, x = 0; + + /* prepare data */ + for (i = 0; i < _INPUT_NUM; i ++) + { + input[i] = (vsi_nn_kernel_tensor_t)param[i]; + in_attr[i] = vsi_nn_kernel_tensor_attr_create( input[i] ); + f32_in_buffer[i] = (float*)vsi_nn_kernel_tensor_create_buffer( input[i], in_attr[i], TRUE ); + CHECK_PTR_FAIL_GOTO( f32_in_buffer[i], "Create input0 buffer fail.", final ); + } + + for (i = 0; i < _OUTPUT_NUM; i ++) + { + output[i] = (vsi_nn_kernel_tensor_t)param[i + _INPUT_NUM]; + out_attr[i] = vsi_nn_kernel_tensor_attr_create( output[i] ); + out_elements[i] = vsi_nn_kernel_tensor_attr_get_size( out_attr[i] ); + f32_out_buffer[i] = (float*)vsi_nn_kernel_tensor_create_buffer( output[i], out_attr[i], TRUE ); + CHECK_PTR_FAIL_GOTO( f32_out_buffer[i], "Create output buffer fail.", final ); + } + + depth = in_attr[0]->shape->data[2]; + height = in_attr[0]->shape->data[1]; + width = in_attr[0]->shape->data[0]; + index = (int32_t)f32_in_buffer[1][0]; + + for (c = 0; c < depth; c++) + { + for (y = 0; y < height; y++) + { + for (x = 0; x < width; x++) + { + int32_t i_idx = c * width * height + y * width + x; + int32_t o_idx = (c * out_attr[0]->shape->data[1] + index ) * out_attr[0]->shape->data[0] + x; + float value = f32_in_buffer[0][i_idx]; + + f32_out_buffer[0][o_idx] = value; + } + } + } + + /* save data */ + for(i = 0; i < _OUTPUT_NUM; i++) + { + status = vsi_nn_kernel_tensor_write_from_float( output[i], out_attr[i], + f32_out_buffer[i], out_elements[i] ); + CHECK_STATUS_FAIL_GOTO( status, final ); + } + +final: + for (i = 0; i < _INPUT_NUM; i++) + { + if (f32_in_buffer[i]) + { + free(f32_in_buffer[i]); + f32_in_buffer[i] = NULL; + } + if (in_attr[i]) + { + vsi_nn_kernel_tensor_attr_release( &in_attr[i] ); + } + } + for(i = 0; i < _OUTPUT_NUM; i++) + { + if (f32_out_buffer[i]) + { + free(f32_out_buffer[i]); + f32_out_buffer[i] = NULL; + } + if (out_attr[i]) + { + vsi_nn_kernel_tensor_attr_release( &out_attr[i] ); + } + } + + return status; +} /* _compute() */ + + +/* + * Query kernel + */ +static vsi_status _query_kernel + ( + vsi_nn_kernel_t * kernel, + vsi_nn_tensor_t * const * const inputs, + vsi_nn_tensor_t * const * const outputs + /* Add extra params */ + ) +{ + vsi_status status = VSI_FAILURE; + snprintf( kernel->info.name, VX_MAX_KERNEL_NAME, "%s", _KERNEL_NAME ); + kernel->info.function = _compute; + kernel->info.parameters = _tensorstackconcat_kernel_param_def; + kernel->info.numParams = _cnt_of_array( _tensorstackconcat_kernel_param_def ); + + status = VSI_SUCCESS; + return status; +} /* _query_kernel() */ + + +static vsi_nn_kernel_node_t _setup + ( + vsi_nn_graph_t * graph, + vsi_nn_tensor_t ** inputs, + size_t input_num, + vsi_nn_tensor_t ** outputs, + size_t output_num, + const vsi_nn_kernel_param_t * params, + vsi_nn_kernel_t * kernel + ) +{ + vsi_status status = VSI_FAILURE; + vsi_nn_kernel_node_param_t node_params[_TENSORSTACKCONCAT_PARAM_NUM]; + vsi_nn_kernel_node_t node = NULL; + + status = _query_kernel( kernel, inputs, outputs ); + if ( VSI_SUCCESS == status) + { + node = vsi_nn_kernel_create_node( graph, kernel ); + if ( node ) + { + /* Set inputs and outputs */ + vsi_nn_kernel_node_pack_io( node_params, _TENSORSTACKCONCAT_PARAM_NUM, + inputs, input_num, outputs, output_num ); + /* Pass parameters to node. */ + status = vsi_nn_kernel_node_pass_param( node, node_params, _TENSORSTACKCONCAT_PARAM_NUM ); + } + } + + return node; +} /* _setup() */ + +__END_DECLS + +REGISTER_BACKEND_CPU( tensorstackconcat, _setup ) diff --git a/src/tim/vx/internal/src/kernel/cpu/upsample_cpu.c b/src/tim/vx/internal/src/kernel/cpu/upsample_cpu.c index 7dd0a16..a4b8801 100644 --- a/src/tim/vx/internal/src/kernel/cpu/upsample_cpu.c +++ b/src/tim/vx/internal/src/kernel/cpu/upsample_cpu.c @@ -79,8 +79,8 @@ DEF_KERNEL_EXECUTOR(_compute) vsi_nn_kernel_tensor_t output[_OUTPUT_NUM] = {NULL}; float *f32_in_buffer[_INPUT_NUM] = {NULL}; float *f32_out_buffer[_OUTPUT_NUM] = {NULL}; - vsi_nn_kernel_tensor_attr_t *in_attr[_INPUT_NUM]; - vsi_nn_kernel_tensor_attr_t *out_attr[_OUTPUT_NUM]; + vsi_nn_kernel_tensor_attr_t *in_attr[_INPUT_NUM] = {NULL}; + vsi_nn_kernel_tensor_attr_t *out_attr[_OUTPUT_NUM] = {NULL}; size_t out_stride_size[_OUTPUT_NUM][VSI_NN_MAX_DIM_NUM] = {{1}}; size_t out_elements[_OUTPUT_NUM] = {0}; size_t out_bytes[_OUTPUT_NUM] = {0}; diff --git a/src/tim/vx/internal/src/kernel/evis/detect_post_box_evis.c b/src/tim/vx/internal/src/kernel/evis/detect_post_box_evis.c index 6cba0a0..10143c6 100644 --- a/src/tim/vx/internal/src/kernel/evis/detect_post_box_evis.c +++ b/src/tim/vx/internal/src/kernel/evis/detect_post_box_evis.c @@ -114,7 +114,7 @@ DEF_KERNEL_INITIALIZER(_detect_post_box_initializer) {0, 0, 0} }; vsi_nn_kernel_tensor_attr_t * input_attr = NULL; - vsi_nn_kernel_tensor_attr_t * input1_attr = NULL; + vsi_nn_kernel_tensor_attr_t * input1_attr = NULL; vsi_int_array_t * in_shape = NULL; float logE = (float)(log10(exp(1.0f)) / log10(2.0f)); float scaleIn0 = 1.0f; @@ -224,6 +224,7 @@ DEF_KERNEL_INITIALIZER(_detect_post_box_initializer) final: #define SAFE_FREE_TENSOR_ATTR(_PTR) if( _PTR ) { vsi_nn_kernel_tensor_attr_release( &_PTR ); _PTR = NULL; } SAFE_FREE_TENSOR_ATTR(input_attr); + SAFE_FREE_TENSOR_ATTR(input1_attr); return status; } /* _detect_post_box_initializer() */ diff --git a/src/tim/vx/internal/src/kernel/evis/eltwise_unary_evis.c b/src/tim/vx/internal/src/kernel/evis/eltwise_unary_evis.c index fd07a58..1294344 100644 --- a/src/tim/vx/internal/src/kernel/evis/eltwise_unary_evis.c +++ b/src/tim/vx/internal/src/kernel/evis/eltwise_unary_evis.c @@ -49,6 +49,8 @@ typedef enum UNARY_HSIGMOID, UNARY_MISH, UNARY_ROUND, + UNARY_GELU, + UNARY_HGELU, } unary_type_e; /* @@ -84,6 +86,8 @@ typedef enum #define HSIGMOID_OPERATION hard_sigmoid #define MISH_OPERATION mish #define ROUND_OPERATION round +#define GELU_OPERATION gelu +#define HGELU_OPERATION hard_gelu static const struct { uint32_t key; @@ -274,6 +278,42 @@ static const struct { TENSOR_UNARY_KERNELS_2D(ROUND_OPERATION, UNARY_ROUND, I8, I8 , KERNEL_SOURCE_2D) TENSOR_UNARY_KERNELS_2D(ROUND_OPERATION, UNARY_ROUND, I8, F16 , KERNEL_SOURCE_2D) TENSOR_UNARY_KERNELS_2D(ROUND_OPERATION, UNARY_ROUND, BF16, BF16 , KERNEL_SOURCE_2D) + + TENSOR_UNARY_KERNELS(GELU_OPERATION, UNARY_GELU, F16, F16 , KERNEL_SOURCE_3D) + TENSOR_UNARY_KERNELS(GELU_OPERATION, UNARY_GELU, F16, I16 , KERNEL_SOURCE_3D) + TENSOR_UNARY_KERNELS(GELU_OPERATION, UNARY_GELU, F16, U8 , KERNEL_SOURCE_3D) + TENSOR_UNARY_KERNELS(GELU_OPERATION, UNARY_GELU, F16, I8 , KERNEL_SOURCE_3D) + TENSOR_UNARY_KERNELS(GELU_OPERATION, UNARY_GELU, I16, I16 , KERNEL_SOURCE_3D) + TENSOR_UNARY_KERNELS(GELU_OPERATION, UNARY_GELU, I16, F16 , KERNEL_SOURCE_3D) + TENSOR_UNARY_KERNELS(GELU_OPERATION, UNARY_GELU, U8, U8 , KERNEL_SOURCE_3D) + TENSOR_UNARY_KERNELS(GELU_OPERATION, UNARY_GELU, U8, F16 , KERNEL_SOURCE_3D) + TENSOR_UNARY_KERNELS(GELU_OPERATION, UNARY_GELU, I8, I8 , KERNEL_SOURCE_3D) + TENSOR_UNARY_KERNELS(GELU_OPERATION, UNARY_GELU, I8, F16 , KERNEL_SOURCE_3D) + TENSOR_UNARY_KERNELS(GELU_OPERATION, UNARY_GELU, BF16, BF16 , KERNEL_SOURCE_3D) + + TENSOR_UNARY_KERNELS_2D(GELU_OPERATION, UNARY_GELU, F16, F16 , KERNEL_SOURCE_2D) + TENSOR_UNARY_KERNELS_2D(GELU_OPERATION, UNARY_GELU, F16, I16 , KERNEL_SOURCE_2D) + TENSOR_UNARY_KERNELS_2D(GELU_OPERATION, UNARY_GELU, F16, U8 , KERNEL_SOURCE_2D) + TENSOR_UNARY_KERNELS_2D(GELU_OPERATION, UNARY_GELU, F16, I8 , KERNEL_SOURCE_2D) + TENSOR_UNARY_KERNELS_2D(GELU_OPERATION, UNARY_GELU, I16, I16 , KERNEL_SOURCE_2D) + TENSOR_UNARY_KERNELS_2D(GELU_OPERATION, UNARY_GELU, I16, F16 , KERNEL_SOURCE_2D) + TENSOR_UNARY_KERNELS_2D(GELU_OPERATION, UNARY_GELU, U8, U8 , KERNEL_SOURCE_2D) + TENSOR_UNARY_KERNELS_2D(GELU_OPERATION, UNARY_GELU, U8, F16 , KERNEL_SOURCE_2D) + TENSOR_UNARY_KERNELS_2D(GELU_OPERATION, UNARY_GELU, I8, I8 , KERNEL_SOURCE_2D) + TENSOR_UNARY_KERNELS_2D(GELU_OPERATION, UNARY_GELU, I8, F16 , KERNEL_SOURCE_2D) + TENSOR_UNARY_KERNELS_2D(GELU_OPERATION, UNARY_GELU, BF16, BF16 , KERNEL_SOURCE_2D) + + TENSOR_UNARY_KERNELS_2D(HGELU_OPERATION, UNARY_HGELU, F16, F16 , KERNEL_SOURCE_2D) + TENSOR_UNARY_KERNELS_2D(HGELU_OPERATION, UNARY_HGELU, F16, I16 , KERNEL_SOURCE_2D) + TENSOR_UNARY_KERNELS_2D(HGELU_OPERATION, UNARY_HGELU, F16, U8 , KERNEL_SOURCE_2D) + TENSOR_UNARY_KERNELS_2D(HGELU_OPERATION, UNARY_HGELU, F16, I8 , KERNEL_SOURCE_2D) + TENSOR_UNARY_KERNELS_2D(HGELU_OPERATION, UNARY_HGELU, I16, I16 , KERNEL_SOURCE_2D) + TENSOR_UNARY_KERNELS_2D(HGELU_OPERATION, UNARY_HGELU, I16, F16 , KERNEL_SOURCE_2D) + TENSOR_UNARY_KERNELS_2D(HGELU_OPERATION, UNARY_HGELU, U8, U8 , KERNEL_SOURCE_2D) + TENSOR_UNARY_KERNELS_2D(HGELU_OPERATION, UNARY_HGELU, U8, F16 , KERNEL_SOURCE_2D) + TENSOR_UNARY_KERNELS_2D(HGELU_OPERATION, UNARY_HGELU, I8, I8 , KERNEL_SOURCE_2D) + TENSOR_UNARY_KERNELS_2D(HGELU_OPERATION, UNARY_HGELU, I8, F16 , KERNEL_SOURCE_2D) + TENSOR_UNARY_KERNELS_2D(HGELU_OPERATION, UNARY_HGELU, BF16, BF16 , KERNEL_SOURCE_2D) }; #undef SIN_OPERATION @@ -284,6 +324,8 @@ static const struct { #undef HSIGMOID_OPERATION #undef MISH_OPERATION #undef ROUND_OPERATION +#undef GELU_OPERATION +#undef HGELU_OPERATION /* * Kernel params @@ -403,6 +445,8 @@ DEF_KERNEL_INITIALIZER(_eltwise_unary_initializer) case _PACK_SELECT_KEY( UNARY_HSIGMOID, BF16, BF16 ): case _PACK_SELECT_KEY( UNARY_MISH, BF16, BF16 ): case _PACK_SELECT_KEY( UNARY_ROUND, BF16, BF16 ): + case _PACK_SELECT_KEY( UNARY_GELU, BF16, BF16 ): + case _PACK_SELECT_KEY( UNARY_HGELU, BF16, BF16 ): { gpu_dp_inst_t uniConvBF16toF32_Part0_2x8 = {{ 0x11111111, // TCfg @@ -682,6 +726,7 @@ REGISTER_ELTWISE_UNARY_BACKEND_EVIS( neg, UNARY_NEG ) REGISTER_ELTWISE_UNARY_BACKEND_EVIS( hard_sigmoid, UNARY_HSIGMOID ) REGISTER_ELTWISE_UNARY_BACKEND_EVIS( mish, UNARY_MISH ) REGISTER_ELTWISE_UNARY_BACKEND_EVIS( round, UNARY_ROUND ) - +REGISTER_ELTWISE_UNARY_BACKEND_EVIS( gelu, UNARY_GELU ) +REGISTER_ELTWISE_UNARY_BACKEND_EVIS( hard_gelu, UNARY_HGELU ) __END_DECLS diff --git a/src/tim/vx/internal/src/kernel/evis/extra_ending_evis.c b/src/tim/vx/internal/src/kernel/evis/extra_ending_evis.c new file mode 100644 index 0000000..e241d94 --- /dev/null +++ b/src/tim/vx/internal/src/kernel/evis/extra_ending_evis.c @@ -0,0 +1,243 @@ +/**************************************************************************** +* +* Copyright (c) 2020 Vivante Corporation +* +* Permission is hereby granted, free of charge, to any person obtaining a +* copy of this software and associated documentation files (the "Software"), +* to deal in the Software without restriction, including without limitation +* the rights to use, copy, modify, merge, publish, distribute, sublicense, +* and/or sell copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following conditions: +* +* The above copyright notice and this permission notice shall be included in +* all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +* DEALINGS IN THE SOFTWARE. +* +*****************************************************************************/ + + +#include +#include +#include +#include "vsi_nn_types.h" +#include "vsi_nn_tensor.h" +#include "vsi_nn_graph.h" +#include "vsi_nn_log.h" +#include "vsi_nn_error.h" +#include "vsi_nn_prv.h" +#include "vsi_nn_tensor_util.h" +#include "utils/vsi_nn_util.h" +#include "kernel/vsi_nn_kernel.h" +#include "kernel/vsi_nn_kernel_gpu_shape_optimize.h" + +__BEGIN_DECLS + + +// Add kernel hashtable here +#define EXTRA_ENDING_HASH_KEY( OUT_DTYPE ) \ + ( ( OUT_DTYPE ) ) +#define EXTRA_ENDING_KERNEL_MAP( OUT_DTYPE ) \ + { EXTRA_ENDING_HASH_KEY( OUT_DTYPE ), \ + CVIVANTE_NAMESPACE("evis.extra_ending_"#OUT_DTYPE), \ + "extra_ending" } + +typedef struct +{ + uint32_t key; + char * function_name; + const char * source_name; +} _kernel_map_type; + +static const _kernel_map_type _extra_ending_kernel_map[] = +{ + // Register kernel here + EXTRA_ENDING_KERNEL_MAP( F16 ), + EXTRA_ENDING_KERNEL_MAP( I16 ), + EXTRA_ENDING_KERNEL_MAP( U8 ), + EXTRA_ENDING_KERNEL_MAP( I8 ), +}; + + +/* + * Kernel params + */ +static vx_param_description_t _extra_ending_kernel_param_def[] = +{ + {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, + {VX_OUTPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, +}; +#define _EXTRA_ENDING_PARAM_NUM _cnt_of_array( _extra_ending_kernel_param_def ) + +/* + * Kernel initializer + */ +DEF_KERNEL_INITIALIZER(_extra_ending_initializer) + ( + vsi_nn_kernel_node_t node, + const vsi_nn_kernel_node_param_t * param, + size_t param_size + ) +{ + vsi_status status = VSI_FAILURE; + gpu_param_t gpu_param = { + 3, + {0, 0, 0}, + {0, 0, 0}, + {0, 0, 0}, + {0, 0, 0} + }; + vsi_nn_kernel_tensor_attr_t * attr = NULL; + vsi_int_array_t * out_shape = NULL; + + attr = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[1] ); + out_shape = attr->shape; + + gpu_param.global_scale[0] = 8; + gpu_param.global_scale[1] = 1; + gpu_param.global_scale[2] = 1; + gpu_param.global_size[0] = (out_shape->data[0] + gpu_param.global_scale[0] - 1) + / gpu_param.global_scale[0]; + gpu_param.global_size[1] = out_shape->data[1]; + gpu_param.global_size[2] = out_shape->size > 2 ? out_shape->data[2] : 1; + + status = vsi_nn_kernel_gpu_config( node, &gpu_param ); + CHECK_STATUS_FAIL_GOTO(status, final ); +final: + if (attr) + { + vsi_nn_kernel_tensor_attr_release( &attr ); + attr = NULL; + } + + return status; +} /* _extra_ending_initializer() */ + + +/* + * Query kernel + */ +static vsi_status _query_kernel + ( + vsi_nn_kernel_t * kernel, + vsi_nn_tensor_t * const * const inputs, + vsi_nn_tensor_t * const * const outputs + ) +{ + vsi_status status = VSI_FAILURE; + vsi_nn_kernel_dtype_e out_dtype; + uint32_t key = 0; + uint32_t i = 0; + + out_dtype = vsi_nn_kernel_map_dtype( outputs[0]->attr.dtype.vx_type ); + + key = EXTRA_ENDING_HASH_KEY( out_dtype ); + + for ( i = 0; i < _cnt_of_array(_extra_ending_kernel_map); i ++ ) + { + if ( _extra_ending_kernel_map[i].key == key ) + { + break; + } + } + if ( i < _cnt_of_array(_extra_ending_kernel_map) ) + { + snprintf( kernel->info.name, VX_MAX_KERNEL_NAME, "%s", _extra_ending_kernel_map[i].function_name ); + kernel->info.parameters = _extra_ending_kernel_param_def; + kernel->info.numParams = _cnt_of_array( _extra_ending_kernel_param_def ); + kernel->info.initialize = _extra_ending_initializer; + // Register code source + vsi_nn_kernel_add_source( kernel, VSI_NN_GPU_SOURCE_FMT_CODE, 2, + "vsi_nn_kernel_header", + _extra_ending_kernel_map[i].source_name ); + // Register binary source + vsi_nn_kernel_add_source( kernel, VSI_NN_GPU_SOURCE_FMT_EXECUTABLE, 1, + _extra_ending_kernel_map[i].source_name ); + status = VSI_SUCCESS; + } + + return status; +} /* _query_kernel() */ + + +static vsi_nn_kernel_node_t _setup + ( + vsi_nn_graph_t * graph, + vsi_nn_tensor_t ** inputs, + size_t input_num, + vsi_nn_tensor_t ** outputs, + size_t output_num, + const vsi_nn_kernel_param_t * params, + vsi_nn_kernel_t * kernel + ) +{ + vsi_status status = VSI_FAILURE; + vsi_nn_kernel_node_param_t node_params[_EXTRA_ENDING_PARAM_NUM] = {NULL}; + vsi_nn_kernel_node_t node = NULL; + uint32_t rank[3] = {0}; + int32_t shapes[3][VSI_NN_MAX_DIM_NUM] = {{ 1 }}; + vsi_nn_tensor_t* reshape_tensors[3] = { NULL }; + int32_t i = 0; + + vsi_nn_kernel_optimize_1d_tensor_shape( (const int32_t*)inputs[0]->attr.size, inputs[0]->attr.dim_num, + shapes[0], &rank[0]); + vsi_nn_kernel_optimize_1d_tensor_shape( (const int32_t*)inputs[1]->attr.size, inputs[1]->attr.dim_num, + shapes[1], &rank[1]); + vsi_nn_kernel_optimize_1d_tensor_shape( (const int32_t*)outputs[0]->attr.size, outputs[0]->attr.dim_num, + shapes[2], &rank[2]); + + for (i = 0; i < 2; i++) + { + reshape_tensors[i] = vsi_nn_reshape_tensor( graph, + inputs[i], (uint32_t*)shapes[i], rank[i] ); + } + reshape_tensors[2] = vsi_nn_reshape_tensor( graph, + outputs[0], (uint32_t*)shapes[2], rank[2] ); + + if ( !vsi_nn_kernel_gpu_check_shape( (int32_t*)reshape_tensors[0]->attr.size, + inputs[0]->attr.dim_num ) ) + { + goto final; + } + + status = _query_kernel( kernel, inputs, outputs ); + if ( VSI_SUCCESS == status) + { + node = vsi_nn_kernel_create_node( graph, kernel ); + if ( node ) + { + vx_border_t border; + + border.mode = VX_BORDER_CONSTANT; + border.constant_value.U32 = 0; + status = vxSetNodeAttribute( (vx_node)node, VX_NODE_BORDER, &border, sizeof(border)); + CHECK_STATUS_FAIL_GOTO( status, final ); + + /* Set inputs and outputs */ + vsi_nn_kernel_node_pack_io( node_params, _EXTRA_ENDING_PARAM_NUM, + reshape_tensors, input_num, &reshape_tensors[2], output_num ); + /* Pass parameters to node. */ + status = vsi_nn_kernel_node_pass_param( node, node_params, _EXTRA_ENDING_PARAM_NUM ); + CHECK_STATUS_FAIL_GOTO( status, final ); + } + } + +final: + for (i = 0; i < 3; i++) + { + vsi_safe_release_tensor(reshape_tensors[i]); + } + + return node; +} /* _setup() */ + +__END_DECLS + +REGISTER_BACKEND_EVIS( extra_ending, _setup ) diff --git a/src/tim/vx/internal/src/kernel/evis/lstmunit_activation_evis.c b/src/tim/vx/internal/src/kernel/evis/lstmunit_activation_evis.c index 03262c3..f94f94b 100644 --- a/src/tim/vx/internal/src/kernel/evis/lstmunit_activation_evis.c +++ b/src/tim/vx/internal/src/kernel/evis/lstmunit_activation_evis.c @@ -991,8 +991,8 @@ DEF_KERNEL_INITIALIZER(_lstmunit_activation_initializer) int32_t _is_ln = 0; int32_t _is_cifg = 0; int32_t _is_hybrid = 0; - vsi_nn_kernel_tensor_attr_t* input_attr[9]; - vsi_nn_kernel_tensor_attr_t* attr[2]; + vsi_nn_kernel_tensor_attr_t* input_attr[9] = {NULL}; + vsi_nn_kernel_tensor_attr_t* attr[2] = {NULL};; status = vsi_nn_kernel_scalar_read_int32( (vsi_nn_kernel_scalar_t)param[param_size - 5], &_is_ln ); CHECK_STATUS_FAIL_GOTO(status, final ); diff --git a/src/tim/vx/internal/src/kernel/evis/moments_evis.c b/src/tim/vx/internal/src/kernel/evis/moments_evis.c index 4416328..01998f3 100644 --- a/src/tim/vx/internal/src/kernel/evis/moments_evis.c +++ b/src/tim/vx/internal/src/kernel/evis/moments_evis.c @@ -44,6 +44,8 @@ __BEGIN_DECLS #define KERNEL_SOURCE_3 "moments_axis2" #define KERNEL_SOURCE_4 "moments_axis01" #define KERNEL_SOURCE_5 "moments_axis012" +#define KERNEL_SOURCE_6 "moments_u8" +#define KERNEL_SOURCE_7 "moments_u8_axis012" // Add kernel hashtable here #define HASH_MOMENTS_KEY(_input0_type, _output_type, _axis_num, _axis0, _axis1, _axis2, _image_2d) \ @@ -107,14 +109,19 @@ static const struct { TENSOR_MOMENTS_KERNELS(I8, F16, 2, KERNEL_SOURCE_3) TENSOR_MOMENTS_KERNELS(I16, F16, 2, KERNEL_SOURCE_3) TENSOR_MOMENTS_KERNELS(F16, F16, 2, KERNEL_SOURCE_3) + TENSOR_MOMENTS_KERNELS(U8, U8, 0, KERNEL_SOURCE_6) + TENSOR_MOMENTS_KERNELS(U8, U8, 1, KERNEL_SOURCE_6) + TENSOR_MOMENTS_KERNELS(U8, U8, 2, KERNEL_SOURCE_6) TENSOR_MOMENTS_TWO_AXIS_KERNELS(U8, F16, 0, 1, KERNEL_SOURCE_4) TENSOR_MOMENTS_TWO_AXIS_KERNELS(I8, F16, 0, 1, KERNEL_SOURCE_4) TENSOR_MOMENTS_TWO_AXIS_KERNELS(I16, F16, 0, 1, KERNEL_SOURCE_4) TENSOR_MOMENTS_TWO_AXIS_KERNELS(F16, F16, 0, 1, KERNEL_SOURCE_4) + TENSOR_MOMENTS_TWO_AXIS_KERNELS(U8, U8, 0, 1, KERNEL_SOURCE_6) TENSOR_MOMENTS_THREE_AXIS_KERNELS(U8, F16, 0, 1, 2, KERNEL_SOURCE_5) TENSOR_MOMENTS_THREE_AXIS_KERNELS(I8, F16, 0, 1, 2, KERNEL_SOURCE_5) TENSOR_MOMENTS_THREE_AXIS_KERNELS(I16, F16, 0, 1, 2, KERNEL_SOURCE_5) TENSOR_MOMENTS_THREE_AXIS_KERNELS(F16, F16, 0, 1, 2, KERNEL_SOURCE_5) + TENSOR_MOMENTS_THREE_AXIS_KERNELS(U8, U8, 0, 1, 2, KERNEL_SOURCE_7) TENSOR_MOMENTS_KERNELS_2D(U8, F16, 0, KERNEL_SOURCE_1) TENSOR_MOMENTS_KERNELS_2D(I8, F16, 0, KERNEL_SOURCE_1) TENSOR_MOMENTS_KERNELS_2D(I16, F16, 0, KERNEL_SOURCE_1) @@ -123,10 +130,13 @@ static const struct { TENSOR_MOMENTS_KERNELS_2D(I8, F16, 1, KERNEL_SOURCE_2) TENSOR_MOMENTS_KERNELS_2D(I16, F16, 1, KERNEL_SOURCE_2) TENSOR_MOMENTS_KERNELS_2D(F16, F16, 1, KERNEL_SOURCE_2) + TENSOR_MOMENTS_KERNELS_2D(U8, U8, 0, KERNEL_SOURCE_6) + TENSOR_MOMENTS_KERNELS_2D(U8, U8, 1, KERNEL_SOURCE_6) TENSOR_MOMENTS_TWO_AXIS_KERNELS_2D(U8, F16, 0, 1, KERNEL_SOURCE_4) TENSOR_MOMENTS_TWO_AXIS_KERNELS_2D(I8, F16, 0, 1, KERNEL_SOURCE_4) TENSOR_MOMENTS_TWO_AXIS_KERNELS_2D(I16, F16, 0, 1, KERNEL_SOURCE_4) TENSOR_MOMENTS_TWO_AXIS_KERNELS_2D(F16, F16, 0, 1, KERNEL_SOURCE_4) + TENSOR_MOMENTS_TWO_AXIS_KERNELS_2D(U8, U8, 0, 1, KERNEL_SOURCE_6) }; /* @@ -179,31 +189,41 @@ DEF_KERNEL_INITIALIZER(_moments_initializer) {0, 0, 0}, // localWorkSize: local group size in thread {0, 0, 0}}; // globalWorkSize: image size in thread - vsi_nn_kernel_tensor_attr_t* attr[1] = {NULL}; + vsi_nn_kernel_tensor_attr_t* attr[3] = {NULL, NULL, NULL}; vsi_int_array_t * input_shape = NULL; - float scaleIn = 0; - int32_t input_zp = 0; - vx_uint32 iter = 0; - int32_t sumInZp = 0; - int32_t tmpZp1 = 0; - float tmpZp2 = 0; - float e2InScale = 0; - float rowSumScale = 0; - int32_t axis = 0; - int32_t axis_num = 0; - int32_t width = 0; - int32_t height = 0; - int32_t chn = 0; - float dimRatio = 1.0; - int32_t iterSize = 16; - float zpScaleSqr_i16 = 0.0f; - float zpScale2_i16 = 0.0f; - float sumScale_i16 = 0.0f; + float scaleIn = 0; + int32_t input_zp = 0; + vx_uint32 iter = 0; + int32_t sumInZp = 0; + int32_t tmpZp1 = 0; + float tmpZp2 = 0; + float e2InScale = 0; + float rowSumScale = 0; + int32_t axis = 0; + int32_t axis_num = 0; + int32_t width = 0; + int32_t height = 0; + int32_t chn = 0; + float dimRatio = 1.0; + int32_t iterSize = 16; + float zpScaleSqr_i16 = 0.0f; + float zpScale2_i16 = 0.0f; + float sumScale_i16 = 0.0f; + float output_ZP[4] = {0.0f, 0.0f, 0.0f, 0.0f}; + float outputScale[4] = {1.0f, 1.0f, 1.0f, 1.0f}; + float output_ZP0 = 0.0f; + float outputScale0 = 1; + float output_ZP1 = 0.0f; + float outputScale1 = 1.0f; uint32_t pack_key = 0; attr[0] = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[0] ); CHECK_PTR_FAIL_GOTO( attr[0], "Create tensor attr buffer fail.", OnError ); + attr[1] = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[1] ); + CHECK_PTR_FAIL_GOTO( attr[1], "Create tensor attr buffer fail.", OnError ); + attr[2] = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[2] ); + CHECK_PTR_FAIL_GOTO( attr[2], "Create tensor attr buffer fail.", OnError ); status = vsi_nn_kernel_scalar_read_int32((vsi_nn_kernel_scalar_t)param[3], &axis); CHECK_STATUS_FAIL_GOTO(status, OnError ); @@ -212,10 +232,13 @@ DEF_KERNEL_INITIALIZER(_moments_initializer) CHECK_STATUS_FAIL_GOTO(status, OnError ); input_shape = attr[0]->shape; - input_zp = attr[0]->asymm.zero_point; - scaleIn = attr[0]->asymm.scale; - if(attr[0]->quant == VSI_NN_KERNEL_QUANT_DFP ) + if (attr[0]->quant == VSI_NN_KERNEL_QUANT_ASYMM) + { + input_zp = attr[0]->asymm.zero_point; + scaleIn = attr[0]->asymm.scale; + } + else if(attr[0]->quant == VSI_NN_KERNEL_QUANT_DFP ) { if (attr[0]->dfp.fl > 0) { @@ -234,6 +257,57 @@ DEF_KERNEL_INITIALIZER(_moments_initializer) scaleIn = 1; } + if (attr[1]->quant == VSI_NN_KERNEL_QUANT_ASYMM) + { + output_ZP0 = (float)attr[1]->asymm.zero_point; + outputScale0 = 1.0f / attr[1]->asymm.scale; + } + else if ( attr[1]->quant == VSI_NN_KERNEL_QUANT_DFP ) + { + if (attr[1]->dfp.fl > 0) + { + outputScale0 = (float)((int64_t)1 << attr[1]->dfp.fl); + } + else + { + outputScale0 = (1.0f / (float)((int64_t)1 << -attr[1]->dfp.fl)); + } + output_ZP0 = 0.0f; + } + else if ( attr[1]->quant == VSI_NN_KERNEL_QUANT_NONE ) + { + outputScale0 = 1.0f; + output_ZP0 = 0.0f; + } + + if (attr[2]->quant == VSI_NN_KERNEL_QUANT_ASYMM) + { + output_ZP1 = (float)attr[2]->asymm.zero_point; + outputScale1 = 1.0f / attr[2]->asymm.scale; + } + else if ( attr[2]->quant == VSI_NN_KERNEL_QUANT_DFP ) + { + if (attr[2]->dfp.fl > 0) + { + outputScale1 = (float)((int64_t)1 << attr[2]->dfp.fl); + } + else + { + outputScale1 = (1.0f / (float)((int64_t)1 << -attr[2]->dfp.fl)); + } + output_ZP1 = 0.0f; + } + else if ( attr[2]->quant == VSI_NN_KERNEL_QUANT_NONE ) + { + outputScale1 = 1.0f; + output_ZP1 = 0.0f; + } + + output_ZP[0] = output_ZP0; + output_ZP[1] = output_ZP1; + outputScale[0] = outputScale0; + outputScale[1] = outputScale1; + if(attr[0]->dtype == I16) { iterSize = 8; @@ -316,10 +390,10 @@ DEF_KERNEL_INITIALIZER(_moments_initializer) zpScale2_i16 = tmpZp1 * e2InScale; sumScale_i16 = sumInZp * scaleIn; -#define _PACK_SELECT_KEY( IN0_TYPE, AXIS_NUM, FIRST_AXIS ) \ - (IN0_TYPE | (AXIS_NUM << 8) | (FIRST_AXIS << 16)) +#define _PACK_SELECT_KEY( IN0_TYPE, OUT0_TYPE, AXIS_NUM, FIRST_AXIS ) \ + (IN0_TYPE | (OUT0_TYPE << 8) | (AXIS_NUM << 16) | (FIRST_AXIS << 24)) - pack_key = _PACK_SELECT_KEY( attr[0]->dtype, axis_num, axis); + pack_key = _PACK_SELECT_KEY( attr[0]->dtype, attr[1]->dtype, axis_num, axis); { gpu_dp_inst_t uniSumU8_16x1 = {{ @@ -377,11 +451,22 @@ DEF_KERNEL_INITIALIZER(_moments_initializer) 0x00000001, 0x00000001, 0x00000001, 0x00000001, 0x00000001, 0x00000001, 0x00000001, 0x00000001 // Constant }, GPU_DP_TYPE_16 }; + gpu_dp_inst_t uniConvertInt32toUint8_2x8 = {{ + 0x33333333, // TCfg + 0x11110000, // ASelt + 0x03020100, 0x03020100, // ABin + 0x00000000, // BSelt + 0x00000000, 0x00000000, // BBin + 0x00002400, // AccumType, ConstantType, and PostShift + 0x00000000, 0x00000000, 0x00000000, 0x00000000, + 0x00000000, 0x00000000, 0x00000000, 0x00000000 // Constant + }, GPU_DP_TYPE_16 }; + switch( pack_key ) { - case _PACK_SELECT_KEY( U8, 1, 0): - case _PACK_SELECT_KEY( I8, 1, 0): - case _PACK_SELECT_KEY( I16, 1, 0): + case _PACK_SELECT_KEY( U8, F16, 1, 0): + case _PACK_SELECT_KEY( I8, F16, 1, 0): + case _PACK_SELECT_KEY( I16, F16, 1, 0): { status = vsi_nn_kernel_gpu_add_param(node, "uniSumU8_16x1", &uniSumU8_16x1); status |= vsi_nn_kernel_gpu_add_param(node, "uniSqrSum_16x1", &uniSqrSum_16x1); @@ -395,22 +480,28 @@ DEF_KERNEL_INITIALIZER(_moments_initializer) status |= vsi_nn_kernel_gpu_add_param(node, "zpScale2_i16", &zpScale2_i16); status |= vsi_nn_kernel_gpu_add_param(node, "sumScale_i16", &sumScale_i16); status |= vsi_nn_kernel_gpu_add_param(node, "uniFp16SumSqr_dp8x2", &uniFp16SumSqr_dp8x2); + status |= vsi_nn_kernel_gpu_add_param(node, "uniConvertHalftoFp16_2x8", + &uniConvertHalftoFp16_2x8); CHECK_STATUS_FAIL_GOTO(status, OnError ); } break; - case _PACK_SELECT_KEY( F16, 1, 0): + case _PACK_SELECT_KEY( F16, F16, 1, 0): { status = vsi_nn_kernel_gpu_add_param(node, "uniFp16SumSqr_dp8x2", &uniFp16SumSqr_dp8x2); + status |= vsi_nn_kernel_gpu_add_param(node, "uniConvertHalftoFp16_2x8", + &uniConvertHalftoFp16_2x8); status |= vsi_nn_kernel_gpu_add_param(node, "width", &width); CHECK_STATUS_FAIL_GOTO(status, OnError ); } break; - case _PACK_SELECT_KEY( U8, 1, 1): - case _PACK_SELECT_KEY( I8, 1, 1): - case _PACK_SELECT_KEY( I16, 1, 1): + case _PACK_SELECT_KEY( U8, F16, 1, 1): + case _PACK_SELECT_KEY( I8, F16, 1, 1): + case _PACK_SELECT_KEY( I16, F16, 1, 1): { status = vsi_nn_kernel_gpu_add_param(node, "uniConvert1stUint8SubZpToFp32_4x4", &uniConvert1stUint8SubZpToFp32_4x4); + status |= vsi_nn_kernel_gpu_add_param(node, "uniConvertHalftoFp16_2x8", + &uniConvertHalftoFp16_2x8); status |= vsi_nn_kernel_gpu_add_param(node, "inputZP", &input_zp); status |= vsi_nn_kernel_gpu_add_param(node, "input_scale", &scaleIn); status |= vsi_nn_kernel_gpu_add_param(node, "e2InScale", &e2InScale); @@ -418,19 +509,23 @@ DEF_KERNEL_INITIALIZER(_moments_initializer) CHECK_STATUS_FAIL_GOTO(status, OnError ); } break; - case _PACK_SELECT_KEY( F16, 1, 1): + case _PACK_SELECT_KEY( F16, F16, 1, 1): { status = vsi_nn_kernel_gpu_add_param(node, "UniFP16toFP32Lo4_dp4x4", &UniFP16toFP32Lo4_dp4x4); + status |= vsi_nn_kernel_gpu_add_param(node, "uniConvertHalftoFp16_2x8", + &uniConvertHalftoFp16_2x8); status |= vsi_nn_kernel_gpu_add_param(node, "height", &height); CHECK_STATUS_FAIL_GOTO(status, OnError ); } break; - case _PACK_SELECT_KEY( U8, 1, 2): - case _PACK_SELECT_KEY( I8, 1, 2): - case _PACK_SELECT_KEY( I16, 1, 2): + case _PACK_SELECT_KEY( U8, F16, 1, 2): + case _PACK_SELECT_KEY( I8, F16, 1, 2): + case _PACK_SELECT_KEY( I16, F16, 1, 2): { status = vsi_nn_kernel_gpu_add_param(node, "uniConvert1stUint8SubZpToFp32_4x4", &uniConvert1stUint8SubZpToFp32_4x4); + status |= vsi_nn_kernel_gpu_add_param(node, "uniConvertHalftoFp16_2x8", + &uniConvertHalftoFp16_2x8); status |= vsi_nn_kernel_gpu_add_param(node, "inputZP", &input_zp); status |= vsi_nn_kernel_gpu_add_param(node, "input_scale", &scaleIn); status |= vsi_nn_kernel_gpu_add_param(node, "e2InScale", &e2InScale); @@ -438,16 +533,18 @@ DEF_KERNEL_INITIALIZER(_moments_initializer) CHECK_STATUS_FAIL_GOTO(status, OnError ); } break; - case _PACK_SELECT_KEY( F16, 1, 2): + case _PACK_SELECT_KEY( F16, F16, 1, 2): { status = vsi_nn_kernel_gpu_add_param(node, "UniFP16toFP32Lo4_dp4x4", &UniFP16toFP32Lo4_dp4x4); + status |= vsi_nn_kernel_gpu_add_param(node, "uniConvertHalftoFp16_2x8", + &uniConvertHalftoFp16_2x8); status |= vsi_nn_kernel_gpu_add_param(node, "channel", &chn); CHECK_STATUS_FAIL_GOTO(status, OnError ); } break; - case _PACK_SELECT_KEY( U8, 2, 0): - case _PACK_SELECT_KEY( I8, 2, 0): - case _PACK_SELECT_KEY( I16, 2, 0): + case _PACK_SELECT_KEY( U8, F16, 2, 0): + case _PACK_SELECT_KEY( I8, F16, 2, 0): + case _PACK_SELECT_KEY( I16, F16, 2, 0): { status = vsi_nn_kernel_gpu_add_param(node, "uniSumU8_16x1", &uniSumU8_16x1); status |= vsi_nn_kernel_gpu_add_param(node, "uniSqrSum_16x1", &uniSqrSum_16x1); @@ -462,12 +559,14 @@ DEF_KERNEL_INITIALIZER(_moments_initializer) status |= vsi_nn_kernel_gpu_add_param(node, "zpScale2_i16", &zpScale2_i16); status |= vsi_nn_kernel_gpu_add_param(node, "sumScale_i16", &sumScale_i16); status |= vsi_nn_kernel_gpu_add_param(node, "uniFp16SumSqr_dp8x2", &uniFp16SumSqr_dp8x2); + status |= vsi_nn_kernel_gpu_add_param(node, "uniConvertHalftoFp16_2x8", + &uniConvertHalftoFp16_2x8); CHECK_STATUS_FAIL_GOTO(status, OnError ); } break; - case _PACK_SELECT_KEY( U8, 3, 0): - case _PACK_SELECT_KEY( I8, 3, 0): - case _PACK_SELECT_KEY( I16, 3, 0): + case _PACK_SELECT_KEY( U8, F16, 3, 0): + case _PACK_SELECT_KEY( I8, F16, 3, 0): + case _PACK_SELECT_KEY( I16, F16, 3, 0): { status = vsi_nn_kernel_gpu_add_param(node, "uniSumU8_16x1", &uniSumU8_16x1); status |= vsi_nn_kernel_gpu_add_param(node, "uniSqrSum_16x1", &uniSqrSum_16x1); @@ -483,32 +582,85 @@ DEF_KERNEL_INITIALIZER(_moments_initializer) status |= vsi_nn_kernel_gpu_add_param(node, "zpScale2_i16", &zpScale2_i16); status |= vsi_nn_kernel_gpu_add_param(node, "sumScale_i16", &sumScale_i16); status |= vsi_nn_kernel_gpu_add_param(node, "uniFp16SumSqr_dp8x2", &uniFp16SumSqr_dp8x2); + status |= vsi_nn_kernel_gpu_add_param(node, "uniConvertHalftoFp16_2x8", + &uniConvertHalftoFp16_2x8); CHECK_STATUS_FAIL_GOTO(status, OnError ); } break; - case _PACK_SELECT_KEY( F16, 2, 0): + case _PACK_SELECT_KEY( F16, F16, 2, 0): { status = vsi_nn_kernel_gpu_add_param(node, "uniFp16SumSqr_dp8x2", &uniFp16SumSqr_dp8x2); + status |= vsi_nn_kernel_gpu_add_param(node, "uniConvertHalftoFp16_2x8", + &uniConvertHalftoFp16_2x8); status |= vsi_nn_kernel_gpu_add_param(node, "width", &width); status |= vsi_nn_kernel_gpu_add_param(node, "height", &height); CHECK_STATUS_FAIL_GOTO(status, OnError ); } break; - case _PACK_SELECT_KEY( F16, 3, 0): + case _PACK_SELECT_KEY( F16, F16, 3, 0): { status = vsi_nn_kernel_gpu_add_param(node, "uniFp16SumSqr_dp8x2", &uniFp16SumSqr_dp8x2); + status |= vsi_nn_kernel_gpu_add_param(node, "uniConvertHalftoFp16_2x8", + &uniConvertHalftoFp16_2x8); status |= vsi_nn_kernel_gpu_add_param(node, "width", &width); status |= vsi_nn_kernel_gpu_add_param(node, "height", &height); status |= vsi_nn_kernel_gpu_add_param(node, "channel", &chn); CHECK_STATUS_FAIL_GOTO(status, OnError ); } break; + case _PACK_SELECT_KEY( U8, U8, 1, 0): + case _PACK_SELECT_KEY( U8, U8, 1, 1): + case _PACK_SELECT_KEY( U8, U8, 1, 2): + case _PACK_SELECT_KEY( U8, U8, 2, 0): + { + status = vsi_nn_kernel_gpu_add_param(node, "uniSumU8_16x1", &uniSumU8_16x1); + status |= vsi_nn_kernel_gpu_add_param(node, "uniSqrSum_16x1", &uniSqrSum_16x1); + status |= vsi_nn_kernel_gpu_add_param(node, "uniConvertInt32toUint8_2x8", + &uniConvertInt32toUint8_2x8); + status |= vsi_nn_kernel_gpu_add_param(node, "uniConvert1stUint8SubZpToFp32_4x4", + &uniConvert1stUint8SubZpToFp32_4x4); + status |= vsi_nn_kernel_gpu_add_param(node, "sumInZp", &sumInZp); + status |= vsi_nn_kernel_gpu_add_param(node, "tmpZp1", &tmpZp1); + status |= vsi_nn_kernel_gpu_add_param(node, "inputZP", &input_zp); + status |= vsi_nn_kernel_gpu_add_param(node, "input_scale", &scaleIn); + status |= vsi_nn_kernel_gpu_add_param(node, "e2InScale", &e2InScale); + status |= vsi_nn_kernel_gpu_add_param(node, "rowSumScale", &rowSumScale); + status |= vsi_nn_kernel_gpu_add_param(node, "width", &width); + status |= vsi_nn_kernel_gpu_add_param(node, "height", &height); + status |= vsi_nn_kernel_gpu_add_param(node, "channel", &chn); + status |= vsi_nn_kernel_gpu_add_param(node, "output_ZP", &output_ZP); + status |= vsi_nn_kernel_gpu_add_param(node, "outputScale", &outputScale); + status |= vsi_nn_kernel_gpu_add_param(node, "output_ZP0", &output_ZP0); + status |= vsi_nn_kernel_gpu_add_param(node, "outputScale0", &outputScale0); + status |= vsi_nn_kernel_gpu_add_param(node, "output_ZP1", &output_ZP1); + status |= vsi_nn_kernel_gpu_add_param(node, "outputScale1", &outputScale1); + CHECK_STATUS_FAIL_GOTO(status, OnError ); + } + break; + case _PACK_SELECT_KEY( U8, U8, 3, 0): + { + status = vsi_nn_kernel_gpu_add_param(node, "uniSumU8_16x1", &uniSumU8_16x1); + status |= vsi_nn_kernel_gpu_add_param(node, "uniSqrSum_16x1", &uniSqrSum_16x1); + status |= vsi_nn_kernel_gpu_add_param(node, "uniConvertInt32toUint8_2x8", + &uniConvertInt32toUint8_2x8); + status |= vsi_nn_kernel_gpu_add_param(node, "sumInZp", &sumInZp); + status |= vsi_nn_kernel_gpu_add_param(node, "tmpZp1", &tmpZp1); + status |= vsi_nn_kernel_gpu_add_param(node, "input_scale", &scaleIn); + status |= vsi_nn_kernel_gpu_add_param(node, "e2InScale", &e2InScale); + status |= vsi_nn_kernel_gpu_add_param(node, "rowSumScale", &rowSumScale); + status |= vsi_nn_kernel_gpu_add_param(node, "width", &width); + status |= vsi_nn_kernel_gpu_add_param(node, "height", &height); + status |= vsi_nn_kernel_gpu_add_param(node, "channel", &chn); + status |= vsi_nn_kernel_gpu_add_param(node, "output_ZP", &output_ZP); + status |= vsi_nn_kernel_gpu_add_param(node, "outputScale", &outputScale); + CHECK_STATUS_FAIL_GOTO(status, OnError ); + } + break; default: VSI_ASSERT( FALSE ); break; } status = vsi_nn_kernel_gpu_add_param(node, "dimRatio", &dimRatio); - status |= vsi_nn_kernel_gpu_add_param(node, "uniConvertHalftoFp16_2x8", &uniConvertHalftoFp16_2x8); CHECK_STATUS_FAIL_GOTO(status, OnError ); } #undef _PACK_SELECT_KEY @@ -519,6 +671,16 @@ OnError: vsi_nn_kernel_tensor_attr_release( &attr[0] ); attr[0] = NULL; } + if (attr[1]) + { + vsi_nn_kernel_tensor_attr_release( &attr[1] ); + attr[1] = NULL; + } + if (attr[2]) + { + vsi_nn_kernel_tensor_attr_release( &attr[2] ); + attr[2] = NULL; + } return status; } diff --git a/src/tim/vx/internal/src/kernel/evis/scatter_nd_update_evis.c b/src/tim/vx/internal/src/kernel/evis/scatter_nd_update_evis.c new file mode 100644 index 0000000..2cb231c --- /dev/null +++ b/src/tim/vx/internal/src/kernel/evis/scatter_nd_update_evis.c @@ -0,0 +1,1603 @@ +/**************************************************************************** +* +* Copyright (c) 2019 Vivante Corporation +* +* Permission is hereby granted, free of charge, to any person obtaining a +* copy of this software and associated documentation files (the "Software"), +* to deal in the Software without restriction, including without limitation +* the rights to use, copy, modify, merge, publish, distribute, sublicense, +* and/or sell copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following conditions: +* +* The above copyright notice and this permission notice shall be included in +* all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +* DEALINGS IN THE SOFTWARE. +* +*****************************************************************************/ + + +#include +#include +#include +#include "vsi_nn_types.h" +#include "vsi_nn_tensor.h" +#include "vsi_nn_graph.h" +#include "vsi_nn_log.h" +#include "vsi_nn_prv.h" +#include "vsi_nn_tensor_util.h" +#include "vsi_nn_error.h" +#include "utils/vsi_nn_util.h" +#include "kernel/vsi_nn_kernel.h" +#include "libnnext/vx_lib_nnext.h" + +__BEGIN_DECLS + +/* + * Define kernel meta. + */ +#define KERNEL_SOURCE_1 "scatter_nd_update" +#define KERNEL_SOURCE_2 "scatter_nd_update_big" +#define KERNEL_SOURCE_3 "scatter_nd_update_atom" + +#define HASH_SCATTER_ND_UPDATE_KEY(_input0_type, _input2_type, _output_type, _pre_op, _large_type) \ + ((_input0_type << 24) | (_input2_type << 16) | (_output_type << 8) | (_pre_op << 4) | (_large_type)) + +#define HASH_SCATTER_ND_UPDATE_SH_KERNEL_NAME(SRC0_TYPE, SRC2_TYPE, DST_TYPE) \ + CVIVANTE_NAMESPACE("evis.scatter_nd_update_"#SRC0_TYPE#SRC2_TYPE"to"#DST_TYPE) + +#define HASH_SCATTER_ND_UPDATE_SH_KERNEL_BIG_NAME(SRC0_TYPE, SRC2_TYPE, DST_TYPE) \ + CVIVANTE_NAMESPACE("evis.scatter_nd_update_"#SRC0_TYPE#SRC2_TYPE"to"#DST_TYPE"_big") + +#define HASH_SCATTER_ND_UPDATE_SH_KERNEL_PRE_NAME(SRC0_TYPE) \ + CVIVANTE_NAMESPACE("evis.scatter_nd_update_"#SRC0_TYPE"_pre") + + #define HASH_SCATTER_ND_UPDATE_SH_KERNEL_RESET_NAME() \ + CVIVANTE_NAMESPACE("evis.scatter_nd_update_reset") + +#define TENSOR_SCATTER_ND_UPDATE_KERNELS(IN0_TYPE, IN1_TYPE, IN2_TYPE, OUT_TYPE, SOURCE) \ + { HASH_SCATTER_ND_UPDATE_KEY(IN0_TYPE, IN2_TYPE, OUT_TYPE, 0, 0), \ + HASH_SCATTER_ND_UPDATE_SH_KERNEL_NAME(IN0_TYPE, IN2_TYPE, OUT_TYPE), \ + SOURCE }, + +#define TENSOR_SCATTER_ND_UPDATE_BIG_KERNELS(IN0_TYPE, IN1_TYPE, IN2_TYPE, OUT_TYPE, SOURCE) \ + { HASH_SCATTER_ND_UPDATE_KEY(IN0_TYPE, IN2_TYPE, OUT_TYPE, 0, 1), \ + HASH_SCATTER_ND_UPDATE_SH_KERNEL_BIG_NAME(IN0_TYPE, IN2_TYPE, OUT_TYPE), \ + SOURCE }, + +#define TENSOR_SCATTER_ND_UPDATE_PRE_KERNELS(IN0_TYPE, SOURCE) \ + { HASH_SCATTER_ND_UPDATE_KEY(IN0_TYPE, I32, I32, 1, 1), \ + HASH_SCATTER_ND_UPDATE_SH_KERNEL_PRE_NAME(IN0_TYPE), \ + SOURCE }, + + #define TENSOR_SCATTER_ND_UPDATE_RESET_KERNELS(SOURCE) \ + { HASH_SCATTER_ND_UPDATE_KEY(I32, I32, I32, 2, 1), \ + HASH_SCATTER_ND_UPDATE_SH_KERNEL_RESET_NAME(), \ + SOURCE }, + +typedef struct +{ + uint32_t key; + char * function_name; + const char * source_name; +} _kernel_map_type; + +static const _kernel_map_type scatter_nd_update_map[] = +{ + TENSOR_SCATTER_ND_UPDATE_KERNELS(I8, I32, I8, I8, KERNEL_SOURCE_1) + TENSOR_SCATTER_ND_UPDATE_KERNELS(U8, I32, U8, U8, KERNEL_SOURCE_1) + TENSOR_SCATTER_ND_UPDATE_KERNELS(I16, I32, I16, I16, KERNEL_SOURCE_1) + TENSOR_SCATTER_ND_UPDATE_KERNELS(F16, I32, F16, F16, KERNEL_SOURCE_1) + TENSOR_SCATTER_ND_UPDATE_KERNELS(BF16, I32, BF16, BF16, KERNEL_SOURCE_1) + TENSOR_SCATTER_ND_UPDATE_KERNELS(U8, I32, U8, F16, KERNEL_SOURCE_1) + TENSOR_SCATTER_ND_UPDATE_KERNELS(I8, I32, I8, F16, KERNEL_SOURCE_1) + TENSOR_SCATTER_ND_UPDATE_KERNELS(I16, I32, I16, F16, KERNEL_SOURCE_1) + TENSOR_SCATTER_ND_UPDATE_BIG_KERNELS(F16, I32, F16, F16, KERNEL_SOURCE_2) +}; + +static const _kernel_map_type scatter_nd_update_reset_map[] = +{ + TENSOR_SCATTER_ND_UPDATE_RESET_KERNELS(KERNEL_SOURCE_3) +}; + +static const _kernel_map_type scatter_nd_update_pre_map[] = +{ + TENSOR_SCATTER_ND_UPDATE_PRE_KERNELS(U8, KERNEL_SOURCE_3) + TENSOR_SCATTER_ND_UPDATE_PRE_KERNELS(I8, KERNEL_SOURCE_3) + TENSOR_SCATTER_ND_UPDATE_PRE_KERNELS(I16, KERNEL_SOURCE_3) +}; + +static const _kernel_map_type scatter_nd_update_post_map[] = +{ + TENSOR_SCATTER_ND_UPDATE_BIG_KERNELS(U8, I32, U8, F16, KERNEL_SOURCE_3) + TENSOR_SCATTER_ND_UPDATE_BIG_KERNELS(I8, I32, I8, F16, KERNEL_SOURCE_3) + TENSOR_SCATTER_ND_UPDATE_BIG_KERNELS(I16, I32, I16, F16, KERNEL_SOURCE_3) + TENSOR_SCATTER_ND_UPDATE_BIG_KERNELS(I8, I32, I8, I8, KERNEL_SOURCE_3) + TENSOR_SCATTER_ND_UPDATE_BIG_KERNELS(U8, I32, U8, U8, KERNEL_SOURCE_3) + TENSOR_SCATTER_ND_UPDATE_BIG_KERNELS(I16, I32, I16, I16, KERNEL_SOURCE_3) +}; + +/* + * Kernel params + */ +static vx_param_description_t _scatter_nd_update_kernel_param_def[] = +{ + {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, + {VX_OUTPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED}, + // Add kererl parameters here +}; + +static vx_param_description_t _scatter_nd_update_reset_kernel_param_def[] = +{ + {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, + {VX_OUTPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, + {VX_OUTPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, + // Add kererl parameters here +}; + +static vx_param_description_t _scatter_nd_update_pre_kernel_param_def[] = +{ + {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, + {VX_OUTPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, + //{VX_OUTPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED}, + // Add kererl parameters here +}; + +static vx_param_description_t _scatter_nd_update_post_kernel_param_def[] = +{ + {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, + {VX_OUTPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED}, + // Add kererl parameters here +}; + +#define _SCATTER_ND_UPDATE_PARAM_NUM _cnt_of_array( _scatter_nd_update_kernel_param_def ) +#define _SCATTER_ND_UPDATE_PRE_PARAM_NUM _cnt_of_array( _scatter_nd_update_pre_kernel_param_def ) +#define _SCATTER_ND_UPDATE_POST_PARAM_NUM _cnt_of_array( _scatter_nd_update_post_kernel_param_def ) +#define _SCATTER_ND_UPDATE_RESET_PARAM_NUM _cnt_of_array( _scatter_nd_update_reset_kernel_param_def ) + +static vsi_status get_scatter_nd_update_tensor_reshape_size + ( + vsi_nn_tensor_t ** inputs, + int32_t sizes[VSI_NN_MAX_DIM_NUM], + uint32_t block_size, + uint32_t coordDim, + uint32_t* width, + uint32_t* area, + uint32_t* vol, + int32_t* newDim, + int32_t* isBig + ) +{ + vsi_status status = VSI_FAILURE; + uint32_t dims_num = inputs[0]->attr.dim_num; + uint32_t *input_size = inputs[0]->attr.size; + uint32_t i = 0; + uint32_t elementCnt = 1; + + if (coordDim != 0 && (width == NULL || area == NULL)) + { + return status; + } + +#define VSI_NN_MAX_IMAGE_WIDTH (65536) + + newDim[0] = 0; + for(i = 0; i < dims_num; ++i) + { + elementCnt *= input_size[i]; + } + + for(i = 0; i < VSI_NN_MAX_DIM_NUM; ++i) + { + sizes[i] = 1; + } + + sizes[0] = block_size; + sizes[1] = elementCnt / block_size; + newDim[0] = 2; + + if ((elementCnt / block_size) >= VSI_NN_MAX_IMAGE_WIDTH) + { + isBig[0] |= 1; + } + + if (coordDim == 1) // index shape + { + *width = 0; + *area = 0; + } + else if (coordDim == 2) + { + *width = input_size[dims_num - 2]; + *area = 0; + } + else if (coordDim == 3) + { + *width = input_size[dims_num - 3]; + *area = input_size[dims_num - 3] * input_size[dims_num - 2]; + } + else if (coordDim == 4) + { + *width = input_size[dims_num - 4]; + *area = input_size[dims_num - 4] * input_size[dims_num - 3]; + *vol = input_size[dims_num - 4] * input_size[dims_num - 3] * input_size[dims_num - 2]; + } + else if (coordDim == 5) + { + *width = input_size[dims_num - 5]; + *area = input_size[dims_num - 5] * input_size[dims_num - 4]; + *vol = input_size[dims_num - 5] * input_size[dims_num - 4] * input_size[dims_num - 3]; + } +#undef VSI_NN_MAX_IMAGE_WIDTH + + return VSI_SUCCESS; +} /* _get_EltOP_tensor_reshape_size */ + +/* + * Kernel initializer + */ +DEF_KERNEL_INITIALIZER(_scatter_nd_update_initializer) + ( + vsi_nn_kernel_node_t node, + const vsi_nn_kernel_node_param_t * param, + size_t param_size + ) +{ + vsi_status status = VSI_FAILURE; + gpu_param_t gpu_param = { + 3, + {0, 0, 0}, + {0, 0, 0}, + {0, 0, 0}, + {0, 0, 0} + }; + + vsi_nn_kernel_tensor_attr_t * attr[4] = { NULL }; + int32_t block_size = 1; + int32_t height = 1; + int32_t index_num = 1; + int32_t width = 0, area = 0, vol = 0; + int32_t coord_dim = 0; + int32_t offsetX = 0, offsetY = 0, offsetZ = 0, offsetW = 0, offset_idx = 0; + int32_t src0ZP = 0; + float src0Scale = 1; + int32_t src2ZP = 0; + float src2Scale = 1; + int32_t dstZP = 0; + float dstScale = 1; + + attr[0] = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[0] ); + CHECK_PTR_FAIL_GOTO( attr[0], "Create tensor attr buffer fail.", OnError ); + attr[1] = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[1] ); + CHECK_PTR_FAIL_GOTO( attr[1], "Create tensor attr buffer fail.", OnError ); + attr[2] = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[2] ); + CHECK_PTR_FAIL_GOTO( attr[2], "Create tensor attr buffer fail.", OnError ); + attr[3] = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[3] ); + CHECK_PTR_FAIL_GOTO( attr[3], "Create tensor attr buffer fail.", OnError ); + + status = vsi_nn_kernel_scalar_read_int32((vsi_nn_kernel_scalar_t)param[4], &width); + CHECK_STATUS_FAIL_GOTO(status, OnError ); + status = vsi_nn_kernel_scalar_read_int32((vsi_nn_kernel_scalar_t)param[5], &area); + CHECK_STATUS_FAIL_GOTO(status, OnError ); + status = vsi_nn_kernel_scalar_read_int32((vsi_nn_kernel_scalar_t)param[6], &vol); + CHECK_STATUS_FAIL_GOTO(status, OnError ); + status = vsi_nn_kernel_scalar_read_int32((vsi_nn_kernel_scalar_t)param[7], &coord_dim); + CHECK_STATUS_FAIL_GOTO(status, OnError ); + + block_size = attr[3]->shape->data[0]; + height = attr[3]->shape->data[1]; + index_num = attr[1]->shape->data[1]; + + if (attr[0]->quant == VSI_NN_KERNEL_QUANT_ASYMM) + { + src0ZP = attr[0]->asymm.zero_point; + src0Scale = attr[0]->asymm.scale; + } + else if ( attr[0]->quant == VSI_NN_KERNEL_QUANT_DFP ) + { + if (attr[0]->dfp.fl > 0) + { + src0Scale = (1.0f / ((float) ((int64_t)1 << attr[0]->dfp.fl))); + } + else + { + src0Scale = ((float) ((int64_t)1 << -attr[0]->dfp.fl)); + } + src0ZP = 0; + } + else if ( attr[0]->quant == VSI_NN_KERNEL_QUANT_NONE ) + { + src0Scale = 1; + src0ZP = 0; + } + + if (attr[2]->quant == VSI_NN_KERNEL_QUANT_ASYMM) + { + src2ZP = attr[2]->asymm.zero_point; + src2Scale = attr[2]->asymm.scale; + } + else if ( attr[2]->quant == VSI_NN_KERNEL_QUANT_DFP ) + { + if (attr[2]->dfp.fl > 0) + { + src2Scale = (1.0f / ((float) ((int64_t)1 << attr[2]->dfp.fl))); + } + else + { + src2Scale = ((float) ((int64_t)1 << -attr[2]->dfp.fl)); + } + src2ZP = 0; + } + else if ( attr[2]->quant == VSI_NN_KERNEL_QUANT_NONE ) + { + src2Scale = 1; + src2ZP = 0; + } + + if (attr[3]->quant == VSI_NN_KERNEL_QUANT_ASYMM) + { + dstZP = attr[3]->asymm.zero_point; + dstScale = attr[3]->asymm.scale; + } + else if ( attr[3]->quant == VSI_NN_KERNEL_QUANT_DFP ) + { + if (attr[3]->dfp.fl > 0) + { + dstScale = (float)((int64_t)1 << attr[3]->dfp.fl); + } + else + { + dstScale = (1.0f / (float)((int64_t)1 << -attr[3]->dfp.fl)); + } + dstScale = 1.0f/dstScale; + dstZP = 0; + } + else if ( attr[3]->quant == VSI_NN_KERNEL_QUANT_NONE ) + { + dstScale = 1; + dstZP = 0; + } + + if (coord_dim == 5) + { + offset_idx = 1; + } + if (coord_dim == 4 || coord_dim == 5) + { + offsetX = vol; + offsetY = area; + offsetZ = width; + offsetW = 1; + } + else if (coord_dim == 3) + { + offsetX = area; + offsetY = width; + offsetZ = 1; + offsetW = 0; + } + else if (coord_dim == 2) + { + offsetX = width; + offsetY = 1; + offsetZ = 0; + offsetW = 0; + } + else if (coord_dim == 1) + { + offsetX = 1; + offsetY = 0; + offsetZ = 0; + offsetW = 0; + } + + gpu_param.global_scale[0] = 8; + gpu_param.global_scale[1] = 1; + gpu_param.global_scale[2] = 1; + + gpu_param.global_size[0] = gpu_align_p2((block_size + gpu_param.global_scale[0] - 1) + / gpu_param.global_scale[0], 4); + gpu_param.global_size[1] = height; + gpu_param.global_size[2] = 1; + + status = vsi_nn_kernel_gpu_config( node, &gpu_param ); + CHECK_STATUS_FAIL_GOTO(status, OnError); + + { + uint16_t M0 = 0; + uint16_t M1 = 0; + int32_t postShift0 = 0; + int32_t postShift1 = 0; + uint32_t multAndoutZP0[2] = {0}; + uint32_t multAndoutZP1[2] = {0}; + gpu_dp_inst_t uniAccumulateSum_2x8 = {{ + 0x55555555, // TCfg + 0x44444444, // ASelt + 0x33221100, 0x77665544, // ABin + 0xaaaaaaaa, // BSelt + 0x00000000, 0x00000000, // BBin + 0x00000600, // AccumType, ConstantType, and PostShift + 0x00010001, 0x00010001, 0x00010001, 0x00010001, + 0x00010001, 0x00010001, 0x00010001, 0x00010001 // Constant + }, GPU_DP_TYPE_16 }; + gpu_dp_inst_t uniU8MulAndPostShift_0_Lo_2x8 = {{ + 0xdddddddd, // TCfg + 0x44444444, // ASelt + 0x13121110, 0x17161514, // ABin + 0x11111111, // BSelt + 0x00000000, 0x00000000, // BBin + 0x00002600, // AccumType, ConstantType, and PostShift + 0x00000000, 0x00000000, 0x00000000, 0x00000000, + 0x00000000, 0x00000000, 0x00000000, 0x00000000 // Constant + }, GPU_DP_TYPE_16 }; + gpu_dp_inst_t uniU8MulAndPostShift_1_Lo_2x8 = {{ + 0xdddddddd, // TCfg + 0x44444444, // ASelt + 0x13121110, 0x17161514, // ABin + 0x11111111, // BSelt + 0x00000000, 0x00000000, // BBin + 0x00002600, // AccumType, ConstantType, and PostShift + 0x00000000, 0x00000000, 0x00000000, 0x00000000, + 0x00000000, 0x00000000, 0x00000000, 0x00000000 // Constant + }, GPU_DP_TYPE_16 }; + gpu_dp_inst_t uniConvBF16toF32_Part0_2x8 = {{ + 0x11111111, // TCfg + 0x01010101, // ASelt + 0x01050004, 0x03070206, // ABin + 0x22222222, // BSelt + 0x00000000, 0x00000000, // BBin + 0x00000600, // AccumType, ConstantType, and PostShift + 0x00000001, 0x00000001, 0x00000001, 0x00000001, + 0x00000001, 0x00000001, 0x00000001, 0x00000001 // Constant + }, GPU_DP_TYPE_16}; + gpu_dp_inst_t uniConvBF16toF32_Part1_2x8 = {{ + 0x11111111, // TCfg + 0x01010101, // ASelt + 0x05050404, 0x07070606, // ABin + 0x22222222, // BSelt + 0x00000000, 0x00000000, // BBin + 0x00000600, // AccumType, ConstantType, and PostShift + 0x00000001, 0x00000001, 0x00000001, 0x00000001, + 0x00000001, 0x00000001, 0x00000001, 0x00000001 // Constant + }, GPU_DP_TYPE_16}; + gpu_dp_inst_t uniExtractOddData_2x8 = {{ + 0x11111111, // TCfg + 0x11110000, // ASelt + 0x07050301, 0x07050301, // ABin + 0x22222222, // BSelt + 0x00000000, 0x00000000, // BBin + 0x00000600, // AccumType, ConstantType, and PostShift + 0x00000001, 0x00000001, 0x00000001, 0x00000001, + 0x00000001, 0x00000001, 0x00000001, 0x00000001 // Constant + }, GPU_DP_TYPE_16}; + + gpu_quantize_multiplier_16bit( (double)src0Scale / dstScale, &M0, &postShift0); + gpu_quantize_multiplier_16bit( (double)src2Scale / dstScale, &M1, &postShift1); + multAndoutZP0[0] = (uint32_t)(M0); + multAndoutZP0[1] = (uint32_t)((dstZP << postShift0) - src0ZP * M0); + multAndoutZP1[0] = (uint32_t)(M1); + multAndoutZP1[1] = (uint32_t)((dstZP << postShift1) - src2ZP * M1); + gpu_dp_inst_update_postshfit( &uniU8MulAndPostShift_0_Lo_2x8, postShift0 ); + gpu_dp_inst_update_postshfit( &uniU8MulAndPostShift_1_Lo_2x8, postShift1 ); + + status = vsi_nn_kernel_gpu_add_param( node, + "uniAccumulateSum_2x8", &uniAccumulateSum_2x8 ); + status |= vsi_nn_kernel_gpu_add_param( node, + "uniU8MulAndPostShift_0_Lo_2x8", &uniU8MulAndPostShift_0_Lo_2x8 ); + status |= vsi_nn_kernel_gpu_add_param( node, + "uniU8MulAndPostShift_1_Lo_2x8", &uniU8MulAndPostShift_1_Lo_2x8 ); + status |= vsi_nn_kernel_gpu_add_param( node, "multAndoutZP0", &multAndoutZP0 ); + status |= vsi_nn_kernel_gpu_add_param( node, "multAndoutZP1", &multAndoutZP1 ); + status |= vsi_nn_kernel_gpu_add_param( node, + "uniConvBF16toF32_Part0_2x8", &uniConvBF16toF32_Part0_2x8 ); + status |= vsi_nn_kernel_gpu_add_param( node, + "uniConvBF16toF32_Part1_2x8", &uniConvBF16toF32_Part1_2x8 ); + status |= vsi_nn_kernel_gpu_add_param( node, + "uniExtractOddData_2x8", &uniExtractOddData_2x8 ); + status |= vsi_nn_kernel_gpu_add_param( node, "index_num", &index_num ); + status |= vsi_nn_kernel_gpu_add_param( node, "offsetX", &offsetX ); + status |= vsi_nn_kernel_gpu_add_param( node, "offsetY", &offsetY ); + status |= vsi_nn_kernel_gpu_add_param( node, "offsetZ", &offsetZ ); + status |= vsi_nn_kernel_gpu_add_param( node, "offsetW", &offsetW ); + status |= vsi_nn_kernel_gpu_add_param( node, "offset_idx", &offset_idx ); + CHECK_STATUS_FAIL_GOTO(status, OnError); + } + +OnError: + if (attr[0]) + { + vsi_nn_kernel_tensor_attr_release( &attr[0] ); + attr[0] = NULL; + } + if (attr[1]) + { + vsi_nn_kernel_tensor_attr_release( &attr[1] ); + attr[1] = NULL; + } + if (attr[2]) + { + vsi_nn_kernel_tensor_attr_release( &attr[2] ); + attr[2] = NULL; + } + if (attr[3]) + { + vsi_nn_kernel_tensor_attr_release( &attr[3] ); + attr[3] = NULL; + } + return status; +} /* _scatter_nd_update_initializer() */ + +DEF_KERNEL_INITIALIZER(_scatter_nd_update_big_initializer) + ( + vsi_nn_kernel_node_t node, + const vsi_nn_kernel_node_param_t * param, + size_t param_size + ) +{ + vsi_status status = VSI_FAILURE; + gpu_param_t gpu_param = { + 3, + {0, 0, 0}, + {0, 0, 0}, + {0, 0, 0}, + {0, 0, 0} + }; + + vsi_nn_kernel_tensor_attr_t * attr[4] = { NULL }; + int32_t block_size = 1; + int32_t height = 1; + int32_t index_num = 1; + int32_t width = 0, area = 0, vol = 0; + int32_t coord_dim = 0; + int32_t offsetX = 0, offsetY = 0, offsetZ = 0, offsetW = 0, offset_idx = 0; + int32_t src0ZP = 0; + float src0Scale = 1; + int32_t src2ZP = 0; + float src2Scale = 1; + int32_t dstZP = 0; + float dstScale = 1; + + attr[0] = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[0] ); + CHECK_PTR_FAIL_GOTO( attr[0], "Create tensor attr buffer fail.", OnError ); + attr[1] = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[1] ); + CHECK_PTR_FAIL_GOTO( attr[1], "Create tensor attr buffer fail.", OnError ); + attr[2] = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[2] ); + CHECK_PTR_FAIL_GOTO( attr[2], "Create tensor attr buffer fail.", OnError ); + attr[3] = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[3] ); + CHECK_PTR_FAIL_GOTO( attr[3], "Create tensor attr buffer fail.", OnError ); + + status = vsi_nn_kernel_scalar_read_int32((vsi_nn_kernel_scalar_t)param[4], &width); + CHECK_STATUS_FAIL_GOTO(status, OnError ); + status = vsi_nn_kernel_scalar_read_int32((vsi_nn_kernel_scalar_t)param[5], &area); + CHECK_STATUS_FAIL_GOTO(status, OnError ); + status = vsi_nn_kernel_scalar_read_int32((vsi_nn_kernel_scalar_t)param[6], &vol); + CHECK_STATUS_FAIL_GOTO(status, OnError ); + status = vsi_nn_kernel_scalar_read_int32((vsi_nn_kernel_scalar_t)param[7], &coord_dim); + CHECK_STATUS_FAIL_GOTO(status, OnError ); + + block_size = attr[3]->shape->data[0]; + height = attr[3]->shape->data[1]; + index_num = attr[1]->shape->data[1]; + + if (attr[0]->quant == VSI_NN_KERNEL_QUANT_ASYMM) + { + src0ZP = attr[0]->asymm.zero_point; + src0Scale = attr[0]->asymm.scale; + } + else if ( attr[0]->quant == VSI_NN_KERNEL_QUANT_DFP ) + { + if (attr[0]->dfp.fl > 0) + { + src0Scale = (1.0f / ((float) ((int64_t)1 << attr[0]->dfp.fl))); + } + else + { + src0Scale = ((float) ((int64_t)1 << -attr[0]->dfp.fl)); + } + src0ZP = 0; + } + else if ( attr[0]->quant == VSI_NN_KERNEL_QUANT_NONE ) + { + src0Scale = 1; + src0ZP = 0; + } + + if (attr[2]->quant == VSI_NN_KERNEL_QUANT_ASYMM) + { + src2ZP = attr[2]->asymm.zero_point; + src2Scale = attr[2]->asymm.scale; + } + else if ( attr[2]->quant == VSI_NN_KERNEL_QUANT_DFP ) + { + if (attr[2]->dfp.fl > 0) + { + src2Scale = (1.0f / ((float) ((int64_t)1 << attr[2]->dfp.fl))); + } + else + { + src2Scale = ((float) ((int64_t)1 << -attr[2]->dfp.fl)); + } + src2ZP = 0; + } + else if ( attr[2]->quant == VSI_NN_KERNEL_QUANT_NONE ) + { + src2Scale = 1; + src2ZP = 0; + } + + if (attr[3]->quant == VSI_NN_KERNEL_QUANT_ASYMM) + { + dstZP = attr[3]->asymm.zero_point; + dstScale = attr[3]->asymm.scale; + } + else if ( attr[3]->quant == VSI_NN_KERNEL_QUANT_DFP ) + { + if (attr[3]->dfp.fl > 0) + { + dstScale = (float)((int64_t)1 << attr[3]->dfp.fl); + } + else + { + dstScale = (1.0f / (float)((int64_t)1 << -attr[3]->dfp.fl)); + } + dstScale = 1.0f/dstScale; + dstZP = 0; + } + else if ( attr[3]->quant == VSI_NN_KERNEL_QUANT_NONE ) + { + dstScale = 1; + dstZP = 0; + } + + if (coord_dim == 5) + { + offset_idx = 1; + } + if (coord_dim == 4 || coord_dim == 5) + { + offsetX = vol; + offsetY = area; + offsetZ = width; + offsetW = 1; + } + else if (coord_dim == 3) + { + offsetX = area; + offsetY = width; + offsetZ = 1; + } + else if (coord_dim == 2) + { + offsetX = width; + offsetY = 1; + offsetZ = 0; + } + else if (coord_dim == 1) + { + offsetX = 1; + offsetY = 0; + offsetZ = 0; + } + + gpu_param.global_scale[0] = 1; + gpu_param.global_scale[1] = 1; + gpu_param.global_scale[2] = 1; + + gpu_param.global_size[0] = block_size; + gpu_param.global_size[1] = height; + gpu_param.global_size[2] = 1; + + status = vsi_nn_kernel_gpu_config( node, &gpu_param ); + CHECK_STATUS_FAIL_GOTO(status, OnError); + + { + uint16_t M0 = 0; + uint16_t M1 = 0; + int32_t postShift0 = 0; + int32_t postShift1 = 0; + uint32_t multAndoutZP0[2] = {0}; + uint32_t multAndoutZP1[2] = {0}; + gpu_dp_inst_t uniAccumulateSum_2x8 = {{ + 0x55555555, // TCfg + 0x44444444, // ASelt + 0x33221100, 0x77665544, // ABin + 0xaaaaaaaa, // BSelt + 0x00000000, 0x00000000, // BBin + 0x00000600, // AccumType, ConstantType, and PostShift + 0x00010001, 0x00010001, 0x00010001, 0x00010001, + 0x00010001, 0x00010001, 0x00010001, 0x00010001 // Constant + }, GPU_DP_TYPE_16 }; + gpu_dp_inst_t uniU8MulAndPostShift_0_Lo_2x8 = {{ + 0xdddddddd, // TCfg + 0x44444444, // ASelt + 0x13121110, 0x17161514, // ABin + 0x11111111, // BSelt + 0x00000000, 0x00000000, // BBin + 0x00002600, // AccumType, ConstantType, and PostShift + 0x00000000, 0x00000000, 0x00000000, 0x00000000, + 0x00000000, 0x00000000, 0x00000000, 0x00000000 // Constant + }, GPU_DP_TYPE_16 }; + gpu_dp_inst_t uniU8MulAndPostShift_1_Lo_2x8 = {{ + 0xdddddddd, // TCfg + 0x44444444, // ASelt + 0x13121110, 0x17161514, // ABin + 0x11111111, // BSelt + 0x00000000, 0x00000000, // BBin + 0x00002600, // AccumType, ConstantType, and PostShift + 0x00000000, 0x00000000, 0x00000000, 0x00000000, + 0x00000000, 0x00000000, 0x00000000, 0x00000000 // Constant + }, GPU_DP_TYPE_16 }; + + gpu_quantize_multiplier_16bit( (double)src0Scale / dstScale, &M0, &postShift0); + gpu_quantize_multiplier_16bit( (double)src2Scale / dstScale, &M1, &postShift1); + multAndoutZP0[0] = (uint32_t)(M0); + multAndoutZP0[1] = (uint32_t)((dstZP << postShift0) - src0ZP * M0); + multAndoutZP1[0] = (uint32_t)(M1); + multAndoutZP1[1] = (uint32_t)((dstZP << postShift1) - src2ZP * M1); + gpu_dp_inst_update_postshfit( &uniU8MulAndPostShift_0_Lo_2x8, postShift0 ); + gpu_dp_inst_update_postshfit( &uniU8MulAndPostShift_1_Lo_2x8, postShift1 ); + + status = vsi_nn_kernel_gpu_add_param( node, + "uniAccumulateSum_2x8", &uniAccumulateSum_2x8 ); + if (attr[2]->quant != VSI_NN_KERNEL_QUANT_NONE) + { + status |= vsi_nn_kernel_gpu_add_param( node, + "uniU8MulAndPostShift_0_Lo_2x8", &uniU8MulAndPostShift_0_Lo_2x8 ); + status |= vsi_nn_kernel_gpu_add_param( node, + "uniU8MulAndPostShift_1_Lo_2x8", &uniU8MulAndPostShift_1_Lo_2x8 ); + status |= vsi_nn_kernel_gpu_add_param( node, "multAndoutZP0", &multAndoutZP0 ); + status |= vsi_nn_kernel_gpu_add_param( node, "multAndoutZP1", &multAndoutZP1 ); + } + status |= vsi_nn_kernel_gpu_add_param( node, "index_num", &index_num ); + status |= vsi_nn_kernel_gpu_add_param( node, "update_width", &block_size ); + status |= vsi_nn_kernel_gpu_add_param( node, "output_width", &block_size ); + status |= vsi_nn_kernel_gpu_add_param( node, "offsetX", &offsetX ); + status |= vsi_nn_kernel_gpu_add_param( node, "offsetY", &offsetY ); + status |= vsi_nn_kernel_gpu_add_param( node, "offsetZ", &offsetZ ); + status |= vsi_nn_kernel_gpu_add_param( node, "offsetW", &offsetW ); + status |= vsi_nn_kernel_gpu_add_param( node, "offset_idx", &offset_idx ); + CHECK_STATUS_FAIL_GOTO(status, OnError); + } + +OnError: + if (attr[0]) + { + vsi_nn_kernel_tensor_attr_release( &attr[0] ); + attr[0] = NULL; + } + if (attr[1]) + { + vsi_nn_kernel_tensor_attr_release( &attr[1] ); + attr[1] = NULL; + } + if (attr[2]) + { + vsi_nn_kernel_tensor_attr_release( &attr[2] ); + attr[2] = NULL; + } + if (attr[3]) + { + vsi_nn_kernel_tensor_attr_release( &attr[3] ); + attr[3] = NULL; + } + return status; +} /* _scatter_nd_update_big_initializer() */ + +DEF_KERNEL_INITIALIZER(_scatter_nd_update_pre_initializer) + ( + vsi_nn_kernel_node_t node, + const vsi_nn_kernel_node_param_t * param, + size_t param_size + ) +{ + vsi_status status = VSI_FAILURE; + gpu_param_t gpu_param = { + 3, + {0, 0, 0}, + {0, 0, 0}, + {0, 0, 0}, + {0, 0, 0} + }; + + vsi_nn_kernel_tensor_attr_t * attr[3] = { NULL }; + int32_t block_size = 1; + int32_t update_width = 1; + int32_t index_num = 1; + int32_t width = 0, area = 0, vol = 0; + int32_t coord_dim = 0; + int32_t offsetX = 0, offsetY = 0, offsetZ = 0, offsetW = 0, offset_idx = 0; + int32_t src0ZP = 0; + float src0Scale = 1; + + attr[0] = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[0] ); + CHECK_PTR_FAIL_GOTO( attr[0], "Create tensor attr buffer fail.", OnError ); + attr[1] = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[1] ); + CHECK_PTR_FAIL_GOTO( attr[1], "Create tensor attr buffer fail.", OnError ); + attr[2] = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[2] ); + CHECK_PTR_FAIL_GOTO( attr[2], "Create tensor attr buffer fail.", OnError ); + + status = vsi_nn_kernel_scalar_read_int32((vsi_nn_kernel_scalar_t)param[5], &width); + CHECK_STATUS_FAIL_GOTO(status, OnError ); + status = vsi_nn_kernel_scalar_read_int32((vsi_nn_kernel_scalar_t)param[6], &area); + CHECK_STATUS_FAIL_GOTO(status, OnError ); + status = vsi_nn_kernel_scalar_read_int32((vsi_nn_kernel_scalar_t)param[7], &vol); + CHECK_STATUS_FAIL_GOTO(status, OnError ); + status = vsi_nn_kernel_scalar_read_int32((vsi_nn_kernel_scalar_t)param[8], &coord_dim); + CHECK_STATUS_FAIL_GOTO(status, OnError ); + + block_size = attr[2]->shape->data[0]; + update_width = attr[1]->shape->data[0]; + index_num = attr[0]->shape->data[1]; + + if (attr[1]->quant == VSI_NN_KERNEL_QUANT_ASYMM) + { + src0ZP = attr[1]->asymm.zero_point; + src0Scale = attr[1]->asymm.scale; + } + else if ( attr[1]->quant == VSI_NN_KERNEL_QUANT_DFP ) + { + if (attr[1]->dfp.fl > 0) + { + src0Scale = (1.0f / ((float) ((int64_t)1 << attr[1]->dfp.fl))); + } + else + { + src0Scale = ((float) ((int64_t)1 << -attr[1]->dfp.fl)); + } + src0ZP = 0; + } + else if ( attr[1]->quant == VSI_NN_KERNEL_QUANT_NONE ) + { + src0Scale = 1; + src0ZP = 0; + } + + if (coord_dim == 5) + { + offset_idx = 1; + } + if (coord_dim == 4 || coord_dim == 5) + { + offsetX = vol; + offsetY = area; + offsetZ = width; + offsetW = 1; + } + else if (coord_dim == 3) + { + offsetX = area; + offsetY = width; + offsetZ = 1; + } + else if (coord_dim == 2) + { + offsetX = width; + offsetY = 1; + offsetZ = 0; + } + else if (coord_dim == 1) + { + offsetX = 1; + offsetY = 0; + offsetZ = 0; + } + + gpu_param.global_scale[0] = 1; + gpu_param.global_scale[1] = 1; + gpu_param.global_scale[2] = 1; + + gpu_param.global_size[0] = block_size; + gpu_param.global_size[1] = index_num; + gpu_param.global_size[2] = 1; + + status = vsi_nn_kernel_gpu_config( node, &gpu_param ); + CHECK_STATUS_FAIL_GOTO(status, OnError); + + { + gpu_dp_inst_t uniConvert1stUint8SubZpToFp32_4x4 = {{ + 0x05050505, // TCfg + 0x04040404, // ASelt + 0x00010000, 0x00030002, // ABin + 0x0a0a0a0a, // BSelt + 0x00000000, 0x00000000, // BBin + 0x00000400, // AccumType, ConstantType, and PostShift + 0xffff0001, 0x00000000, 0xffff0001, 0x00000000, + 0xffff0001, 0x00000000, 0xffff0001, 0x00000000 // Constant + }, GPU_DP_TYPE_16 }; + + status = vsi_nn_kernel_gpu_add_param( node, + "uniConvert1stUint8SubZpToFp32_4x4", &uniConvert1stUint8SubZpToFp32_4x4 ); + status |= vsi_nn_kernel_gpu_add_param( node, "update_width", &update_width ); + status |= vsi_nn_kernel_gpu_add_param( node, "output_width", &block_size ); + status |= vsi_nn_kernel_gpu_add_param( node, "offsetX", &offsetX ); + status |= vsi_nn_kernel_gpu_add_param( node, "offsetY", &offsetY ); + status |= vsi_nn_kernel_gpu_add_param( node, "offsetZ", &offsetZ ); + status |= vsi_nn_kernel_gpu_add_param( node, "offsetW", &offsetW ); + status |= vsi_nn_kernel_gpu_add_param( node, "offset_idx", &offset_idx ); + status |= vsi_nn_kernel_gpu_add_param( node, "input_zp", &src0ZP ); + status |= vsi_nn_kernel_gpu_add_param( node, "input_scale", &src0Scale ); + CHECK_STATUS_FAIL_GOTO(status, OnError); + } + +OnError: + if (attr[0]) + { + vsi_nn_kernel_tensor_attr_release( &attr[0] ); + attr[0] = NULL; + } + if (attr[1]) + { + vsi_nn_kernel_tensor_attr_release( &attr[1] ); + attr[1] = NULL; + } + if (attr[2]) + { + vsi_nn_kernel_tensor_attr_release( &attr[2] ); + attr[2] = NULL; + } + return status; +} /* _scatter_nd_update_pre_initializer() */ + +DEF_KERNEL_INITIALIZER(_scatter_nd_update_post_initializer) + ( + vsi_nn_kernel_node_t node, + const vsi_nn_kernel_node_param_t * param, + size_t param_size + ) +{ + vsi_status status = VSI_FAILURE; + gpu_param_t gpu_param = { + 3, + {0, 0, 0}, + {0, 0, 0}, + {0, 0, 0}, + {0, 0, 0} + }; + + vsi_nn_kernel_tensor_attr_t * attr[3] = { NULL }; + int32_t block_size = 1; + int32_t height = 1; + int32_t width = 0, area = 0, vol = 0; + int32_t coord_dim = 0; + int32_t offsetX = 0, offsetY = 0, offsetZ = 0, offsetW = 0, offset_idx = 0; + int32_t src0ZP = 0; + float src0Scale = 1; + float src2Scale = 1; + int32_t dstZP = 0; + float dstScale = 1; + + attr[0] = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[0] ); // ref + CHECK_PTR_FAIL_GOTO( attr[0], "Create tensor attr buffer fail.", OnError ); + attr[1] = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[4] ); // update + CHECK_PTR_FAIL_GOTO( attr[1], "Create tensor attr buffer fail.", OnError ); + attr[2] = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[5] ); // output + CHECK_PTR_FAIL_GOTO( attr[2], "Create tensor attr buffer fail.", OnError ); + + status = vsi_nn_kernel_scalar_read_int32((vsi_nn_kernel_scalar_t)param[6], &width); + CHECK_STATUS_FAIL_GOTO(status, OnError ); + status = vsi_nn_kernel_scalar_read_int32((vsi_nn_kernel_scalar_t)param[7], &area); + CHECK_STATUS_FAIL_GOTO(status, OnError ); + status = vsi_nn_kernel_scalar_read_int32((vsi_nn_kernel_scalar_t)param[8], &vol); + CHECK_STATUS_FAIL_GOTO(status, OnError ); + status = vsi_nn_kernel_scalar_read_int32((vsi_nn_kernel_scalar_t)param[9], &coord_dim); + CHECK_STATUS_FAIL_GOTO(status, OnError ); + + block_size = attr[2]->shape->data[0]; + height = attr[2]->shape->data[1]; + + if (attr[0]->quant == VSI_NN_KERNEL_QUANT_ASYMM) + { + src0ZP = attr[0]->asymm.zero_point; + src0Scale = attr[0]->asymm.scale; + } + else if ( attr[0]->quant == VSI_NN_KERNEL_QUANT_DFP ) + { + if (attr[0]->dfp.fl > 0) + { + src0Scale = (1.0f / ((float) ((int64_t)1 << attr[0]->dfp.fl))); + } + else + { + src0Scale = ((float) ((int64_t)1 << -attr[0]->dfp.fl)); + } + src0ZP = 0; + } + else if ( attr[0]->quant == VSI_NN_KERNEL_QUANT_NONE ) + { + src0Scale = 1; + src0ZP = 0; + } + + if (attr[1]->quant == VSI_NN_KERNEL_QUANT_ASYMM) + { + src2Scale = attr[1]->asymm.scale; + } + else if ( attr[1]->quant == VSI_NN_KERNEL_QUANT_DFP ) + { + if (attr[1]->dfp.fl > 0) + { + src2Scale = (1.0f / ((float) ((int64_t)1 << attr[1]->dfp.fl))); + } + else + { + src2Scale = ((float) ((int64_t)1 << -attr[1]->dfp.fl)); + } + } + else if ( attr[1]->quant == VSI_NN_KERNEL_QUANT_NONE ) + { + src2Scale = 1; + } + + if (attr[2]->quant == VSI_NN_KERNEL_QUANT_ASYMM) + { + dstZP = attr[2]->asymm.zero_point; + dstScale = attr[2]->asymm.scale; + } + else if ( attr[2]->quant == VSI_NN_KERNEL_QUANT_DFP ) + { + if (attr[2]->dfp.fl > 0) + { + dstScale = (float)((int64_t)1 << attr[2]->dfp.fl); + } + else + { + dstScale = (1.0f / (float)((int64_t)1 << -attr[2]->dfp.fl)); + } + dstScale = 1.0f/dstScale; + dstZP = 0; + } + else if ( attr[2]->quant == VSI_NN_KERNEL_QUANT_NONE ) + { + dstScale = 1; + dstZP = 0; + } + + if (coord_dim == 5) + { + offset_idx = 1; + } + if (coord_dim == 4 || coord_dim == 5) + { + offsetX = vol; + offsetY = area; + offsetZ = width; + offsetW = 1; + } + else if (coord_dim == 3) + { + offsetX = area; + offsetY = width; + offsetZ = 1; + } + else if (coord_dim == 2) + { + offsetX = width; + offsetY = 1; + offsetZ = 0; + } + else if (coord_dim == 1) + { + offsetX = 1; + offsetY = 0; + offsetZ = 0; + } + + gpu_param.global_scale[0] = 1; + gpu_param.global_scale[1] = 1; + gpu_param.global_scale[2] = 1; + + gpu_param.global_size[0] = block_size; + gpu_param.global_size[1] = height; + gpu_param.global_size[2] = 1; + + status = vsi_nn_kernel_gpu_config( node, &gpu_param ); + CHECK_STATUS_FAIL_GOTO(status, OnError); + + { + uint16_t M0 = 0; + int32_t postShift0 = 0; + uint32_t multAndoutZP0[2] = {0}; + gpu_dp_inst_t uniU8MulAndPostShift_0_Lo_2x8 = {{ + 0xdddddddd, // TCfg + 0x44444444, // ASelt + 0x13121110, 0x17161514, // ABin + 0x11111111, // BSelt + 0x00000000, 0x00000000, // BBin + 0x00002600, // AccumType, ConstantType, and PostShift + 0x00000000, 0x00000000, 0x00000000, 0x00000000, + 0x00000000, 0x00000000, 0x00000000, 0x00000000 // Constant + }, GPU_DP_TYPE_16 }; + gpu_dp_inst_t uniConvertInt32toUint8_2x8 = {{ + 0x33333333, // TCfg + 0x11110000, // ASelt + 0x03020100, 0x03020100, // ABin + 0x00000000, // BSelt + 0x00000000, 0x00000000, // BBin + 0x00002400, // AccumType, ConstantType, and PostShift + 0x00000000, 0x00000000, 0x00000000, 0x00000000, + 0x00000000, 0x00000000, 0x00000000, 0x00000000 // Constant + }, GPU_DP_TYPE_16 }; + float output_zp = (float)dstZP; + float scaleInOut = src2Scale / dstScale; + + gpu_quantize_multiplier_16bit( (double)src0Scale / dstScale, &M0, &postShift0); + multAndoutZP0[0] = (uint32_t)(M0); + multAndoutZP0[1] = (uint32_t)((dstZP << postShift0) - src0ZP * M0); + gpu_dp_inst_update_postshfit( &uniU8MulAndPostShift_0_Lo_2x8, postShift0 ); + + status = vsi_nn_kernel_gpu_add_param( node, + "uniU8MulAndPostShift_0_Lo_2x8", &uniU8MulAndPostShift_0_Lo_2x8 ); + status |= vsi_nn_kernel_gpu_add_param( node, "multAndoutZP0", &multAndoutZP0 ); + status |= vsi_nn_kernel_gpu_add_param( node, "output_width", &block_size ); + status |= vsi_nn_kernel_gpu_add_param( node, "offsetX", &offsetX ); + status |= vsi_nn_kernel_gpu_add_param( node, "offsetY", &offsetY ); + status |= vsi_nn_kernel_gpu_add_param( node, "offsetZ", &offsetZ ); + status |= vsi_nn_kernel_gpu_add_param( node, "offsetW", &offsetW ); + status |= vsi_nn_kernel_gpu_add_param( node, "offset_idx", &offset_idx ); + status |= vsi_nn_kernel_gpu_add_param( node, "input_scale", &src2Scale ); + status |= vsi_nn_kernel_gpu_add_param( node, "output_zp", &output_zp ); + status |= vsi_nn_kernel_gpu_add_param( node, "scaleInOut", &scaleInOut ); + status |= vsi_nn_kernel_gpu_add_param( node, + "uniConvertInt32toUint8_2x8", &uniConvertInt32toUint8_2x8 ); + CHECK_STATUS_FAIL_GOTO(status, OnError); + } + +OnError: + if (attr[0]) + { + vsi_nn_kernel_tensor_attr_release( &attr[0] ); + attr[0] = NULL; + } + if (attr[1]) + { + vsi_nn_kernel_tensor_attr_release( &attr[1] ); + attr[1] = NULL; + } + if (attr[2]) + { + vsi_nn_kernel_tensor_attr_release( &attr[2] ); + attr[2] = NULL; + } + return status; +} /* _scatter_nd_update_post_initializer() */ + +DEF_KERNEL_INITIALIZER(_scatter_nd_update_reset_initializer) + ( + vsi_nn_kernel_node_t node, + const vsi_nn_kernel_node_param_t * param, + size_t param_size + ) +{ + vsi_status status = VSI_FAILURE; + gpu_param_t gpu_param = { + 3, + {0, 0, 0}, + {0, 0, 0}, + {0, 0, 0}, + {0, 0, 0} + }; + + vsi_nn_kernel_tensor_attr_t * attr[1] = { NULL }; + int32_t block_size = 1; + int32_t width = 0; + int32_t height = 0; + int32_t count_width = 0; + + attr[0] = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[1] ); + CHECK_PTR_FAIL_GOTO( attr[0], "Create tensor attr buffer fail.", OnError ); + + block_size = attr[0]->shape->data[0]; + height = attr[0]->shape->data[1]; + width = block_size * height; + count_width = (height + 3) / 4; + + gpu_param.global_scale[0] = 1; + gpu_param.global_scale[1] = 1; + gpu_param.global_scale[2] = 1; + + gpu_param.global_size[0] = (width + 3) / 4; + gpu_param.global_size[1] = 1; + gpu_param.global_size[2] = 1; + + status = vsi_nn_kernel_gpu_config( node, &gpu_param ); + CHECK_STATUS_FAIL_GOTO(status, OnError); + + status = vsi_nn_kernel_gpu_add_param( node, "count_width", &count_width ); + CHECK_STATUS_FAIL_GOTO(status, OnError); + +OnError: + if (attr[0]) + { + vsi_nn_kernel_tensor_attr_release( &attr[0] ); + attr[0] = NULL; + } + return status; +} /* _scatter_nd_update_reset_initializer() */ + +/* + * Query kernel + */ +static vsi_status _query_kernel + ( + vsi_nn_tensor_t* const* const inputs, + vsi_nn_tensor_t* const* const outputs, + vsi_nn_kernel_t* kernel, + int32_t coord_dim, + int32_t isBig + ) +{ + vsi_status status = VSI_FAILURE; + vsi_nn_kernel_dtype_e input0_dtype = U8; + vsi_nn_kernel_dtype_e input2_dtype = U8; + vsi_nn_kernel_dtype_e output_dtype = U8; + uint32_t key = 0; + int i = 0; + + input0_dtype = vsi_nn_kernel_map_dtype( inputs[0]->attr.dtype.vx_type ); + input2_dtype = vsi_nn_kernel_map_dtype( inputs[2]->attr.dtype.vx_type ); + output_dtype = vsi_nn_kernel_map_dtype( outputs[0]->attr.dtype.vx_type ); + + key = HASH_SCATTER_ND_UPDATE_KEY( input0_dtype, input2_dtype, output_dtype, 0, isBig ); + + for( i = 0; i < _cnt_of_array(scatter_nd_update_map); i ++ ) + { + if ( scatter_nd_update_map[i].key == key ) + { + break; + } + } + if ( i < _cnt_of_array(scatter_nd_update_map) ) + { + snprintf( kernel->info.name, VX_MAX_KERNEL_NAME, "%s", scatter_nd_update_map[i].function_name ); + kernel->info.parameters = _scatter_nd_update_kernel_param_def; + kernel->info.numParams = _cnt_of_array( _scatter_nd_update_kernel_param_def ); + if (isBig) + { + kernel->info.initialize = _scatter_nd_update_big_initializer; + } + else + { + kernel->info.initialize = _scatter_nd_update_initializer; + } + + vsi_nn_kernel_add_source( kernel, VSI_NN_GPU_SOURCE_FMT_CODE, 2, + "vsi_nn_kernel_header", + scatter_nd_update_map[i].source_name ); + vsi_nn_kernel_add_source( kernel, VSI_NN_GPU_SOURCE_FMT_EXECUTABLE, 1, + scatter_nd_update_map[i].source_name ); + status = VSI_SUCCESS; + } + return status; +} /* _query_kernel() */ + +static vsi_status _query_kernel_large + ( + vsi_nn_tensor_t* const* const inputs, + vsi_nn_tensor_t* const* const outputs, + vsi_nn_kernel_t* kernel_reset, + vsi_nn_kernel_t* kernel_pre, + vsi_nn_kernel_t* kernel + ) +{ + vsi_status status = VSI_SUCCESS; + vsi_nn_kernel_dtype_e input0_dtype = U8; + vsi_nn_kernel_dtype_e input2_dtype = F16; + vsi_nn_kernel_dtype_e output_dtype = U8; + uint32_t key = 0; + int i = 0; + + input0_dtype = vsi_nn_kernel_map_dtype( inputs[0]->attr.dtype.vx_type ); + input2_dtype = vsi_nn_kernel_map_dtype( inputs[2]->attr.dtype.vx_type ); + output_dtype = vsi_nn_kernel_map_dtype( outputs[0]->attr.dtype.vx_type ); + + key = HASH_SCATTER_ND_UPDATE_KEY( input0_dtype, I32, I32, 1, 1 ); + + for( i = 0; i < _cnt_of_array(scatter_nd_update_pre_map); i ++ ) + { + if ( scatter_nd_update_pre_map[i].key == key ) + { + break; + } + } + + if ( i < _cnt_of_array(scatter_nd_update_pre_map) ) + { + snprintf( kernel_pre->info.name, VX_MAX_KERNEL_NAME, "%s", scatter_nd_update_pre_map[i].function_name ); + kernel_pre->info.parameters = _scatter_nd_update_pre_kernel_param_def; + kernel_pre->info.numParams = _SCATTER_ND_UPDATE_PRE_PARAM_NUM; + kernel_pre->info.initialize = _scatter_nd_update_pre_initializer; + + vsi_nn_kernel_add_source( kernel_pre, VSI_NN_GPU_SOURCE_FMT_CODE, 2, + "vsi_nn_kernel_header", + scatter_nd_update_pre_map[i].source_name ); + vsi_nn_kernel_add_source( kernel_pre, VSI_NN_GPU_SOURCE_FMT_EXECUTABLE, 1, + scatter_nd_update_pre_map[i].source_name ); + } + else + { + status = VSI_FAILURE; + } + + + key = HASH_SCATTER_ND_UPDATE_KEY( input0_dtype, input2_dtype, output_dtype, 0, 1 ); + + for( i = 0; i < _cnt_of_array(scatter_nd_update_post_map); i ++ ) + { + if ( scatter_nd_update_post_map[i].key == key ) + { + break; + } + } + if ( i < _cnt_of_array(scatter_nd_update_post_map) ) + { + snprintf( kernel->info.name, VX_MAX_KERNEL_NAME, "%s", scatter_nd_update_post_map[i].function_name ); + kernel->info.parameters = _scatter_nd_update_post_kernel_param_def; + kernel->info.numParams = _SCATTER_ND_UPDATE_POST_PARAM_NUM; + kernel->info.initialize = _scatter_nd_update_post_initializer; + + vsi_nn_kernel_add_source( kernel, VSI_NN_GPU_SOURCE_FMT_CODE, 2, + "vsi_nn_kernel_header", + scatter_nd_update_post_map[i].source_name ); + vsi_nn_kernel_add_source( kernel, VSI_NN_GPU_SOURCE_FMT_EXECUTABLE, 1, + scatter_nd_update_post_map[i].source_name ); + } + else + { + status |= VSI_FAILURE; + } + + key = HASH_SCATTER_ND_UPDATE_KEY( I32, I32, I32, 2, 1 ); + + for( i = 0; i < _cnt_of_array(scatter_nd_update_reset_map); i ++ ) + { + if ( scatter_nd_update_reset_map[i].key == key ) + { + break; + } + } + if ( i < _cnt_of_array(scatter_nd_update_reset_map) ) + { + snprintf( kernel_reset->info.name, VX_MAX_KERNEL_NAME, "%s", scatter_nd_update_reset_map[i].function_name ); + kernel_reset->info.parameters = _scatter_nd_update_reset_kernel_param_def; + kernel_reset->info.numParams = _SCATTER_ND_UPDATE_RESET_PARAM_NUM; + kernel_reset->info.initialize = _scatter_nd_update_reset_initializer; + + vsi_nn_kernel_add_source( kernel_reset, VSI_NN_GPU_SOURCE_FMT_CODE, 2, + "vsi_nn_kernel_header", + scatter_nd_update_reset_map[i].source_name ); + vsi_nn_kernel_add_source( kernel_reset, VSI_NN_GPU_SOURCE_FMT_EXECUTABLE, 1, + scatter_nd_update_reset_map[i].source_name ); + } + else + { + status |= VSI_FAILURE; + } + return status; +} /* _query_kernel_large() */ + +static vsi_nn_kernel_node_t _setup + ( + vsi_nn_graph_t * graph, + vsi_nn_tensor_t ** inputs, + size_t input_num, + vsi_nn_tensor_t ** outputs, + size_t output_num, + const vsi_nn_kernel_param_t * params, + vsi_nn_kernel_t * kernel + ) +{ + vsi_status status = VSI_FAILURE; + vsi_nn_kernel_node_param_t tmp_params[_SCATTER_ND_UPDATE_PARAM_NUM] = { NULL }; + vsi_nn_kernel_node_t node = NULL; + int32_t shapes[3][VSI_NN_MAX_DIM_NUM] = {{0}}; + int32_t block_size = vsi_nn_kernel_param_get_int32( params, "block_size" ); + int32_t coord_dim = vsi_nn_kernel_param_get_int32( params, "coord_dim" ); + int32_t rs_in_dim = 0, rs_idx_dim = 0, rs_out_dim = 0; + uint32_t width = 0, area = 0, vol = 0; + int32_t big_flg = 0; + vsi_nn_kernel_dtype_e update_dtype = vsi_nn_kernel_map_dtype(inputs[2]->attr.dtype.vx_type); + int32_t i = 0; + + status = get_scatter_nd_update_tensor_reshape_size(&inputs[1], shapes[0], coord_dim, 0, + NULL, NULL, NULL, &rs_idx_dim, &big_flg); + status |= get_scatter_nd_update_tensor_reshape_size(&inputs[2], shapes[1], block_size, 0, + NULL, NULL, NULL, &rs_in_dim, &big_flg); + status |= get_scatter_nd_update_tensor_reshape_size(&outputs[0], shapes[2], block_size, coord_dim, + &width, &area, &vol, &rs_out_dim, &big_flg); + if (status != VSI_SUCCESS) + { + return NULL; + } + + if ((update_dtype == U8 || update_dtype == I8 || update_dtype == I16)) + { + vsi_nn_tensor_attr_t attr; + vsi_nn_kernel_node_t tmp_node = NULL; + vsi_nn_kernel_node_t reset_node = NULL; + vsi_nn_kernel_node_param_t pre_params[_SCATTER_ND_UPDATE_PRE_PARAM_NUM] = { NULL }; + vsi_nn_kernel_node_param_t node_params[_SCATTER_ND_UPDATE_POST_PARAM_NUM] = { NULL }; + vsi_nn_kernel_node_param_t reset_params[_SCATTER_ND_UPDATE_RESET_PARAM_NUM] = { NULL }; + vsi_nn_kernel_t * ikernels[2] = { NULL }; + vsi_nn_tensor_t * tensors[3] = { NULL }; + + ikernels[0] = vsi_nn_kernel_create( VSI_NN_KERNEL_TYPE_EVIS ); + ikernels[0]->unique_id = kernel->unique_id; + ikernels[1] = vsi_nn_kernel_create( VSI_NN_KERNEL_TYPE_EVIS ); + ikernels[1]->unique_id = kernel->unique_id; + + memset( &attr, 0, sizeof(vsi_nn_tensor_attr_t) ); + attr.dtype.vx_type = VSI_NN_TYPE_INT32; + attr.dtype.qnt_type = VSI_NN_QNT_TYPE_NONE; + attr.is_const = FALSE; + attr.vtl = TRUE; + + for(i = 0; i < rs_out_dim; i++) + { + attr.size[i] = shapes[2][i]; + } + attr.dim_num = rs_out_dim; + + tensors[0] = vsi_nn_CreateTensor( graph, &attr ); + attr.size[0] = 1; + tensors[1] = vsi_nn_CreateTensor( graph, &attr ); + attr.size[1] = 1; + tensors[2] = vsi_nn_CreateTensor( graph, &attr ); + + status = _query_kernel_large( inputs, outputs, ikernels[0], ikernels[1], kernel); + if ( VSI_SUCCESS == status) + { + // reset count + reset_node = vsi_nn_kernel_create_node( graph, ikernels[0] ); + if (reset_node) + { + uint32_t index = 0; + /* Pass parameters to node. */ + reset_params[index++] = vsi_nn_kernel_tensor_reshape( inputs[1]->t, shapes[0], rs_idx_dim ); + reset_params[index++] = (vsi_nn_kernel_node_param_t)tensors[0]->t; + reset_params[index++] = (vsi_nn_kernel_node_param_t)tensors[1]->t; + status = vsi_nn_kernel_node_pass_param( reset_node, reset_params, _SCATTER_ND_UPDATE_RESET_PARAM_NUM ); + CHECK_STATUS(status); + vsi_nn_kernel_tensor_release( &reset_params[0] ); + } + + // pre-process + tmp_node = vsi_nn_kernel_create_node( graph, ikernels[1] ); + if (tmp_node) + { + uint32_t index = 0; + /* Pass parameters to node. */ + pre_params[index++] = vsi_nn_kernel_tensor_reshape( inputs[1]->t, shapes[0], rs_idx_dim ); + pre_params[index++] = vsi_nn_kernel_tensor_reshape( inputs[2]->t, shapes[1], rs_in_dim ); + pre_params[index++] = (vsi_nn_kernel_node_param_t)tensors[0]->t; + pre_params[index++] = (vsi_nn_kernel_node_param_t)tensors[1]->t; + pre_params[index++] = (vsi_nn_kernel_node_param_t)tensors[2]->t; + pre_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &width ); + pre_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &area ); + pre_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &vol ); + pre_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &coord_dim ); + status = vsi_nn_kernel_node_pass_param( tmp_node, pre_params, _SCATTER_ND_UPDATE_PRE_PARAM_NUM ); + CHECK_STATUS(status); + vsi_nn_kernel_tensor_release( &pre_params[0] ); + vsi_nn_kernel_tensor_release( &pre_params[1] ); + vsi_nn_kernel_scalar_release( &pre_params[5] ); + vsi_nn_kernel_scalar_release( &pre_params[6] ); + vsi_nn_kernel_scalar_release( &pre_params[7] ); + vsi_nn_kernel_scalar_release( &pre_params[8] ); + } + + node = vsi_nn_kernel_create_node( graph, kernel ); + if ( node ) + { + uint32_t index = 0; + /* Pass parameters to node. */ + node_params[index++] = vsi_nn_kernel_tensor_reshape( inputs[0]->t, shapes[2], rs_out_dim ); + node_params[index++] = (vsi_nn_kernel_node_param_t)tensors[0]->t; + node_params[index++] = (vsi_nn_kernel_node_param_t)tensors[1]->t; + node_params[index++] = (vsi_nn_kernel_node_param_t)tensors[2]->t; + node_params[index++] = vsi_nn_kernel_tensor_reshape( inputs[2]->t, shapes[1], rs_in_dim ); + node_params[index++] = vsi_nn_kernel_tensor_reshape( outputs[0]->t, shapes[2], rs_out_dim ); + node_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &width ); + node_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &area ); + node_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &vol ); + node_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &coord_dim ); + status = vsi_nn_kernel_node_pass_param( node, node_params, _SCATTER_ND_UPDATE_POST_PARAM_NUM ); + CHECK_STATUS(status); + vsi_nn_kernel_tensor_release( &node_params[0] ); + vsi_nn_kernel_tensor_release( &node_params[4] ); + vsi_nn_kernel_tensor_release( &node_params[5] ); + vsi_nn_kernel_scalar_release( &node_params[6] ); + vsi_nn_kernel_scalar_release( &node_params[7] ); + vsi_nn_kernel_scalar_release( &node_params[8] ); + vsi_nn_kernel_scalar_release( &node_params[9] ); + } + } + + if ( ikernels[0] ) + { + vsi_nn_kernel_release( &ikernels[0] ); + } + if ( ikernels[1] ) + { + vsi_nn_kernel_release( &ikernels[1] ); + } + if ( tensors[0] ) + { + vsi_nn_ReleaseTensor( &tensors[0] ); + } + if ( tensors[1] ) + { + vsi_nn_ReleaseTensor( &tensors[1] ); + } + if ( tensors[2] ) + { + vsi_nn_ReleaseTensor( &tensors[2] ); + } + if (reset_node) {vsi_nn_kernel_node_release( &reset_node );} + if (tmp_node) {vsi_nn_kernel_node_release( &tmp_node );} + } + else + { + status = _query_kernel( inputs, outputs, kernel, coord_dim, big_flg); + if ( VSI_SUCCESS == status) + { + node = vsi_nn_kernel_create_node( graph, kernel ); + if ( node ) + { + uint32_t index = 0; + /* Pass parameters to node. */ + tmp_params[index++] = vsi_nn_kernel_tensor_reshape( inputs[0]->t, shapes[2], rs_out_dim ); + tmp_params[index++] = vsi_nn_kernel_tensor_reshape( inputs[1]->t, shapes[0], rs_idx_dim ); + tmp_params[index++] = vsi_nn_kernel_tensor_reshape( inputs[2]->t, shapes[1], rs_in_dim ); + //tmp_params[index++] = (vsi_nn_kernel_node_param_t)inputs[2]->t; + tmp_params[index++] = vsi_nn_kernel_tensor_reshape( outputs[0]->t, shapes[2], rs_out_dim ); + tmp_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &width ); + tmp_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &area ); + tmp_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &vol ); + tmp_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &coord_dim ); + status = vsi_nn_kernel_node_pass_param( node, tmp_params, _SCATTER_ND_UPDATE_PARAM_NUM ); + CHECK_STATUS(status); + vsi_nn_kernel_tensor_release( &tmp_params[0] ); + vsi_nn_kernel_tensor_release( &tmp_params[1] ); + vsi_nn_kernel_tensor_release( &tmp_params[2] ); + vsi_nn_kernel_tensor_release( &tmp_params[3] ); + vsi_nn_kernel_scalar_release( &tmp_params[4] ); + vsi_nn_kernel_scalar_release( &tmp_params[5] ); + vsi_nn_kernel_scalar_release( &tmp_params[6] ); + vsi_nn_kernel_scalar_release( &tmp_params[7] ); + } + } + } + return node; +} /* _setup() */ + +__END_DECLS + +REGISTER_BACKEND_EVIS( scatter_nd_update, _setup ) + diff --git a/src/tim/vx/internal/src/kernel/evis/signal_frame_evis.c b/src/tim/vx/internal/src/kernel/evis/signal_frame_evis.c new file mode 100644 index 0000000..0fdf6a8 --- /dev/null +++ b/src/tim/vx/internal/src/kernel/evis/signal_frame_evis.c @@ -0,0 +1,292 @@ +/**************************************************************************** +* +* Copyright (c) 2020 Vivante Corporation +* +* Permission is hereby granted, free of charge, to any person obtaining a +* copy of this software and associated documentation files (the "Software"), +* to deal in the Software without restriction, including without limitation +* the rights to use, copy, modify, merge, publish, distribute, sublicense, +* and/or sell copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following conditions: +* +* The above copyright notice and this permission notice shall be included in +* all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +* DEALINGS IN THE SOFTWARE. +* +*****************************************************************************/ + +#include +#include +#include +#include "vsi_nn_types.h" +#include "vsi_nn_tensor.h" +#include "vsi_nn_graph.h" +#include "vsi_nn_log.h" +#include "vsi_nn_error.h" +#include "vsi_nn_prv.h" +#include "vsi_nn_tensor_util.h" +#include "utils/vsi_nn_util.h" +#include "kernel/vsi_nn_kernel.h" +#include "utils/vsi_nn_dtype_util.h" + +__BEGIN_DECLS + +#define SIGNAL_FRAME_HASH_KEY( IN_DTYPE, OUT_DTYPE ) \ + ( ( IN_DTYPE << 8 ) | ( OUT_DTYPE ) ) +#define SIGNAL_FRAME_KERNEL_MAP( IN_DTYPE, OUT_DTYPE ) \ + { SIGNAL_FRAME_HASH_KEY( IN_DTYPE, OUT_DTYPE ), \ + CVIVANTE_NAMESPACE("evis.signal_frame_"#IN_DTYPE"to"#OUT_DTYPE), \ + "signal_frame" } + +typedef struct +{ + uint32_t key; + char * function_name; + const char * source_name; +} _kernel_map_type; + +static const _kernel_map_type _signal_frame_kernel_map[] = +{ + // Register kernel here + SIGNAL_FRAME_KERNEL_MAP( I16, I16 ), + SIGNAL_FRAME_KERNEL_MAP( F16, F16 ), + SIGNAL_FRAME_KERNEL_MAP( BF16, BF16 ), + SIGNAL_FRAME_KERNEL_MAP( U8, U8 ), + SIGNAL_FRAME_KERNEL_MAP( I8, I8 ), +}; + +/* + * Kernel params + */ +static vx_param_description_t _signal_frame_kernel_param_def[] = +{ + {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, + {VX_OUTPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED}, + // Add kererl parameters here +}; +#define _SIGNAL_FRAME_PARAM_NUM _cnt_of_array( _signal_frame_kernel_param_def ) +#define FRAME_STEP (2) +/* + * Kernel initializer + */ +DEF_KERNEL_INITIALIZER(_signal_frame_initializer) + ( + vsi_nn_kernel_node_t node, + const vsi_nn_kernel_node_param_t * param, + size_t param_size + ) +{ + vsi_status status = VSI_FAILURE; + gpu_param_t gpu_param = { + 3, + {0, 0, 0}, + {0, 0, 0}, + {0, 0, 0}, + {0, 0, 0} + }; + vsi_nn_kernel_tensor_attr_t * attr = NULL; + vsi_int_array_t * out_shape = NULL; + + attr = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[1] ); + out_shape = attr->shape; + + gpu_param.global_scale[0] = 16; + gpu_param.global_scale[1] = 1; + gpu_param.global_scale[2] = 1; + if ( attr->dtype == F16 || attr->dtype == I16 || attr->dtype == U16 || attr->dtype == BF16) + { + gpu_param.global_scale[0] = 8; + } + gpu_param.global_size[0] = (out_shape->data[0] + gpu_param.global_scale[0] - 1) + / gpu_param.global_scale[0]; + gpu_param.global_size[1] = out_shape->data[1]; + gpu_param.global_size[2] = out_shape->data[2]; + + status = vsi_nn_kernel_gpu_config( node, &gpu_param ); + CHECK_STATUS_FAIL_GOTO(status, final ); +final: + if (attr) + { + vsi_nn_kernel_tensor_attr_release( &attr ); + attr = NULL; + } + + return status; +} /* _signal_frame_initializer() */ + +/* + * Query kernel + */ +static vsi_status _query_kernel + ( + vsi_nn_kernel_t * kernel, + vsi_nn_tensor_t * const * const inputs, + vsi_nn_tensor_t * const * const outputs + ) +{ + vsi_status status = VSI_FAILURE; + vsi_nn_kernel_dtype_e in_dtype; + vsi_nn_kernel_dtype_e out_dtype; + uint32_t key = 0; + uint32_t i = 0; + + in_dtype = vsi_nn_kernel_map_dtype( inputs[0]->attr.dtype.vx_type ); + out_dtype = vsi_nn_kernel_map_dtype( outputs[0]->attr.dtype.vx_type ); + + key = SIGNAL_FRAME_HASH_KEY( in_dtype, out_dtype ); + + for ( i = 0; i < _cnt_of_array(_signal_frame_kernel_map); i ++ ) + { + if ( _signal_frame_kernel_map[i].key == key ) + { + break; + } + } + if ( i < _cnt_of_array(_signal_frame_kernel_map) ) + { + snprintf( kernel->info.name, VX_MAX_KERNEL_NAME, "%s", _signal_frame_kernel_map[i].function_name ); + kernel->info.parameters = _signal_frame_kernel_param_def; + kernel->info.numParams = _cnt_of_array( _signal_frame_kernel_param_def ); + kernel->info.initialize = _signal_frame_initializer; + // Register code source + vsi_nn_kernel_add_source( kernel, VSI_NN_GPU_SOURCE_FMT_CODE, 2, + "vsi_nn_kernel_header", + _signal_frame_kernel_map[i].source_name ); + // Register binary source + vsi_nn_kernel_add_source( kernel, VSI_NN_GPU_SOURCE_FMT_EXECUTABLE, 1, + _signal_frame_kernel_map[i].source_name ); + status = VSI_SUCCESS; + } + return status; +} /* _query_kernel() */ + +static vsi_nn_kernel_node_t _setup + ( + vsi_nn_graph_t * graph, + vsi_nn_tensor_t ** inputs, + size_t input_num, + vsi_nn_tensor_t ** outputs, + size_t output_num, + const vsi_nn_kernel_param_t * params, + vsi_nn_kernel_t * kernel + ) +{ + vsi_status status = VSI_FAILURE; + vsi_nn_kernel_node_param_t node_params[_SIGNAL_FRAME_PARAM_NUM] = {NULL}; + vsi_nn_kernel_node_t node = NULL; + int32_t frame_length = vsi_nn_kernel_param_get_int32( params, "frame_length" ); + int32_t frame_step = vsi_nn_kernel_param_get_int32( params, "frame_step" ); + int32_t axis = vsi_nn_kernel_param_get_int32( params, "axis" ); + int32_t pad_end = vsi_nn_kernel_param_get_int32( params, "pad_end" ); + float pad_value = vsi_nn_kernel_param_get_float32( params, "pad_val" ); + int32_t num_frames = outputs[0]->attr.size[axis + 1]; + int32_t rank = inputs[0]->attr.dim_num; + int32_t inner = 1; + int32_t outer = 1; + int32_t length_samples = inputs[0]->attr.size[axis]; + int32_t i = 0; + vsi_nn_tensor_t* rs_tensors[2] = { NULL }; + int32_t shape[2][VSI_NN_MAX_DIM_NUM] = {{ 0 }}; + + for (i = 0; i < axis; i++) + { + inner *= inputs[0]->attr.size[i]; + } + + for (i = axis + 1; i < rank; i++) + { + outer *= inputs[0]->attr.size[i]; + } + + shape[0][0] = inner; + shape[0][1] = length_samples; + shape[0][2] = 1; + shape[0][3] = outer; + + shape[1][0] = inner; + shape[1][1] = frame_length; + shape[1][2] = num_frames; + shape[1][3] = outer; + + rs_tensors[0] = vsi_nn_reshape_tensor( graph, + inputs[0], (uint32_t*)shape[0], 4 ); + rs_tensors[1] = vsi_nn_reshape_tensor( graph, + outputs[0], (uint32_t*)shape[1], 4 ); + + if ( !vsi_nn_kernel_gpu_check_shape( (int32_t*)rs_tensors[1]->attr.size, + rs_tensors[1]->attr.dim_num ) ) + { + return NULL; + } + + status = _query_kernel( kernel, inputs, outputs ); + if ( VSI_SUCCESS == status) + { + node = vsi_nn_kernel_create_node( graph, kernel ); + if ( node ) + { + if ( pad_end ) + { + // Set default border mode. + vx_border_t border; + uint32_t data = 0; + uint32_t dsize = 1; + + vsi_nn_Float32ToDtype(pad_value, (uint8_t*)&data, &outputs[0]->attr.dtype); + border.mode = VX_BORDER_CONSTANT; + dsize = vsi_nn_GetTypeBytes( inputs[0]->attr.dtype.vx_type ); + if ( dsize == 1 ) + { + border.constant_value.U8 = (uint8_t)data; + } + else if ( dsize == 4 ) + { + border.constant_value.U32 = data; + } + else + { + border.constant_value.U16 = (uint16_t)data; + } + + status |= vxSetNodeAttribute( (vx_node)node, VX_NODE_BORDER, &border, sizeof(border) ); + } + /* Set inputs and outputs */ + vsi_nn_kernel_node_pack_io( node_params, _SIGNAL_FRAME_PARAM_NUM, + &rs_tensors[0], input_num, &rs_tensors[1], output_num ); + node_params[FRAME_STEP] = vsi_nn_kernel_scalar_create( + graph, I32, &frame_step ); + /* Pass parameters to node. */ + status = vsi_nn_kernel_node_pass_param( node, node_params, _SIGNAL_FRAME_PARAM_NUM ); + CHECK_STATUS_FAIL_GOTO( status, final ); + } + } +final: + if (rs_tensors[0]) + { + vsi_nn_ReleaseTensor( &rs_tensors[0] ); + } + + if (rs_tensors[1]) + { + vsi_nn_ReleaseTensor( &rs_tensors[1] ); + } + + if (node_params[FRAME_STEP]) + { + vsi_nn_kernel_scalar_release( &node_params[FRAME_STEP] ); + } + + return node; +} /* _setup() */ + +__END_DECLS + +REGISTER_BACKEND_EVIS( signal_frame, _setup ) diff --git a/src/tim/vx/internal/src/kernel/evis/slice_evis.c b/src/tim/vx/internal/src/kernel/evis/slice_evis.c index 35b8b99..69bfb4b 100644 --- a/src/tim/vx/internal/src/kernel/evis/slice_evis.c +++ b/src/tim/vx/internal/src/kernel/evis/slice_evis.c @@ -22,7 +22,6 @@ * *****************************************************************************/ - #include #include #include @@ -40,7 +39,6 @@ __BEGIN_DECLS - #define _SLICE_KERNEL_SOURCE "slice" #define _SLICE_KERNEL_NAME CVIVANTE_NAMESPACE("evis.slice") @@ -379,7 +377,6 @@ static vsi_status _query_kernel return status; } /* _query_kernel() */ - static vsi_nn_kernel_node_t _setup ( vsi_nn_graph_t * graph, @@ -421,7 +418,7 @@ static vsi_nn_kernel_node_t _setup if ( !vsi_nn_kernel_gpu_check_shape( (int32_t*)reshape_tensors[0]->attr.size, reshape_tensors[0]->attr.dim_num ) || input_batch != output_batch ) { - return NULL; + goto final; } image_2d = (rank[0] < 3 || shapes[0][2] == 1); @@ -443,6 +440,12 @@ static vsi_nn_kernel_node_t _setup } } +final: + for (i = 0; i < _IO_NUM; i++) + { + vsi_safe_release_tensor(reshape_tensors[i]); + } + return node; } /* _setup() */ diff --git a/src/tim/vx/internal/src/kernel/evis/spatial_transformer_evis.c b/src/tim/vx/internal/src/kernel/evis/spatial_transformer_evis.c new file mode 100644 index 0000000..01b6155 --- /dev/null +++ b/src/tim/vx/internal/src/kernel/evis/spatial_transformer_evis.c @@ -0,0 +1,641 @@ +/**************************************************************************** +* +* Copyright (c) 2020 Vivante Corporation +* +* Permission is hereby granted, free of charge, to any person obtaining a +* copy of this software and associated documentation files (the "Software"), +* to deal in the Software without restriction, including without limitation +* the rights to use, copy, modify, merge, publish, distribute, sublicense, +* and/or sell copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following conditions: +* +* The above copyright notice and this permission notice shall be included in +* all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +* DEALINGS IN THE SOFTWARE. +* +*****************************************************************************/ + + +#include +#include +#include +#include "vsi_nn_types.h" +#include "vsi_nn_tensor.h" +#include "vsi_nn_graph.h" +#include "vsi_nn_log.h" +#include "vsi_nn_error.h" +#include "vsi_nn_prv.h" +#include "vsi_nn_tensor_util.h" +#include "utils/vsi_nn_util.h" +#include "kernel/vsi_nn_kernel.h" +#include "libnnext/vx_lib_nnext.h" + +__BEGIN_DECLS + +typedef enum +{ + INTERNAL_KERNEL_GET_MATRIX, + INTERNAL_KERNEL_WARP_AFFINE, +} _internal_kernel_e; + +#define _GET_MATRIX_SOURCE "get_matrix" +#define _WARP_AFFINE_SOURCE "warp_affine" + +// Add kernel hashtable here +#define GET_MATRIX_HASH_KEY( IN1_DTYPE, OUT_DTYPE ) \ + (( IN1_DTYPE << 8 ) | ( OUT_DTYPE )) +#define GET_MATRIX_KERNEL_MAP( IN1_DTYPE, OUT_DTYPE ) \ + { GET_MATRIX_HASH_KEY( IN1_DTYPE, OUT_DTYPE ), \ + CVIVANTE_NAMESPACE("evis.get_matrix_"#IN1_DTYPE"toF32"), \ + _GET_MATRIX_SOURCE } + +#define WARP_AFFINE_HASH_KEY( IN0_DTYPE, OUT_DTYPE ) \ + (( IN0_DTYPE << 8 ) | ( OUT_DTYPE )) +#define WARP_AFFINE_KERNEL_MAP( IN0_DTYPE, OUT_DTYPE ) \ + { WARP_AFFINE_HASH_KEY( IN0_DTYPE, OUT_DTYPE ), \ + CVIVANTE_NAMESPACE("evis.warp_affine_"#IN0_DTYPE"to"#OUT_DTYPE), \ + _WARP_AFFINE_SOURCE } + +typedef struct +{ + uint32_t key; + char * function_name; + const char * source_name; +} _kernel_map_type; + +static const _kernel_map_type _get_matrix_kernel_map[] = +{ + // Register kernel here + GET_MATRIX_KERNEL_MAP( F16, F32 ), + GET_MATRIX_KERNEL_MAP( I16, F32 ), + GET_MATRIX_KERNEL_MAP( U8, F32 ), + GET_MATRIX_KERNEL_MAP( I8, F32 ), +}; + +static const _kernel_map_type _warp_affine_kernel_map[] = +{ + // Register kernel here + WARP_AFFINE_KERNEL_MAP( F16, F16 ), + WARP_AFFINE_KERNEL_MAP( U8, U8 ), +}; + +/* + * Kernel params + */ +static vx_param_description_t _get_matrix_kernel_param_def[] = +{ + {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, + {VX_OUTPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED}, + // Add kererl parameters here +}; +#define _GET_MATRIX_PARAM_NUM _cnt_of_array( _get_matrix_kernel_param_def ) +#define HAS_THETA_1_1 (2) +#define HAS_THETA_1_2 (3) +#define HAS_THETA_1_3 (4) +#define HAS_THETA_2_1 (5) +#define HAS_THETA_2_2 (6) +#define HAS_THETA_2_3 (7) +#define THETA_1_1 (8) +#define THETA_1_2 (9) +#define THETA_1_3 (10) +#define THETA_2_1 (11) +#define THETA_2_2 (12) +#define THETA_2_3 (13) +#define I_WIDTH (14) +#define I_HEIGHT (15) +#define O_WIDTH (16) +#define O_HEIGHT (17) + +static vx_param_description_t _warp_affine_kernel_param_def[] = +{ + {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, + {VX_OUTPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, + // Add kererl parameters here +}; +#define _WARP_AFFINE_PARAM_NUM _cnt_of_array( _warp_affine_kernel_param_def ) +/* + * Kernel initializer + */ +DEF_KERNEL_INITIALIZER(_get_matrix_initializer) + ( + vsi_nn_kernel_node_t node, + const vsi_nn_kernel_node_param_t * param, + size_t param_size + ) +{ + vsi_status status = VSI_FAILURE; + gpu_param_t gpu_param = { + 2, + {0, 0, 0}, + {0, 0, 0}, + {0, 0, 0}, + {0, 0, 0} + }; + vsi_nn_kernel_tensor_attr_t * attr = NULL; + vsi_int_array_t * in_shape = NULL; + float theta[8] = {0}; + float input_scale = 1.0f; + float input_tail = 0; + float input_w = 1.0f; + float input_h = 1.0f; + float output_w = 1.0f; + float output_h = 1.0f; + float scale[4] = {0}; + + attr = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[0] ); + + if ( attr->quant == VSI_NN_KERNEL_QUANT_DFP ) + { + int32_t fl = attr->dfp.fl; + if (fl > 0) + { + input_scale = 1.0f / (float) ((int64_t)1 << fl); + } + else + { + input_scale = (float)((int64_t)1 << -fl); + } + } + else if ( attr->quant == VSI_NN_KERNEL_QUANT_ASYMM ) + { + input_scale = attr->asymm.scale; + input_tail = 0 - attr->asymm.zero_point * input_scale; + } + + in_shape = attr->shape; + + status = vsi_nn_kernel_scalar_read_float32((vsi_nn_kernel_scalar_t)param[THETA_1_1], &theta[0]); + status |= vsi_nn_kernel_scalar_read_float32((vsi_nn_kernel_scalar_t)param[THETA_1_2], &theta[1]); + status |= vsi_nn_kernel_scalar_read_float32((vsi_nn_kernel_scalar_t)param[THETA_1_3], &theta[2]); + status |= vsi_nn_kernel_scalar_read_float32((vsi_nn_kernel_scalar_t)param[THETA_2_1], &theta[4]); + status |= vsi_nn_kernel_scalar_read_float32((vsi_nn_kernel_scalar_t)param[THETA_2_2], &theta[5]); + status |= vsi_nn_kernel_scalar_read_float32((vsi_nn_kernel_scalar_t)param[THETA_2_3], &theta[6]); + status |= vsi_nn_kernel_scalar_read_float32((vsi_nn_kernel_scalar_t)param[I_WIDTH], &input_w); + status |= vsi_nn_kernel_scalar_read_float32((vsi_nn_kernel_scalar_t)param[I_HEIGHT], &input_h); + status |= vsi_nn_kernel_scalar_read_float32((vsi_nn_kernel_scalar_t)param[O_WIDTH], &output_w); + status |= vsi_nn_kernel_scalar_read_float32((vsi_nn_kernel_scalar_t)param[O_HEIGHT], &output_h); + CHECK_STATUS_FAIL_GOTO( status, final ); + + scale[0] = input_w / output_w; + scale[1] = input_h / output_h; + scale[2] = input_w / output_h; + scale[3] = input_h / output_w; + + gpu_param.global_scale[0] = 1; + gpu_param.global_scale[1] = 1; + gpu_param.global_size[0] = 1; + gpu_param.global_size[1] = in_shape->data[1]; + + status = vsi_nn_kernel_gpu_add_param( node, + "theta_1", &theta[0] ); + status |= vsi_nn_kernel_gpu_add_param( node, + "theta_2", &theta[4] ); + status |= vsi_nn_kernel_gpu_add_param( node, + "scale", &scale ); + status |= vsi_nn_kernel_gpu_add_param( node, + "input_scale", &input_scale ); + status |= vsi_nn_kernel_gpu_add_param( node, + "input_tail", &input_tail ); + CHECK_STATUS_FAIL_GOTO(status, final ); + + status = vsi_nn_kernel_gpu_config( node, &gpu_param ); + CHECK_STATUS_FAIL_GOTO(status, final ); +final: + if (attr) + { + vsi_nn_kernel_tensor_attr_release( &attr ); + attr = NULL; + } + + return status; +} /* _get_matrix_initializer() */ + +DEF_KERNEL_INITIALIZER(_warp_affine_initializer) + ( + vsi_nn_kernel_node_t node, + const vsi_nn_kernel_node_param_t * param, + size_t param_size + ) +{ + vsi_status status = VSI_FAILURE; + gpu_param_t gpu_param = { + 3, + {0, 0, 0}, + {0, 0, 0}, + {0, 0, 0}, + {0, 0, 0} + }; + vsi_nn_kernel_tensor_attr_t * attr[2] = {NULL}; + vsi_int_array_t * out_shape = NULL; + float input_scale = 1.0f; + float input_tail = 0; + float output_scale = 1.0f; + float output_zp = 0; + + attr[0] = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[0] ); + attr[1] = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[2] ); + + if ( attr[0]->quant == VSI_NN_KERNEL_QUANT_DFP ) + { + int32_t fl = attr[0]->dfp.fl; + if (fl > 0) + { + input_scale = 1.0f / (float) ((int64_t)1 << fl); + } + else + { + input_scale = (float)((int64_t)1 << -fl); + } + } + else if ( attr[0]->quant == VSI_NN_KERNEL_QUANT_ASYMM ) + { + input_scale = attr[0]->asymm.scale; + input_tail = 0 - attr[0]->asymm.zero_point * input_scale; + } + + if (attr[1]->quant == VSI_NN_KERNEL_QUANT_DFP) + { + int32_t fl = attr[1]->dfp.fl; + + if (fl >= 0) + { + output_scale = (vx_float32) ((vx_int64)1 << fl); + } + else if (fl < 0) + { + output_scale = 1.0f / (vx_float32) ((vx_int64)1 << -fl); + } + } + else if (attr[1]->quant == VSI_NN_KERNEL_QUANT_ASYMM) + { + output_scale = 1.0f / attr[1]->asymm.scale;; + output_zp = (float)attr[1]->asymm.zero_point; + } + + out_shape = attr[1]->shape; + + gpu_param.global_scale[0] = 2; + gpu_param.global_scale[1] = 1; + gpu_param.global_scale[2] = 1; + gpu_param.global_size[0] = gpu_align_p2( + (out_shape->data[0] + gpu_param.global_scale[0] - 1) + / gpu_param.global_scale[0], 4); + gpu_param.global_size[1] = out_shape->data[1]; + gpu_param.global_size[2] = out_shape->size > 2 ? out_shape->data[2] : 1; + + do + { + gpu_dp_inst_t uniConvertDatatoF32_0_4x4 = {{ + 0x01010101, // TCfg + 0x01010000, // ASelt + 0x00010000, 0x00010000, // ABin + 0x02020202, // BSelt + 0x00000000, 0x00000000, // BBin + 0x00000100, // AccumType, ConstantType, and PostShift + 0x00003c00, 0x00000000, 0x00003c00, 0x00000000, + 0x00003c00, 0x00000000, 0x00003c00, 0x00000000 // Constant + }, GPU_DP_TYPE_16}; + gpu_dp_inst_t uniConvertDatatoF32_1_4x4 = {{ + 0x01010101, // TCfg + 0x01010000, // ASelt + 0x00030002, 0x00030002, // ABin + 0x02020202, // BSelt + 0x00000000, 0x00000000, // BBin + 0x00000100, // AccumType, ConstantType, and PostShift + 0x00003c00, 0x00000000, 0x00003c00, 0x00000000, + 0x00003c00, 0x00000000, 0x00003c00, 0x00000000 // Constant + }, GPU_DP_TYPE_16}; + gpu_dp_inst_t uniExtractHalf8_2x8 = {{ + 0x11111111, // TCfg + 0x11110000, // ASelt + 0x06040200, 0x06040200, // ABin + 0x22222222, // BSelt + 0x00000000, 0x00000000, // BBin + 0x00000100, // AccumType, ConstantType, and PostShift + 0x00003c00, 0x00003c00, 0x00003c00, 0x00003c00, + 0x00003c00, 0x00003c00, 0x00003c00, 0x00003c00 // Constant + }, GPU_DP_TYPE_16 }; + gpu_dp_inst_t uniExtractInteger_2x8 = {{ + 0x33333333, // TCfg + 0x11110000, // ASelt + 0x03020100, 0x03020100, // ABin + 0x00000000, // BSelt + 0x00000000, 0x00000000, // BBin + 0x00002400, // AccumType, ConstantType, and PostShift + 0x00000000, 0x00000000, 0x00000000, 0x00000000, + 0x00000000, 0x00000000, 0x00000000, 0x00000000 // Constant + }, GPU_DP_TYPE_16 }; + + status = vsi_nn_kernel_gpu_add_param( node, "uniConvertDatatoF32_0_4x4", &uniConvertDatatoF32_0_4x4); + status |= vsi_nn_kernel_gpu_add_param( node, "uniConvertDatatoF32_1_4x4", &uniConvertDatatoF32_1_4x4); + status |= vsi_nn_kernel_gpu_add_param( node, "input_scale", &input_scale); + status |= vsi_nn_kernel_gpu_add_param( node, "input_tail", &input_tail); + status |= vsi_nn_kernel_gpu_add_param( node, "output_scale", &output_scale); + status |= vsi_nn_kernel_gpu_add_param( node, "output_zp", &output_zp); + if (attr[1]->dtype == F16) + { + status |= vsi_nn_kernel_gpu_add_param( node, + "uniExtract8Data_2x8", &uniExtractHalf8_2x8 ); + } + else + { + status |= vsi_nn_kernel_gpu_add_param( node, + "uniExtract8Data_2x8", &uniExtractInteger_2x8 ); + } + CHECK_STATUS_FAIL_GOTO(status, final ); + }while(0); + + + status = vsi_nn_kernel_gpu_config( node, &gpu_param ); + CHECK_STATUS_FAIL_GOTO(status, final ); +final: + if (attr[0]) + { + vsi_nn_kernel_tensor_attr_release( &attr[0] ); + attr[0] = NULL; + } + + if (attr[1]) + { + vsi_nn_kernel_tensor_attr_release( &attr[1] ); + attr[1] = NULL; + } + + return status; +} /* _warp_affine_initializer() */ + + +/* + * Query kernel + */ +static vsi_status _query_kernel + ( + vsi_nn_kernel_t * kernel, + const uint32_t hashkey, + _internal_kernel_e kernel_id + ) +{ + vx_kernel_initialize_f initializer = NULL; + vx_param_description_t * param_def; + vsi_status status = VSI_FAILURE; + const _kernel_map_type* kernel_map; + size_t kernel_map_size; + size_t param_size; + uint32_t i; + + switch( kernel_id ) + { + case INTERNAL_KERNEL_GET_MATRIX: + initializer = _get_matrix_initializer; + kernel_map = _get_matrix_kernel_map; + kernel_map_size = _cnt_of_array( _get_matrix_kernel_map ); + param_def = _get_matrix_kernel_param_def; + param_size = _GET_MATRIX_PARAM_NUM; + break; + case INTERNAL_KERNEL_WARP_AFFINE: + initializer = _warp_affine_initializer; + kernel_map = _warp_affine_kernel_map; + kernel_map_size = _cnt_of_array( _warp_affine_kernel_map ); + param_def = _warp_affine_kernel_param_def; + param_size = _WARP_AFFINE_PARAM_NUM; + break; + default: + VSI_ASSERT( FALSE ); + return VSI_FAILURE; + } + + for( i = 0; i < kernel_map_size; i ++ ) + { + if( kernel_map[i].key == hashkey ) + { + break; + } + } + if( i < kernel_map_size ) + { + snprintf( kernel->info.name, VX_MAX_KERNEL_NAME, "%s", kernel_map[i].function_name ); + kernel->info.parameters = param_def; + kernel->info.numParams = (uint32_t)param_size; + kernel->info.initialize = initializer; + // Register code source + vsi_nn_kernel_add_source( kernel, VSI_NN_GPU_SOURCE_FMT_CODE, 2, + "vsi_nn_kernel_header", + kernel_map[i].source_name ); + // Register binary source + vsi_nn_kernel_add_source( kernel, VSI_NN_GPU_SOURCE_FMT_EXECUTABLE, 1, + kernel_map[i].source_name ); + status = VSI_SUCCESS; + } + + return status; +} /* _query_kernel() */ + + +static vsi_nn_kernel_node_t _setup + ( + vsi_nn_graph_t * graph, + vsi_nn_tensor_t ** inputs, + size_t input_num, + vsi_nn_tensor_t ** outputs, + size_t output_num, + const vsi_nn_kernel_param_t * params, + vsi_nn_kernel_t * kernel + ) +{ +#define INTERNAL_KERNEL_SIZE (2) +#define MATRIX_INDEX (0) +#define WARP_AFFINE_INDEX (1) + vsi_status status = VSI_FAILURE; + vsi_nn_kernel_node_param_t node_params[_GET_MATRIX_PARAM_NUM] = { NULL }; + vsi_nn_kernel_node_param_t warp_affine_node_params[_WARP_AFFINE_PARAM_NUM] = { NULL }; + vsi_nn_kernel_node_t node = NULL; + vsi_nn_kernel_dtype_e in0_dtype; + vsi_nn_kernel_dtype_e in1_dtype; + vsi_nn_kernel_dtype_e out_dtype; + vsi_nn_tensor_attr_t attr; + vsi_nn_kernel_t * ikernels[INTERNAL_KERNEL_SIZE] = { NULL }; + vsi_nn_tensor_t * tensors[INTERNAL_KERNEL_SIZE] = { NULL }; + vsi_nn_tensor_t * warp_affine_tensors[2] = {NULL}; + uint32_t hashkeys[INTERNAL_KERNEL_SIZE] = { 0 }; + int32_t has_theta_1_1 = vsi_nn_kernel_param_get_int32( params, "has_theta_1_1" ); + int32_t has_theta_1_2 = vsi_nn_kernel_param_get_int32( params, "has_theta_1_2" ); + int32_t has_theta_1_3 = vsi_nn_kernel_param_get_int32( params, "has_theta_1_3" ); + int32_t has_theta_2_1 = vsi_nn_kernel_param_get_int32( params, "has_theta_2_1" ); + int32_t has_theta_2_2 = vsi_nn_kernel_param_get_int32( params, "has_theta_2_2" ); + int32_t has_theta_2_3 = vsi_nn_kernel_param_get_int32( params, "has_theta_2_3" ); + float theta_1_1 = vsi_nn_kernel_param_get_float32( params, "theta_1_1" ); + float theta_1_2 = vsi_nn_kernel_param_get_float32( params, "theta_1_2" ); + float theta_1_3 = vsi_nn_kernel_param_get_float32( params, "theta_1_3" ); + float theta_2_1 = vsi_nn_kernel_param_get_float32( params, "theta_2_1" ); + float theta_2_2 = vsi_nn_kernel_param_get_float32( params, "theta_2_2" ); + float theta_2_3 = vsi_nn_kernel_param_get_float32( params, "theta_2_3" ); + int32_t align_corners = vsi_nn_kernel_param_get_int32( params, "align_corners" ); + float input_w = (float)inputs[0]->attr.size[0]; + float input_h = (float)inputs[0]->attr.size[1]; + float output_w = (float)outputs[0]->attr.size[0]; + float output_h = (float)outputs[0]->attr.size[1]; + int32_t i = 0; + + if (align_corners && output_w > 1) + { + output_w = output_w - 1; + } + + if (align_corners && output_h > 1) + { + output_h = output_h - 1; + } + + // Check if gpu can support the size + if( !vsi_nn_kernel_gpu_check_shape( + (int32_t*)outputs[0]->attr.size, outputs[0]->attr.dim_num ) ) + { + return NULL; + } + + for( i = 0; i < INTERNAL_KERNEL_SIZE; i ++ ) + { + ikernels[i] = vsi_nn_kernel_create( VSI_NN_KERNEL_TYPE_EVIS ); + // Assign unique_id + ikernels[i]->unique_id = kernel->unique_id; + } + + memcpy( &attr, &(inputs[1]->attr), sizeof(vsi_nn_tensor_attr_t) ); + attr.size[0] = 16; + attr.dim_num = 2; + attr.dtype.vx_type = VSI_NN_TYPE_UINT16; + attr.dtype.qnt_type = VSI_NN_QNT_TYPE_NONE; + attr.is_const = FALSE; + attr.vtl = TRUE; + tensors[0] = vsi_nn_CreateTensor( graph, &attr ); + + attr.size[3] = attr.size[1]; + attr.size[2] = attr.size[1] = 1; + attr.dim_num = inputs[0]->attr.dim_num; + tensors[1] = vsi_nn_reshape_tensor( graph, + tensors[0], (uint32_t*)attr.size, attr.dim_num ); + + warp_affine_tensors[0] = inputs[0]; + warp_affine_tensors[1] = tensors[1]; + + in0_dtype = vsi_nn_kernel_map_dtype( inputs[0]->attr.dtype.vx_type ); + in1_dtype = vsi_nn_kernel_map_dtype( inputs[1]->attr.dtype.vx_type ); + out_dtype = vsi_nn_kernel_map_dtype( outputs[0]->attr.dtype.vx_type ); + + hashkeys[MATRIX_INDEX]= GET_MATRIX_HASH_KEY( in1_dtype, F32 ); + hashkeys[WARP_AFFINE_INDEX] = WARP_AFFINE_HASH_KEY( in0_dtype, out_dtype ); + + status = _query_kernel( ikernels[MATRIX_INDEX], hashkeys[MATRIX_INDEX], INTERNAL_KERNEL_GET_MATRIX ); + if( VSI_SUCCESS != status ) + { + goto final; + } + + status = _query_kernel( ikernels[WARP_AFFINE_INDEX], hashkeys[WARP_AFFINE_INDEX], INTERNAL_KERNEL_WARP_AFFINE ); + if( VSI_SUCCESS != status ) + { + goto final; + } + + // Get Matrix + node = vsi_nn_kernel_create_node( graph, ikernels[MATRIX_INDEX] ); + vsi_nn_kernel_node_pack_io( node_params, _GET_MATRIX_PARAM_NUM, + &inputs[1], 1, &tensors[0], 1 ); + node_params[HAS_THETA_1_1] = vsi_nn_kernel_scalar_create( graph, I32, &has_theta_1_1 ); + node_params[HAS_THETA_1_2] = vsi_nn_kernel_scalar_create( graph, I32, &has_theta_1_2 ); + node_params[HAS_THETA_1_3] = vsi_nn_kernel_scalar_create( graph, I32, &has_theta_1_3 ); + node_params[HAS_THETA_2_1] = vsi_nn_kernel_scalar_create( graph, I32, &has_theta_2_1 ); + node_params[HAS_THETA_2_2] = vsi_nn_kernel_scalar_create( graph, I32, &has_theta_2_2 ); + node_params[HAS_THETA_2_3] = vsi_nn_kernel_scalar_create( graph, I32, &has_theta_2_3 ); + node_params[THETA_1_1] = vsi_nn_kernel_scalar_create( graph, F32, &theta_1_1 ); + node_params[THETA_1_2] = vsi_nn_kernel_scalar_create( graph, F32, &theta_1_2 ); + node_params[THETA_1_3] = vsi_nn_kernel_scalar_create( graph, F32, &theta_1_3 ); + node_params[THETA_2_1] = vsi_nn_kernel_scalar_create( graph, F32, &theta_2_1 ); + node_params[THETA_2_2] = vsi_nn_kernel_scalar_create( graph, F32, &theta_2_2 ); + node_params[THETA_2_3] = vsi_nn_kernel_scalar_create( graph, F32, &theta_2_3 ); + node_params[I_WIDTH] = vsi_nn_kernel_scalar_create( graph, F32, &input_w ); + node_params[I_HEIGHT] = vsi_nn_kernel_scalar_create( graph, F32, &input_h ); + node_params[O_WIDTH] = vsi_nn_kernel_scalar_create( graph, F32, &output_w ); + node_params[O_HEIGHT] = vsi_nn_kernel_scalar_create( graph, F32, &output_h ); + status = vsi_nn_kernel_node_pass_param( node, node_params, _GET_MATRIX_PARAM_NUM ); + vsi_nn_kernel_scalar_release( &node_params[HAS_THETA_1_1] ); + vsi_nn_kernel_scalar_release( &node_params[HAS_THETA_1_2] ); + vsi_nn_kernel_scalar_release( &node_params[HAS_THETA_1_3] ); + vsi_nn_kernel_scalar_release( &node_params[HAS_THETA_2_1] ); + vsi_nn_kernel_scalar_release( &node_params[HAS_THETA_2_2] ); + vsi_nn_kernel_scalar_release( &node_params[HAS_THETA_2_3] ); + vsi_nn_kernel_scalar_release( &node_params[THETA_1_1] ); + vsi_nn_kernel_scalar_release( &node_params[THETA_1_2] ); + vsi_nn_kernel_scalar_release( &node_params[THETA_1_3] ); + vsi_nn_kernel_scalar_release( &node_params[THETA_2_1] ); + vsi_nn_kernel_scalar_release( &node_params[THETA_2_2] ); + vsi_nn_kernel_scalar_release( &node_params[THETA_2_3] ); + vsi_nn_kernel_scalar_release( &node_params[I_WIDTH] ); + vsi_nn_kernel_scalar_release( &node_params[I_HEIGHT] ); + vsi_nn_kernel_scalar_release( &node_params[O_WIDTH] ); + vsi_nn_kernel_scalar_release( &node_params[O_HEIGHT] ); + vsi_nn_kernel_node_release( &node ); + + // Warp Affine + node = vsi_nn_kernel_create_node( graph, ikernels[WARP_AFFINE_INDEX] ); + if (node) + { + vx_border_t border; + border.mode = VX_BORDER_CONSTANT; + border.constant_value.U32 = 0; + border.constant_value.S16 = 0; + border.constant_value.U8 = 0; + if (inputs[0]->attr.dtype.vx_type == VSI_NN_TYPE_UINT8 && + inputs[0]->attr.dtype.qnt_type == VSI_NN_QNT_TYPE_AFFINE_ASYMMETRIC) + { + border.constant_value.U8 = (vx_uint8)inputs[0]->attr.dtype.zero_point; + } + status = vsi_nn_kernel_node_set_border( node, &border ); + VSI_ASSERT( status == VSI_SUCCESS ); + } + vsi_nn_kernel_node_pack_io( warp_affine_node_params, _WARP_AFFINE_PARAM_NUM, + warp_affine_tensors, 2, outputs, 1 ); + status = vsi_nn_kernel_node_pass_param( node, warp_affine_node_params, _WARP_AFFINE_PARAM_NUM ); +final: + for( i = 0; i < INTERNAL_KERNEL_SIZE; i ++ ) + { + if( ikernels[i] ) + { + vsi_nn_kernel_release( &ikernels[i] ); + } + if( tensors[i] ) + { + vsi_nn_ReleaseTensor( &tensors[i] ); + } + } + + return node; +} /* _setup() */ + +__END_DECLS + +REGISTER_BACKEND_EVIS( spatial_transformer, _setup ) diff --git a/src/tim/vx/internal/src/kernel/evis/tensorstackconcat_evis.c b/src/tim/vx/internal/src/kernel/evis/tensorstackconcat_evis.c new file mode 100644 index 0000000..cb97d0b --- /dev/null +++ b/src/tim/vx/internal/src/kernel/evis/tensorstackconcat_evis.c @@ -0,0 +1,248 @@ +/**************************************************************************** +* +* Copyright (c) 2020 Vivante Corporation +* +* Permission is hereby granted, free of charge, to any person obtaining a +* copy of this software and associated documentation files (the "Software"), +* to deal in the Software without restriction, including without limitation +* the rights to use, copy, modify, merge, publish, distribute, sublicense, +* and/or sell copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following conditions: +* +* The above copyright notice and this permission notice shall be included in +* all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +* DEALINGS IN THE SOFTWARE. +* +*****************************************************************************/ + + +#include +#include +#include +#include "vsi_nn_types.h" +#include "vsi_nn_tensor.h" +#include "vsi_nn_graph.h" +#include "vsi_nn_log.h" +#include "vsi_nn_error.h" +#include "vsi_nn_prv.h" +#include "vsi_nn_tensor_util.h" +#include "utils/vsi_nn_util.h" +#include "kernel/vsi_nn_kernel.h" +#include "utils/vsi_nn_dtype_util.h" + +__BEGIN_DECLS + +/* + * Define kernel meta. + */ +#define KERNEL_SOURCE "tensorstackconcat", + +#define HASH_SH_KEY( IN_DTYPE, OUT_DTYPE, _image_2d ) \ + (( IN_DTYPE << 20 ) | ( OUT_DTYPE << 8) | (_image_2d)) + +#define PACK_KERNEL_8BITS_MAP(SRC_TYPE, OUT_TYPE) \ + { HASH_SH_KEY(SRC_TYPE, OUT_TYPE, 0), \ + CVIVANTE_NAMESPACE("evis.tensorstackconcat_8bits"), \ + KERNEL_SOURCE }, + +#define PACK_KERNEL_8BITS_MAP_2D(SRC_TYPE, OUT_TYPE) \ + { HASH_SH_KEY(SRC_TYPE, OUT_TYPE, 1), \ + CVIVANTE_NAMESPACE("evis.tensorstackconcat_8bits_2D"), \ + KERNEL_SOURCE }, + +#define PACK_KERNEL_16BITS_MAP(SRC_TYPE, OUT_TYPE) \ + { HASH_SH_KEY(SRC_TYPE, OUT_TYPE, 0), \ + CVIVANTE_NAMESPACE("evis.tensorstackconcat_16bits"), \ + KERNEL_SOURCE }, + +#define PACK_KERNEL_16BITS_MAP_2D(SRC_TYPE, OUT_TYPE) \ + { HASH_SH_KEY(SRC_TYPE, OUT_TYPE, 1), \ + CVIVANTE_NAMESPACE("evis.tensorstackconcat_16bits_2D"), \ + KERNEL_SOURCE }, + +typedef struct +{ + uint32_t key; + char * function_name; + const char * source_name; +} _kernel_map_type; + +static const _kernel_map_type _tensorstackconcat_kernel_map[] = +{ + // Register kernel here + PACK_KERNEL_8BITS_MAP( I8, I8 ) + PACK_KERNEL_8BITS_MAP( U8, U8 ) + PACK_KERNEL_8BITS_MAP_2D( I8, I8 ) + PACK_KERNEL_8BITS_MAP_2D( U8, U8 ) + + PACK_KERNEL_16BITS_MAP( F16, F16 ) + PACK_KERNEL_16BITS_MAP( BF16, BF16 ) + PACK_KERNEL_16BITS_MAP( I16, I16 ) + PACK_KERNEL_16BITS_MAP_2D( F16, F16 ) + PACK_KERNEL_16BITS_MAP_2D( BF16, BF16 ) + PACK_KERNEL_16BITS_MAP_2D( I16, I16 ) +}; + +/* + * Kernel params + */ +static vx_param_description_t _tensorstackconcat_kernel_param_def[] = +{ + {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, + {VX_OUTPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, +}; +#define _TENSORSTACKCONCAT_PARAM_NUM _cnt_of_array( _tensorstackconcat_kernel_param_def ) + +/* + * Kernel initializer + */ +DEF_KERNEL_INITIALIZER(_tensorstackconcat_initializer) + ( + vsi_nn_kernel_node_t node, + const vsi_nn_kernel_node_param_t * param, + size_t param_size + ) +{ + vsi_status status = VSI_FAILURE; + gpu_param_t gpu_param = { + 3, + {0, 0, 0}, + {0, 0, 0}, + {0, 0, 0}, + {0, 0, 0} + }; + vsi_nn_kernel_tensor_attr_t * input_attr = NULL; + vsi_int_array_t * in_shape = NULL; + // Add initializer + + input_attr = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[0] ); + CHECK_PTR_FAIL_GOTO( input_attr, "Create tensor attr buffer fail.", final ); + + in_shape = input_attr->shape; + + if (input_attr->dtype == I16 || input_attr->dtype == F16) + { + gpu_param.global_scale[0] = 8; + } + else + { + gpu_param.global_scale[0] = 16; + } + gpu_param.global_scale[1] = 1; + gpu_param.global_scale[2] = 1; + + gpu_param.global_size[0] = gpu_align_p2( + (in_shape->data[0] + gpu_param.global_scale[0] - 1) + / gpu_param.global_scale[0], 4); + gpu_param.global_size[1] = 1; + gpu_param.global_size[2] = in_shape->size > 2 ? in_shape->data[2] : 1; + + status = vsi_nn_kernel_gpu_config( node, &gpu_param ); + +final: +#define SAFE_FREE_TENSOR_ATTR(_PTR) if( _PTR ) { vsi_nn_kernel_tensor_attr_release( &_PTR ); _PTR = NULL; } + SAFE_FREE_TENSOR_ATTR(input_attr); +#undef SAFE_FREE_TENSOR_ATTR + + return status; +} /* _tensorstackconcat_initializer() */ + + +/* + * Query kernel + */ +static vsi_status _query_kernel + ( + vsi_nn_kernel_t * kernel, + vsi_nn_tensor_t * const * const inputs, + vsi_nn_tensor_t * const * const outputs, + vsi_bool image_2d + ) +{ + vsi_status status = VSI_FAILURE; + vsi_nn_kernel_dtype_e in_dtype; + vsi_nn_kernel_dtype_e out_dtype; + const _kernel_map_type * kernel_map = _tensorstackconcat_kernel_map; + size_t kernel_map_size = _cnt_of_array( _tensorstackconcat_kernel_map ); + vx_param_description_t * param_def = _tensorstackconcat_kernel_param_def; + vx_kernel_initialize_f initializer = _tensorstackconcat_initializer; + + uint32_t key; + uint32_t i; + + in_dtype = vsi_nn_kernel_map_dtype( inputs[0]->attr.dtype.vx_type ); + out_dtype = vsi_nn_kernel_map_dtype( outputs[0]->attr.dtype.vx_type ); + + key = HASH_SH_KEY( in_dtype, out_dtype, image_2d ); + + for ( i = 0; i < (uint32_t)kernel_map_size; i ++ ) + { + if ( kernel_map[i].key == key ) + { + break; + } + } + if ( i < (uint32_t)kernel_map_size ) + { + snprintf( kernel->info.name, VX_MAX_KERNEL_NAME, "%s", kernel_map[i].function_name ); + kernel->info.parameters = param_def; + kernel->info.numParams = _cnt_of_array( _tensorstackconcat_kernel_param_def ); + kernel->info.initialize = initializer; + // Register code source + vsi_nn_kernel_add_source( kernel, VSI_NN_GPU_SOURCE_FMT_CODE, 2, + "vsi_nn_kernel_header", + kernel_map[i].source_name ); + // Register binary source + vsi_nn_kernel_add_source( kernel, VSI_NN_GPU_SOURCE_FMT_EXECUTABLE, 1, + kernel_map[i].source_name ); + status = VSI_SUCCESS; + } + return status; +} /* _query_kernel() */ + + +static vsi_nn_kernel_node_t _setup + ( + vsi_nn_graph_t * graph, + vsi_nn_tensor_t ** inputs, + size_t input_num, + vsi_nn_tensor_t ** outputs, + size_t output_num, + const vsi_nn_kernel_param_t * params, + vsi_nn_kernel_t * kernel + ) +{ + vsi_status status = VSI_FAILURE; + vsi_nn_kernel_node_param_t node_params[_TENSORSTACKCONCAT_PARAM_NUM]; + vsi_nn_kernel_node_t node = NULL; + vsi_bool image_2d = FALSE; + + image_2d = (inputs[0]->attr.dim_num == 2 || inputs[0]->attr.size[2] == 1); + status = _query_kernel( kernel, inputs, outputs, image_2d ); + if ( VSI_SUCCESS == status) + { + node = vsi_nn_kernel_create_node( graph, kernel ); + if ( node ) + { + /* Set inputs and outputs */ + vsi_nn_kernel_node_pack_io( node_params, _TENSORSTACKCONCAT_PARAM_NUM, + inputs, input_num, outputs, output_num ); + /* Pass parameters to node. */ + status = vsi_nn_kernel_node_pass_param( node, node_params, _TENSORSTACKCONCAT_PARAM_NUM ); + } + } + + return node; +} /* _setup() */ + +__END_DECLS + +REGISTER_BACKEND_EVIS( tensorstackconcat, _setup ) diff --git a/src/tim/vx/internal/src/kernel/vsi_nn_kernel.c b/src/tim/vx/internal/src/kernel/vsi_nn_kernel.c index e0c43e2..5eefb9c 100644 --- a/src/tim/vx/internal/src/kernel/vsi_nn_kernel.c +++ b/src/tim/vx/internal/src/kernel/vsi_nn_kernel.c @@ -444,14 +444,15 @@ static vsi_status _gpu_register if( VSI_NN_KERNEL_TYPE_EVIS == kernel->type ) { cost_bytes = snprintf( cmd, MAX_BUILDPROGRAM_LEN, - "-cl-viv-vx-extension -D VX_VERSION=2" ); + "-cl-viv-vx-extension -D VX_VERSION=2 -D USE_40BITS_VA=%d", + context->config.use_40bits_va ); } } else { cost_bytes = snprintf( cmd, MAX_BUILDPROGRAM_LEN, - "-cl-viv-vx-extension -D VX_VERSION=%d", - context->config.evis.ver ); + "-cl-viv-vx-extension -D VX_VERSION=%d -D USE_40BITS_VA=%d", + context->config.evis.ver, context->config.use_40bits_va ); } // Pack build option if( kernel->gpu.sources[active_fmt].build_option.data ) @@ -812,7 +813,6 @@ void vsi_nn_kernel_add_build_option } snprintf( &buf[org_size], item_size + 2, " %s", option ); build_option->data = buf; - } /* vsi_nn_kernel_add_build_option() */ void vsi_nn_kernel_release @@ -1224,18 +1224,7 @@ vsi_status vsi_nn_kernel_pirority_set static vsi_bool _check_shader_support(vsi_nn_graph_t* graph) { - char *envctrl; - static int32_t enableShader = -1; - - if (enableShader == -1) - { - enableShader = 1; - envctrl = getenv("VIV_VX_ENABLE_SHADER"); - if (envctrl) - { - enableShader = atoi(envctrl); - } - } + int32_t enableShader = graph->ctx->options.enable_shader; #if VX_HARDWARE_CAPS_PARAMS_EXT_SUPPORT if ( graph->ctx->config.subGroupSize == 0 ) @@ -1251,4 +1240,3 @@ static vsi_bool _check_shader_support(vsi_nn_graph_t* graph) return FALSE; } - diff --git a/src/tim/vx/internal/src/kernel/vsi_nn_kernel_selector.c b/src/tim/vx/internal/src/kernel/vsi_nn_kernel_selector.c index fd4d2e7..2b98e93 100644 --- a/src/tim/vx/internal/src/kernel/vsi_nn_kernel_selector.c +++ b/src/tim/vx/internal/src/kernel/vsi_nn_kernel_selector.c @@ -127,5 +127,7 @@ REGISTER_VX_FIRST_KERNEL_SELECTOR(hard_sigmoid) REGISTER_VX_FIRST_KERNEL_SELECTOR(clip) REGISTER_VX_FIRST_KERNEL_SELECTOR(relu_keras) REGISTER_VX_FIRST_KERNEL_SELECTOR(erf) +REGISTER_VX_FIRST_KERNEL_SELECTOR(gelu) +REGISTER_VX_FIRST_KERNEL_SELECTOR(hard_gelu) __END_DECLS diff --git a/src/tim/vx/internal/src/kernel/vx/eltwise_unary_vx.c b/src/tim/vx/internal/src/kernel/vx/eltwise_unary_vx.c index 0a64be9..977be07 100644 --- a/src/tim/vx/internal/src/kernel/vx/eltwise_unary_vx.c +++ b/src/tim/vx/internal/src/kernel/vx/eltwise_unary_vx.c @@ -78,6 +78,59 @@ static float mish_eval(float data, float alpha) return data; } +static float erf_eval(float x) +{ + float res = 0; + float tmp = x; + float factorial = 1; /*n!*/ + float x_pow = x; + int32_t one = 1; + int32_t n = 1; + + if (x <= -3) + { + return -1; + } + else if (x >= 3) + { + return 1; + } + + while (vsi_abs(tmp) > 1e-5) + { + res += tmp; + + factorial *= n; + one *= -1; + x_pow *= x * x; + tmp = one / factorial * x_pow / ( 2 * n + 1); + + n ++; + } +#define VSI_MUL2_RSQRTPI (1.1283791670955126f) + + res *= VSI_MUL2_RSQRTPI; + + return res; +} + +static float gelu_eval(float data, float alpha) +{ + data = (float)(0.5f * data * (1 + erf_eval(data / (float)sqrt(2.0f)))); + + return data; +} + + +#define VSI_SQRT_2_RCP_PI 0.7978845834732056f +static float hgelu_eval(float data, float alpha) +{ + float cdf = (float)(0.5f * (1.0f + tanh((VSI_SQRT_2_RCP_PI * + (data + 0.044715f * data * data * data))))); + + return data * cdf; +} + #ifdef VX_USER_LOOKUP_TABLE_SUPPORT static int32_t _lut_comparator(const void *pa, const void *pb) { @@ -232,6 +285,8 @@ REGISTER_ELTWISE_UNARY_OPENVX_KERNEL( log, log_eval ) REGISTER_ELTWISE_UNARY_OPENVX_KERNEL( elu, elu_eval ) REGISTER_ELTWISE_UNARY_OPENVX_KERNEL( neg, neg_eval ) REGISTER_ELTWISE_UNARY_OPENVX_KERNEL( hard_sigmoid, hsigmoid_eval ) +REGISTER_ELTWISE_UNARY_OPENVX_KERNEL( gelu, gelu_eval ) +REGISTER_ELTWISE_UNARY_OPENVX_KERNEL( hard_gelu, hgelu_eval ) #undef REGISTER_ELTWISE_UNARY_OPENVX_KERNEL diff --git a/src/tim/vx/internal/src/kernel/vx/erf_vx.c b/src/tim/vx/internal/src/kernel/vx/erf_vx.c index 8daf0be..f33fa23 100644 --- a/src/tim/vx/internal/src/kernel/vx/erf_vx.c +++ b/src/tim/vx/internal/src/kernel/vx/erf_vx.c @@ -38,8 +38,9 @@ typedef struct _sort_lut_s float val; } sort_lut; -static float erf_eval(float x) +static float erf_eval(float _x) { + float x = vsi_clamp(_x, -2, 2); float res = 0; float tmp = x; float factorial = 1; /*n!*/ diff --git a/src/tim/vx/internal/src/libnnext/ops/cl/batchnorm_single.cl b/src/tim/vx/internal/src/libnnext/ops/cl/batchnorm_single.cl index 227e659..e2ed333 100644 --- a/src/tim/vx/internal/src/libnnext/ops/cl/batchnorm_single.cl +++ b/src/tim/vx/internal/src/libnnext/ops/cl/batchnorm_single.cl @@ -1,4 +1,10 @@ +#define READ_IMAGEF_ARRAY2D(dest, tensor, coord) \ + do { \ + int depth = get_image_array_size(tensor); \ + _viv_asm(CLAMP0MAX, coord_in0.z, coord_in0.z, in0_depth - 1); \ + dest = read_imagef(tensor, coord); \ + } while(0) __kernel void batch_norm_F32toF32 ( __read_only image2d_array_t input, @@ -17,11 +23,11 @@ __kernel void batch_norm_F32toF32 int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0); float4 src, mean, var, gamma, beta; - readImage2DArray(src, input, coord); - readImage2DArray(mean, Mean, coord); - readImage2DArray(var, Variance, coord); - readImage2DArray(gamma, Gamma, coord); - readImage2DArray(beta, Beta, coord); + READ_IMAGEF_2DARRAY(src, input, coord); + READ_IMAGEF_2DARRAY(mean, Mean, coord); + READ_IMAGEF_2DARRAY(var, Variance, coord); + READ_IMAGEF_2DARRAY(gamma, Gamma, coord); + READ_IMAGEF_2DARRAY(beta, Beta, coord); float4 dst; src.x = src.x - mean.x; @@ -81,11 +87,11 @@ __kernel void batch_norm_U8toU8 uint4 data; float4 src, mean, var, gamma, beta; - readImage2DArray(data, input, coord); - readImage2DArray(mean, Mean, coord); - readImage2DArray(var, Variance, coord); - readImage2DArray(gamma, Gamma, coord); - readImage2DArray(beta, Beta, coord); + READ_IMAGEF_2DARRAY(data, input, coord); + READ_IMAGEF_2DARRAY(mean, Mean, coord); + READ_IMAGEF_2DARRAY(var, Variance, coord); + READ_IMAGEF_2DARRAY(gamma, Gamma, coord); + READ_IMAGEF_2DARRAY(beta, Beta, coord); src = convert_float4(data) * input_scale - input_tail; src.x = src.x - mean.x; diff --git a/src/tim/vx/internal/src/libnnext/ops/cl/eltwise_ops_helper.cl b/src/tim/vx/internal/src/libnnext/ops/cl/eltwise_ops_helper.cl index 9a322a5..37f1db8 100644 --- a/src/tim/vx/internal/src/libnnext/ops/cl/eltwise_ops_helper.cl +++ b/src/tim/vx/internal/src/libnnext/ops/cl/eltwise_ops_helper.cl @@ -18,11 +18,19 @@ inline Image create_image_from_image2d(image2d_t input, int stride_x) int8 desc; _viv_asm(COPY, desc, input, sizeof(desc)); +#if (USE_40BITS_VA==0) + uint address = as_uint(desc.s0); + int stride_y = desc.s1; +#else + ulong address = as_ulong(desc.s05); + int stride_y = desc.s6; +#endif + Image img = { - .ptr = (uchar*)desc.s0, + .ptr = (uchar*)address, .stride_x = stride_x, - .stride_y = desc.s1 + .stride_y = stride_y }; return img; @@ -36,53 +44,60 @@ typedef struct Tensor int stride_z; } Tensor; -inline uchar* create_tensor_ptr_from_coord(Tensor t, int4 coord) +inline uchar* get_tensor_ptr_from_coord(Tensor t, int4 coord) { return t.ptr + coord.x * t.stride_x + coord.y * t.stride_y + coord.z * t.stride_z; } inline Tensor create_tensor_from_image2d_array(image2d_array_t input, int stride_x) { +#if (USE_40BITS_VA==0) int8 desc; _viv_asm(COPY, desc, input, sizeof(desc)); + uint address = as_uint(desc.s0); + int stride_y = desc.s1; + int stride_z = desc.s4; +#else + int16 desc; + _viv_asm(COPY, desc, input, sizeof(desc)); + + ulong address = as_ulong(desc.s05); + int stride_y = desc.s6; + int stride_z = desc.sa; +#endif + Tensor t = { - .ptr = (uchar*)desc.s0, + .ptr = (uchar*)address, .stride_x = stride_x, - .stride_y = desc.s1, - .stride_z = desc.s4 + .stride_y = stride_y, + .stride_z = stride_z }; return t; } -#define readImage2DArray(Dest, Image, Coord) \ - do { \ - int8 desc; \ - _viv_asm(COPY, desc, Image, sizeof(desc)); \ - _viv_asm(CLAMP0MAX, (Coord).w, (Coord).z, desc.s5 - 1); \ - int baseAddr = (int)(Coord).w * desc.s4 + desc.s0; \ - _viv_asm(MOV, (Coord).w, baseAddr); \ - _viv_asm(IMAGE_READ_3D, Dest, Image, (Coord).xyww); \ - } while (0) +#define READ_IMAGEF_2DARRAY(dest, tensor, coord) \ + do { \ + int depth = get_image_array_size(tensor); \ + int4 coord_in = coord; \ + _viv_asm(CLAMP0MAX, coord_in.z, coord_in.z, depth - 1); \ + dest = read_imagef(tensor, coord_in); \ + } while(0) -#define writeImage2DArray(Image, Coord, Color) \ - do { \ - int8 desc; \ - _viv_asm(COPY, desc, Image, sizeof(desc)); \ - _viv_asm(CLAMP0MAX, (Coord).w, (Coord).z, desc.s5 - 1); \ - int baseAddr = (int)(Coord).w * desc.s4 + desc.s0; \ - _viv_asm(MOV, (Coord).w, baseAddr); \ - _viv_asm(IMAGE_WRITE_3D, Color, Image, (Coord).xyww); \ - } while (0) +#define READ_IMAGEI_2DARRAY(dest, tensor, coord) \ + do { \ + int depth = get_image_array_size(tensor); \ + int4 coord_in = coord; \ + _viv_asm(CLAMP0MAX, coord_in.z, coord_in.z, depth - 1); \ + dest = read_imagei(tensor, coord_in); \ + } while(0) -#define readImage(Dest, Image, Coord) \ - do { \ - _viv_asm(IMAGE_READ, Dest, Image, Coord); \ - } while (0) - -#define writeImage(Image, Coord, Color) \ - do { \ - _viv_asm(IMAGE_WRITE, Color, Image, Coord); \ - } while (0) +#define READ_IMAGEUI_2DARRAY(dest, tensor, coord) \ + do { \ + int depth = get_image_array_size(tensor); \ + int4 coord_in = coord; \ + _viv_asm(CLAMP0MAX, coord_in.z, coord_in.z, depth - 1); \ + dest = read_imageui(tensor, coord_in); \ + } while(0) diff --git a/src/tim/vx/internal/src/libnnext/ops/cl/eltwise_unary.cl b/src/tim/vx/internal/src/libnnext/ops/cl/eltwise_unary.cl index 5b23144..bb31c02 100644 --- a/src/tim/vx/internal/src/libnnext/ops/cl/eltwise_unary.cl +++ b/src/tim/vx/internal/src/libnnext/ops/cl/eltwise_unary.cl @@ -1,12 +1,12 @@ -float4 eltwise_unary_sin(float4 x, float alpha) +float eltwise_unary_sin(float x, float alpha) { return native_sin(x); } #define logE (1.44269502f) #define twoLogE (logE * 2.0f) -float4 eltwise_unary_exp(float4 x, float alpha) +float eltwise_unary_exp(float x, float alpha) { x *= logE; x = exp2(x); @@ -14,33 +14,33 @@ float4 eltwise_unary_exp(float4 x, float alpha) } #define rlogE (0.693147182f) -float4 eltwise_unary_log(float4 x, float alpha) +float eltwise_unary_log(float x, float alpha) { x = log2(x); return x * rlogE; } -float4 eltwise_unary_elu(float4 val, float alpha) +float eltwise_unary_elu(float val, float alpha) { - float4 x = val * logE; + float x = val * logE; x = exp2(x) * alpha - alpha; return val < 0 ? x : val; } -float4 eltwise_unary_neg(float4 x, float alpha) +float eltwise_unary_neg(float x, float alpha) { return x * -1; } -float4 eltwise_unary_hard_sigmoid(float4 x, float alpha) +float eltwise_unary_hard_sigmoid(float x, float alpha) { x = 0.2 * x + 0.5; x = clamp(x, 0, 1); return x; } -float4 _softrelu(float4 x, float alpha) +float _softrelu(float x, float alpha) { x *= logE; x = exp2(x); @@ -49,7 +49,7 @@ float4 _softrelu(float4 x, float alpha) return x * rlogE; } -float4 _tanh(float4 x, float alpha) +float _tanh(float x, float alpha) { x *= -twoLogE; x = 1 + exp2(x); @@ -57,16 +57,60 @@ float4 _tanh(float4 x, float alpha) return (2 * x - 1); } -float4 eltwise_unary_mish(float4 x, float alpha) +float eltwise_unary_mish(float x, float alpha) { - float4 y = _softrelu(x, alpha); + float y = _softrelu(x, alpha); x = x * _tanh(y, alpha); return x; } -float4 eltwise_unary_round(float4 x, float alpha) +float eltwise_unary_round(float x, float alpha) { - return convert_float4(convert_int4_rte(x)); + return convert_float(convert_int_rte(x)); +} + +#define MUL2_RSQRTPI (1.1283791670955126f) +float erf_eval(float x) +{ + float res = 0; + float tmp = x; + float factorial = 1; + float x_pow = x; + float one = 1.0f; + float n = 1; + + if (x <= -3) + return -1; + else if (x >= 3) + return 1; + + while (fabs(tmp) > 1e-5) + { + res += tmp; + + factorial *= n; + one *= -1; + x_pow *= x * x; + tmp = one / factorial * x_pow / ( 2 * n + 1); + + n += 1.0f; + } + return res * MUL2_RSQRTPI; +} +#define RSQRT2 (0.70710678118654752440084436210485f) +float eltwise_unary_gelu(float x, float alpha) +{ + x = 0.5f * x * (1 + erf_eval(x * RSQRT2)); + + return x; +} + +#define SQRT_2_RCP_PI 0.7978845834732056f +float eltwise_unary_hard_gelu(float x, float alpha) +{ + float cdf = 0.5f + 0.5f * _tanh(SQRT_2_RCP_PI * + (x + 0.044715f * x * x * x), 0); + return x * cdf; } #define ELTWISE_UNARY_F32(func_name) \ @@ -85,9 +129,10 @@ __kernel void func_name##_F32toF32 \ \ float4 src = read_imagef(input, coord); \ \ - float4 dst = eltwise_unary_##func_name(src, alpha); \ + float4 dst = 0; \ + dst.x = eltwise_unary_##func_name(src.x, alpha); \ \ - write_imagef(output, coord, dst); \ + write_imagef(output, coord, dst.xxxx); \ } ELTWISE_UNARY_F32(sin) ELTWISE_UNARY_F32(exp) @@ -97,6 +142,8 @@ ELTWISE_UNARY_F32(neg) ELTWISE_UNARY_F32(mish) ELTWISE_UNARY_F32(hard_sigmoid) ELTWISE_UNARY_F32(round) +ELTWISE_UNARY_F32(gelu) +ELTWISE_UNARY_F32(hard_gelu) #define ELTWISE_UNARY_F32_2D(func_name) \ __kernel void func_name##_F32toF32_2D \ @@ -114,9 +161,10 @@ __kernel void func_name##_F32toF32_2D \ \ float4 src = read_imagef(input, coord); \ \ - float4 dst = eltwise_unary_##func_name(src, alpha); \ + float4 dst = 0; \ + dst.x = eltwise_unary_##func_name(src.x, alpha); \ \ - write_imagef(output, coord, dst); \ + write_imagef(output, coord, dst.xxxx); \ } ELTWISE_UNARY_F32_2D(sin) ELTWISE_UNARY_F32_2D(exp) @@ -126,6 +174,8 @@ ELTWISE_UNARY_F32_2D(neg) ELTWISE_UNARY_F32_2D(mish) ELTWISE_UNARY_F32_2D(hard_sigmoid) ELTWISE_UNARY_F32_2D(round) +ELTWISE_UNARY_F32_2D(gelu) +ELTWISE_UNARY_F32_2D(hard_gelu) #define ELTWISE_UNARY_U8(func_name) \ __kernel void func_name##_U8toU8 \ @@ -144,7 +194,7 @@ __kernel void func_name##_U8toU8 \ uint4 src = read_imageui(input, coord); \ float4 data = convert_float4(src) * inputScale - inputTail; \ \ - data = eltwise_unary_##func_name(data, alpha); \ + data.x = eltwise_unary_##func_name(data.x, alpha); \ uint4 dst = convert_uint4(data * outputScale + outputZP); \ \ write_imageui(output, coord, dst); \ @@ -157,6 +207,8 @@ ELTWISE_UNARY_U8(neg) ELTWISE_UNARY_U8(mish) ELTWISE_UNARY_U8(hard_sigmoid) ELTWISE_UNARY_U8(round) +ELTWISE_UNARY_U8(gelu) +ELTWISE_UNARY_U8(hard_gelu) #define ELTWISE_UNARY_U8_2D(func_name) \ __kernel void func_name##_U8toU8_2D \ @@ -175,7 +227,7 @@ __kernel void func_name##_U8toU8_2D \ uint4 src = read_imageui(input, coord); \ float4 data = convert_float4(src) * inputScale - inputTail; \ \ - data = eltwise_unary_##func_name(data, alpha); \ + data.x = eltwise_unary_##func_name(data.x, alpha); \ uint4 dst = convert_uint4(data * outputScale + outputZP); \ \ write_imageui(output, coord, dst); \ @@ -188,6 +240,8 @@ ELTWISE_UNARY_U8_2D(neg) ELTWISE_UNARY_U8_2D(mish) ELTWISE_UNARY_U8_2D(hard_sigmoid) ELTWISE_UNARY_U8_2D(round) +ELTWISE_UNARY_U8_2D(gelu) +ELTWISE_UNARY_U8_2D(hard_gelu) __kernel void neg_I32toI32 ( diff --git a/src/tim/vx/internal/src/libnnext/ops/cl/erf.cl b/src/tim/vx/internal/src/libnnext/ops/cl/erf.cl index 9f38f95..0a0e410 100644 --- a/src/tim/vx/internal/src/libnnext/ops/cl/erf.cl +++ b/src/tim/vx/internal/src/libnnext/ops/cl/erf.cl @@ -1,6 +1,7 @@ #define MUL2_RSQRTPI (1.1283791670955126f) -float eltwise_unary_erf(float x) +float eltwise_unary_erf(float _x) { + float x = clamp(_x, -2, 2); float res = 0; float tmp = x; float factorial = 1; diff --git a/src/tim/vx/internal/src/libnnext/ops/cl/floordiv.cl b/src/tim/vx/internal/src/libnnext/ops/cl/floordiv.cl index 581694a..746a06e 100644 --- a/src/tim/vx/internal/src/libnnext/ops/cl/floordiv.cl +++ b/src/tim/vx/internal/src/libnnext/ops/cl/floordiv.cl @@ -6,8 +6,8 @@ __kernel void floordiv_F32F32toF32( int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0); float4 src0; float4 src1; - readImage2DArray(src0, input, coord); - readImage2DArray(src1, input1, coord); + READ_IMAGEF_2DARRAY(src0, input, coord); + READ_IMAGEF_2DARRAY(src1, input1, coord); float4 dst = floor(src0 / src1); write_imagef(output, coord, dst); } @@ -32,8 +32,8 @@ __kernel void floordiv_I32I32toI32( int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0); int4 src0; int4 src1; - readImage2DArray(src0, input, coord); - readImage2DArray(src1, input1, coord); + READ_IMAGEI_2DARRAY(src0, input, coord); + READ_IMAGEI_2DARRAY(src1, input1, coord); int4 dst = convert_int4(floor(convert_float4(src0) / convert_float4(src1))); write_imagei(output, coord, dst); } @@ -64,8 +64,8 @@ __kernel void floordiv_I32I32toU8( int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0); int4 src0; int4 src1; - readImage2DArray(src0, input, coord); - readImage2DArray(src1, input1, coord); + READ_IMAGEI_2DARRAY(src0, input, coord); + READ_IMAGEI_2DARRAY(src1, input1, coord); uint4 dst = convert_uint4(floor(convert_float4(src0) / convert_float4(src1)) * outputScale + outputTail); write_imageui(output, coord, dst); } @@ -102,8 +102,8 @@ __kernel void floordiv_U8U8toU8( int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0); uint4 src0, src1; float4 in0, in1, out; - readImage2DArray(src0, input, coord); - readImage2DArray(src1, input1, coord); + READ_IMAGEUI_2DARRAY(src0, input, coord); + READ_IMAGEUI_2DARRAY(src1, input1, coord); in0 = convert_float4(src0) * input0Scale + input0Tail; in1 = convert_float4(src1) * input1Scale + input1Tail; out = floor(in0 / in1) * outputScale + outputTail; @@ -148,8 +148,8 @@ __kernel void floordiv_U8I32toU8( uint4 src0; int4 src1; float4 in0, in1, out; - readImage2DArray(src0, input, coord); - readImage2DArray(src1, input1, coord); + READ_IMAGEUI_2DARRAY(src0, input, coord); + READ_IMAGEI_2DARRAY(src1, input1, coord); in0 = convert_float4(src0) * input0Scale + input0Tail; in1 = convert_float4(src1); out = floor(in0 / in1) * outputScale + outputTail; diff --git a/src/tim/vx/internal/src/libnnext/ops/cl/logical_ops.cl b/src/tim/vx/internal/src/libnnext/ops/cl/logical_ops.cl index 0b0ebf5..3c5806e 100644 --- a/src/tim/vx/internal/src/libnnext/ops/cl/logical_ops.cl +++ b/src/tim/vx/internal/src/libnnext/ops/cl/logical_ops.cl @@ -7,8 +7,8 @@ __kernel void logical_##name##_I8toI8( \ int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0); \ int4 src0; \ int4 src1; \ - readImage2DArray(src0, input, coord); \ - readImage2DArray(src1, input1, coord); \ + READ_IMAGEI_2DARRAY(src0, input, coord); \ + READ_IMAGEI_2DARRAY(src1, input1, coord); \ int4 dst = (lgc_op2(src0))lgc_op(lgc_op2(src1)); \ dst.x = dst.x & 1; \ write_imagei(output, coord, dst); \ diff --git a/src/tim/vx/internal/src/libnnext/ops/cl/maximum.cl b/src/tim/vx/internal/src/libnnext/ops/cl/maximum.cl index eb59e6d..bbf45c2 100644 --- a/src/tim/vx/internal/src/libnnext/ops/cl/maximum.cl +++ b/src/tim/vx/internal/src/libnnext/ops/cl/maximum.cl @@ -15,8 +15,8 @@ __kernel void maximum_FP32FP32toFP32 float4 src0; float4 src1; - readImage2DArray(src0, input0, coord); - readImage2DArray(src1, input1, coord); + READ_IMAGEF_2DARRAY(src0, input0, coord); + READ_IMAGEF_2DARRAY(src1, input1, coord); float4 dst = src0 > src1 ? src0 : src1; @@ -63,8 +63,8 @@ __kernel void maximum_U8U8toU8 uint4 src0; uint4 src1; - readImage2DArray(src0, input0, coord); - readImage2DArray(src1, input1, coord); + READ_IMAGEUI_2DARRAY(src0, input0, coord); + READ_IMAGEUI_2DARRAY(src1, input1, coord); float4 data0 = convert_float4(src0) * input0Scale - input0Tail; float4 data1 = convert_float4(src1) * input1Scale - input1Tail; @@ -118,8 +118,8 @@ __kernel void maximum_I32I32toI32 int4 src0; int4 src1; - readImage2DArray(src0, input0, coord); - readImage2DArray(src1, input1, coord); + READ_IMAGEI_2DARRAY(src0, input0, coord); + READ_IMAGEI_2DARRAY(src1, input1, coord); int4 dst = src0 > src1 ? src0 : src1; diff --git a/src/tim/vx/internal/src/libnnext/ops/cl/minimum.cl b/src/tim/vx/internal/src/libnnext/ops/cl/minimum.cl index d04431a..981d789 100644 --- a/src/tim/vx/internal/src/libnnext/ops/cl/minimum.cl +++ b/src/tim/vx/internal/src/libnnext/ops/cl/minimum.cl @@ -15,8 +15,8 @@ __kernel void minimum_FP32FP32toFP32 float4 src0; float4 src1; - readImage2DArray(src0, input0, coord); - readImage2DArray(src1, input1, coord); + READ_IMAGEF_2DARRAY(src0, input0, coord); + READ_IMAGEF_2DARRAY(src1, input1, coord); float4 dst = src0 < src1 ? src0 : src1; @@ -63,8 +63,8 @@ __kernel void minimum_U8U8toU8 uint4 src0; uint4 src1; - readImage2DArray(src0, input0, coord); - readImage2DArray(src1, input1, coord); + READ_IMAGEUI_2DARRAY(src0, input0, coord); + READ_IMAGEUI_2DARRAY(src1, input1, coord); float4 data0 = convert_float4(src0) * input0Scale - input0Tail; float4 data1 = convert_float4(src1) * input1Scale - input1Tail; @@ -118,8 +118,8 @@ __kernel void minimum_I32I32toI32 int4 src0; int4 src1; - readImage2DArray(src0, input0, coord); - readImage2DArray(src1, input1, coord); + READ_IMAGEI_2DARRAY(src0, input0, coord); + READ_IMAGEI_2DARRAY(src1, input1, coord); int4 dst = src0 < src1 ? src0 : src1; diff --git a/src/tim/vx/internal/src/libnnext/ops/cl/pow.cl b/src/tim/vx/internal/src/libnnext/ops/cl/pow.cl index 9acbe98..a2ee944 100644 --- a/src/tim/vx/internal/src/libnnext/ops/cl/pow.cl +++ b/src/tim/vx/internal/src/libnnext/ops/cl/pow.cl @@ -9,8 +9,8 @@ __kernel void pow_FP32FP32toFP32 float4 src0, src1; float4 dst; - readImage2DArray(src0, input0, coord); - readImage2DArray(src1, input1, coord); + READ_IMAGEF_2DARRAY(src0, input0, coord); + READ_IMAGEF_2DARRAY(src1, input1, coord); float4 s0 = sign(src0); int4 t0 = convert_int4(src1) & 1; diff --git a/src/tim/vx/internal/src/libnnext/ops/cl/prelu.cl b/src/tim/vx/internal/src/libnnext/ops/cl/prelu.cl index daf2819..b6a5efc 100644 --- a/src/tim/vx/internal/src/libnnext/ops/cl/prelu.cl +++ b/src/tim/vx/internal/src/libnnext/ops/cl/prelu.cl @@ -15,8 +15,8 @@ __kernel void prelu_FP32FP32toFP32 float4 src0; float4 src1; - readImage2DArray(src0, input0, coord); - readImage2DArray(src1, input1, coord); + READ_IMAGEF_2DARRAY(src0, input0, coord); + READ_IMAGEF_2DARRAY(src1, input1, coord); float4 maxData = src0 >= 0 ? src0 : 0; float4 minData = src0 < 0 ? src0 : 0; @@ -67,8 +67,8 @@ __kernel void prelu_U8U8toU8 uint4 src0; uint4 src1; - readImage2DArray(src0, input0, coord); - readImage2DArray(src1, input1, coord); + READ_IMAGEUI_2DARRAY(src0, input0, coord); + READ_IMAGEUI_2DARRAY(src1, input1, coord); float4 data0 = convert_float4(src0) * input0Scale - input0Tail; float4 data1 = convert_float4(src1) * input1Scale - input1Tail; @@ -130,8 +130,8 @@ __kernel void prelu_I32I32toI32 int4 src0; int4 src1; - readImage2DArray(src0, input0, coord); - readImage2DArray(src1, input1, coord); + READ_IMAGEI_2DARRAY(src0, input0, coord); + READ_IMAGEI_2DARRAY(src1, input1, coord); float4 data0 = convert_float4(src0) * input0Scale - input0Tail; float4 data1 = convert_float4(src1) * input1Scale - input1Tail; diff --git a/src/tim/vx/internal/src/libnnext/ops/cl/random_multinomial.cl b/src/tim/vx/internal/src/libnnext/ops/cl/random_multinomial.cl index 2a90f35..88377c9 100644 --- a/src/tim/vx/internal/src/libnnext/ops/cl/random_multinomial.cl +++ b/src/tim/vx/internal/src/libnnext/ops/cl/random_multinomial.cl @@ -1,14 +1,5 @@ #pragma OPENCL EXTENSION CL_VIV_asm : enable -inline uchar* get_image2D_array_ptr(image2d_array_t input) -{ - int8 desc; - _viv_asm(COPY, desc, input, sizeof(desc)); - uchar *src_ptr = (uchar*)desc.s0; - - return src_ptr; -} - uint4 _philox4x32bumpkey(uint4 key) { uint4 mask = (uint4)((uint)0x9E3779B9, (uint)0xBB67AE85, 0, 0); @@ -61,14 +52,16 @@ __kernel void random_seed( float re_rand_max ) { - __global uint* seeds_ptr = (__global uint*)get_image2D_array_ptr(seeds); + Tensor s_tensor = create_tensor_from_image2d_array(seeds, 4); + __global uint* seeds_ptr = (__global uint*)s_tensor.ptr; seeds_ptr = seeds_ptr; uint4 key = vload4(0, seeds_ptr); uint4 ctr = (uint4)(0); float4 result = 0; - __global float* output_ptr = (__global float*)get_image2D_array_ptr(output); + Tensor o_tensor = create_tensor_from_image2d_array(output, 4); + __global float* output_ptr = (__global float*)o_tensor.ptr; for(int i = 0; i < iter; i++) { @@ -152,17 +145,20 @@ __kernel void random_multinomial int class_size = get_image_width(cdfs); int offset = gidy * class_size; - __global float* cdf_ptr = (__global float*)get_image2D_array_ptr(cdfs); + Tensor cdf_tensor = create_tensor_from_image2d_array(cdfs, 4); + __global float* cdf_ptr = (__global uint*)cdf_tensor.ptr; __global float* cdfPtr = cdf_ptr + offset; int width = get_image_width(randoms); offset = coord.x + coord.y * width; - __global float* randoms_ptr = (__global float*)get_image2D_array_ptr(randoms); + Tensor r_tensor = create_tensor_from_image2d_array(randoms, 4); + __global float* randoms_ptr = (__global float*)r_tensor.ptr; randoms_ptr = randoms_ptr + offset; width = get_image_width(output); offset = coord.x + coord.y * width; - __global uint* output_ptr = (__global uint*)get_image2D_array_ptr(output); + Tensor o_tensor = create_tensor_from_image2d_array(output, 4); + __global uint* output_ptr = (__global uint*)o_tensor.ptr; output_ptr = output_ptr + offset; float4 ran = vload4(0, randoms_ptr); diff --git a/src/tim/vx/internal/src/libnnext/ops/cl/relational_ops.cl b/src/tim/vx/internal/src/libnnext/ops/cl/relational_ops.cl index cd13f7e..6e23b91 100644 --- a/src/tim/vx/internal/src/libnnext/ops/cl/relational_ops.cl +++ b/src/tim/vx/internal/src/libnnext/ops/cl/relational_ops.cl @@ -15,8 +15,8 @@ __kernel void func_name##_F32F32toBOOL8 \ \ float4 src0; \ float4 src1; \ - readImage2DArray(src0, input0, coord); \ - readImage2DArray(src1, input1, coord); \ + READ_IMAGEF_2DARRAY(src0, input0, coord); \ + READ_IMAGEF_2DARRAY(src1, input1, coord); \ \ int4 dst = (src0)comp_op(src1); \ dst &= 1; \ @@ -75,8 +75,8 @@ __kernel void func_name##_U32U32toBOOL8 \ \ uint4 data0; \ uint4 data1; \ - readImage2DArray(data0, input0, coord); \ - readImage2DArray(data1, input1, coord); \ + READ_IMAGEUI_2DARRAY(data0, input0, coord); \ + READ_IMAGEUI_2DARRAY(data1, input1, coord); \ \ float4 src0 = convert_float4(data0) * input0Scale - input0Tail; \ float4 src1 = convert_float4(data1) * input1Scale - input1Tail; \ @@ -139,8 +139,8 @@ __kernel void func_name##_I32I32toBOOL8 \ \ int4 src0; \ int4 src1; \ - readImage2DArray(src0, input0, coord); \ - readImage2DArray(src1, input1, coord); \ + READ_IMAGEI_2DARRAY(src0, input0, coord); \ + READ_IMAGEI_2DARRAY(src1, input1, coord); \ \ int4 dst = (src0)comp_op(src1); \ dst &= 1; \ diff --git a/src/tim/vx/internal/src/libnnext/ops/cl/scatter_nd_update.cl b/src/tim/vx/internal/src/libnnext/ops/cl/scatter_nd_update.cl new file mode 100644 index 0000000..93ce3a2 --- /dev/null +++ b/src/tim/vx/internal/src/libnnext/ops/cl/scatter_nd_update.cl @@ -0,0 +1,45 @@ + +#define SCATTER_ND_UPDATE(src0_type, data_type, read_func, write_func) \ +__kernel void scatter_nd_update_##src0_type##src0_type##to##src0_type( \ + __read_only image2d_t input0, \ + __read_only image2d_t input1, \ + __read_only image2d_t input2, \ + __write_only image2d_t output, \ + int offsetX, \ + int offsetY, \ + int offsetZ, \ + int offsetW, \ + int offset_idx, \ + int coord_dim, \ + int index_num \ + ) \ +{ \ + int gidx = get_global_id(0); \ + int gidy = get_global_id(1); \ + int cnt = 0; \ + \ + data_type sum = (data_type)(0, 0, 0, 0); \ + Image img1 = create_image_from_image2d(input1, 4); \ + __global int* index_ptr = (__global int*)img1.ptr; \ + for(int i = 0; i < index_num; i++) \ + { \ + int4 indice = vload4(0, index_ptr + offset_idx); \ + index_ptr += coord_dim; \ + int idx = indice.x * offsetX + indice.y * offsetY + indice.z * offsetZ + indice.w * offsetW; \ + if(gidy == idx) \ + { \ + data_type data = read_func(input2, (int2)(gidx, i)); \ + cnt++; \ + sum += data; \ + } \ + } \ + int2 coord = (int2)(gidx, gidy); \ + if(cnt == 0) \ + { \ + sum = read_func(input0, coord); \ + } \ + write_func(output, coord, sum); \ +} +SCATTER_ND_UPDATE(U32, uint4, read_imageui, write_imageui) +SCATTER_ND_UPDATE(I32, int4, read_imagei, write_imagei) +SCATTER_ND_UPDATE(F32, float4, read_imagef, write_imagef) diff --git a/src/tim/vx/internal/src/libnnext/ops/cl/select.cl b/src/tim/vx/internal/src/libnnext/ops/cl/select.cl index fcdd616..ab39f63 100644 --- a/src/tim/vx/internal/src/libnnext/ops/cl/select.cl +++ b/src/tim/vx/internal/src/libnnext/ops/cl/select.cl @@ -12,9 +12,9 @@ __kernel void select_I8_U8_U8toU8( int4 value; uint4 src0, src1, src, dst; float inputScale, inputTail; - readImage2DArray(value, condition, coord); - readImage2DArray(src0, input0, coord); - readImage2DArray(src1, input1, coord); + READ_IMAGEI_2DARRAY(value, condition, coord); + READ_IMAGEF_2DARRAY(src0, input0, coord); + READ_IMAGEF_2DARRAY(src1, input1, coord); src = (value != 0 ? src0 : src1); inputScale = (value.x != 0 ? input0Scale : input1Scale); inputTail = (value.x != 0 ? input0Tail : input1Tail); @@ -56,9 +56,9 @@ __kernel void select_I8_I32_I32toI32( int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0); int4 value; int4 src0, src1, dst; - readImage2DArray(value, condition, coord); - readImage2DArray(src0, input0, coord); - readImage2DArray(src1, input1, coord); + READ_IMAGEI_2DARRAY(value, condition, coord); + READ_IMAGEI_2DARRAY(src0, input0, coord); + READ_IMAGEI_2DARRAY(src1, input1, coord); dst = (value != 0 ? src0 : src1); write_imagei(output, coord, dst); } @@ -94,9 +94,9 @@ __kernel void select_I8_F32_F32toF32( int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0); int4 value; float4 src0, src1, dst; - readImage2DArray(value, condition, coord); - readImage2DArray(src0, input0, coord); - readImage2DArray(src1, input1, coord); + READ_IMAGEI_2DARRAY(value, condition, coord); + READ_IMAGEF_2DARRAY(src0, input0, coord); + READ_IMAGEF_2DARRAY(src1, input1, coord); dst = (value != 0 ? src0 : src1); write_imagef(output, coord, dst); } diff --git a/src/tim/vx/internal/src/libnnext/ops/cl/signal_frame.cl b/src/tim/vx/internal/src/libnnext/ops/cl/signal_frame.cl new file mode 100644 index 0000000..5976598 --- /dev/null +++ b/src/tim/vx/internal/src/libnnext/ops/cl/signal_frame.cl @@ -0,0 +1,21 @@ + +#define SIGNAL_FRAME_SH_IMPL(type, data_type, read_imagefunc, write_imagefunc) \ +__kernel void signal_frame_##type##to##type \ + ( \ + __read_only image2d_t input, \ + __write_only image2d_array_t output, \ + int frame_step \ + ) \ +{ \ + int inner = get_global_id(0); \ + int length_k = get_global_id(1); \ + int frames_id = get_global_id(2); \ + \ + int4 coord = (int4)(inner, length_k, frames_id, frames_id); \ + int2 coord_in = (int2)(inner, frames_id * frame_step + length_k); \ + \ + data_type src = read_imagefunc(input, coord_in); \ + write_imagefunc(output, coord, src); \ +} +SIGNAL_FRAME_SH_IMPL(F32, float4, read_imagef, write_imagef) +SIGNAL_FRAME_SH_IMPL(U8, uint4, read_imageui, write_imageui) diff --git a/src/tim/vx/internal/src/libnnext/ops/cl/tile.cl b/src/tim/vx/internal/src/libnnext/ops/cl/tile.cl index f2c281a..117d6d2 100644 --- a/src/tim/vx/internal/src/libnnext/ops/cl/tile.cl +++ b/src/tim/vx/internal/src/libnnext/ops/cl/tile.cl @@ -1,5 +1,5 @@ -#define TILE_3D(name0, name1, data_type, write_image_func) \ +#define TILE_3D(name0, name1, data_type, read_image_func, write_image_func) \ __kernel void tile_##name0##to##name1 \ ( \ __read_only image2d_array_t input, \ @@ -19,7 +19,7 @@ __kernel void tile_##name0##to##name1 \ int height = get_image_height(input); \ \ data_type src; \ - readImage2DArray(src, input, coord); \ + read_image_func(src, input, coord); \ \ int batch_id = (short)coord.z / (short)depthIn; \ coord.z = (short)coord.z % (short)depthIn; \ @@ -46,11 +46,11 @@ __kernel void tile_##name0##to##name1 \ } \ } \ } -TILE_3D(I32, I32, int4, write_imagei) -TILE_3D(U32, U32, uint4, write_imageui) -TILE_3D(F32, F32, float4, write_imagef) +TILE_3D(I32, I32, int4, READ_IMAGEI_2DARRAY, write_imagei) +TILE_3D(U32, U32, uint4, READ_IMAGEUI_2DARRAY, write_imageui) +TILE_3D(F32, F32, float4, READ_IMAGEF_2DARRAY, write_imagef) -#define TILE_2D(name0, name1, data_type) \ +#define TILE_2D(name0, name1, data_type, read_image_func, write_image_func) \ __kernel void tile_##name0##to##name1##_2D \ ( \ __read_only image2d_t input, \ @@ -70,23 +70,22 @@ __kernel void tile_##name0##to##name1##_2D \ int output_width = get_image_width(output); \ int output_height = get_image_height(output); \ \ - data_type src; \ - readImage(src, input, coord); \ + data_type src = read_image_func(input, coord); \ \ do \ { \ do \ { \ - writeImage(output, coord, src); \ + write_image_func(output, coord, src); \ coord.x += width; \ } while (coord.x < output_width); \ coord.x = get_global_id(0); \ coord.y += height; \ } while (coord.y < output_height); \ } -TILE_2D(I32, I32, int4) -TILE_2D(U32, U32, uint4) -TILE_2D(F32, F32, float4) +TILE_2D(I32, I32, int4, read_imagei, write_imagei) +TILE_2D(U32, U32, uint4, read_imageui, write_imageui) +TILE_2D(F32, F32, float4, read_imagef, write_imagef) diff --git a/src/tim/vx/internal/src/libnnext/ops/kernel/vsi_nn_kernel_box_with_nms_limit.c b/src/tim/vx/internal/src/libnnext/ops/kernel/vsi_nn_kernel_box_with_nms_limit.c deleted file mode 100644 index f4b6949..0000000 --- a/src/tim/vx/internal/src/libnnext/ops/kernel/vsi_nn_kernel_box_with_nms_limit.c +++ /dev/null @@ -1,511 +0,0 @@ -/**************************************************************************** -* -* Copyright (c) 2020 Vivante Corporation -* -* Permission is hereby granted, free of charge, to any person obtaining a -* copy of this software and associated documentation files (the "Software"), -* to deal in the Software without restriction, including without limitation -* the rights to use, copy, modify, merge, publish, distribute, sublicense, -* and/or sell copies of the Software, and to permit persons to whom the -* Software is furnished to do so, subject to the following conditions: -* -* The above copyright notice and this permission notice shall be included in -* all copies or substantial portions of the Software. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING -* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER -* DEALINGS IN THE SOFTWARE. -* -*****************************************************************************/ -#include -#include -#include -#include -#include -#include - -#include "vsi_nn_platform.h" - -#include "vsi_nn_prv.h" -#include "vsi_nn_log.h" -#include "vsi_nn_test.h" -#include "vsi_nn_tensor_util.h" -#include "utils/vsi_nn_util.h" -#include "utils/vsi_nn_dtype_util.h" -#include "utils/vsi_nn_math.h" -#include "utils/vsi_nn_link_list.h" -#include "libnnext/vsi_nn_vxkernel.h" -#include "libnnext/vx_lib_nnext.h" - -#define _VX_KERNEL_VAR (vx_kernel_BOX_WITH_NMS_LIMIT) -#define _VX_KERNEL_ID (VX_KERNEL_ENUM_BOX_WITH_NMS_LIMIT) -#define _VX_KERNEL_NAME (VX_KERNEL_NAME_BOX_WITH_NMS_LIMIT) -#define _VX_KERNEL_FUNC_KERNEL (vxBox_with_nms_limitKernel) - -static float hard_nms_kernel - ( - float iou, - float iouThreshold - ) -{ - return iou < iouThreshold ? 1.0f : 0.0f; -} - -static float linear_nms_kernel - ( - float iou, - float iouThreshold - ) -{ - return iou < iouThreshold ? 1.0f : 1.0f - iou; -} - -static float gaussian_nms_kernel - ( - float iou, - float sigma - ) -{ - return (float)(exp(-1.0f * iou * iou / sigma)); -} - -void swap_element - ( - uint32_t* list, - uint32_t first, - uint32_t second - ) -{ - uint32_t temp = list[first]; - list[first] = list[second]; - list[second] = temp; -} - -uint32_t max_element - ( - float* data, - uint32_t* index_list, - uint32_t len - ) -{ - uint32_t i; - uint32_t max_index = 0; - float max_val = data[index_list[0]]; - for(i = 1; i < len; i++) - { - float val = data[index_list[i]]; - if (max_val < val) - { - max_val = val; - max_index = i; - } - } - return max_index; -} - -static uint32_t max_comp_func - ( - void* data, - int32_t left, - int32_t right - ) -{ - float* fdata = (float*)data; - return fdata[left] >= fdata[right]; -} - -void sort_element_by_score - ( - float* data, - uint32_t* index_list, - uint32_t len - ) -{ - vsi_nn_partition(data, 0, len - 1, max_comp_func, TRUE, index_list); -} - -typedef struct -{ - float* fdata; - uint32_t numClasses; -} class_comp_param; - -static uint32_t class_comp_func - ( - void* data, - int32_t left, - int32_t right - ) -{ - class_comp_param *p = (class_comp_param*)data; - float* fdata = p->fdata; - uint32_t numClasses = p->numClasses; - uint32_t lhsClass = left % numClasses, rhsClass = right % numClasses; - return lhsClass == rhsClass ? fdata[left] > fdata[right] - : lhsClass < rhsClass; -} - -static void sort_element_by_class - ( - float* data, - uint32_t* index_list, - uint32_t len, - uint32_t numClasses - ) -{ - class_comp_param class_comp; - class_comp.fdata = data; - class_comp.numClasses = numClasses; - vsi_nn_partition(&class_comp, 0, len - 1, class_comp_func, TRUE, index_list); -} - -// Taking two indices of bounding boxes, return the intersection-of-union. -float getIoUAxisAligned - ( - const float* roi1, - const float* roi2 - ) -{ - const float area1 = (roi1[2] - roi1[0]) * (roi1[3] - roi1[1]); - const float area2 = (roi2[2] - roi2[0]) * (roi2[3] - roi2[1]); - const float x1 = vsi_nn_max(roi1[0], roi2[0]); - const float x2 = vsi_nn_min(roi1[2], roi2[2]); - const float y1 = vsi_nn_max(roi1[1], roi2[1]); - const float y2 = vsi_nn_min(roi1[3], roi2[3]); - const float w = vsi_nn_max(x2 - x1, 0.0f); - const float h = vsi_nn_max(y2 - y1, 0.0f); - const float areaIntersect = w * h; - const float areaUnion = area1 + area2 - areaIntersect; - return areaIntersect / areaUnion; -} - -static vsi_status VX_CALLBACK vxBox_with_nms_limitKernel - ( - vx_node node, - const vx_reference* paramObj, - uint32_t paramNum - ) -{ -#define ARG_NUM (5) -#define TENSOR_NUM_INPUT (3) -#define TENSOR_NUM_OUTPUT (4) -#define TENSOR_NUM (TENSOR_NUM_INPUT+TENSOR_NUM_OUTPUT) - - vsi_status status = VSI_FAILURE; - vx_context context = NULL; - vx_tensor input[TENSOR_NUM_INPUT] = {0}; - vx_tensor output[TENSOR_NUM_OUTPUT] = {0}; - float *f32_in_buffer[TENSOR_NUM_INPUT] = {0}; - int32_t* int32_in_buffer[TENSOR_NUM_INPUT] = {0}; - float *f32_out_buffer[TENSOR_NUM_OUTPUT] = {0}; - int32_t* int32_out_buffer[TENSOR_NUM_OUTPUT] = {0}; - vsi_nn_tensor_attr_t in_attr[TENSOR_NUM_INPUT]; - vsi_nn_tensor_attr_t out_attr[TENSOR_NUM_OUTPUT]; - uint32_t in_elements[TENSOR_NUM_INPUT] = {0}; - uint32_t out_elements[TENSOR_NUM_OUTPUT]= {0}; - - float scoreThreshold; - int32_t maxNumDetections; - int32_t nms_kernel_method; - float iou_threshold; - float sigma; - float nms_score_threshold; - - uint32_t i = 0; - for(i = 0; i < TENSOR_NUM_INPUT; i++) - { - memset(&in_attr[i], 0x0, sizeof(vsi_nn_tensor_attr_t)); - } - for(i = 0; i < TENSOR_NUM_OUTPUT; i++) - { - memset(&out_attr[i], 0x0, sizeof(vsi_nn_tensor_attr_t)); - } - /* prepare data */ - context = vxGetContext((vx_reference)node); - - for(i = 0; i < TENSOR_NUM_INPUT; i ++) - { - input[i] = (vx_tensor)paramObj[i]; - status = vsi_nn_vxGetTensorAttr(input[i], &in_attr[i]); - TEST_CHECK_STATUS(status, final); - in_elements[i] = vsi_nn_vxGetTensorElementNum(&in_attr[i]); - if (i == 2) - { - int32_in_buffer[i] = (int32_t *)vsi_nn_vxCopyTensorToData(context, - input[i], &in_attr[i]); - } - else - { - f32_in_buffer[i] = (float *)malloc(in_elements[i] * sizeof(float)); - status = vsi_nn_vxConvertTensorToFloat32Data( - context, input[i], &in_attr[i], f32_in_buffer[i], - in_elements[i] * sizeof(float)); - TEST_CHECK_STATUS(status, final); - } - } - for(i = 0; i < TENSOR_NUM_OUTPUT; i ++) - { - output[i] = (vx_tensor)paramObj[i + TENSOR_NUM_INPUT]; - status = vsi_nn_vxGetTensorAttr(output[i], &out_attr[i]); - TEST_CHECK_STATUS(status, final); - out_elements[i] = vsi_nn_vxGetTensorElementNum(&out_attr[i]); - if (i < 2) - { - f32_out_buffer[i] = (float *)malloc(out_elements[i] * sizeof(float)); - memset(f32_out_buffer[i], 0, out_elements[i] * sizeof(float)); - } - else - { - int32_out_buffer[i] = (int32_t *)malloc(out_elements[i] * sizeof(int32_t)); - memset(int32_out_buffer[i], 0, out_elements[i] * sizeof(int32_t)); - } - } - vxCopyScalar((vx_scalar)paramObj[TENSOR_NUM], &(scoreThreshold), - VX_READ_ONLY, VX_MEMORY_TYPE_HOST); - vxCopyScalar((vx_scalar)paramObj[TENSOR_NUM + 1], &(maxNumDetections), - VX_READ_ONLY, VX_MEMORY_TYPE_HOST); - vxCopyScalar((vx_scalar)paramObj[TENSOR_NUM + 2], &(nms_kernel_method), - VX_READ_ONLY, VX_MEMORY_TYPE_HOST); - vxCopyScalar((vx_scalar)paramObj[TENSOR_NUM + 3], &(iou_threshold), - VX_READ_ONLY, VX_MEMORY_TYPE_HOST); - vxCopyScalar((vx_scalar)paramObj[TENSOR_NUM + 4], &(sigma), - VX_READ_ONLY, VX_MEMORY_TYPE_HOST); - vxCopyScalar((vx_scalar)paramObj[TENSOR_NUM + 5], &(nms_score_threshold), - VX_READ_ONLY, VX_MEMORY_TYPE_HOST); - /* TODO: Add CPU kernel implement */ - { - uint32_t j, n, b, c; - const uint32_t kRoiDim = 4; - uint32_t numRois = in_attr[0].size[1]; - uint32_t numClasses = in_attr[0].size[0]; - int32_t ind; - - uint32_t * batch_data = (uint32_t*)malloc(numRois * sizeof(uint32_t)); - int32_t numBatch = 0; - uint32_t * select = NULL; - uint32_t select_size = 0; - uint32_t scores_index = 0; - uint32_t roi_index = 0; - uint32_t roi_out_index = 0; - - memset(batch_data, 0, numRois * sizeof(uint32_t)); - for (i = 0, ind = -1; i < numRois; i++) - { - if (int32_in_buffer[2][i] != ind) - { - ind = int32_in_buffer[2][i]; - numBatch++; - } - batch_data[numBatch - 1]++; - } - select = (uint32_t*)malloc(numBatch * numRois - * numClasses * sizeof(uint32_t)); - memset(select, 0, numBatch * numRois * numClasses * sizeof(uint32_t)); - for (n = 0; n < (uint32_t)numBatch; n++) - { - int32_t numDetections_batch = 0; - uint32_t select_start_batch = select_size; - uint32_t select_len = 0; - // Exclude class 0 (background) - for (c = 1; c < numClasses; c++) - { - uint32_t select_start = select_size; - int32_t maxNumDetections0 = maxNumDetections; - uint32_t numDetections = 0; - for (b = 0; b < batch_data[n]; b++) - { - uint32_t index = b * numClasses + c; - float score = f32_in_buffer[0][scores_index + index]; - if (score > scoreThreshold) { - select[select_size] = index; - select_size++; - } - } - select_len = select_size - select_start; - - if (maxNumDetections0 < 0) - { - maxNumDetections0 = select_len; - } - - for (j = 0; (j < select_len && numDetections < (uint32_t)maxNumDetections0); j++) - { - // find max score and swap to the front. - int32_t max_index = max_element(&(f32_in_buffer[0][scores_index]), - &(select[select_start + j]), select_len - j) + j; - - swap_element(&(select[select_start]), max_index, j); - - // Calculate IoU of the rest, swap to the end (disgard) if needed. - for (i = j + 1; i < select_len; i++) - { - int32_t roiBase0 = roi_index + select[select_start + i] * kRoiDim; - int32_t roiBase1 = roi_index + select[select_start + j] * kRoiDim; - float iou = getIoUAxisAligned(&(f32_in_buffer[1][roiBase0]), - &(f32_in_buffer[1][roiBase1])); - float kernel_iou; - if (nms_kernel_method == 0) - { - kernel_iou = hard_nms_kernel(iou, iou_threshold); - } - else if (nms_kernel_method == 1) - { - kernel_iou = linear_nms_kernel(iou, iou_threshold); - } - else - { - kernel_iou = gaussian_nms_kernel(iou, sigma); - - } - f32_in_buffer[0][scores_index + select[select_start + i]] *= kernel_iou; - if (f32_in_buffer[0][scores_index + select[select_start + i]] < nms_score_threshold) - { - swap_element(&(select[select_start]), i, select_len - 1); - i--; - select_len--; - } - } - numDetections++; - } - select_size = select_start + select_len; - numDetections_batch += numDetections; - } - - // Take top maxNumDetections. - sort_element_by_score(&(f32_in_buffer[0][scores_index]), &(select[select_start_batch]), - numDetections_batch); - - if (numDetections_batch > maxNumDetections) - { - select_size = select_start_batch + maxNumDetections; - } - select_len = select_size - select_start_batch; - // Sort again by class. - sort_element_by_class(&(f32_in_buffer[0][scores_index]), &(select[select_start_batch]), - select_len, numClasses); - - for (i = 0; i < select_len; i++) - { - int32_t in_index0 = scores_index + select[select_start_batch + i]; - int32_t in_index1 = roi_index + select[select_start_batch + i] * kRoiDim; - f32_out_buffer[0][roi_out_index] = f32_in_buffer[0][in_index0]; - memcpy(&(f32_out_buffer[1][roi_out_index * kRoiDim]), - &f32_in_buffer[1][in_index1], kRoiDim * sizeof(float)); - int32_out_buffer[2][roi_out_index] = select[select_start_batch + i] % numClasses; - int32_out_buffer[3][roi_out_index] = n; - roi_out_index++; - } - - scores_index += batch_data[n] * numClasses; - roi_index += batch_data[n] * numClasses * kRoiDim; - } - if (batch_data) free(batch_data); - if (select) free(select); - } - - /* save data */ - for(i = 0; i < TENSOR_NUM_OUTPUT; i++) - { - if (i < 2) - { - status = vsi_nn_vxConvertFloat32DataToTensor( - context, output[i], &out_attr[i], f32_out_buffer[i], - out_elements[i] * sizeof(float)); - TEST_CHECK_STATUS(status, final); - } - else - { - vsi_nn_vxCopyDataToTensor(context, output[i], &out_attr[i], - (uint8_t *)int32_out_buffer[i]); - } - } - -final: - for (i = 0; i < TENSOR_NUM_INPUT; i++) - { - if (f32_in_buffer[i]) free(f32_in_buffer[i]); - if (int32_in_buffer[i]) free(int32_in_buffer[i]); - } - for(i = 0; i < TENSOR_NUM_OUTPUT; i++) - { - if (f32_out_buffer[i]) free(f32_out_buffer[i]); - if (int32_out_buffer[i]) free(int32_out_buffer[i]); - } - return status; -} /* _VX_KERNEL_FUNC_KERNEL() */ - -static vx_param_description_t vxBox_with_nms_limitKernelParam[] = -{ - {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, - {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, - {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, - {VX_OUTPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, - {VX_OUTPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, - {VX_OUTPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, - {VX_OUTPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, - {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED}, - {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED}, - {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED}, - {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED}, - {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED}, - {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED}, -}; - -vx_status VX_CALLBACK vxBox_with_nms_limitInitializer - ( - vx_node nodObj, - const vx_reference *paramObj, - vx_uint32 paraNum - ) -{ - vx_status status = VX_SUCCESS; - /*TODO: Add initial code for VX program*/ - - return status; -} - - -#ifdef __cplusplus -extern "C" { -#endif -vx_kernel_description_t vxBox_with_nms_limit_CPU = -{ - _VX_KERNEL_ID, - _VX_KERNEL_NAME, - _VX_KERNEL_FUNC_KERNEL, - vxBox_with_nms_limitKernelParam, - _cnt_of_array( vxBox_with_nms_limitKernelParam ), - vsi_nn_KernelValidator, - NULL, - NULL, - vsi_nn_KernelInitializer, - vsi_nn_KernelDeinitializer -}; - -vx_kernel_description_t vxBox_with_nms_limit_VX = -{ - _VX_KERNEL_ID, - _VX_KERNEL_NAME, - NULL, - vxBox_with_nms_limitKernelParam, - _cnt_of_array( vxBox_with_nms_limitKernelParam ), - vsi_nn_KernelValidator, - NULL, - NULL, - vxBox_with_nms_limitInitializer, - vsi_nn_KernelDeinitializer -}; - -vx_kernel_description_t * vx_kernel_BOX_WITH_NMS_LIMIT_list[] = -{ - &vxBox_with_nms_limit_CPU, - &vxBox_with_nms_limit_VX, - NULL -}; -#ifdef __cplusplus -} -#endif diff --git a/src/tim/vx/internal/src/libnnext/ops/kernel/vsi_nn_kernel_extra_ending.c b/src/tim/vx/internal/src/libnnext/ops/kernel/vsi_nn_kernel_extra_ending.c deleted file mode 100644 index f14c2f6..0000000 --- a/src/tim/vx/internal/src/libnnext/ops/kernel/vsi_nn_kernel_extra_ending.c +++ /dev/null @@ -1,250 +0,0 @@ -/**************************************************************************** -* -* Copyright (c) 2020 Vivante Corporation -* -* Permission is hereby granted, free of charge, to any person obtaining a -* copy of this software and associated documentation files (the "Software"), -* to deal in the Software without restriction, including without limitation -* the rights to use, copy, modify, merge, publish, distribute, sublicense, -* and/or sell copies of the Software, and to permit persons to whom the -* Software is furnished to do so, subject to the following conditions: -* -* The above copyright notice and this permission notice shall be included in -* all copies or substantial portions of the Software. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING -* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER -* DEALINGS IN THE SOFTWARE. -* -*****************************************************************************/ -#include -#include -#include -#include - -#include "vsi_nn_platform.h" - -#include "vsi_nn_prv.h" -#include "vsi_nn_log.h" -#include "vsi_nn_test.h" -#include "vsi_nn_tensor_util.h" -#include "utils/vsi_nn_util.h" -#include "utils/vsi_nn_dtype_util.h" -#include "utils/vsi_nn_math.h" -#include "libnnext/vsi_nn_vxkernel.h" -#include "libnnext/vx_lib_nnext.h" - -#define _VX_KERNEL_VAR (vx_kernel_EXTRA_ENDING) -#define _VX_KERNEL_ID (VX_KERNEL_ENUM_EXTRA_ENDING) -#define _VX_KERNEL_FUNC_KERNEL (vxExtra_endingKernel) - -static vsi_status VX_CALLBACK vxExtra_endingKernel - ( - vx_node node, - const vx_reference* paramObj, - uint32_t paramNum - ) -{ -#define TENSOR_NUM_INPUT (2) -#define TENSOR_NUM_OUTPUT (1) - - vsi_status status = VSI_FAILURE; - vx_context context = NULL; - vx_tensor input = NULL; - vx_tensor output[TENSOR_NUM_OUTPUT] = {0}; - uint8_t *u8_in_buffer[1] = {0}; - uint8_t *u8_out_buffer[TENSOR_NUM_OUTPUT] = {0}; - vsi_nn_tensor_attr_t out_attr[TENSOR_NUM_OUTPUT]; - uint32_t out_elements[TENSOR_NUM_OUTPUT]= {0}; - vsi_nn_tensor_attr_t in_attr; - - int32_t i = 0; - - memset(&in_attr, 0x0, sizeof(vsi_nn_tensor_attr_t)); - for(i = 0; i < TENSOR_NUM_OUTPUT; i++) - { - memset(&out_attr[i], 0x0, sizeof(vsi_nn_tensor_attr_t)); - } - /* prepare data */ - context = vxGetContext((vx_reference)node); - - input = (vx_tensor)paramObj[1]; - status = vsi_nn_vxGetTensorAttr(input, &in_attr); - TEST_CHECK_STATUS(status, final); - - for(i = 0; i < 1; i ++) - { - output[i] = (vx_tensor)paramObj[i + TENSOR_NUM_INPUT]; - status = vsi_nn_vxGetTensorAttr(output[i], &out_attr[i]); - TEST_CHECK_STATUS(status, final); - out_elements[i] = vsi_nn_vxGetTensorElementNum(&out_attr[i]); - u8_out_buffer[i]= (uint8_t *)malloc(out_elements[i] * sizeof(uint8_t)); - memset(u8_out_buffer[i], 0, out_elements[i] * sizeof(uint8_t)); - - u8_in_buffer[0] = vsi_nn_vxCopyTensorToData(context, input, &in_attr); - memcpy(u8_out_buffer[0], u8_in_buffer[0], out_elements[i] * sizeof(uint8_t)); - } - - /* save data */ - status = vsi_nn_vxCopyDataToTensor(context, output[0], &out_attr[0], u8_out_buffer[0]); - TEST_CHECK_STATUS(status, final); - -final: - for(i = 0; i < TENSOR_NUM_OUTPUT; i++) - { - if (u8_out_buffer[i]) free(u8_out_buffer[i]); - } - if (u8_in_buffer[0]) free(u8_in_buffer[0]); - return status; -} /* _VX_KERNEL_FUNC_KERNEL() */ - -static vx_param_description_t vxExtra_endingKernelParam[] = -{ - {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, - {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, - {VX_OUTPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, -}; - -vx_status VX_CALLBACK vxExtra_endingInitializer - ( - vx_node nodObj, - const vx_reference *paramObj, - vx_uint32 paraNum - ) -{ - vx_status status = VX_SUCCESS; -// Alignment with a power of two value. -#define gcmALIGN(n, align) ((n) + ((align) - 1)) & ~((align) - 1) - vx_kernel_execution_parameters_t shaderParam = { - 3, // workdim - {0, 0, 0}, // globalWorkOffset: control the start location be processed in the image - {0, 0, 0}, // globalWorkScale: how many pixels could be processed by a single thread - {0, 0, 0}, // localWorkSize: local group size in thread - {0, 0, 0}}; // globalWorkSize: image size in thread - - vx_tensor output = (vx_tensor)paramObj[2]; - - vx_uint32 width = 0; - vx_uint32 height = 0; - vx_uint32 channel = 0; - vx_uint32 dst_size[4] = {1, 1, 1, 1}; - vsi_nn_tensor_attr_t attr; - uint32_t i; - uint32_t output_dims; - - memset(&attr, 0, sizeof(vsi_nn_tensor_attr_t)); - status = vsi_nn_vxGetTensorAttr(output, &attr); - if (status != VX_SUCCESS) - { - VSILOGE("vsi_nn_vxGetTensorAttr failure! at line %d\n", __LINE__); - return status; - } - output_dims = attr.dim_num; - for (i = 0; i < output_dims; i++) - { - dst_size[i] = attr.size[i]; - } - - width = dst_size[0]; - height = dst_size[1]; - channel = dst_size[2]; - - shaderParam.globalWorkOffset[0] = 0; - shaderParam.globalWorkOffset[1] = 0; - shaderParam.globalWorkOffset[2] = 0; - shaderParam.globalWorkScale[0] = 8; - shaderParam.globalWorkScale[1] = 1; - shaderParam.globalWorkScale[2] = 1; - shaderParam.localWorkSize[0] = 16; - shaderParam.localWorkSize[1] = 1; - shaderParam.localWorkSize[2] = 1; - shaderParam.globalWorkSize[0] = gcmALIGN((width + shaderParam.globalWorkScale[0] - 1) - / shaderParam.globalWorkScale[0], shaderParam.localWorkSize[0]); - shaderParam.globalWorkSize[1] = gcmALIGN((height + shaderParam.globalWorkScale[1] - 1) - / shaderParam.globalWorkScale[1], shaderParam.localWorkSize[1]); - shaderParam.globalWorkSize[2] = channel; - - status |= vxSetNodeAttribute(nodObj, VX_NODE_ATTRIBUTE_KERNEL_EXECUTION_PARAMETERS, - &shaderParam, sizeof(vx_kernel_execution_parameters_t)); - - if(status < 0) - VSILOGE("error-%s,%d\n",__FILE__,__LINE__); - - return status; -} - - -#ifdef __cplusplus -extern "C" { -#endif -vx_kernel_description_t vxExtra_ending_CPU = -{ - _VX_KERNEL_ID, - VX_KERNEL_NAME_EXTRA_ENDING_I16, - _VX_KERNEL_FUNC_KERNEL, - vxExtra_endingKernelParam, - _cnt_of_array( vxExtra_endingKernelParam ), - vsi_nn_KernelValidator, - NULL, - NULL, - vsi_nn_KernelInitializer, - vsi_nn_KernelDeinitializer -}; - -vx_kernel_description_t vxExtra_ending_i16 = -{ - _VX_KERNEL_ID, - VX_KERNEL_NAME_EXTRA_ENDING_I16, - NULL, - vxExtra_endingKernelParam, - _cnt_of_array( vxExtra_endingKernelParam ), - vsi_nn_KernelValidator, - NULL, - NULL, - vxExtra_endingInitializer, - vsi_nn_KernelDeinitializer -}; - -vx_kernel_description_t vxExtra_ending_i8 = -{ - _VX_KERNEL_ID, - VX_KERNEL_NAME_EXTRA_ENDING_I8, - NULL, - vxExtra_endingKernelParam, - _cnt_of_array( vxExtra_endingKernelParam ), - vsi_nn_KernelValidator, - NULL, - NULL, - vxExtra_endingInitializer, - vsi_nn_KernelDeinitializer -}; - -vx_kernel_description_t vxExtra_ending_u8 = -{ - _VX_KERNEL_ID, - VX_KERNEL_NAME_EXTRA_ENDING_U8, - NULL, - vxExtra_endingKernelParam, - _cnt_of_array( vxExtra_endingKernelParam ), - vsi_nn_KernelValidator, - NULL, - NULL, - vxExtra_endingInitializer, - vsi_nn_KernelDeinitializer -}; - -vx_kernel_description_t * vx_kernel_EXTRA_ENDING_list[] = -{ - &vxExtra_ending_CPU, - &vxExtra_ending_i16, - &vxExtra_ending_i8, - &vxExtra_ending_u8, - NULL -}; -#ifdef __cplusplus -} -#endif diff --git a/src/tim/vx/internal/src/libnnext/ops/kernel/vsi_nn_kernel_heatmap_max_keypoint.c b/src/tim/vx/internal/src/libnnext/ops/kernel/vsi_nn_kernel_heatmap_max_keypoint.c deleted file mode 100644 index e464197..0000000 --- a/src/tim/vx/internal/src/libnnext/ops/kernel/vsi_nn_kernel_heatmap_max_keypoint.c +++ /dev/null @@ -1,322 +0,0 @@ -/**************************************************************************** -* -* Copyright (c) 2020 Vivante Corporation -* -* Permission is hereby granted, free of charge, to any person obtaining a -* copy of this software and associated documentation files (the "Software"), -* to deal in the Software without restriction, including without limitation -* the rights to use, copy, modify, merge, publish, distribute, sublicense, -* and/or sell copies of the Software, and to permit persons to whom the -* Software is furnished to do so, subject to the following conditions: -* -* The above copyright notice and this permission notice shall be included in -* all copies or substantial portions of the Software. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING -* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER -* DEALINGS IN THE SOFTWARE. -* -*****************************************************************************/ -#include -#include -#include -#include -#include -#include - -#include "vsi_nn_platform.h" - -#include "vsi_nn_prv.h" -#include "vsi_nn_log.h" -#include "vsi_nn_test.h" -#include "vsi_nn_tensor_util.h" -#include "utils/vsi_nn_util.h" -#include "utils/vsi_nn_dtype_util.h" -#include "utils/vsi_nn_math.h" -#include "libnnext/vsi_nn_vxkernel.h" -#include "libnnext/vx_lib_nnext.h" - -#define _VX_KERNEL_VAR (vx_kernel_HEATMAP_MAX_KEYPOINT) -#define _VX_KERNEL_ID (VX_KERNEL_ENUM_HEATMAP_MAX_KEYPOINT) -#define _VX_KERNEL_NAME (VX_KERNEL_NAME_HEATMAP_MAX_KEYPOINT) -#define _VX_KERNEL_FUNC_KERNEL (vxHeatmap_max_keypointKernel) - -// This function uses Taylor expansion up to the quatratic term to approximate bicubic -// upscaling result. -// 2nd order Taylor expansion: D(x) = D - b'x + 1/2 * x'Ax -// where D = grid[1][1], Taylor expansion center, the original score, -// x = delta, the correction on max keypoint position, -// D(x) = deltaScore, the accuracy score after correction -static void solveForDelta - ( - const float grid[3][3], - float* delta, - float* deltaScore, - float fpAtol, - float fpRtol - ) -{ - // b: negative 1st order derivative at center - // A: Hessian matrix at center (2nd order derivative) - float A[2][2], b[2]; - float crossProd1, crossProd2; - float detA; - b[0] = -(grid[1][2] - grid[1][0]) / 2.0f; - b[1] = -(grid[2][1] - grid[0][1]) / 2.0f; - A[0][0] = grid[1][0] - 2.0f * grid[1][1] + grid[1][2]; - A[0][1] = (grid[2][2] - grid[2][0] - grid[0][2] + grid[0][0]) / 4.0f; - A[1][0] = A[0][1]; - A[1][1] = grid[0][1] - 2.0f * grid[1][1] + grid[2][1]; - - // solve Ax=b, where x=delta -> delta = inv(A) * b - crossProd1 = A[0][0] * A[1][1]; - crossProd2 = A[0][1] * A[1][0]; - detA = crossProd1 - crossProd2; - // check if A is invertible - if (fabs(detA) < (fpAtol + fpRtol * crossProd1)) return; - delta[0] = (A[1][1] * b[0] - A[0][1] * b[1]) / detA; - delta[1] = (A[0][0] * b[1] - A[1][0] * b[0]) / detA; - - // clip out of range delta, i.e. delta > 3/2 - if (fabs(delta[0]) > 1.5f || fabs(delta[1]) > 1.5f) - { - float scale = (float)(1.5f / vsi_nn_max(fabs(delta[0]), fabs(delta[1]))); - delta[0] *= scale; - delta[1] *= scale; - } - - *deltaScore = grid[1][1] - b[0] * delta[0] - b[1] * delta[1] + - ((A[0][0] * delta[0] + A[0][1] * delta[1]) * delta[0] + - (A[1][0] * delta[0] + A[1][1] * delta[1]) * delta[1]) / - 2.0f; -} - -static vsi_status VX_CALLBACK vxHeatmap_max_keypointKernel - ( - vx_node node, - const vx_reference* paramObj, - uint32_t paramNum - ) -{ -#define ARG_NUM (1) -#define TENSOR_NUM_INPUT (2) -#define TENSOR_NUM_OUTPUT (2) -#define TENSOR_NUM (TENSOR_NUM_INPUT+TENSOR_NUM_OUTPUT) - - vsi_status status = VSI_FAILURE; - vx_context context = NULL; - vx_tensor input[TENSOR_NUM_INPUT] = {0}; - vx_tensor output[TENSOR_NUM_OUTPUT] = {0}; - float *f32_in_buffer[TENSOR_NUM_INPUT] = {0}; - float *f32_out_buffer[TENSOR_NUM_OUTPUT] = {0}; - vsi_nn_tensor_attr_t in_attr[TENSOR_NUM_INPUT]; - vsi_nn_tensor_attr_t out_attr[TENSOR_NUM_OUTPUT]; - uint32_t in_elements[TENSOR_NUM_INPUT] = {0}; - uint32_t out_elements[TENSOR_NUM_OUTPUT]= {0}; - - int32_t type; - - uint32_t i = 0; - for(i = 0; i < TENSOR_NUM_INPUT; i++) - { - memset(&in_attr[i], 0x0, sizeof(vsi_nn_tensor_attr_t)); - } - for(i = 0; i < TENSOR_NUM_OUTPUT; i++) - { - memset(&out_attr[i], 0x0, sizeof(vsi_nn_tensor_attr_t)); - } - /* prepare data */ - context = vxGetContext((vx_reference)node); - - for(i = 0; i < TENSOR_NUM_INPUT; i ++) - { - input[i] = (vx_tensor)paramObj[i]; - status = vsi_nn_vxGetTensorAttr(input[i], &in_attr[i]); - TEST_CHECK_STATUS(status, final); - in_elements[i] = vsi_nn_vxGetTensorElementNum(&in_attr[i]); - f32_in_buffer[i] = (float *)malloc(in_elements[i] * sizeof(float)); - status = vsi_nn_vxConvertTensorToFloat32Data( - context, input[i], &in_attr[i], f32_in_buffer[i], - in_elements[i] * sizeof(float)); - TEST_CHECK_STATUS(status, final); - } - for(i = 0; i < TENSOR_NUM_OUTPUT; i ++) - { - output[i] = (vx_tensor)paramObj[i + TENSOR_NUM_INPUT]; - status = vsi_nn_vxGetTensorAttr(output[i], &out_attr[i]); - TEST_CHECK_STATUS(status, final); - out_elements[i] = vsi_nn_vxGetTensorElementNum(&out_attr[i]); - f32_out_buffer[i]= (float *)malloc(out_elements[i] * sizeof(float)); - memset(f32_out_buffer[i], 0, out_elements[i] * sizeof(float)); - } - vxCopyScalar((vx_scalar)paramObj[TENSOR_NUM], &(type), - VX_READ_ONLY, VX_MEMORY_TYPE_HOST); - - /* TODO: Add CPU kernel implement */ - { - uint32_t j, k; - uint32_t numBoxes = in_attr[0].size[3]; - uint32_t heatmapSize = in_attr[0].size[2]; - uint32_t numKeypoints = in_attr[0].size[0]; - uint32_t boxInfoLength = 4; - uint32_t output_score_index = 0; - uint32_t output_keypoint_index = 0; - - for(i = 0; i < numBoxes; i++) - { - for (j = 0; j < numKeypoints; j++) - { - uint32_t maxIndex = 0; - float maxScore = -FLT_MAX; - uint32_t maxIndexWidth; - uint32_t maxIndexHeight; - float localGrid[3][3]; - int32_t dh, dw; - float delta[2] = {0.0f, 0.0f}, deltaScore; - float wRoiStart = f32_in_buffer[1][i * boxInfoLength]; - float hRoiStart = f32_in_buffer[1][i * boxInfoLength + 1]; - float wRoiEnd = f32_in_buffer[1][i * boxInfoLength + 2]; - float hRoiEnd = f32_in_buffer[1][i * boxInfoLength + 3]; - float roiWidth = wRoiEnd - wRoiStart; - float roiHeight = hRoiEnd - hRoiStart; - float wRelativePos; - float hRelativePos; - for (k = 0; k < heatmapSize * heatmapSize; k++) - { - uint32_t index = i * heatmapSize * heatmapSize * numKeypoints - + k * numKeypoints + j; - float val = f32_in_buffer[0][index]; - if (maxScore < val) - { - maxScore = val; - maxIndex = k; - } - } - maxIndexWidth = maxIndex % heatmapSize; - maxIndexHeight = maxIndex / heatmapSize; - - // get local 3x3 grid - for (dh = -1; dh <= 1; dh++) - { - for (dw = -1; dw <= 1; dw++) - { - // cast uint32_t to int32_t - int32_t h = (int32_t)(maxIndexHeight) + dh; - int32_t w = (int32_t)(maxIndexWidth) + dw; - uint32_t heatmapIndex; - - // use mirroring for out of bound indexing - // need to ensure heatmapSize >= 2 - h = h < 0 ? 1 : (h >= (int32_t)heatmapSize ? heatmapSize - 2 : h); - w = w < 0 ? 1 : (w >= (int32_t)heatmapSize ? heatmapSize - 2 : w); - - heatmapIndex = i * heatmapSize * heatmapSize * numKeypoints + - (uint32_t)(h) * heatmapSize * numKeypoints + - (uint32_t)(w) * numKeypoints + j; - localGrid[dh + 1][dw + 1] = f32_in_buffer[0][heatmapIndex]; - } - } - deltaScore = maxScore; - solveForDelta((const float (*)[3])localGrid, delta, &deltaScore, 1e-3f, 1e-3f); - - wRelativePos = ((float)(maxIndexWidth) + delta[0] + 0.5f) / - (float)(heatmapSize); - hRelativePos = ((float)(maxIndexHeight) + delta[1] + 0.5f) / - (float)(heatmapSize); - f32_out_buffer[0][output_score_index] = deltaScore; - f32_out_buffer[1][output_keypoint_index] = wRelativePos * roiWidth + wRoiStart; - f32_out_buffer[1][output_keypoint_index + 1] = hRelativePos * roiHeight + hRoiStart; - output_score_index++; - output_keypoint_index +=2; - } - } - } - - /* save data */ - for(i = 0; i < TENSOR_NUM_OUTPUT; i++) - { - status = vsi_nn_vxConvertFloat32DataToTensor( - context, output[i], &out_attr[i], f32_out_buffer[i], - out_elements[i] * sizeof(float)); - TEST_CHECK_STATUS(status, final); - } - -final: - for (i = 0; i < TENSOR_NUM_INPUT; i++) - { - if (f32_in_buffer[i]) free(f32_in_buffer[i]); - } - for(i = 0; i < TENSOR_NUM_OUTPUT; i++) - { - if (f32_out_buffer[i]) free(f32_out_buffer[i]); - } - return status; -} /* _VX_KERNEL_FUNC_KERNEL() */ - -static vx_param_description_t vxHeatmap_max_keypointKernelParam[] = -{ - {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, - {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, - {VX_OUTPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, - {VX_OUTPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, - {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED}, -}; - -vx_status VX_CALLBACK vxHeatmap_max_keypointInitializer - ( - vx_node nodObj, - const vx_reference *paramObj, - vx_uint32 paraNum - ) -{ - vx_status status = VX_SUCCESS; - /*TODO: Add initial code for VX program*/ - - return status; -} - - -#ifdef __cplusplus -extern "C" { -#endif -vx_kernel_description_t vxHeatmap_max_keypoint_CPU = -{ - _VX_KERNEL_ID, - _VX_KERNEL_NAME, - _VX_KERNEL_FUNC_KERNEL, - vxHeatmap_max_keypointKernelParam, - _cnt_of_array( vxHeatmap_max_keypointKernelParam ), - vsi_nn_KernelValidator, - NULL, - NULL, - vsi_nn_KernelInitializer, - vsi_nn_KernelDeinitializer -}; - -vx_kernel_description_t vxHeatmap_max_keypoint_VX = -{ - _VX_KERNEL_ID, - _VX_KERNEL_NAME, - NULL, - vxHeatmap_max_keypointKernelParam, - _cnt_of_array( vxHeatmap_max_keypointKernelParam ), - vsi_nn_KernelValidator, - NULL, - NULL, - vxHeatmap_max_keypointInitializer, - vsi_nn_KernelDeinitializer -}; - -vx_kernel_description_t * vx_kernel_HEATMAP_MAX_KEYPOINT_list[] = -{ - &vxHeatmap_max_keypoint_CPU, - &vxHeatmap_max_keypoint_VX, - NULL -}; -#ifdef __cplusplus -} -#endif diff --git a/src/tim/vx/internal/src/libnnext/ops/kernel/vsi_nn_kernel_imageprocess.c b/src/tim/vx/internal/src/libnnext/ops/kernel/vsi_nn_kernel_imageprocess.c deleted file mode 100644 index a63cb15..0000000 --- a/src/tim/vx/internal/src/libnnext/ops/kernel/vsi_nn_kernel_imageprocess.c +++ /dev/null @@ -1,1177 +0,0 @@ -/**************************************************************************** -* -* Copyright (c) 2020 Vivante Corporation -* -* Permission is hereby granted, free of charge, to any person obtaining a -* copy of this software and associated documentation files (the "Software"), -* to deal in the Software without restriction, including without limitation -* the rights to use, copy, modify, merge, publish, distribute, sublicense, -* and/or sell copies of the Software, and to permit persons to whom the -* Software is furnished to do so, subject to the following conditions: -* -* The above copyright notice and this permission notice shall be included in -* all copies or substantial portions of the Software. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING -* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER -* DEALINGS IN THE SOFTWARE. -* -*****************************************************************************/ - -#include -#include -#include -#include - -#include "vsi_nn_platform.h" - -#include "vsi_nn_prv.h" -#include "vsi_nn_log.h" -#include "vsi_nn_tensor_util.h" -#include "utils/vsi_nn_util.h" -#include "utils/vsi_nn_dtype_util.h" -#include "utils/vsi_nn_math.h" -#include "libnnext/vsi_nn_vxkernel.h" -#include "libnnext/vx_lib_nnext.h" - -#define _VX_KERNEL_VAR (vx_kernel_IMAGEPROCESS) -#define _VX_KERNEL_ID (VX_KERNEL_ENUM_IMAGEPROCESS) -#define _VX_KERNEL_NAME ("vsi_nn_kernel_imageprocess") -#define _VX_KERNEL_FUNC_KERNEL (vximageprocessKernel) - -//static uint32_t layerNum = 0; - -static void resize_crop_op - ( - uint8_t *buffer_ptr0, - uint8_t *buffer_ptr1, - vsi_nn_tensor_attr_t *attr0, - vsi_nn_tensor_attr_t *attr1, - uint32_t *stride_size0, - uint32_t *stride_size1, - int32_t *resize_crop_start - ) -{ - int32_t index[4]; - for (index[3] = 0; index[3] < (int32_t)attr1->size[3]; index[3]++) - { - for (index[2] = 0; index[2] < (int32_t)attr1->size[2]; index[2]++) - { - for (index[1] = 0; index[1] < (int32_t)attr1->size[1]; index[1]++) - { - for (index[0] = 0; index[0] < (int32_t)attr1->size[0]; index[0]++) - { - int32_t index_in = (((index[3] + resize_crop_start[3]) * attr0->size[2] - + (index[2] + resize_crop_start[2])) * attr0->size[1] - + (index[1] + resize_crop_start[1])) * attr0->size[0] - + (index[0] + resize_crop_start[0]); - int32_t index_out = (((index[3]) * attr1->size[2] - + (index[2])) * attr1->size[1] - + (index[1])) * attr1->size[0] - + (index[0]); - float val; - vsi_nn_DtypeToFloat32(&buffer_ptr0[stride_size0[0] * index_in], - &val, &attr0->dtype); - vsi_nn_Float32ToDtype(val, &buffer_ptr1[stride_size1[0] * index_out], - &attr1->dtype); - } - } - } - } -} - -static void reverse_channel_op - ( - uint8_t *buffer_ptr0, - uint8_t *buffer_ptr1, - vsi_nn_tensor_attr_t *attr, - uint32_t *stride_size - ) -{ - int32_t index[4]; - for (index[3] = 0; index[3] < (int32_t)attr->size[3]; index[3]++) - { - for (index[2] = 0; index[2] < 3; index[2]++) - { - for (index[1] = 0; index[1] < (int32_t)attr->size[1]; index[1]++) - { - for (index[0] = 0; index[0] < (int32_t)attr->size[0]; index[0]++) - { - int32_t index_in = (((index[3]) * attr->size[2] - + (2 - index[2])) * attr->size[1] - + (index[1])) * attr->size[0] - + (index[0]); - int32_t index_out = (((index[3]) * attr->size[2] - + (index[2])) * attr->size[1] - + (index[1])) * attr->size[0] - + (index[0]); - float val; - vsi_nn_DtypeToFloat32(&buffer_ptr0[stride_size[0] * index_in], - &val, &attr->dtype); - vsi_nn_Float32ToDtype(val, &buffer_ptr1[stride_size[0] * index_out], - &attr->dtype); - } - } - } - } -} - -static void mean_pixel_op - ( - uint8_t *buffer_ptr0, - uint8_t *buffer_ptr1, - vsi_nn_tensor_attr_t *attr, - uint32_t *stride_size, - float mean_scale, - float *mean_mean_value - ) -{ - int32_t index[4]; - for (index[3] = 0; index[3] < (int32_t)attr->size[3]; index[3]++) - { - for (index[2] = 0; index[2] < (int32_t)attr->size[2]; index[2]++) - { - for (index[1] = 0; index[1] < (int32_t)attr->size[1]; index[1]++) - { - for (index[0] = 0; index[0] < (int32_t)attr->size[0]; index[0]++) - { - int32_t index_in = (((index[3]) * attr->size[2] - + (index[2])) * attr->size[1] - + (index[1])) * attr->size[0] - + (index[0]); - int32_t index_out = (((index[3]) * attr->size[2] - + (index[2])) * attr->size[1] - + (index[1])) * attr->size[0] - + (index[0]); - float val; - vsi_nn_DtypeToFloat32(&buffer_ptr0[stride_size[0] * index_in], - &val, &attr->dtype); - val = (val - mean_mean_value[0]) * mean_scale; - vsi_nn_Float32ToDtype(val, &buffer_ptr1[stride_size[0] * index_out], - &attr->dtype); - } - } - } - } -} - -static void mean_channel_op - ( - uint8_t *buffer_ptr0, - uint8_t *buffer_ptr1, - vsi_nn_tensor_attr_t *attr, - uint32_t *stride_size, - float mean_scale, - float *mean_mean_value - ) -{ - int32_t index[4]; - for (index[3] = 0; index[3] < (int32_t)attr->size[3]; index[3]++) - { - for (index[2] = 0; index[2] < (int32_t)attr->size[2]; index[2]++) - { - for (index[1] = 0; index[1] < (int32_t)attr->size[1]; index[1]++) - { - for (index[0] = 0; index[0] < (int32_t)attr->size[0]; index[0]++) - { - int32_t index_in = (((index[3]) * attr->size[2] - + (index[2])) * attr->size[1] - + (index[1])) * attr->size[0] - + (index[0]); - int32_t index_out = (((index[3]) * attr->size[2] - + (index[2])) * attr->size[1] - + (index[1])) * attr->size[0] - + (index[0]); - float val; - vsi_nn_DtypeToFloat32(&buffer_ptr0[stride_size[0] * index_in], - &val, &attr->dtype); - val = (val - mean_mean_value[index[2]]) * mean_scale; - vsi_nn_Float32ToDtype(val, &buffer_ptr1[stride_size[0] * index_out], - &attr->dtype); - } - } - } - } -} - -static vsi_status VX_CALLBACK vximageprocessKernel - ( - vx_node node, - const vx_reference* paramObj, - uint32_t paramNum - ) -{ -#define ARG_NUM (14) -#define TENSOR_NUM_INPUT (1) -#define TENSOR_NUM_OUTPUT (1) -#define TENSOR_NUM (TENSOR_NUM_INPUT+TENSOR_NUM_OUTPUT) - - vsi_status status = VX_SUCCESS; - int32_t i; - vx_context context = NULL; - vsi_nn_tensor_attr_t attr[TENSOR_NUM]; - uint32_t stride_size[TENSOR_NUM][VSI_NN_MAX_DIM_NUM]; - vx_tensor_addressing user_addr[TENSOR_NUM] = {NULL}; - uint8_t *buffer_ptr[TENSOR_NUM] = {NULL}; - vx_tensor tensor[TENSOR_NUM]; - - int32_t crop_enable, resize_crop_dim_num, resize_crop_start[4] = {0}; - int32_t mean_type, mean_mean_value_size; - vx_bool reverse_channel; - float mean_scale, mean_mean_value[4] = {0}; - uint8_t *temp_ptr[2] = {NULL}; - uint32_t buf_sz; - - //prepare data - context = vxGetContext((vx_reference)node); - - for( i = 0; i < TENSOR_NUM_INPUT; i ++ ) - { - tensor[i] = (vx_tensor)paramObj[i]; - buffer_ptr[i] = vsi_nn_ConvertRawTensorToData2(context, tensor[i], - &(attr[i]), stride_size[i], &(user_addr[i]), VX_READ_ONLY); - } - for( i = TENSOR_NUM_INPUT; i < TENSOR_NUM; i ++ ) - { - tensor[i] = (vx_tensor)paramObj[i]; - buffer_ptr[i] = vsi_nn_ConvertRawTensorToData2(context, tensor[i], - &(attr[i]), stride_size[i], &(user_addr[i]), VX_WRITE_ONLY); - } - - vxCopyScalar((vx_scalar)paramObj[TENSOR_NUM], &(crop_enable), - VX_READ_ONLY, VX_MEMORY_TYPE_HOST); - vxCopyScalar((vx_scalar)paramObj[TENSOR_NUM + 1], &(resize_crop_dim_num), - VX_READ_ONLY, VX_MEMORY_TYPE_HOST); - - for (i = 0; i < resize_crop_dim_num; i++) - { - vxCopyScalar((vx_scalar)paramObj[TENSOR_NUM + 2 + i], &(resize_crop_start[i]), - VX_READ_ONLY, VX_MEMORY_TYPE_HOST); - } - - vxCopyScalar((vx_scalar)paramObj[TENSOR_NUM + 6], &(reverse_channel), - VX_READ_ONLY, VX_MEMORY_TYPE_HOST); - vxCopyScalar((vx_scalar)paramObj[TENSOR_NUM + 7], &(mean_type), - VX_READ_ONLY, VX_MEMORY_TYPE_HOST); - vxCopyScalar((vx_scalar)paramObj[TENSOR_NUM + 8], &(mean_scale), - VX_READ_ONLY, VX_MEMORY_TYPE_HOST); - vxCopyScalar((vx_scalar)paramObj[TENSOR_NUM + 9], &(mean_mean_value_size), - VX_READ_ONLY, VX_MEMORY_TYPE_HOST); - - for (i = 0; i < mean_mean_value_size; i++) - { - vxCopyScalar((vx_scalar)paramObj[TENSOR_NUM + 10 + i], &(mean_mean_value[i]), - VX_READ_ONLY, VX_MEMORY_TYPE_HOST); - } - - //op calc - buf_sz = vsi_nn_GetTensorSize(attr[1].size, attr[1].dim_num, attr[1].dtype.vx_type); - temp_ptr[0] = (uint8_t *)malloc( buf_sz ); - temp_ptr[1] = (uint8_t *)malloc( buf_sz ); - - if (crop_enable == TRUE) - { - resize_crop_op(buffer_ptr[0], temp_ptr[0], &attr[0], &attr[1], - stride_size[0], stride_size[1], resize_crop_start); - } - - if (reverse_channel) - { - reverse_channel_op(temp_ptr[0], temp_ptr[1], &attr[1], - stride_size[1]); - } - - if (mean_type == VSI_NN_IMAGEPROCESS_MEAN_PIXEL) - { - mean_pixel_op(temp_ptr[1], buffer_ptr[1], &attr[1], - stride_size[1], mean_scale, mean_mean_value); - } - else if (mean_type == VSI_NN_IMAGEPROCESS_MEAN_CHANNEL) - { - mean_channel_op(temp_ptr[1], buffer_ptr[1], &attr[1], - stride_size[1], mean_scale, mean_mean_value); - } - - //save data - for( i = TENSOR_NUM_INPUT; i < TENSOR_NUM; i ++ ) - { - status = vsi_nn_copy_tensor_patch(tensor[i], &attr[i], buffer_ptr[i], VX_WRITE_ONLY); - if (user_addr[i]) vxReleaseTensorAddressing(&(user_addr[i])); - } - for( i = 0; i < TENSOR_NUM; i ++ ) - { - if (buffer_ptr[i]) free(buffer_ptr[i]); - } - - if (temp_ptr[0]) free(temp_ptr[0]); - if (temp_ptr[1]) free(temp_ptr[1]); - return status; -} /* _VX_KERNEL_FUNC_KERNEL() */ - -vx_status VX_CALLBACK vxScaletoTensorInitializer - ( - vx_node nodObj, - const vx_reference *paramObj, - vx_uint32 paraNum - ) -{ -// Alignment with a power of two value. -#define gcmALIGN(n, align) ((n) + ((align) - 1)) & ~((align) - 1) - vx_kernel_execution_parameters_t shaderParam = { - 2, // workdim - {0, 0, 0}, // globalWorkOffset: control the start location be processed in the image - {0, 0, 0}, // globalWorkScale: how many pixels could be processed by a single thread - {0, 0, 0}, // localWorkSize: local group size in thread - {0, 0, 0}}; // globalWorkSize: image size in thread - - vx_status status = VX_SUCCESS; - vx_image bgrImg = (vx_image)paramObj[0]; - vx_tensor output = (vx_tensor)paramObj[1]; - vx_scalar xRatio_s = (vx_scalar)paramObj[2]; - vx_scalar yRatio_s = (vx_scalar)paramObj[3]; - vx_uint32 width = 0; - vx_uint32 height = 0; - vx_int32 xRatio = 0; - vx_int32 yRatio = 0; - vx_uint32 output_size[DIM_SIZE] = {1, 1, 1, 1}; - vx_int8 dstFixedPointPos = 0; - vx_enum dstFormat; - vx_float32 outputScale = 1.0; - vx_int32 output_ZP = 0; - uint32_t output_dims = 0; - vsi_nn_tensor_attr_t attr; - uint32_t i; - - memset(&attr, 0, sizeof(vsi_nn_tensor_attr_t)); - vxQueryImage(bgrImg, VX_IMAGE_WIDTH, &width, sizeof(width)); - vxQueryImage(bgrImg, VX_IMAGE_HEIGHT, &height, sizeof(height)); - - vxCopyScalar(xRatio_s, (void*)&xRatio, VX_READ_ONLY, VX_MEMORY_TYPE_HOST); - vxCopyScalar(yRatio_s, (void*)&yRatio, VX_READ_ONLY, VX_MEMORY_TYPE_HOST); - - status = vsi_nn_vxGetTensorAttr(output, &attr); - if (status != VX_SUCCESS) - { - VSILOGE("vsi_nn_vxGetTensorAttr failure! at line %d\n", __LINE__); - return status; - } - - output_dims = attr.dim_num; - dstFormat = attr.dtype.vx_type; - for (i = 0; i < output_dims; i++) - { - output_size[i] = attr.size[i]; - } - dstFixedPointPos = attr.dtype.fl; - output_ZP = attr.dtype.zero_point; - outputScale = attr.dtype.scale; - - if (xRatio == (1 << 15) && yRatio == (1 << 15)) - { - vx_uint32 uniExtractR_2x8[16] = { - 0x00099999, // TCfg - 0x00044444, // ASelt - 0x09060300, 0x0000000c, // ABin - 0x00099999, // BSelt - 0x06060606, 0x00000006, // BBin - 0x00000100, // AccumType, ConstantType, and PostShift - 0x3c000000, 0x3c000000, 0x3c000000, 0x3c000000, - 0x3c000000, 0x00000000, 0x00000000, 0x00000000 // Constant - }; - vx_uint32 uniExtractG_2x8[16] = { - 0x00099999, // TCfg - 0x00044444, // ASelt - 0x2a272421, 0x0000002d, // ABin - 0x00099999, // BSelt - 0x06060606, 0x00000006, // BBin - 0x00000100, // AccumType, ConstantType, and PostShift - 0x3c000000, 0x3c000000, 0x3c000000, 0x3c000000, - 0x3c000000, 0x00000000, 0x00000000, 0x00000000 // Constant - }; - vx_uint32 uniExtractB_2x8[16] = { - 0x00099999, // TCfg - 0x00044444, // ASelt - 0x4b484542, 0x0000004e, // ABin - 0x00099999, // BSelt - 0x06060606, 0x00000006, // BBin - 0x00000100, // AccumType, ConstantType, and PostShift - 0x3c000000, 0x3c000000, 0x3c000000, 0x3c000000, - 0x3c000000, 0x00000000, 0x00000000, 0x00000000 // Constant - }; - shaderParam.globalWorkOffset[0] = 0; - shaderParam.globalWorkOffset[1] = 0; - if (dstFormat == VSI_NN_TYPE_FLOAT16 || dstFormat == VSI_NN_TYPE_INT16) - shaderParam.globalWorkScale[0] = 8; - else if (dstFormat == VSI_NN_TYPE_INT8 || dstFormat == VSI_NN_TYPE_UINT8) - shaderParam.globalWorkScale[0] = 10; - - shaderParam.globalWorkScale[1] = 1; - shaderParam.globalWorkSize[0] = gcmALIGN((output_size[0] + shaderParam.globalWorkScale[0] - 1) - / shaderParam.globalWorkScale[0], 4); - shaderParam.globalWorkSize[1] = (output_size[1] + shaderParam.globalWorkScale[1] - 1) - / shaderParam.globalWorkScale[1]; - - if (dstFormat == VSI_NN_TYPE_INT8 || dstFormat == VSI_NN_TYPE_INT16) - { - if(dstFixedPointPos > 0) - outputScale = (vx_float32) ((int64_t)1 << dstFixedPointPos); - else - { - outputScale = 1.0f; - uniExtractR_2x8[7] |= ((-dstFixedPointPos) & 0x1F); - uniExtractG_2x8[7] |= ((-dstFixedPointPos) & 0x1F); - uniExtractB_2x8[7] |= ((-dstFixedPointPos) & 0x1F); - } - } - else if (dstFormat == VSI_NN_TYPE_UINT8) - { - vx_float32 outputZP = (vx_float32)output_ZP; - - outputScale = 1.0f / outputScale; - - vxSetNodeUniform(nodObj, "outputZP", 1, &outputZP); - } - - vxSetNodeUniform(nodObj, "uniExtractR_2x8", 1, uniExtractR_2x8); - vxSetNodeUniform(nodObj, "uniExtractG_2x8", 1, uniExtractG_2x8); - vxSetNodeUniform(nodObj, "uniExtractB_2x8", 1, uniExtractB_2x8); - status |= vxSetNodeUniform(nodObj, "outputScale", 1, &outputScale); - } - else - { - vx_uint32 uniVecShift10[16] = { - 0x01010101, // TCfg - 0x00000000, // ASelt - 0x00020000, 0x00060004, // ABin - 0x02020202, // BSelt - 0x00000000, 0x00000000, // BBin - 0x00000600, // AccumType, ConstantType, and PostShift - 0x00000400, 0x00000000, 0x00000400, 0x00000000, - 0x00000400, 0x00000000, 0x00000400, 0x00000000 // Constant - }; - vx_uint32 uniAddRShift[16] = { - 0x0f0f0f0f, // TCfg - 0x04040404, // ASelt - 0x00010000, 0x00030002, // ABin - 0x00000000, // BSelt - 0x00000000, 0x00000000, // BBin - 0x00002405, // AccumType, ConstantType, and PostShift - 0x00000000, 0x00000000, 0x00000000, 0x00000000, - 0x00000000, 0x00000000, 0x00000000, 0x00000000 // Constant - }; - vx_uint32 uniGetTempVal[16] = { - 0x09090909, // TCfg - 0x00000000, // ASelt - 0x00230001, 0x00670045, // ABin - 0x05050505, // BSelt - 0x00110000, 0x00330022, // BBin - 0x00000400, // AccumType, ConstantType, and PostShift - 0x00000000, 0x00000000, 0x00000000, 0x00000000, - 0x00000000, 0x00000000, 0x00000000, 0x00000000 // Constant - }; - vx_uint32 uniExtractBytes[16] = { - 0x0f0f0f0f, // TCfg - 0x04040404, // ASelt - 0x00010000, 0x00030002, // ABin - 0x00000000, // BSelt - 0x00000000, 0x00000000, // BBin - 0x00002414, // AccumType, ConstantType, and PostShift - 0x00000000, 0x00000000, 0x00000000, 0x00000000, - 0x00000000, 0x00000000, 0x00000000, 0x00000000 // Constant - }; - vx_uint32 uniUnpackToR[16] = { - 0x33333333, // TCfg - 0x11110000, // ASelt - 0x09060300, 0x09060300, // ABin - 0x00000000, // BSelt - 0x00000000, 0x00000000, // BBin - 0x00007400, // AccumType, ConstantType, and PostShift - 0x00000000, 0x00000000, 0x00000000, 0x00000000, - 0x00000000, 0x00000000, 0x00000000, 0x00000000 // Constant - }; - vx_uint32 uniUnpackToG[16] = { - 0x33333333, // TCfg - 0x11110000, // ASelt - 0x0a070401, 0x0a070401, // ABin - 0x00000000, // BSelt - 0x00000000, 0x00000000, // BBin - 0x00007400, // AccumType, ConstantType, and PostShift - 0x00000000, 0x00000000, 0x00000000, 0x00000000, - 0x00000000, 0x00000000, 0x00000000, 0x00000000 // Constant - }; - vx_uint32 uniUnpackToB[16] = { - 0x33333333, // TCfg - 0x11110000, // ASelt - 0x0b080502, 0x0b080502, // ABin - 0x00000000, // BSelt - 0x00000000, 0x00000000, // BBin - 0x00007400, // AccumType, ConstantType, and PostShift - 0x00000000, 0x00000000, 0x00000000, 0x00000000, - 0x00000000, 0x00000000, 0x00000000, 0x00000000 // Constant - }; - vx_uint32 uniDataMulAlpha_4x4[16] = { - 0x01010101, // TCfg - 0x00000000, // ASelt - 0x00010000, 0x00030002, // ABin - 0x01010101, // BSelt - 0x00000000, 0x00000000, // BBin - 0x00000400, // AccumType, ConstantType, and PostShift - 0x00000000, 0x00000000, 0x00000000, 0x00000000, - 0x00000000, 0x00000000, 0x00000000, 0x00000000 // Constant - }; - vx_uint32 uniDataSubMean_4x4[16] = { - 0x09090909, // TCfg - 0x04040404, // ASelt - 0x00010000, 0x00030002, // ABin - 0x0a0a0a0a, // BSelt - 0x00000000, 0x00000000, // BBin - 0x00007100, // AccumType, ConstantType, and PostShift - 0x3c003c00, 0x00000000, 0x3c003c00, 0x00000000, - 0x3c003c00, 0x00000000, 0x3c003c00, 0x00000000 // Constant - }; - vx_uint32 uniConvertIntergetoF32_4x4[16] = { - 0x01010101, // TCfg - 0x00000000, // ASelt - 0x00010000, 0x00030002, // ABin - 0x02020202, // BSelt - 0x00000000, 0x00000000, // BBin - 0x00000400, // AccumType, ConstantType, and PostShift - 0x00000001, 0x00000000, 0x00000001, 0x00000000, - 0x00000001, 0x00000000, 0x00000001, 0x00000000 // Constant - }; - vx_uint32 uniExtactInteger_2x8[16] = { - 0x33333333, // TCfg - 0x11110000, // ASelt - 0x03020100, 0x03020100, // ABin - 0x00000000, // BSelt - 0x00000000, 0x00000000, // BBin - 0x00002300, // AccumType, ConstantType, and PostShift - 0x00000000, 0x00000000, 0x00000000, 0x00000000, - 0x00000000, 0x00000000, 0x00000000, 0x00000000 // Constant - }; - - shaderParam.globalWorkOffset[0] = 0; - shaderParam.globalWorkOffset[1] = 0; - shaderParam.globalWorkScale[0] = 4; - shaderParam.globalWorkScale[1] = 1; - shaderParam.globalWorkSize[0] = gcmALIGN((output_size[0] + shaderParam.globalWorkScale[0] - 1) - / shaderParam.globalWorkScale[0], 4); - shaderParam.globalWorkSize[1] = (output_size[1] + shaderParam.globalWorkScale[1] - 1) - / shaderParam.globalWorkScale[1]; - - status |= vxSetNodeUniform(nodObj, "uniDataMulAlpha_4x4", 1, uniDataMulAlpha_4x4); - status |= vxSetNodeUniform(nodObj, "uniDataSubMean_4x4", 1, uniDataSubMean_4x4); - status |= vxSetNodeUniform(nodObj, "uniUnpackToR", 1, uniUnpackToR); - status |= vxSetNodeUniform(nodObj, "uniUnpackToG", 1, uniUnpackToG); - status |= vxSetNodeUniform(nodObj, "uniUnpackToB", 1, uniUnpackToB); - status |= vxSetNodeUniform(nodObj, "uniVecShift10", 1, uniVecShift10); - status |= vxSetNodeUniform(nodObj, "uniAddRShift", 1, uniAddRShift); - status |= vxSetNodeUniform(nodObj, "uniGetTempVal", 1, uniGetTempVal); - status |= vxSetNodeUniform(nodObj, "uniExtractBytes", 1, uniExtractBytes); - - if (dstFormat == VSI_NN_TYPE_INT8 || dstFormat == VSI_NN_TYPE_INT16) - { - if(dstFixedPointPos > 0) - outputScale = (vx_float32) ((int64_t)1 << dstFixedPointPos); - else - outputScale = 1.0f / (vx_float32) ((int64_t)1 << -dstFixedPointPos); - - status |= vxSetNodeUniform(nodObj, "uniConvertIntergetoF32_4x4", - 1, uniConvertIntergetoF32_4x4); - status |= vxSetNodeUniform(nodObj, "outputScale", 1, &outputScale); - status |= vxSetNodeUniform(nodObj, "uniExtactInteger_2x8", 1, uniExtactInteger_2x8); - } - else if (dstFormat == VSI_NN_TYPE_UINT8) - { - vx_float32 outputZP = (vx_float32)output_ZP; - - outputScale = 1.0f / outputScale; - - status |= vxSetNodeUniform(nodObj, "uniConvertIntergetoF32_4x4", - 1, uniConvertIntergetoF32_4x4); - status |= vxSetNodeUniform(nodObj, "outputZP", 1, &outputZP); - status |= vxSetNodeUniform(nodObj, "outputScale", 1, &outputScale); - status |= vxSetNodeUniform(nodObj, "uniExtactInteger_2x8", 1, uniExtactInteger_2x8); - } - } - - status |= vxSetNodeAttribute(nodObj, VX_NODE_ATTRIBUTE_KERNEL_EXECUTION_PARAMETERS, - &shaderParam, sizeof(vx_kernel_execution_parameters_t)); - - return VX_SUCCESS; -} - -vx_status VX_CALLBACK vxGrayScaletoTensorInitializer(vx_node nodObj, const vx_reference *paramObj, vx_uint32 paraNum) -{ -// Alignment with a power of two value. -#define gcmALIGN(n, align) ((n) + ((align) - 1)) & ~((align) - 1) - vx_kernel_execution_parameters_t shaderParam = { - 2, // workdim - {0, 0, 0}, // globalWorkOffset: control the start location be processed in the image - {0, 0, 0}, // globalWorkScale: how many pixels could be processed by a single thread - {0, 0, 0}, // localWorkSize: local group size in thread - {0, 0, 0}}; // globalWorkSize: image size in thread - - vx_status status = VX_SUCCESS; - vx_image inputImg = (vx_image)paramObj[0]; - vx_scalar xRatio_s = (vx_scalar)paramObj[2]; - vx_scalar yRatio_s = (vx_scalar)paramObj[3]; - vx_tensor output = (vx_tensor)paramObj[1]; - vx_uint32 width = 0; - vx_uint32 height = 0; - vx_int32 xRatio = 0; - vx_int32 yRatio = 0; - vx_uint32 output_size[4] = {1, 1, 1, 1}; - vx_int8 dstFixedPointPos = 0; - vx_enum dstFormat; - vx_float32 outputScale = 1.0; - vx_int32 output_ZP = 0; - uint32_t output_dims = 0; - vsi_nn_tensor_attr_t attr; - uint32_t i; - - memset(&attr, 0, sizeof(vsi_nn_tensor_attr_t)); - vxQueryImage(inputImg, VX_IMAGE_WIDTH, &width, sizeof(width)); - vxQueryImage(inputImg, VX_IMAGE_HEIGHT, &height, sizeof(height)); - - vxCopyScalar(xRatio_s, (void*)&xRatio, VX_READ_ONLY, VX_MEMORY_TYPE_HOST); - vxCopyScalar(yRatio_s, (void*)&yRatio, VX_READ_ONLY, VX_MEMORY_TYPE_HOST); - - status = vsi_nn_vxGetTensorAttr(output, &attr); - if (status != VX_SUCCESS) - { - VSILOGE("vsi_nn_vxGetTensorAttr failure! at line %d\n", __LINE__); - return status; - } - - output_dims = attr.dim_num; - dstFormat = attr.dtype.vx_type; - for (i = 0; i < output_dims; i++) - { - output_size[i] = attr.size[i]; - } - dstFixedPointPos = attr.dtype.fl; - output_ZP = attr.dtype.zero_point; - outputScale = attr.dtype.scale; - - if (xRatio == (1 << 15) && yRatio == (1 << 15)) - { - vx_uint32 uniDataMeanStddevLo_2x8[16] = { - 0x99999999, // TCfg - 0x44444444, // ASelt - 0x03020100, 0x07060504, // ABin - 0x99999999, // BSelt - 0x06060606, 0x06060606, // BBin - 0x00000100, // AccumType, ConstantType, and PostShift - 0x3c000000, 0x3c000000, 0x3c000000, 0x3c000000, - 0x3c000000, 0x3c000000, 0x3c000000, 0x3c000000 // Constant - }; - vx_uint32 uniDataMeanStddevHi_2x8[16] = { - 0x99999999, // TCfg - 0x44444444, // ASelt - 0x0b0a0908, 0x0f0e0d0c, // ABin - 0x99999999, // BSelt - 0x06060606, 0x06060606, // BBin - 0x00000100, // AccumType, ConstantType, and PostShift - 0x3c000000, 0x3c000000, 0x3c000000, 0x3c000000, - 0x3c000000, 0x3c000000, 0x3c000000, 0x3c000000 // Constant - }; - - shaderParam.globalWorkOffset[0] = 0; - shaderParam.globalWorkOffset[1] = 0; - if (dstFormat == VSI_NN_TYPE_FLOAT16 || dstFormat == VSI_NN_TYPE_INT16) - shaderParam.globalWorkScale[0] = 16; - else if (dstFormat == VSI_NN_TYPE_INT8 || dstFormat == VSI_NN_TYPE_UINT8) - shaderParam.globalWorkScale[0] = 16; - - shaderParam.globalWorkScale[1] = 1; - shaderParam.localWorkSize[0] = 8; - shaderParam.localWorkSize[1] = 1; - shaderParam.globalWorkSize[0] = gcmALIGN((output_size[0] + - shaderParam.globalWorkScale[0] - 1) - / shaderParam.globalWorkScale[0], shaderParam.localWorkSize[0]); - shaderParam.globalWorkSize[1] = gcmALIGN((output_size[1] + - shaderParam.globalWorkScale[1] - 1) - / shaderParam.globalWorkScale[1], shaderParam.localWorkSize[1]); - - if (dstFormat == VSI_NN_TYPE_INT8 || dstFormat == VSI_NN_TYPE_INT16) - { - if(dstFixedPointPos > 0) - outputScale = (vx_float32) ((int64_t)1 << dstFixedPointPos); - else - { - outputScale = 1.0f; - uniDataMeanStddevLo_2x8[7] |= ((-dstFixedPointPos) & 0x1F); - uniDataMeanStddevHi_2x8[7] |= ((-dstFixedPointPos) & 0x1F); - } - } - else if (dstFormat == VSI_NN_TYPE_UINT8) - { - vx_float32 outputZP = (vx_float32)output_ZP; - - outputScale = 1.0f / outputScale; - - vxSetNodeUniform(nodObj, "outputZP", 1, &outputZP); - } - - vxSetNodeUniform(nodObj, "uniDataMeanStddevLo_2x8", 1, uniDataMeanStddevLo_2x8); - vxSetNodeUniform(nodObj, "uniDataMeanStddevHi_2x8", 1, uniDataMeanStddevHi_2x8); - status |= vxSetNodeUniform(nodObj, "outputScale", 1, &outputScale); - } - else - { - vx_uint32 uniVecShift10[16] = { - 0x01010101, // TCfg - 0x00000000, // ASelt - 0x00020000, 0x00060004, // ABin - 0x02020202, // BSelt - 0x00000000, 0x00000000, // BBin - 0x00000600, // AccumType, ConstantType, and PostShift - 0x00000400, 0x00000000, 0x00000400, 0x00000000, - 0x00000400, 0x00000000, 0x00000400, 0x00000000 // Constant - }; - vx_uint32 uniAddRShift[16] = { - 0x0f0f0f0f, // TCfg - 0x04040404, // ASelt - 0x00010000, 0x00030002, // ABin - 0x00000000, // BSelt - 0x00000000, 0x00000000, // BBin - 0x00002405, // AccumType, ConstantType, and PostShift - 0x00000000, 0x00000000, 0x00000000, 0x00000000, - 0x00000000, 0x00000000, 0x00000000, 0x00000000 // Constant - }; - vx_uint32 uniGetTempVal[16] = { - 0x09090909, // TCfg - 0x00000000, // ASelt - 0x00230001, 0x00670045, // ABin - 0x05050505, // BSelt - 0x00110000, 0x00330022, // BBin - 0x00000400, // AccumType, ConstantType, and PostShift - 0x00000000, 0x00000000, 0x00000000, 0x00000000, - 0x00000000, 0x00000000, 0x00000000, 0x00000000 // Constant - }; - vx_uint32 uniExtractBytes[16] = { - 0x0f0f0f0f, // TCfg - 0x04040404, // ASelt - 0x00010000, 0x00030002, // ABin - 0x00000000, // BSelt - 0x00000000, 0x00000000, // BBin - 0x00002414, // AccumType, ConstantType, and PostShift - 0x00000000, 0x00000000, 0x00000000, 0x00000000, - 0x00000000, 0x00000000, 0x00000000, 0x00000000 // Constant - }; - vx_uint32 uniDataMulAlpha_4x4[16] = { - 0x01010101, // TCfg - 0x00000000, // ASelt - 0x00010000, 0x00030002, // ABin - 0x01010101, // BSelt - 0x00000000, 0x00000000, // BBin - 0x00000400, // AccumType, ConstantType, and PostShift - 0x00000000, 0x00000000, 0x00000000, 0x00000000, - 0x00000000, 0x00000000, 0x00000000, 0x00000000 // Constant - }; - vx_uint32 uniDataSubMean_4x4[16] = { - 0x09090909, // TCfg - 0x04040404, // ASelt - 0x00010000, 0x00030002, // ABin - 0x0a0a0a0a, // BSelt - 0x00000000, 0x00000000, // BBin - 0x00007100, // AccumType, ConstantType, and PostShift - 0x3c003c00, 0x00000000, 0x3c003c00, 0x00000000, - 0x3c003c00, 0x00000000, 0x3c003c00, 0x00000000 // Constant - }; - vx_uint32 uniConvertIntergetoF32_4x4[16] = { - 0x01010101, // TCfg - 0x00000000, // ASelt - 0x00010000, 0x00030002, // ABin - 0x02020202, // BSelt - 0x00000000, 0x00000000, // BBin - 0x00000400, // AccumType, ConstantType, and PostShift - 0x00000001, 0x00000000, 0x00000001, 0x00000000, - 0x00000001, 0x00000000, 0x00000001, 0x00000000 // Constant - }; - vx_uint32 uniExtactInteger_2x8[16] = { - 0x33333333, // TCfg - 0x11110000, // ASelt - 0x03020100, 0x03020100, // ABin - 0x00000000, // BSelt - 0x00000000, 0x00000000, // BBin - 0x00002300, // AccumType, ConstantType, and PostShift - 0x00000000, 0x00000000, 0x00000000, 0x00000000, - 0x00000000, 0x00000000, 0x00000000, 0x00000000 // Constant - }; - - shaderParam.globalWorkOffset[0] = 0; - shaderParam.globalWorkOffset[1] = 0; - shaderParam.globalWorkScale[0] = 4; - shaderParam.globalWorkScale[1] = 1; - shaderParam.localWorkSize[0] = 2; - shaderParam.localWorkSize[1] = 4; - shaderParam.globalWorkSize[0] = gcmALIGN((output_size[0] + - shaderParam.globalWorkScale[0] - 1) - / shaderParam.globalWorkScale[0], shaderParam.localWorkSize[0]); - shaderParam.globalWorkSize[1] = gcmALIGN((output_size[1] + - shaderParam.globalWorkScale[1] - 1) - / shaderParam.globalWorkScale[1], shaderParam.localWorkSize[1]); - - if (dstFormat == VSI_NN_TYPE_FLOAT16) - { - status |= vxSetNodeUniform(nodObj, "uniDataMulAlpha_4x4", 1, uniDataMulAlpha_4x4); - status |= vxSetNodeUniform(nodObj, "uniDataSubMean_4x4", 1, uniDataSubMean_4x4); - } - - status |= vxSetNodeUniform(nodObj, "uniVecShift10", 1, uniVecShift10); - status |= vxSetNodeUniform(nodObj, "uniAddRShift", 1, uniAddRShift); - status |= vxSetNodeUniform(nodObj, "uniGetTempVal", 1, uniGetTempVal); - status |= vxSetNodeUniform(nodObj, "uniExtractBytes", 1, uniExtractBytes); - - if (dstFormat == VSI_NN_TYPE_INT8 || dstFormat == VSI_NN_TYPE_INT16) - { - if(dstFixedPointPos > 0) - outputScale *= (vx_float32) ((int64_t)1 << dstFixedPointPos); - else - outputScale *= 1.0f / (vx_float32) ((int64_t)1 << -dstFixedPointPos); - - status |= vxSetNodeUniform(nodObj, "uniConvertIntergetoF32_4x4", - 1, uniConvertIntergetoF32_4x4); - status |= vxSetNodeUniform(nodObj, "outputScale", 1, &outputScale); - status |= vxSetNodeUniform(nodObj, "uniExtactInteger_2x8", 1, - uniExtactInteger_2x8); - } - else if (dstFormat == VSI_NN_TYPE_UINT8) - { - vx_float32 outputZP = (vx_float32)output_ZP; - - outputScale = 1.0f / outputScale; - - status |= vxSetNodeUniform(nodObj, "uniConvertIntergetoF32_4x4", - 1, uniConvertIntergetoF32_4x4); - status |= vxSetNodeUniform(nodObj, "outputZP", 1, &outputZP); - status |= vxSetNodeUniform(nodObj, "outputScale", 1, &outputScale); - status |= vxSetNodeUniform(nodObj, "uniExtactInteger_2x8", 1, - uniExtactInteger_2x8); - } - } - - status |= vxSetNodeAttribute(nodObj, VX_NODE_ATTRIBUTE_KERNEL_EXECUTION_PARAMETERS, - &shaderParam, sizeof(vx_kernel_execution_parameters_t)); - - return VX_SUCCESS; -} - -static vx_param_description_t s_params[] = -{ - { VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED }, - { VX_OUTPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED }, - { VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED }, - { VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED }, - { VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_OPTIONAL }, - { VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_OPTIONAL }, - { VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_OPTIONAL }, - { VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_OPTIONAL }, - { VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED }, - { VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED }, - { VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED }, - { VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED }, - { VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_OPTIONAL }, - { VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_OPTIONAL }, - { VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_OPTIONAL }, - { VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_OPTIONAL }, -}; - -static vx_param_description_t vxScaletoTensorKernelParam[] = -{ - {VX_INPUT, VX_TYPE_IMAGE, VX_PARAMETER_STATE_REQUIRED}, - {VX_OUTPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, - {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED}, - {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED}, - {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED}, - {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED}, - {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED}, - {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED}, - {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED}, - {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED}, -}; - -static vx_param_description_t vxGrayScaletoTensorKernelParam[] = -{ - {VX_INPUT, VX_TYPE_IMAGE, VX_PARAMETER_STATE_REQUIRED}, - {VX_OUTPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, - {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED}, - {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED}, - {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED}, - {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED}, - {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED}, - {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED}, -}; - -#ifdef __cplusplus -extern "C" { -#endif - -vx_kernel_description_t _VX_KERNEL_VAR = -{ - _VX_KERNEL_ID, - _VX_KERNEL_NAME, - _VX_KERNEL_FUNC_KERNEL, - s_params, - _cnt_of_array( s_params ), - vsi_nn_KernelValidator, - NULL, - NULL, - vsi_nn_KernelInitializer, - vsi_nn_KernelDeinitializer -}; - -vx_kernel_description_t vxScaletoTensorKernelInfo_fp16 = -{ - VX_KERNEL_ENUM_SCALETOTENSOR, - VX_KERNEL_NAME_SCALETOTENSOR_FP16, - NULL, - vxScaletoTensorKernelParam, - (sizeof(vxScaletoTensorKernelParam) / sizeof(vxScaletoTensorKernelParam[0])), - vsi_nn_KernelValidator, - NULL, - NULL, - vxScaletoTensorInitializer, - vsi_nn_KernelDeinitializer -}; - -vx_kernel_description_t vxScaletoTensorKernelInfo_int8 = -{ - VX_KERNEL_ENUM_SCALETOTENSOR, - VX_KERNEL_NAME_SCALETOTENSOR_INT8, - NULL, - vxScaletoTensorKernelParam, - (sizeof(vxScaletoTensorKernelParam) / sizeof(vxScaletoTensorKernelParam[0])), - vsi_nn_KernelValidator, - NULL, - NULL, - vxScaletoTensorInitializer, - vsi_nn_KernelDeinitializer -}; - -vx_kernel_description_t vxScaletoTensorKernelInfo_fp16_copy = -{ - VX_KERNEL_ENUM_SCALETOTENSOR, - VX_KERNEL_NAME_SCALETOTENSOR_FP16_COPY, - NULL, - vxScaletoTensorKernelParam, - (sizeof(vxScaletoTensorKernelParam) / sizeof(vxScaletoTensorKernelParam[0])), - vsi_nn_KernelValidator, - NULL, - NULL, - vxScaletoTensorInitializer, - vsi_nn_KernelDeinitializer -}; - -vx_kernel_description_t vxScaletoTensorKernelInfo_int8_copy = -{ - VX_KERNEL_ENUM_SCALETOTENSOR, - VX_KERNEL_NAME_SCALETOTENSOR_INT8_COPY, - NULL, - vxScaletoTensorKernelParam, - (sizeof(vxScaletoTensorKernelParam) / sizeof(vxScaletoTensorKernelParam[0])), - vsi_nn_KernelValidator, - NULL, - NULL, - vxScaletoTensorInitializer, - vsi_nn_KernelDeinitializer -}; - -vx_kernel_description_t vxScaletoTensorKernelInfo_int16 = -{ - VX_KERNEL_ENUM_SCALETOTENSOR, - VX_KERNEL_NAME_SCALETOTENSOR_INT16, - NULL, - vxScaletoTensorKernelParam, - (sizeof(vxScaletoTensorKernelParam) / sizeof(vxScaletoTensorKernelParam[0])), - vsi_nn_KernelValidator, - NULL, - NULL, - vxScaletoTensorInitializer, - vsi_nn_KernelDeinitializer -}; - -vx_kernel_description_t vxScaletoTensorKernelInfo_int16_copy = -{ - VX_KERNEL_ENUM_SCALETOTENSOR, - VX_KERNEL_NAME_SCALETOTENSOR_INT16_COPY, - NULL, - vxScaletoTensorKernelParam, - (sizeof(vxScaletoTensorKernelParam) / sizeof(vxScaletoTensorKernelParam[0])), - vsi_nn_KernelValidator, - NULL, - NULL, - vxScaletoTensorInitializer, - vsi_nn_KernelDeinitializer -}; - -vx_kernel_description_t vxScaletoTensorKernelInfo_uint8 = -{ - VX_KERNEL_ENUM_SCALETOTENSOR, - VX_KERNEL_NAME_SCALETOTENSOR_UINT8, - NULL, - vxScaletoTensorKernelParam, - (sizeof(vxScaletoTensorKernelParam) / sizeof(vxScaletoTensorKernelParam[0])), - vsi_nn_KernelValidator, - NULL, - NULL, - vxScaletoTensorInitializer, - vsi_nn_KernelDeinitializer -}; - -vx_kernel_description_t vxScaletoTensorKernelInfo_uint8_copy = -{ - VX_KERNEL_ENUM_SCALETOTENSOR, - VX_KERNEL_NAME_SCALETOTENSOR_UINT8_COPY, - NULL, - vxScaletoTensorKernelParam, - (sizeof(vxScaletoTensorKernelParam) / sizeof(vxScaletoTensorKernelParam[0])), - vsi_nn_KernelValidator, - NULL, - NULL, - vxScaletoTensorInitializer, - vsi_nn_KernelDeinitializer -}; - -vx_kernel_description_t vxGrayScaletoTensorKernelInfo_fp16 = -{ - VX_KERNEL_ENUM_GRAYSCALETOTENSOR, - VX_KERNEL_NAME_GRAYSCALETOTENSOR_FP16, - NULL, - vxGrayScaletoTensorKernelParam, - (sizeof(vxGrayScaletoTensorKernelParam) / sizeof(vxGrayScaletoTensorKernelParam[0])), - vsi_nn_KernelValidator, - NULL, - NULL, - vxGrayScaletoTensorInitializer, - vsi_nn_KernelDeinitializer -}; - -vx_kernel_description_t vxGrayScaletoTensorKernelInfo_int8 = -{ - VX_KERNEL_ENUM_GRAYSCALETOTENSOR, - VX_KERNEL_NAME_GRAYSCALETOTENSOR_INT8, - NULL, - vxGrayScaletoTensorKernelParam, - (sizeof(vxGrayScaletoTensorKernelParam) / sizeof(vxGrayScaletoTensorKernelParam[0])), - vsi_nn_KernelValidator, - NULL, - NULL, - vxGrayScaletoTensorInitializer, - vsi_nn_KernelDeinitializer -}; - -vx_kernel_description_t vxGrayScaletoTensorKernelInfo_fp16_copy = -{ - VX_KERNEL_ENUM_GRAYSCALETOTENSOR, - VX_KERNEL_NAME_GRAYSCALETOTENSOR_FP16_COPY, - NULL, - vxGrayScaletoTensorKernelParam, - (sizeof(vxGrayScaletoTensorKernelParam) / sizeof(vxGrayScaletoTensorKernelParam[0])), - vsi_nn_KernelValidator, - NULL, - NULL, - vxGrayScaletoTensorInitializer, - vsi_nn_KernelDeinitializer -}; - -vx_kernel_description_t vxGrayScaletoTensorKernelInfo_int8_copy = -{ - VX_KERNEL_ENUM_GRAYSCALETOTENSOR, - VX_KERNEL_NAME_GRAYSCALETOTENSOR_INT8_COPY, - NULL, - vxGrayScaletoTensorKernelParam, - (sizeof(vxGrayScaletoTensorKernelParam) / sizeof(vxGrayScaletoTensorKernelParam[0])), - vsi_nn_KernelValidator, - NULL, - NULL, - vxGrayScaletoTensorInitializer, - vsi_nn_KernelDeinitializer -}; - -vx_kernel_description_t vxGrayScaletoTensorKernelInfo_int16 = -{ - VX_KERNEL_ENUM_GRAYSCALETOTENSOR, - VX_KERNEL_NAME_GRAYSCALETOTENSOR_INT16, - NULL, - vxGrayScaletoTensorKernelParam, - (sizeof(vxGrayScaletoTensorKernelParam) / sizeof(vxGrayScaletoTensorKernelParam[0])), - vsi_nn_KernelValidator, - NULL, - NULL, - vxGrayScaletoTensorInitializer, - vsi_nn_KernelDeinitializer -}; - -vx_kernel_description_t vxGrayScaletoTensorKernelInfo_int16_copy = -{ - VX_KERNEL_ENUM_GRAYSCALETOTENSOR, - VX_KERNEL_NAME_GRAYSCALETOTENSOR_INT16_COPY, - NULL, - vxGrayScaletoTensorKernelParam, - (sizeof(vxGrayScaletoTensorKernelParam) / sizeof(vxGrayScaletoTensorKernelParam[0])), - vsi_nn_KernelValidator, - NULL, - NULL, - vxGrayScaletoTensorInitializer, - vsi_nn_KernelDeinitializer -}; - -vx_kernel_description_t vxGrayScaletoTensorKernelInfo_uint8 = -{ - VX_KERNEL_ENUM_GRAYSCALETOTENSOR, - VX_KERNEL_NAME_GRAYSCALETOTENSOR_UINT8, - NULL, - vxGrayScaletoTensorKernelParam, - (sizeof(vxGrayScaletoTensorKernelParam) / sizeof(vxGrayScaletoTensorKernelParam[0])), - vsi_nn_KernelValidator, - NULL, - NULL, - vxGrayScaletoTensorInitializer, - vsi_nn_KernelDeinitializer -}; - -vx_kernel_description_t vxGrayScaletoTensorKernelInfo_uint8_copy = -{ - VX_KERNEL_ENUM_GRAYSCALETOTENSOR, - VX_KERNEL_NAME_GRAYSCALETOTENSOR_UINT8_COPY, - NULL, - vxGrayScaletoTensorKernelParam, - (sizeof(vxGrayScaletoTensorKernelParam) / sizeof(vxGrayScaletoTensorKernelParam[0])), - vsi_nn_KernelValidator, - NULL, - NULL, - vxGrayScaletoTensorInitializer, - vsi_nn_KernelDeinitializer -}; - -vx_kernel_description_t * vx_kernel_IMAGEPROCESS_list[] = -{ - &_VX_KERNEL_VAR, - &vxScaletoTensorKernelInfo_fp16, - &vxScaletoTensorKernelInfo_int8, - &vxScaletoTensorKernelInfo_int16, - &vxScaletoTensorKernelInfo_uint8, - &vxScaletoTensorKernelInfo_fp16_copy, - &vxScaletoTensorKernelInfo_int8_copy, - &vxScaletoTensorKernelInfo_int16_copy, - &vxScaletoTensorKernelInfo_uint8_copy, - &vxGrayScaletoTensorKernelInfo_fp16, - &vxGrayScaletoTensorKernelInfo_int8, - &vxGrayScaletoTensorKernelInfo_int16, - &vxGrayScaletoTensorKernelInfo_uint8, - &vxGrayScaletoTensorKernelInfo_fp16_copy, - &vxGrayScaletoTensorKernelInfo_int8_copy, - &vxGrayScaletoTensorKernelInfo_int16_copy, - &vxGrayScaletoTensorKernelInfo_uint8_copy, - NULL -}; -#ifdef __cplusplus -} -#endif diff --git a/src/tim/vx/internal/src/libnnext/ops/kernel/vsi_nn_kernel_signalframe.c b/src/tim/vx/internal/src/libnnext/ops/kernel/vsi_nn_kernel_signalframe.c deleted file mode 100644 index 0cb39a1..0000000 --- a/src/tim/vx/internal/src/libnnext/ops/kernel/vsi_nn_kernel_signalframe.c +++ /dev/null @@ -1,806 +0,0 @@ -/**************************************************************************** -* -* Copyright (c) 2020 Vivante Corporation -* -* Permission is hereby granted, free of charge, to any person obtaining a -* copy of this software and associated documentation files (the "Software"), -* to deal in the Software without restriction, including without limitation -* the rights to use, copy, modify, merge, publish, distribute, sublicense, -* and/or sell copies of the Software, and to permit persons to whom the -* Software is furnished to do so, subject to the following conditions: -* -* The above copyright notice and this permission notice shall be included in -* all copies or substantial portions of the Software. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING -* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER -* DEALINGS IN THE SOFTWARE. -* -*****************************************************************************/ - -#include -#include -#include -#include -#include - -#include "vsi_nn_platform.h" - -#include "vsi_nn_prv.h" -#include "vsi_nn_log.h" -#include "vsi_nn_tensor_util.h" -#include "utils/vsi_nn_util.h" -#include "utils/vsi_nn_dtype_util.h" -#include "utils/vsi_nn_math.h" -#include "libnnext/vsi_nn_vxkernel.h" -#include "libnnext/vx_lib_nnext.h" - -#define INPUT_FP16 0 -#define OUTPUT_FP16 0 - -vx_status getFactor(vx_uint32 data, vx_uint32 *factor, vx_uint32 minLimit, vx_uint32 maxLimit, vx_uint32 alignData) -{ - vx_uint32 i = 0; - vx_uint32 maxFactor = alignData - 1; - vx_status status = VX_FAILURE; - - for (i = minLimit; i <= maxLimit; i ++) - { - if (data % i == 0) - { - if (status == VX_FAILURE && data % i == 0) - { - *factor = i; - maxFactor = i; - status = VX_SUCCESS; - continue; - } - else if ((i % alignData) < (maxFactor % alignData)) - { - *factor = i; - maxFactor = i; - status = VX_SUCCESS; - } - } - } - - return status; -} - -void mySignalFrameFunc - ( - void* imgIn, - void* imgOut, - uint32_t input_dim, - uint32_t width, - uint32_t height, - uint32_t channel, - uint32_t batch, - uint32_t frame_len, // window size - uint32_t step, - uint32_t pad_end, - uint32_t pad_val, - uint32_t axis, - uint32_t *dstW, - uint32_t *dstH, - uint32_t *dstC, - uint32_t *dstB - ) -{ - uint8_t* tmpIn = (uint8_t*)imgIn; - uint8_t* tmpOut = (uint8_t*)imgOut; - - uint32_t i,j,k; - uint32_t size = 0; - uint32_t iter = 0; - - if(input_dim == 1) - { - if(axis != 0) - { - VSILOGE("error.\n"); - return; - } - *dstW = frame_len; - //*dstH = (len - frame_len) / step + 1; - *dstH = pad_end ? ((width + step - 1 ) / step) : ((width - frame_len) / step + 1); - *dstC = 1; - *dstB = 1; - - size = (*dstW) * sizeof(int16_t); - iter = pad_end ? width : (width - frame_len + 1); - if(pad_end) - { - int16_t* output = (int16_t*)tmpOut; - int16_t* input = (int16_t*)tmpIn; - uint32_t m = 0; - for(i = 0, j = 0; i < iter; i += step) - { - for(m = i; m < frame_len + i; m++) - { - if(m >= width) - { - output[j] = 0; - } - else - { - output[j] = input[m]; - } - j++; - } - } - } - else - { - for(i = 0, j = 0; i < iter; i += step, j++) - { - memcpy(tmpOut + j * size, tmpIn + i * sizeof(int16_t), size); - } - } - } - else if(input_dim == 2) - { - if(axis == 0) - { - uint8_t* src = tmpIn; - uint8_t* dst = tmpOut; - - *dstH = frame_len; - *dstW = width; - *dstC = pad_end ? ((height + step - 1) / step) : ((height - frame_len) / step + 1); - - *dstB = 1; - - size = width * frame_len * sizeof(int16_t); - iter = pad_end ? (height) : (height - frame_len + 1); - if(pad_end) - { - uint32_t m = 0; - size = width * sizeof(int16_t); - for(i = 0, j = 0; i < iter; i += step) - { - for(m = i; m < frame_len + i; m++) - { - if(m >= height) - { - memset(dst + j * size, 0, size); - } - else - { - memcpy(dst + j * size, src + m * width * sizeof(int16_t), size); - } - j++; - } - } - } - else - { - for(i = 0, j = 0; i < iter; i += step, j++) - { - memcpy(dst + j * size, src + i * width * sizeof(int16_t), size); - } - } - } - else if(axis == 1) - { - *dstW = frame_len; - - //*dstH = (len - frame_len) / step + 1; - *dstH = pad_end ? ((width + step - 1 ) / step) : ((width - frame_len) / step + 1); - - *dstC = height; - *dstB = 1; - - size = (*dstW) * sizeof(int16_t); - iter = pad_end ? width : (width - frame_len + 1); - if(pad_end) - { - for(k = 0; k < height; k++) - { - uint8_t* src = tmpIn + k * width * sizeof(int16_t); - uint8_t* dst = tmpOut + k * (*dstW) * (*dstH) * sizeof(int16_t); - - int16_t* output = (int16_t*)dst; - int16_t* input = (int16_t*)src; - uint32_t m = 0; - for(i = 0, j = 0; i < iter; i += step) - { - for(m = i; m < frame_len + i; m++) - { - if(m >= width) - { - output[j] = 0; - } - else - { - output[j] = input[m]; - } - j++; - } - } - } - } - else - { - for(k = 0; k < height; k++) - { - uint8_t* src = tmpIn + k * width * sizeof(int16_t); - uint8_t* dst = tmpOut + k * (*dstW) * (*dstH) * sizeof(int16_t); - - for(i = 0, j = 0; i < iter; i += step, j++) - { - memcpy(dst + j * size, src + i * sizeof(int16_t), size); - } - } - } - } - } - else if(input_dim == 3) - { - if(axis == 0) - { - uint8_t* src = tmpIn; - uint8_t* dst = tmpOut; - size = width * height * frame_len * sizeof(int16_t); - - *dstW = width; - *dstH = height; - *dstC = frame_len; - *dstB = pad_end ? ((channel + step - 1) / step) :((channel - frame_len) / step + 1); - iter = pad_end ? channel : (channel - frame_len + 1); - if(pad_end) - { - uint32_t m = 0; - size = width * height * sizeof(int16_t); - for(i = 0, j = 0; i < iter; i += step) - { - for(m = i; m < frame_len + i; m++) - { - if(m >= channel) - { - memset(dst + j * size, 0 , size); - } - else - { - memcpy(dst + j * size, src + m * width * height * sizeof(int16_t), size); - } - j++; - } - } - } - else - { - for(i = 0, j = 0; i < iter; i += step, j++) - { - memcpy(dst + j * size, src + i * width * height * sizeof(int16_t), size); - } - } - } - else if(axis == 1) - { - *dstH = frame_len; - *dstW = width; - *dstC = pad_end ? ((height + step - 1) / step) : ((height - frame_len) / step + 1); - *dstB = channel; - - size = width * frame_len * sizeof(int16_t); - iter = pad_end ? (height) : (height - frame_len + 1); - if(pad_end) - { - uint32_t m = 0; - size = width * sizeof(int16_t); - for(k = 0; k < channel; k++) - { - uint8_t* src = tmpIn + k * width * height* sizeof(int16_t); - uint8_t* dst = tmpOut + k * (*dstC) * (*dstW) * (*dstH) * sizeof(int16_t); - - for(i = 0, j = 0; i < iter; i += step) - { - for(m = i; m < frame_len + i; m++) - { - if(m >= height) - { - memset(dst + j * size, 0, size); - } - else - { - memcpy(dst + j * size, src + m * width * sizeof(int16_t), size); - } - j++; - } - } - } - } - else - { - for(k = 0; k < channel; k++) - { - uint8_t* src = tmpIn + k * width * height* sizeof(int16_t); - uint8_t* dst = tmpOut + k * (*dstC) * (*dstW) * (*dstH) * sizeof(int16_t); - - for(i = 0, j = 0; i < iter; i += step, j++) - { - memcpy(dst + j * size, src + i * width * sizeof(int16_t), size); - } - } - } - } - else if(axis == 2) - { - //*dstH = (len - frame_len) / step + 1; - *dstH = pad_end ? ((width + step - 1 ) / step) : ((width - frame_len) / step + 1); - *dstW = frame_len; - *dstC = height; - *dstB = channel; - - size = (*dstW) * sizeof(int16_t); - iter = pad_end ? width : (width - frame_len + 1); - - if(pad_end) - { - for(k = 0; k < channel * height; k++) - { - uint8_t* src = tmpIn + k * width * sizeof(int16_t); - uint8_t* dst = tmpOut + k * (*dstW) * (*dstH) * sizeof(int16_t); - - int16_t* output = (int16_t*)dst; - int16_t* input = (int16_t*)src; - uint32_t m = 0; - for(i = 0, j = 0; i < iter; i += step) - { - for(m = i; m < frame_len + i; m++) - { - if(m >= width) - { - output[j] = 0; - } - else - { - output[j] = input[m]; - } - j++; - } - } - } - } - else - { - for(k = 0; k < channel * height; k++) - { - uint8_t* src = tmpIn + k * width * sizeof(int16_t); - uint8_t* dst = tmpOut + k * (*dstW) * (*dstH) * sizeof(int16_t); - for(i = 0, j = 0; i < iter; i += step, j++) - { - memcpy(dst + j * size, src + i * sizeof(int16_t), size); - } - } - } - } - } - - return; -} - -vsi_status VX_CALLBACK vxSignalFrameKernel - ( - vx_node node, - const vx_reference* paramObj, - uint32_t paramNum - ) -{ - vsi_status status = VX_ERROR_INVALID_PARAMETERS; - - if(paramNum == 7) - { - vx_context context = NULL; - // tensor - vx_tensor imgObj[7] = { NULL }; -#if INPUT_FP16 - int16_t *input = NULL; -#else - uint8_t *input = NULL; -#endif -#if OUTPUT_FP16 - int16_t *output = NULL; -#else - uint8_t *output = NULL; -#endif - - uint32_t input_size[DIM_SIZE] = {1, 1, 1, 1}, output_size[DIM_SIZE] = {1, 1, 1, 1}, dst_size[DIM_SIZE] = {1, 1, 1, 1}; - vsi_nn_tensor_attr_t in_attr, out_attr; - - vsi_nn_type_e outputFormat = VSI_NN_TYPE_FLOAT16; - uint32_t input_dims = 0, output_dims = 0, tmpDim = 0; - - vx_scalar scalar[5] = { NULL }; - uint32_t frame_length = 0, step = 0, pad_end = 0, pad = 0, axis = 0, axis0 = 0; - uint32_t i = 0; - - memset(&in_attr, 0x0, sizeof(vsi_nn_tensor_attr_t)); - memset(&out_attr, 0x0, sizeof(vsi_nn_tensor_attr_t)); - status = vsi_nn_vxGetTensorAttr(imgObj[0], &in_attr); - status |= vsi_nn_vxGetTensorAttr(imgObj[1], &out_attr); - if (status != VX_SUCCESS) - { - VSILOGE("vsi_nn_vxGetTensorAttr failure! at line %d\n", __LINE__); - goto OnError; - } - - imgObj[0] = (vx_tensor)paramObj[0]; - imgObj[1] = (vx_tensor)paramObj[1]; //output - scalar[0] = (vx_scalar)paramObj[2]; - scalar[1] = (vx_scalar)paramObj[3]; - scalar[2] = (vx_scalar)paramObj[4]; - scalar[3] = (vx_scalar)paramObj[5]; - scalar[4] = (vx_scalar)paramObj[6]; - context = vxGetContext((vx_reference)node); - if (context == NULL) - { - VSILOGE("vxGetContext failure! at line %d\n", __LINE__); - goto OnError; - } - //input - input_dims = in_attr.dim_num; - for (i = 0; i < input_dims; i++) - { - input_size[i] = in_attr.size[i]; - } - - //output - output_dims = out_attr.dim_num; - outputFormat = out_attr.dtype.vx_type; - for (i = 0; i < output_dims; i++) - { - output_size[i] = out_attr.size[i]; - } - - input_size[2] = (input_dims <= 2)?1:input_size[2]; - input_size[3] = (input_dims <= 3)?1:input_size[3]; - - -#if INPUT_FP16 - input = (int16_t*)malloc(input_size[0]*input_size[1]*input_size[2]*sizeof(int16_t)); -#else - //input = (uint8_t*)malloc(input_size[0]*input_size[1]*input_size[2]*vsi_nn_GetTypeBytes(inputFormat)); -#endif -#if OUTPUT_FP16 - output = (int16_t*)malloc(output_size[0]*output_size[1]*output_size[2]*sizeof(int16_t)); -#else - output = (uint8_t*)malloc(output_size[0]*output_size[1]*output_size[2]*vsi_nn_GetTypeBytes(outputFormat)); -#endif - - input = vsi_nn_vxCopyTensorToData(context, imgObj[0], &in_attr); - - // scalar - status = vxCopyScalar(scalar[0], &frame_length, VX_READ_ONLY, VX_MEMORY_TYPE_HOST); - status |= vxCopyScalar(scalar[1], &step, VX_READ_ONLY, VX_MEMORY_TYPE_HOST); - status |= vxCopyScalar(scalar[2], &pad_end, VX_READ_ONLY, VX_MEMORY_TYPE_HOST); - status |= vxCopyScalar(scalar[3], &pad, VX_READ_ONLY, VX_MEMORY_TYPE_HOST); - status |= vxCopyScalar(scalar[4], &axis, VX_READ_ONLY, VX_MEMORY_TYPE_HOST); - if (status != VX_SUCCESS) - { - VSILOGE("vxCopyScalar failure! at line %d\n", __LINE__); - goto OnError; - } - - // Call C Prototype - if(output_dims == 2) - tmpDim = 1; - else - tmpDim = input_dims; - { - axis0 = input_dims - axis - 1; - } - mySignalFrameFunc(input, output, tmpDim, input_size[0], - input_size[1], input_size[2], input_size[3], - frame_length, step, pad_end, pad, axis0, - &dst_size[0], &dst_size[1], &dst_size[2], &dst_size[3]); - - //output tensor - status = vsi_nn_vxCopyDataToTensor(context, imgObj[1], &out_attr, output); - if (status != VX_SUCCESS) - { - VSILOGE("vsi_nn_vxCopyDataToTensor failure! at line %d\n", __LINE__); - goto OnError; - } - -OnError: - if(input) free(input); - if(output) free(output); - } - - return status; -} - -vsi_status VX_CALLBACK vxSignalFrameInitializer - ( - vx_node nodObj, - const vx_reference *paramObj, - uint32_t paraNum - ) -{ - vsi_status status = VX_SUCCESS; - // Alignment with a power of two value. -#define gcmALIGN(n, align) ((n) + ((align) - 1)) & ~((align) - 1) - vx_kernel_execution_parameters_t shaderParam = { - 3, // workdim - {0, 0, 0}, // globalWorkOffset: control the start location be processed in the image - {0, 0, 0}, // globalWorkScale: how many pixels could be processed by a single thread - {0, 0, 0}, // localWorkSize: local group size in thread - {0, 0, 0}}; // globalWorkSize: image size in thread - vx_scalar scalar[5]; - vx_tensor input = (vx_tensor)paramObj[0]; - vx_tensor output = (vx_tensor)paramObj[1]; - - uint32_t input_size[DIM_SIZE] = {1, 1, 1, 1}; - uint32_t input_dims = 0; - uint32_t output_dims = 0; - //vx_uint32 factor = 1; - //vx_uint32 maxWorkGroupSize = 8; - uint32_t frame_length, step, pad_end, pad, axis, axis0; - uint32_t output_channel = 0; - - vx_uint32 i = 0; - vsi_nn_tensor_attr_t attr[2]; - memset(&attr[0], 0, sizeof(vsi_nn_tensor_attr_t)); - memset(&attr[1], 0, sizeof(vsi_nn_tensor_attr_t)); - status = vsi_nn_vxGetTensorAttr(input, &attr[0]); - status |= vsi_nn_vxGetTensorAttr(output, &attr[1]); - if (status != VX_SUCCESS) - { - VSILOGE("vsi_nn_vxGetTensorAttr failure! at line %d\n", __LINE__); - return status; - } - - input_dims = attr[0].dim_num; - for (i = 0; i < input_dims; i++) - { - input_size[i] = attr[0].size[i]; - } - output_dims = attr[1].dim_num; - - scalar[0] = (vx_scalar)paramObj[2]; - scalar[1] = (vx_scalar)paramObj[3]; - scalar[2] = (vx_scalar)paramObj[4]; - scalar[3] = (vx_scalar)paramObj[5]; - scalar[4] = (vx_scalar)paramObj[6]; - - status = vxCopyScalar(scalar[0], &frame_length, VX_READ_ONLY, VX_MEMORY_TYPE_HOST); - status |= vxCopyScalar(scalar[1], &step, VX_READ_ONLY, VX_MEMORY_TYPE_HOST); - status |= vxCopyScalar(scalar[2], &pad_end, VX_READ_ONLY, VX_MEMORY_TYPE_HOST); - status |= vxCopyScalar(scalar[3], &pad, VX_READ_ONLY, VX_MEMORY_TYPE_HOST); - status |= vxCopyScalar(scalar[4], &axis0, VX_READ_ONLY, VX_MEMORY_TYPE_HOST); - if (status != VX_SUCCESS) - { - VSILOGE("vxCopyScalar failure! at line %d\n", __LINE__); - return status; - } - - { - if(input_dims == 2 && output_dims == 2) - { - axis = input_dims - axis0 - 2; - } - else - { - axis = input_dims - axis0 - 1; - } - } - - input_size[2] = (input_dims <= 2)?1:input_size[2]; - //input_size[2] = (input_dims == 4)?(input_size[2] * input_size[3]):input_size[2]; - - shaderParam.globalWorkOffset[0] = 0; - shaderParam.globalWorkOffset[1] = 0; - shaderParam.globalWorkOffset[2] = 0; - if((output_dims == 2) - || (input_dims == 2 && output_dims == 3 && axis == 1) - || (input_dims == 3 && axis == 2)) - { - shaderParam.globalWorkScale[0] = 1; - shaderParam.globalWorkScale[1] = 1; - shaderParam.globalWorkScale[2] = 1; - shaderParam.localWorkSize[0] = 1; - shaderParam.localWorkSize[1] = 1; -#if 0 - if (input_size[1] <= maxWorkGroupSize) - shaderParam.localWorkSize[1] = input_size[1]; - else if (getFactor(input_size[1], &factor, 2, maxWorkGroupSize, 8) == VX_SUCCESS) - shaderParam.localWorkSize[1] = factor; - else - shaderParam.localWorkSize[1] = 1; -#endif - - shaderParam.localWorkSize[2] = 1; - shaderParam.globalWorkSize[0] = gcmALIGN((1 + shaderParam.globalWorkScale[0] - 1) - / shaderParam.globalWorkScale[0], shaderParam.localWorkSize[0]); - - shaderParam.globalWorkSize[1] = gcmALIGN((input_size[1] + shaderParam.globalWorkScale[1] - 1) - / shaderParam.globalWorkScale[1], shaderParam.localWorkSize[1]); - //shaderParam.globalWorkSize[1] = input_size[1]; - shaderParam.globalWorkSize[2] = gcmALIGN((input_size[2] + shaderParam.globalWorkScale[2] - 1) - / shaderParam.globalWorkScale[2], shaderParam.localWorkSize[2]); - } - else if((input_dims == 2 && output_dims == 3 && axis == 0) - || (input_dims == 3 && axis == 1)) - { - int height = (pad_end == 0) ? (input_size[1] - frame_length + 1) : (input_size[1]); - shaderParam.globalWorkScale[0] = 8; - shaderParam.globalWorkScale[1] = step; - shaderParam.globalWorkScale[2] = 1; - shaderParam.localWorkSize[0] = 1; - shaderParam.localWorkSize[1] = 1; - shaderParam.localWorkSize[2] = 1; - shaderParam.globalWorkSize[0] = gcmALIGN((input_size[0] + shaderParam.globalWorkScale[0] - 1) - / shaderParam.globalWorkScale[0], shaderParam.localWorkSize[0]); - shaderParam.globalWorkSize[1] = gcmALIGN((height + shaderParam.globalWorkScale[1] - 1) - / shaderParam.globalWorkScale[1], shaderParam.localWorkSize[1]); - shaderParam.globalWorkSize[2] = gcmALIGN((input_size[2] + shaderParam.globalWorkScale[2] - 1) - / shaderParam.globalWorkScale[2], shaderParam.localWorkSize[2]); - - output_channel = (pad_end == 0) ? ((input_size[1] - frame_length) / step + 1) : ((input_size[1] + step - 1) / step); - } - else if(input_dims == 3 && axis == 0) - { - int channel = (pad_end == 0) ? (input_size[2] - frame_length + 1) : (input_size[2]); - shaderParam.globalWorkScale[0] = 8; - shaderParam.globalWorkScale[1] = 1; - shaderParam.globalWorkScale[2] = step; - shaderParam.localWorkSize[0] = 1; - shaderParam.localWorkSize[1] = 1; - shaderParam.localWorkSize[2] = 1; - shaderParam.globalWorkSize[0] = gcmALIGN((input_size[0] + shaderParam.globalWorkScale[0] - 1) - / shaderParam.globalWorkScale[0], shaderParam.localWorkSize[0]); - shaderParam.globalWorkSize[1] = gcmALIGN((input_size[1] + shaderParam.globalWorkScale[1] - 1) - / shaderParam.globalWorkScale[1], shaderParam.localWorkSize[1]); - shaderParam.globalWorkSize[2] = gcmALIGN((channel + shaderParam.globalWorkScale[2] - 1) - / shaderParam.globalWorkScale[2], shaderParam.localWorkSize[2]); - } - - status |= vxSetNodeAttribute(nodObj, VX_NODE_ATTRIBUTE_KERNEL_EXECUTION_PARAMETERS, - &shaderParam, sizeof(vx_kernel_execution_parameters_t)); - if(status < 0) - { - VSILOGE("[%s : %d]Initializer failure! \n",__FILE__, __LINE__); - } - { - status |= vxSetNodeUniform(nodObj, "input_width", 1, &input_size[0]); - status |= vxSetNodeUniform(nodObj, "input_height", 1, &input_size[1]); - status |= vxSetNodeUniform(nodObj, "input_channel", 1, &input_size[2]); - status |= vxSetNodeUniform(nodObj, "output_channel", 1, &output_channel); - if(status < 0) - { - VSILOGE("[%s : %d]Initializer failure! \n",__FILE__, __LINE__); - } - } - return status; -} -static vx_param_description_t vxSignalFrameKernelParam[] = -{ - {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, - {VX_OUTPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, - {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED}, - {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED}, - {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED}, - {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED}, - {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED} -}; -#ifdef __cplusplus -extern "C" { -#endif -vx_kernel_description_t vxSignalFrameKernelInfo = -{ - VX_KERNEL_ENUM_SIGNALFRAME, - VX_KERNEL_NAME_SIGNALFRAME_WIDTH, - NULL, - vxSignalFrameKernelParam, - (sizeof(vxSignalFrameKernelParam) / sizeof(vxSignalFrameKernelParam[0])), - vsi_nn_KernelValidator, - NULL, - NULL, - vxSignalFrameInitializer, - vsi_nn_KernelDeinitializer -}; - -vx_kernel_description_t vxSignalFrameKernelInfo_height = -{ - VX_KERNEL_ENUM_SIGNALFRAME, - VX_KERNEL_NAME_SIGNALFRAME_HEIGHT, - NULL, - vxSignalFrameKernelParam, - (sizeof(vxSignalFrameKernelParam) / sizeof(vxSignalFrameKernelParam[0])), - vsi_nn_KernelValidator, - NULL, - NULL, - vxSignalFrameInitializer, - vsi_nn_KernelDeinitializer -}; - -vx_kernel_description_t vxSignalFrameKernelInfo_channel = -{ - VX_KERNEL_ENUM_SIGNALFRAME, - VX_KERNEL_NAME_SIGNALFRAME_CHANNEL, - NULL, - vxSignalFrameKernelParam, - (sizeof(vxSignalFrameKernelParam) / sizeof(vxSignalFrameKernelParam[0])), - vsi_nn_KernelValidator, - NULL, - NULL, - vxSignalFrameInitializer, - vsi_nn_KernelDeinitializer -}; - -vx_kernel_description_t vxSignalFrameKernelInfo_8bit = -{ - VX_KERNEL_ENUM_SIGNALFRAME, - VX_KERNEL_NAME_SIGNALFRAME_WIDTH_8BITS, - NULL, - vxSignalFrameKernelParam, - (sizeof(vxSignalFrameKernelParam) / sizeof(vxSignalFrameKernelParam[0])), - vsi_nn_KernelValidator, - NULL, - NULL, - vxSignalFrameInitializer, - vsi_nn_KernelDeinitializer -}; - -vx_kernel_description_t vxSignalFrameKernelInfo_height_8bit = -{ - VX_KERNEL_ENUM_SIGNALFRAME, - VX_KERNEL_NAME_SIGNALFRAME_HEIGHT_8BITS, - NULL, - vxSignalFrameKernelParam, - (sizeof(vxSignalFrameKernelParam) / sizeof(vxSignalFrameKernelParam[0])), - vsi_nn_KernelValidator, - NULL, - NULL, - vxSignalFrameInitializer, - vsi_nn_KernelDeinitializer -}; - -vx_kernel_description_t vxSignalFrameKernelInfo_channel_8bit = -{ - VX_KERNEL_ENUM_SIGNALFRAME, - VX_KERNEL_NAME_SIGNALFRAME_CHANNEL_8BITS, - NULL, - vxSignalFrameKernelParam, - (sizeof(vxSignalFrameKernelParam) / sizeof(vxSignalFrameKernelParam[0])), - vsi_nn_KernelValidator, - NULL, - NULL, - vxSignalFrameInitializer, - vsi_nn_KernelDeinitializer -}; - -vx_kernel_description_t vxSignalFrameKernelInfo_CPU = -{ - VX_KERNEL_ENUM_SIGNALFRAME, - VX_KERNEL_NAME_SIGNALFRAME_WIDTH, - vxSignalFrameKernel, - vxSignalFrameKernelParam, - (sizeof(vxSignalFrameKernelParam) / sizeof(vxSignalFrameKernelParam[0])), - vsi_nn_KernelValidator, - NULL, - NULL, - vsi_nn_KernelInitializer, - vsi_nn_KernelDeinitializer -}; - -vx_kernel_description_t * vx_kernel_SIGNALFRAME_list[] = -{ - &vxSignalFrameKernelInfo_CPU, - &vxSignalFrameKernelInfo, - &vxSignalFrameKernelInfo_height, - &vxSignalFrameKernelInfo_channel, - &vxSignalFrameKernelInfo_8bit, - &vxSignalFrameKernelInfo_height_8bit, - &vxSignalFrameKernelInfo_channel_8bit, - NULL -}; -#ifdef __cplusplus -} -#endif diff --git a/src/tim/vx/internal/src/libnnext/ops/kernel/vsi_nn_kernel_spatial_transformer.c b/src/tim/vx/internal/src/libnnext/ops/kernel/vsi_nn_kernel_spatial_transformer.c deleted file mode 100644 index ffa26dd..0000000 --- a/src/tim/vx/internal/src/libnnext/ops/kernel/vsi_nn_kernel_spatial_transformer.c +++ /dev/null @@ -1,481 +0,0 @@ -/**************************************************************************** -* -* Copyright (c) 2020 Vivante Corporation -* -* Permission is hereby granted, free of charge, to any person obtaining a -* copy of this software and associated documentation files (the "Software"), -* to deal in the Software without restriction, including without limitation -* the rights to use, copy, modify, merge, publish, distribute, sublicense, -* and/or sell copies of the Software, and to permit persons to whom the -* Software is furnished to do so, subject to the following conditions: -* -* The above copyright notice and this permission notice shall be included in -* all copies or substantial portions of the Software. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING -* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER -* DEALINGS IN THE SOFTWARE. -* -*****************************************************************************/ -#include -#include -#include -#include -#include - -#include "vsi_nn_platform.h" - -#include "vsi_nn_prv.h" -#include "vsi_nn_log.h" -#include "vsi_nn_tensor_util.h" -#include "utils/vsi_nn_util.h" -#include "utils/vsi_nn_dtype_util.h" -#include "utils/vsi_nn_math.h" -#include "libnnext/vsi_nn_vxkernel.h" -#include "libnnext/vx_lib_nnext.h" - -#define _VX_KERNEL_VAR (vx_kernel_SPATIAL_TRANSFORMER) -#define _VX_KERNEL_ID (VX_KERNEL_ENUM_SPATIAL_TRANSFORMER) -#define _VX_KERNEL_NAME (VX_KERNEL_NAME_SPATIAL_TRANSFORMER) -#define _VX_KERNEL_FUNC_KERNEL (vxSpatial_transformerKernel) - - -static vsi_status VX_CALLBACK vxSpatial_transformerKernel - ( - vx_node node, - const vx_reference* paramObj, - uint32_t paramNum - ) -{ - /*To do cpu implementation*/ - vsi_status status = VX_SUCCESS; - - return status; -} /* _VX_KERNEL_FUNC_KERNEL() */ - -static vx_param_description_t s_params[] = -{ - { VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED }, - { VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED }, - { VX_OUTPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED }, - { VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED }, - { VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED }, -}; - -vx_status VX_CALLBACK vxTransform_GemmInputValidator(vx_node node, vx_uint32 index) -{ - return VX_SUCCESS; -} - -vx_status VX_CALLBACK vxTransform_GemmOutputValidator(vx_node node, vx_uint32 index, vx_meta_format metaObj) -{ - return VX_SUCCESS; -} - -vx_status VX_CALLBACK vxValidator(vx_node node, const vx_reference parameters[], - vx_uint32 num, vx_meta_format metas[]) -{ - vx_status status = VX_SUCCESS; - vx_uint32 index = 0; - for(index = 0; index < num; index++) - { - if(index < 2) - { - status |= vxTransform_GemmInputValidator(node,index); - } - else - { - status |= vxTransform_GemmOutputValidator(node,index,metas[index]); - } - } - return status; -} - -static vx_param_description_t vxTransform_GemmKernelParam[] = -{ - {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, - {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, - {VX_OUTPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED} -}; - -vx_status VX_CALLBACK vxTransform_GemmInitializer(vx_node nodObj, const vx_reference *paramObj, vx_uint32 paraNum) -{ -// Alignment with a power of two value. -#define gcmALIGN(n, align) ((n) + ((align) - 1)) & ~((align) - 1) -#define gcmMIN(x, y) (((x) <= (y)) ? (x) : (y)) -#define gcmMAX(x, y) (((x) >= (y)) ? (x) : (y)) -#define MAX_MULTIPLIER_NUM (65535) -#define MAX_POST_SHIFT_BITS (31) - vx_kernel_execution_parameters_t shaderParam = { - 2, // workdim - {0, 0, 0}, // globalWorkOffset: control the start location be processed in the image - {0, 0, 0}, // globalWorkScale: how many pixels could be processed by a single thread - {0, 0, 0}, // localWorkSize: local group size in thread - {0, 0, 0}}; // globalWorkSize: image size in thread - - vx_status status = VX_SUCCESS; - vx_tensor input0 = (vx_tensor)paramObj[0]; - vx_tensor input1 = (vx_tensor)paramObj[1]; - vx_tensor output = (vx_tensor)paramObj[2]; - vx_enum src0Format = VSI_NN_TYPE_FLOAT16; - vx_enum src1Format = VSI_NN_TYPE_FLOAT16; - vx_enum dstFormat = VSI_NN_TYPE_FLOAT16; - vx_uint32 coord_size[4] = {1, 1, 1, 1}; - vx_uint32 i = 0; - vsi_nn_tensor_attr_t attr[3]; - - memset(&attr[0], 0, sizeof(vsi_nn_tensor_attr_t)); - memset(&attr[1], 0, sizeof(vsi_nn_tensor_attr_t)); - memset(&attr[2], 0, sizeof(vsi_nn_tensor_attr_t)); - - status = vsi_nn_vxGetTensorAttr(input0, &attr[0]); - status |= vsi_nn_vxGetTensorAttr(input1, &attr[1]); - status |= vsi_nn_vxGetTensorAttr(output, &attr[2]); - if (status != VX_SUCCESS) - { - VSILOGE("vsi_nn_vxGetTensorAttr failure! at line %d\n", __LINE__); - return status; - } - - src0Format = attr[0].dtype.vx_type; - src1Format = attr[1].dtype.vx_type; - for (i = 0; i < attr[1].dim_num; i++) - { - coord_size[i] = attr[1].size[i]; - } - dstFormat = attr[2].dtype.vx_type; - - if (src0Format == VSI_NN_TYPE_FLOAT16 && src1Format == VSI_NN_TYPE_FLOAT16 && dstFormat == VSI_NN_TYPE_FLOAT16) - { - shaderParam.globalWorkScale[0] = 12; - shaderParam.globalWorkScale[1] = 1; - } - - shaderParam.globalWorkSize[0] = - gcmALIGN((coord_size[0] + shaderParam.globalWorkScale[0] - 1) / shaderParam.globalWorkScale[0], 4); - shaderParam.globalWorkSize[1] = - (coord_size[1] + shaderParam.globalWorkScale[1] - 1) / shaderParam.globalWorkScale[1]; - { - vx_uint32 uniGemm3x3_4x4[16] = { - 0x15151515, // TCfg - 0x00000000, // ASelt - 0x02100210, 0x05430543, // ABin - 0x15151515, // BSelt - 0x05430210, 0x05430210, // BBin - 0x00000400, // AccumType, ConstantType, and PostShift - 0x00000000, 0x00000000, 0x00000000, 0x00000000, - 0x00000000, 0x00000000, 0x00000000, 0x00000000 // Constant - }; - - vxSetNodeUniform(nodObj, "uniGemm3x3_4x4", 1, uniGemm3x3_4x4); - } - status |= vxSetNodeAttribute(nodObj, VX_NODE_ATTRIBUTE_KERNEL_EXECUTION_PARAMETERS, - &shaderParam, sizeof(vx_kernel_execution_parameters_t)); - - return VX_SUCCESS; -} - -static vx_param_description_t vxTransform_setupThresKernelParam[] = -{ - {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, - {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, - {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED}, - {VX_OUTPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED} -}; - -vx_status VX_CALLBACK vxTransform_setupThresInitializer - ( - vx_node nodObj, - const vx_reference *paramObj, - vx_uint32 paraNum - ) -{ -// Alignment with a power of two value. -#define gcmALIGN(n, align) ((n) + ((align) - 1)) & ~((align) - 1) -#define gcmMIN(x, y) (((x) <= (y)) ? (x) : (y)) -#define gcmMAX(x, y) (((x) >= (y)) ? (x) : (y)) -#define MAX_MULTIPLIER_NUM (65535) -#define MAX_POST_SHIFT_BITS (31) - vx_kernel_execution_parameters_t shaderParam = { - 2, // workdim - {0, 0, 0}, // globalWorkOffset: control the start location be processed in the image - {0, 0, 0}, // globalWorkScale: how many pixels could be processed by a single thread - {0, 0, 0}, // localWorkSize: local group size in thread - {0, 0, 0}}; // globalWorkSize: image size in thread - - vx_status status = VX_SUCCESS; - vx_scalar thresFlag_s = (vx_scalar)paramObj[2]; - vx_enum src0Format = VSI_NN_TYPE_FLOAT16; - vx_enum src1Format = VSI_NN_TYPE_FLOAT16; - - vx_int32 thresFlag = 0; - vx_uint32 extract_packed[4] = {0}; - - vxCopyScalar(thresFlag_s, &thresFlag, VX_READ_ONLY, VX_MEMORY_TYPE_HOST); - - if(status < 0) - VSILOGE("error-%s,%d\n",__FILE__,__LINE__); - - shaderParam.globalWorkScale[0] = 1; - shaderParam.globalWorkScale[1] = 1; - shaderParam.localWorkSize[0] = 1; - shaderParam.localWorkSize[1] = 1; - shaderParam.globalWorkSize[0] = 1; - shaderParam.globalWorkSize[1] = 1; - - if (src0Format == src1Format && src0Format == VSI_NN_TYPE_FLOAT16) - { - vx_uint32 i = 0; - vx_uint32 j = 0; - for (i = 0; i < 4; i++) - { - if (thresFlag & (1 << i)) - { - extract_packed[0] |= ((i << 4) << (i * 8)); - } - else - { - extract_packed[0] |= (((j << 4) + 128) << (i * 8)); - j ++; - } - } - - for (i = 4; i < 6; i++) - { - if (thresFlag & (1 << i)) - { - extract_packed[1] |= ((i << 4) << (i * 8 - 32)); - } - else - { - extract_packed[1] |= (((j << 4) + 128) << (i * 8 - 32)); - j ++; - } - } - - extract_packed[2] = extract_packed[3] = 0x10101010; - } - - vxSetNodeUniform(nodObj, "extract_packed", 1, extract_packed); - - status |= vxSetNodeAttribute(nodObj, VX_NODE_ATTRIBUTE_KERNEL_EXECUTION_PARAMETERS, - &shaderParam, sizeof(vx_kernel_execution_parameters_t)); - - return VX_SUCCESS; -} - - -static vx_param_description_t vxTransform_InterPKernelParam[] = -{ - {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, - {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, - {VX_OUTPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED} -}; - - -vx_status VX_CALLBACK vxTransform_InterPInitializer - ( - vx_node nodObj, - const vx_reference *paramObj, - vx_uint32 paraNum - ) -{ -// Alignment with a power of two value. -#define gcmALIGN(n, align) ((n) + ((align) - 1)) & ~((align) - 1) -#define gcmMIN(x, y) (((x) <= (y)) ? (x) : (y)) -#define gcmMAX(x, y) (((x) >= (y)) ? (x) : (y)) -#define MAX_MULTIPLIER_NUM (65535) -#define MAX_POST_SHIFT_BITS (31) - vx_kernel_execution_parameters_t shaderParam = { - 2, // workdim - {0, 0, 0}, // globalWorkOffset: control the start location be processed in the image - {0, 0, 0}, // globalWorkScale: how many pixels could be processed by a single thread - {0, 0, 0}, // localWorkSize: local group size in thread - {0, 0, 0}}; // globalWorkSize: image size in thread - - vx_status status = VX_SUCCESS; - vx_tensor input0 = (vx_tensor)paramObj[0]; - vx_tensor input1 = (vx_tensor)paramObj[1]; - vx_tensor output = (vx_tensor)paramObj[2]; - vx_enum src0Format = VSI_NN_TYPE_FLOAT16; - vx_enum src1Format = VSI_NN_TYPE_FLOAT16; - vx_enum dstFormat = VSI_NN_TYPE_FLOAT16; - vx_uint32 coord_size[4] = {1, 1, 1, 1}; - vx_uint32 input_size[4] = {1, 1, 1, 1}; - vx_uint32 output_size[4] = {1, 1, 1, 1}; - vx_uint32 i = 0; - vsi_nn_tensor_attr_t attr[3]; - - memset(&attr[0], 0, sizeof(vsi_nn_tensor_attr_t)); - memset(&attr[1], 0, sizeof(vsi_nn_tensor_attr_t)); - memset(&attr[2], 0, sizeof(vsi_nn_tensor_attr_t)); - - status = vsi_nn_vxGetTensorAttr(input0, &attr[0]); - status |= vsi_nn_vxGetTensorAttr(input1, &attr[1]); - status |= vsi_nn_vxGetTensorAttr(output, &attr[2]); - if (status != VX_SUCCESS) - { - VSILOGE("vsi_nn_vxGetTensorAttr failure! at line %d\n", __LINE__); - return status; - } - - for (i = 0; i < attr[0].dim_num; i++) - { - input_size[i] = attr[0].size[i]; - } - src0Format = attr[0].dtype.vx_type; - src1Format = attr[1].dtype.vx_type; - for (i = 0; i < attr[1].dim_num; i++) - { - coord_size[i] = attr[1].size[i]; - } - dstFormat = attr[2].dtype.vx_type; - for (i = 0; i < attr[2].dim_num; i++) - { - output_size[i] = attr[2].size[i]; - } - - if ((src0Format == VSI_NN_TYPE_FLOAT16 && src1Format == VSI_NN_TYPE_FLOAT16 && dstFormat == VSI_NN_TYPE_FLOAT16) - || (src0Format == VSI_NN_TYPE_INT16 && src1Format == VSI_NN_TYPE_INT16 && dstFormat == VSI_NN_TYPE_INT16)) - { - shaderParam.globalWorkScale[0] = 2; - shaderParam.globalWorkScale[1] = 1; - } - - shaderParam.globalWorkSize[0] = - gcmALIGN((coord_size[0] + shaderParam.globalWorkScale[0] - 1) / shaderParam.globalWorkScale[0], 4); - shaderParam.globalWorkSize[1] = - (coord_size[1] + shaderParam.globalWorkScale[1] - 1) / shaderParam.globalWorkScale[1]; - { - vx_int32 packedWH2[2] = {input_size[0], input_size[1]}; - vx_int32 packedWH = (input_size[1] << 16) | (input_size[0] & 0xFFFF); - vx_uint32 uniGetDXY_4x4[16] = { - 0x05050505, // TCfg - 0x04040404, // ASelt - 0x00100001, 0x00010010, // ABin - 0x09090909, // BSelt - 0x00010000, 0x00000001, // BBin - 0x00000101, // AccumType, ConstantType, and PostShift - 0x3c000000, 0x00000000, 0x3c000000, 0x00000000, - 0x3c000000, 0x00000000, 0x3c000000, 0x00000000 // Constant - }; - vx_uint32 uniConvertF16toF32_4x4[16] = { - 0x01010101, // TCfg - 0x01010000, // ASelt - 0x00010000, 0x00010000, // ABin - 0x02020202, // BSelt - 0x00000000, 0x00000000, // BBin - 0x00000100, // AccumType, ConstantType, and PostShift - 0x00003c00, 0x00000000, 0x00003c00, 0x00000000, - 0x00003c00, 0x00000000, 0x00003c00, 0x00000000 // Constant - }; - - vxSetNodeUniform(nodObj, "uniGetDXY_4x4", 1, uniGetDXY_4x4); - vxSetNodeUniform(nodObj, "uniConvertF16toF32_4x4", 1, uniConvertF16toF32_4x4); - - //packedWH2[0] = input_size[0]; - //packedWH2[1] = input_size[1]; - //packedWH = (input_size[1] << 16) | (input_size[0] & 0xFFFF); - vxSetNodeUniform(nodObj, "packedWH2", 1, packedWH2); - vxSetNodeUniform(nodObj, "packedWH", 1, &packedWH); - } - if (output_size[2] > 1) - { - vxSetNodeUniform(nodObj, "depth", 1, &output_size[2]); - } - - status |= vxSetNodeAttribute(nodObj, VX_NODE_ATTRIBUTE_KERNEL_EXECUTION_PARAMETERS, - &shaderParam, sizeof(vx_kernel_execution_parameters_t)); - - return VX_SUCCESS; -} - -#ifdef __cplusplus -extern "C" { -#endif -vx_kernel_description_t vxSpatial_transformer_CPU = -{ - _VX_KERNEL_ID, - _VX_KERNEL_NAME, - _VX_KERNEL_FUNC_KERNEL, - s_params, - _cnt_of_array( s_params ), - vsi_nn_KernelValidator, - NULL, - NULL, - vsi_nn_KernelInitializer, - vsi_nn_KernelDeinitializer -}; - -vx_kernel_description_t vxTransform_GemmKernelInfo_F16toF16 = -{ - VX_KERNEL_ENUM_SPATIAL_TRANSFORMER, - VX_KERNEL_NAME_SPATIAL_TRANSFORMER, - NULL, - vxTransform_GemmKernelParam, - (sizeof(vxTransform_GemmKernelParam) / sizeof(vxTransform_GemmKernelParam[0])), - vxValidator, - NULL, - NULL, - vxTransform_GemmInitializer, - vsi_nn_KernelDeinitializer -}; - -vx_kernel_description_t vxTransform_setupThresKernelInfo_F16toF16 = -{ - VX_KERNEL_ENUM_SPATIAL_TRANSFORMER, - VX_KERNEL_NAME_TRANSFORM_SETUP_THRES_F16TOF16, - NULL, - vxTransform_setupThresKernelParam, - (sizeof(vxTransform_setupThresKernelParam) / sizeof(vxTransform_setupThresKernelParam[0])), - vxValidator, - NULL, - NULL, - vxTransform_setupThresInitializer, - vsi_nn_KernelDeinitializer -}; - -vx_kernel_description_t vxTransform_InterPKernelInfo_F16toF16_2D = -{ - VX_KERNEL_ENUM_SPATIAL_TRANSFORMER, - VX_KERNEL_NAME_TRANSFORM_INTERP_F16TOF16_2D, - NULL, - vxTransform_InterPKernelParam, - (sizeof(vxTransform_InterPKernelParam) / sizeof(vxTransform_InterPKernelParam[0])), - vxValidator, - NULL, - NULL, - vxTransform_InterPInitializer, - vsi_nn_KernelDeinitializer -}; - -vx_kernel_description_t vxTransform_InterPKernelInfo_F16toF16 = -{ - VX_KERNEL_ENUM_SPATIAL_TRANSFORMER, - VX_KERNEL_NAME_TRANSFORM_INTERP_F16TOF16, - NULL, - vxTransform_InterPKernelParam, - (sizeof(vxTransform_InterPKernelParam) / sizeof(vxTransform_InterPKernelParam[0])), - vxValidator, - NULL, - NULL, - vxTransform_InterPInitializer, - vsi_nn_KernelDeinitializer -}; - -vx_kernel_description_t * vx_kernel_SPATIAL_TRANSFORMER_list[] = -{ - &vxSpatial_transformer_CPU, - &vxTransform_setupThresKernelInfo_F16toF16, - &vxTransform_GemmKernelInfo_F16toF16, - &vxTransform_InterPKernelInfo_F16toF16_2D, - &vxTransform_InterPKernelInfo_F16toF16, - NULL -}; -#ifdef __cplusplus -} -#endif diff --git a/src/tim/vx/internal/src/libnnext/ops/kernel/vsi_nn_kernel_sync_host.c b/src/tim/vx/internal/src/libnnext/ops/kernel/vsi_nn_kernel_sync_host.c deleted file mode 100644 index 0b4805d..0000000 --- a/src/tim/vx/internal/src/libnnext/ops/kernel/vsi_nn_kernel_sync_host.c +++ /dev/null @@ -1,124 +0,0 @@ -/**************************************************************************** -* -* Copyright (c) 2020 Vivante Corporation -* -* Permission is hereby granted, free of charge, to any person obtaining a -* copy of this software and associated documentation files (the "Software"), -* to deal in the Software without restriction, including without limitation -* the rights to use, copy, modify, merge, publish, distribute, sublicense, -* and/or sell copies of the Software, and to permit persons to whom the -* Software is furnished to do so, subject to the following conditions: -* -* The above copyright notice and this permission notice shall be included in -* all copies or substantial portions of the Software. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING -* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER -* DEALINGS IN THE SOFTWARE. -* -*****************************************************************************/ -#include -#include -#include -#include -#include - -#include "vsi_nn_pub.h" -#include "utils/vsi_nn_util.h" -#include "libnnext/vsi_nn_vxkernel.h" -#include "libnnext/vx_lib_nnext.h" - -#define _VX_KERNEL_VAR_CPU (vx_client_kernel_cpu_SYNC_HOST) -#define _VX_KERNEL_ID KERNEL_ENUM_SYNC_HOST -#define _VX_KERNEL_NAME ("com.vivantecorp.extension.Sync_hostVXC") -#define _VX_KERNEL_FUNC_KERNEL (vxSync_hostKernel) - -static vsi_status VX_CALLBACK vxSync_hostKernel - ( - vx_node node, - const vx_reference* paramObj, - uint32_t paramNum - ) -{ - vsi_status status = 0; - vx_context context = NULL; - vx_tensor input = NULL; - vx_tensor output = NULL; - uint8_t * in_buffer = NULL; - uint32_t in_stride[8] = { 0 }; - vx_tensor_addressing in_addr = NULL; - vsi_nn_tensor_attr_t in_attr; - - status = VX_SUCCESS; - context = vxGetContext( (vx_reference)node ); - input = (vx_tensor)paramObj[0]; - output = (vx_tensor)paramObj[1]; - memset(&in_attr, 0x0, sizeof(vsi_nn_tensor_attr_t)); - - in_buffer = vsi_nn_ConvertRawTensorToData2( context, input, - &in_attr, in_stride, &in_addr, VX_READ_ONLY ); - - status = vsi_nn_vxCopyDataToTensor(context, output, &in_attr, in_buffer); - if (status != VX_SUCCESS) - { - VSILOGE("vsi_nn_vxCopyDataToTensor failure! at line %d\n", __LINE__); - goto OnError; - } - -OnError: - if( NULL != in_buffer ) - { - free( in_buffer ); - } - return status; -} /* _VX_KERNEL_FUNC_KERNEL() */ - -static vx_param_description_t s_params[] = - { - { VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED }, - { VX_OUTPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED }, - }; - -vx_status VX_CALLBACK vxSync_hostInitializer - ( - vx_node nodObj, - const vx_reference *paramObj, - vx_uint32 paraNum - ) -{ - vx_status status = VX_SUCCESS; - /*TODO: Add initial code for VX program*/ - - return status; -} - - -#ifdef __cplusplus -extern "C" { -#endif -vx_kernel_description_t _VX_KERNEL_VAR_CPU = -{ - _VX_KERNEL_ID, - _VX_KERNEL_NAME, - _VX_KERNEL_FUNC_KERNEL, - s_params, - _cnt_of_array( s_params ), - vsi_nn_KernelValidator, - NULL, - NULL, - vsi_nn_KernelInitializer, - vsi_nn_KernelDeinitializer -}; - -vx_kernel_description_t * vx_kernel_SYNC_HOST_list[] = -{ - &_VX_KERNEL_VAR_CPU, - NULL -}; -#ifdef __cplusplus -} -#endif diff --git a/src/tim/vx/internal/src/libnnext/ops/kernel/vsi_nn_kernel_tensorstackconcat.c b/src/tim/vx/internal/src/libnnext/ops/kernel/vsi_nn_kernel_tensorstackconcat.c deleted file mode 100644 index 9d2c936..0000000 --- a/src/tim/vx/internal/src/libnnext/ops/kernel/vsi_nn_kernel_tensorstackconcat.c +++ /dev/null @@ -1,287 +0,0 @@ -/**************************************************************************** -* -* Copyright (c) 2020 Vivante Corporation -* -* Permission is hereby granted, free of charge, to any person obtaining a -* copy of this software and associated documentation files (the "Software"), -* to deal in the Software without restriction, including without limitation -* the rights to use, copy, modify, merge, publish, distribute, sublicense, -* and/or sell copies of the Software, and to permit persons to whom the -* Software is furnished to do so, subject to the following conditions: -* -* The above copyright notice and this permission notice shall be included in -* all copies or substantial portions of the Software. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING -* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER -* DEALINGS IN THE SOFTWARE. -* -*****************************************************************************/ - -#include -#include -#include -#include -#include - -#include "vsi_nn_platform.h" - -#include "vsi_nn_prv.h" -#include "vsi_nn_log.h" -#include "vsi_nn_tensor_util.h" -#include "utils/vsi_nn_util.h" -#include "utils/vsi_nn_dtype_util.h" -#include "utils/vsi_nn_math.h" -#include "libnnext/vsi_nn_vxkernel.h" -#include "libnnext/vx_lib_nnext.h" - -void tensorStackConcatFunc - ( - int16_t* dataIn, - int16_t* dataIO, - int32_t index, - uint32_t width, - uint32_t height, - uint32_t channel, - uint32_t batch - ) -{ - int32_t stride = width * sizeof(int16_t); - VSILOGI("Hello tensorStackConcatFunc!\n"); - memcpy(dataIO + index * width, dataIn, stride); - return; -} -vsi_status VX_CALLBACK vxTensorStackConcatKernel - ( - vx_node node, - const vx_reference* paramObj, - uint32_t paramNum - ) -{ - vsi_status status = VX_ERROR_INVALID_PARAMETERS; - - if(paramNum == 3) - { - vx_context context = NULL; - // tensor - vx_tensor imgObj[2] = { NULL }; - vsi_nn_tensor_attr_t attr[2]; - int16_t *input = NULL, *output = NULL; - uint32_t input_size[4] = {1, 1, 1, 1}, output_size[4] = {1, 1, 1, 1}; - uint32_t input_stride_size[4] = {1, 1, 1, 1}; - uint32_t output_stride_size[4] = {1, 1, 1, 1}; - vx_tensor_addressing input_user_addr = NULL; - vx_tensor_addressing output_user_addr = NULL; - vsi_nn_type_e inputFormat = VSI_NN_TYPE_FLOAT16, outputFormat = VSI_NN_TYPE_FLOAT16; - uint32_t input_dims = 0, output_dims = 0; - uint32_t i; - // scalar - vx_scalar scalar[1] = { NULL }; - int32_t index = 0; - - status = VX_SUCCESS; - imgObj[0] = (vx_tensor)paramObj[0]; - imgObj[1] = (vx_tensor)paramObj[1]; - scalar[0] = (vx_scalar)paramObj[2]; - memset(&attr[0], 0, sizeof(vsi_nn_tensor_attr_t)); - memset(&attr[1], 0, sizeof(vsi_nn_tensor_attr_t)); - context = vxGetContext((vx_reference)node); - if (context == NULL) - { - VSILOGE("vxGetContext failure! at line %d\n", __LINE__); - return status; - } - - status = vsi_nn_vxGetTensorAttr(imgObj[0], &attr[0]); - status |= vsi_nn_vxGetTensorAttr(imgObj[1], &attr[1]); - status |= vsi_nn_vxGetTensorAttr(imgObj[1], &attr[1]); - if (status != VX_SUCCESS) - { - VSILOGE("vsi_nn_vxGetTensorAttr failure! at line %d\n", __LINE__); - goto final; - } - - //input - input_dims = attr[0].dim_num; - inputFormat = attr[0].dtype.vx_type; - for (i = 0; i < input_dims; i++) - { - input_size[i] = attr[0].size[i]; - } - //output - output_dims = attr[1].dim_num; - outputFormat = attr[1].dtype.vx_type; - for (i = 0; i < output_dims; i++) - { - output_size[i] = attr[1].size[i]; - } - - input_size[2] = (input_dims <= 2)?1:input_size[2]; - input_size[3] = (input_dims <= 3)?1:input_size[3]; - input_stride_size[0] = vsi_nn_GetTypeBytes(inputFormat); - for (i=1; i< input_dims; i++) - { - input_stride_size[i] = input_stride_size[i-1] * input_size[i-1]; - } - input = (int16_t*)malloc(input_size[0]*input_size[1]*input_size[2]*sizeof(int16_t)); - input_user_addr = vxCreateTensorAddressing(context, input_size, input_stride_size, (vx_uint8)input_dims); - vsi_nn_copy_tensor_patch(imgObj[0], &attr[0], input, VX_READ_ONLY); - output_stride_size[0] = vsi_nn_GetTypeBytes(outputFormat); - for (i=1; i< output_dims; i++) - { - output_stride_size[i] = output_stride_size[i-1] * output_size[i-1]; - } - output = (int16_t*)malloc(output_size[0]*output_size[1]*output_size[2]*sizeof(int16_t)); - output_user_addr = vxCreateTensorAddressing(context, output_size, - output_stride_size, (vx_uint8)output_dims); - - vsi_nn_copy_tensor_patch(imgObj[1], &attr[1], output, VX_READ_ONLY); - // scalar - status = vxCopyScalar(scalar[0], &index, VX_READ_ONLY, VX_MEMORY_TYPE_HOST); - if (status != VX_SUCCESS) - { - VSILOGE("vxCopyScalar failure! at line %d\n", __LINE__); - goto final; - } - // Call C Prototype - tensorStackConcatFunc(input, output, index, input_size[0], - input_size[1], input_size[2], input_size[3]); - //output tensor - vsi_nn_copy_tensor_patch(imgObj[1], &attr[1], output, VX_WRITE_ONLY); -final: - if(input) free(input); - if(output) free(output); - if(input_user_addr) vxReleaseTensorAddressing(&input_user_addr); - if(output_user_addr) vxReleaseTensorAddressing(&output_user_addr); - } - return status; -} -vsi_status VX_CALLBACK vxTensorStackConcatInitializer - ( - vx_node nodObj, - const vx_reference *paramObj, - uint32_t paraNum - ) -{ - vsi_status status = VX_SUCCESS; - // Alignment with a power of two value. -#define gcmALIGN(n, align) ((n) + ((align) - 1)) & ~((align) - 1) - vx_kernel_execution_parameters_t shaderParam = { - 3, // workdim - {0, 0, 0}, // globalWorkOffset: control the start location be processed in the image - {0, 0, 0}, // globalWorkScale: how many pixels could be processed by a single thread - {0, 0, 0}, // localWorkSize: local group size in thread - {0, 0, 0}}; // globalWorkSize: image size in thread - - vx_tensor input = (vx_tensor)paramObj[0]; - uint32_t input_size[4] = {1, 1, 1, 1}; - uint32_t input_dims = 0; - vsi_nn_type_e inputDataFormat = VSI_NN_TYPE_FLOAT16; - vsi_nn_tensor_attr_t attr; - uint32_t i; - - memset(&attr, 0, sizeof(vsi_nn_tensor_attr_t)); - status = vsi_nn_vxGetTensorAttr(input, &attr); - if (status != VX_SUCCESS) - { - VSILOGE("vsi_nn_vxGetTensorAttr failure! at line %d\n", __LINE__); - return status; - } - - input_dims = attr.dim_num; - inputDataFormat = attr.dtype.vx_type; - for (i = 0; i < input_dims; i++) - { - input_size[i] = attr.size[i]; - } - input_size[2] = (input_dims <= 2)?1:input_size[2]; - - shaderParam.globalWorkOffset[0] = 0; - shaderParam.globalWorkOffset[1] = 0; - shaderParam.globalWorkOffset[2] = 0; - if (inputDataFormat == VSI_NN_TYPE_FLOAT16 || inputDataFormat == VSI_NN_TYPE_INT16) - shaderParam.globalWorkScale[0] = 16; - else - shaderParam.globalWorkScale[0] = 32; - shaderParam.globalWorkScale[1] = 1; - shaderParam.globalWorkScale[2] = 1; - shaderParam.globalWorkSize[0] = gcmALIGN((input_size[0] + shaderParam.globalWorkScale[0] - 1) - / shaderParam.globalWorkScale[0], 4); - shaderParam.globalWorkSize[1] = (input_size[1] + shaderParam.globalWorkScale[1] - 1) - / shaderParam.globalWorkScale[1]; - shaderParam.globalWorkSize[2] = (input_size[2] + shaderParam.globalWorkScale[2] - 1) - / shaderParam.globalWorkScale[2]; - - status |= vxSetNodeAttribute(nodObj, VX_NODE_ATTRIBUTE_KERNEL_EXECUTION_PARAMETERS, - &shaderParam, sizeof(vx_kernel_execution_parameters_t)); - if(status < 0) - { - VSILOGE("[%s : %d]Initializer failure! \n",__FILE__, __LINE__); - } - return status; -} -static vx_param_description_t vxTensorStackConcatKernelParam[] = -{ - {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, - {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, - {VX_OUTPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED} -}; -#ifdef __cplusplus -extern "C" { -#endif -vx_kernel_description_t vxTensorStackConcatKernelInfo = -{ - VX_KERNEL_ENUM_TENSORSTACKCONCAT, - VX_KERNEL_NAME_TENSORSTACKCONCAT, - NULL, - vxTensorStackConcatKernelParam, - (sizeof(vxTensorStackConcatKernelParam) / sizeof(vxTensorStackConcatKernelParam[0])), - vsi_nn_KernelValidator, - NULL, - NULL, - vxTensorStackConcatInitializer, - vsi_nn_KernelDeinitializer -}; - -vx_kernel_description_t vxTensorStackConcatKernelInfo8Bits = -{ - VX_KERNEL_ENUM_TENSORSTACKCONCAT8BITS, - VX_KERNEL_NAME_TENSORSTACKCONCAT8BITS, - NULL, - vxTensorStackConcatKernelParam, - (sizeof(vxTensorStackConcatKernelParam) / sizeof(vxTensorStackConcatKernelParam[0])), - vsi_nn_KernelValidator, - NULL, - NULL, - vxTensorStackConcatInitializer, - vsi_nn_KernelDeinitializer -}; - -vx_kernel_description_t vxTensorStackConcatKernelInfo_CPU = -{ - VX_KERNEL_ENUM_TENSORSTACKCONCAT, - VX_KERNEL_NAME_TENSORSTACKCONCAT, - vxTensorStackConcatKernel, - vxTensorStackConcatKernelParam, - (sizeof(vxTensorStackConcatKernelParam) / sizeof(vxTensorStackConcatKernelParam[0])), - vsi_nn_KernelValidator, - NULL, - NULL, - vsi_nn_KernelInitializer, - vsi_nn_KernelDeinitializer -}; - -vx_kernel_description_t * vx_kernel_TENSORSTACKCONCAT_list[] = -{ - &vxTensorStackConcatKernelInfo_CPU, - &vxTensorStackConcatKernelInfo, - &vxTensorStackConcatKernelInfo8Bits, - NULL -}; -#ifdef __cplusplus -} -#endif diff --git a/src/tim/vx/internal/src/libnnext/ops/vx/batchnorm_single_f32.vx b/src/tim/vx/internal/src/libnnext/ops/vx/batchnorm_single_f32.vx index e419457..d4d159c 100644 --- a/src/tim/vx/internal/src/libnnext/ops/vx/batchnorm_single_f32.vx +++ b/src/tim/vx/internal/src/libnnext/ops/vx/batchnorm_single_f32.vx @@ -30,24 +30,27 @@ __kernel void batch_norm_##name0##_F16_F16_F32_F32to##name1##_brdcst1( \ _viv_asm(COPY, mean, _mean, 16); \ VXC_ReadImage2DArray(_var, Variance, coord, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \ _viv_asm(COPY, var, _var, 16); \ - float4 gamma0 = read_imagef(Gamma, coord); \ - coord.x += 4; \ - float4 gamma1 = read_imagef(Gamma, coord); \ - coord.x -= 4; \ - float4 beta = read_imagef(Beta, coord); \ + int4 coord_in = coord; \ + int depth = get_image_array_size(Gamma); \ + _viv_asm(CLAMP0MAX, coord_in.z, coord_in.z, depth - 1); \ + float4 gamma = read_imagef(Gamma, coord_in); \ + coord_in.z = coord.z; \ + depth = get_image_array_size(Beta); \ + _viv_asm(CLAMP0MAX, coord_in.z, coord_in.z, depth - 1); \ + float4 beta = read_imagef(Beta, coord_in); \ \ float4 src0, src1, m, v; \ VXC_DP4x4(src0, src, src, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 1), uniDatatoF32_0_4x4); \ VXC_DP4x4(m, mean, mean, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 1), uniDatatoF32_0_4x4); \ VXC_DP4x4(v, var, var, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 1), uniDatatoF32_0_4x4); \ - gamma0 = gamma0 * rsqrt(v + eps); \ + float4 gamma0 = gamma.xxxx * rsqrt(v + eps); \ src0 = src0 * input_scale + input_tail; \ src0 = (src0 - m) * gamma0 + beta.xxxx; \ src0 = src0 * output_scale + output_zp; \ VXC_DP4x4(src1, src, src, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 1), uniDatatoF32_1_4x4); \ VXC_DP4x4(m, mean, mean, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 1), uniDatatoF32_1_4x4); \ VXC_DP4x4(v, var, var, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 1), uniDatatoF32_1_4x4); \ - gamma1 = gamma1 * rsqrt(v + eps); \ + float4 gamma1 = gamma.xxxx * rsqrt(v + eps); \ src1 = src1 * input_scale + input_tail; \ src1 = (src1 - m) * gamma1 + beta.xxxx; \ src1 = src1 * output_scale + output_zp; \ @@ -95,22 +98,21 @@ __kernel void batch_norm_##name0##_F16_F16_F32_F32to##name1##_brdcst1_2D( \ _viv_asm(COPY, mean, _mean, 16); \ VXC_ReadImage(_var, Variance, coord.xy, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \ _viv_asm(COPY, var, _var, 16); \ - float4 gamma0 = read_imagef(Gamma, coord.xy); \ - float4 gamma1 = read_imagef(Gamma, coord.zy); \ + float4 gamma = read_imagef(Gamma, coord.xy); \ float4 beta = read_imagef(Beta, coord.xy); \ \ float4 src0, src1, m, v; \ VXC_DP4x4(src0, src, src, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 1), uniDatatoF32_0_4x4); \ VXC_DP4x4(m, mean, mean, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 1), uniDatatoF32_0_4x4); \ VXC_DP4x4(v, var, var, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 1), uniDatatoF32_0_4x4); \ - gamma0 = gamma0 * rsqrt(v + eps); \ + float4 gamma0 = gamma.xxxx * rsqrt(v + eps); \ src0 = src0 * input_scale + input_tail; \ src0 = (src0 - m) * gamma0 + beta.xxxx; \ src0 = src0 * output_scale + output_zp; \ VXC_DP4x4(src1, src, src, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 1), uniDatatoF32_1_4x4); \ VXC_DP4x4(m, mean, mean, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 1), uniDatatoF32_1_4x4); \ VXC_DP4x4(v, var, var, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 1), uniDatatoF32_1_4x4); \ - gamma1 = gamma1 * rsqrt(v + eps); \ + float4 gamma1 = gamma.xxxx * rsqrt(v + eps); \ src1 = src1 * input_scale + input_tail; \ src1 = (src1 - m) * gamma1 + beta.xxxx; \ src1 = src1 * output_scale + output_zp; \ @@ -158,12 +160,18 @@ __kernel void batch_norm_##name0##_F16_F16_F32_F32to##name1##_brdcst0( \ _viv_asm(COPY, mean, _mean, 16); \ VXC_ReadImage2DArray(_var, Variance, coord, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \ _viv_asm(COPY, var, _var, 16); \ - float4 gamma0 = read_imagef(Gamma, coord); \ - float4 beta0 = read_imagef(Beta, coord); \ - coord.x += 4; \ - float4 gamma1 = read_imagef(Gamma, coord); \ - float4 beta1 = read_imagef(Beta, coord); \ - coord.x -= 4; \ + int4 coord_in0 = coord; \ + int depth = get_image_array_size(Gamma); \ + _viv_asm(CLAMP0MAX, coord_in0.z, coord_in0.z, depth - 1); \ + float4 gamma0 = read_imagef(Gamma, coord_in0); \ + int4 coord_in1 = coord; \ + depth = get_image_array_size(Beta); \ + _viv_asm(CLAMP0MAX, coord_in1.z, coord_in1.z, depth - 1); \ + float4 beta0 = read_imagef(Beta, coord_in1); \ + coord_in0.x += 4; \ + coord_in1.x += 4; \ + float4 gamma1 = read_imagef(Gamma, coord_in0); \ + float4 beta1 = read_imagef(Beta, coord_in1); \ \ float4 src0, src1, m, v; \ VXC_DP4x4(src0, src, src, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 1), uniDatatoF32_0_4x4); \ @@ -264,4 +272,3 @@ BATCH_NORM_SH_IMPL_AXIS1_2D(U8, U8, vxc_uchar16, vxc_uchar16, int4, vxc_uchar BATCH_NORM_SH_IMPL_AXIS1_2D(U8, F16, vxc_uchar16, vxc_uchar16, half4, vxc_half8, vxc_ushort8) BATCH_NORM_SH_IMPL_AXIS1_2D(I8, I8, vxc_char16, vxc_char16, int4, vxc_char16, vxc_char16) BATCH_NORM_SH_IMPL_AXIS1_2D(I8, F16, vxc_char16, vxc_char16, half4, vxc_half8, vxc_ushort8) - diff --git a/src/tim/vx/internal/src/libnnext/ops/vx/conv1d_ovxlib_k1024.vx b/src/tim/vx/internal/src/libnnext/ops/vx/conv1d_ovxlib_k1024.vx index f6ac4ce..dc2497e 100644 --- a/src/tim/vx/internal/src/libnnext/ops/vx/conv1d_ovxlib_k1024.vx +++ b/src/tim/vx/internal/src/libnnext/ops/vx/conv1d_ovxlib_k1024.vx @@ -83,14 +83,6 @@ __kernel void conv1d_U8U8I32toU8_K1024_SMALL( VXC_WriteImage(output, coord.wy, result, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); } -inline uchar* get_image2D_array_ptr(image2d_array_t input) -{ - int8 desc; - _viv_asm(COPY, desc, input, sizeof(desc)); - uchar *src_ptr = (uchar*)desc.s0; - return src_ptr; -} - __kernel void conv1d_U8U8I32toU8_K1024_LARGE( __read_only image2d_array_t input, __read_only image2d_array_t weight, @@ -112,9 +104,11 @@ __kernel void conv1d_U8U8I32toU8_K1024_LARGE( vxc_short8 w_zp = (short)weight_ZP; vxc_uchar16 input_val = 0, weight_val = 0; int temp = 0, i, j; - uchar *src_ptr_base = (uchar *)get_image2D_array_ptr(input); + Tensor src_tensor = create_image_from_image2d(input, 1); + uchar *src_ptr_base = (uchar *)src_image.ptr; uchar *src_ptr; - uchar *dst_ptr = (uchar *)get_image2D_array_ptr(output); + Tensor dst_tensor = create_image_from_image2d(output, 1); + uchar *dst_ptr = (uchar *)dst_tensor.ptr; temp = read_imagei(bias, coord.yz).x; sum0 = convert_float(temp); @@ -122,7 +116,7 @@ __kernel void conv1d_U8U8I32toU8_K1024_LARGE( for (i = 0; i < input_height; i++) { - src_ptr = src_ptr_base + (coord.x + coord.z * input_width); + src_ptr = src_ptr_base + (coord.x + coord.z * src_image.stride_y); for (j = 0; j < kernel_cnt_x16; j++) { VXC_ReadImage2DArray(weight_val, weight, coord_w, VXC_5BITOFFSET_XY(0, 0), \ @@ -161,7 +155,7 @@ __kernel void conv1d_U8U8I32toU8_K1024_LARGE( _viv_asm(CONV_SAT_RTE, result1, sum1); vxc_uchar8 result; VXC_DP2x8(result, result0, result1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniSumOrderUchar_2x8); - dst_ptr = dst_ptr + (coord.w + coord.y * output_width); + dst_ptr = dst_ptr + (coord.w + coord.y * dst_tensor.stride_y); VXC_Vstore8(dst_ptr, 0, result); } diff --git a/src/tim/vx/internal/src/libnnext/ops/vx/eltwise_unary_2d.vx b/src/tim/vx/internal/src/libnnext/ops/vx/eltwise_unary_2d.vx index a8c4583..dea29d2 100644 --- a/src/tim/vx/internal/src/libnnext/ops/vx/eltwise_unary_2d.vx +++ b/src/tim/vx/internal/src/libnnext/ops/vx/eltwise_unary_2d.vx @@ -72,6 +72,56 @@ float4 eltwise_unary_round(float4 x) return convert_float4(convert_int4_rte(x)); } +#define MUL2_RSQRTPI (1.1283791670955126f) +float erf_eval(float x) +{ + float res = 0; + float tmp = x; + float factorial = 1; + float x_pow = x; + float one = 1.0f; + float n = 1; + + if (x <= -3) + return -1; + else if(x >= 3) + return 1; + + while (fabs(tmp) > 1e-5) + { + res += tmp; + + factorial *= n; + one *= -1; + x_pow *= x * x; + tmp = one / factorial * x_pow / ( 2 * n + 1); + + n += 1.0f; + } + return res * MUL2_RSQRTPI; +} +#define RSQRT2 (0.70710678118654752440084436210485f) +float4 eltwise_unary_gelu(float4 x) +{ + float4 erf, data; + data = x * RSQRT2; + erf.x = erf_eval(data.x); + erf.y = erf_eval(data.y); + erf.z = erf_eval(data.z); + erf.w = erf_eval(data.w); + x = 0.5f * x * (1 + erf); + + return x; +} + +#define SQRT_2_RCP_PI 0.7978845834732056f +float4 eltwise_unary_hard_gelu(float4 x) +{ + float4 cdf = 0.5f + 0.5f * _tanh(SQRT_2_RCP_PI * + (x + 0.044715f * x * x * x)); + return x * cdf; +} + _viv_uniform float inputScale; _viv_uniform float inputTail; _viv_uniform float outputScale; @@ -203,6 +253,28 @@ ELTSISE_UNARY_2D(round, U8, U8, vxc_uchar8, vxc_uchar8, int4, vxc_uchar8, vxc ELTSISE_UNARY_2D(round, U8, F16, vxc_uchar8, vxc_uchar8, half4, vxc_half8, vxc_short8) ELTSISE_UNARY_2D(round, I16, I16, vxc_short8, vxc_short8, int4, vxc_short8, vxc_short8) ELTSISE_UNARY_2D(round, I16, F16, vxc_short8, vxc_short8, half4, vxc_half8, vxc_short8) +//GELU +ELTSISE_UNARY_2D(gelu, F16, F16, vxc_short8, vxc_half8, half4, vxc_half8, vxc_short8) +ELTSISE_UNARY_2D(gelu, F16, I8, vxc_short8, vxc_half8, int4, vxc_char8, vxc_char8) +ELTSISE_UNARY_2D(gelu, F16, U8, vxc_short8, vxc_half8, int4, vxc_uchar8, vxc_uchar8) +ELTSISE_UNARY_2D(gelu, F16, I16, vxc_short8, vxc_half8, int4, vxc_short8, vxc_short8) +ELTSISE_UNARY_2D(gelu, I8, I8, vxc_char8, vxc_char8, int4, vxc_char8, vxc_char8) +ELTSISE_UNARY_2D(gelu, I8, F16, vxc_char8, vxc_char8, half4, vxc_half8, vxc_short8) +ELTSISE_UNARY_2D(gelu, U8, U8, vxc_uchar8, vxc_uchar8, int4, vxc_uchar8, vxc_uchar8) +ELTSISE_UNARY_2D(gelu, U8, F16, vxc_uchar8, vxc_uchar8, half4, vxc_half8, vxc_short8) +ELTSISE_UNARY_2D(gelu, I16, I16, vxc_short8, vxc_short8, int4, vxc_short8, vxc_short8) +ELTSISE_UNARY_2D(gelu, I16, F16, vxc_short8, vxc_short8, half4, vxc_half8, vxc_short8) +//HARD_GELU +ELTSISE_UNARY_2D(hard_gelu, F16, F16, vxc_short8, vxc_half8, half4, vxc_half8, vxc_short8) +ELTSISE_UNARY_2D(hard_gelu, F16, I8, vxc_short8, vxc_half8, int4, vxc_char8, vxc_char8) +ELTSISE_UNARY_2D(hard_gelu, F16, U8, vxc_short8, vxc_half8, int4, vxc_uchar8, vxc_uchar8) +ELTSISE_UNARY_2D(hard_gelu, F16, I16, vxc_short8, vxc_half8, int4, vxc_short8, vxc_short8) +ELTSISE_UNARY_2D(hard_gelu, I8, I8, vxc_char8, vxc_char8, int4, vxc_char8, vxc_char8) +ELTSISE_UNARY_2D(hard_gelu, I8, F16, vxc_char8, vxc_char8, half4, vxc_half8, vxc_short8) +ELTSISE_UNARY_2D(hard_gelu, U8, U8, vxc_uchar8, vxc_uchar8, int4, vxc_uchar8, vxc_uchar8) +ELTSISE_UNARY_2D(hard_gelu, U8, F16, vxc_uchar8, vxc_uchar8, half4, vxc_half8, vxc_short8) +ELTSISE_UNARY_2D(hard_gelu, I16, I16, vxc_short8, vxc_short8, int4, vxc_short8, vxc_short8) +ELTSISE_UNARY_2D(hard_gelu, I16, F16, vxc_short8, vxc_short8, half4, vxc_half8, vxc_short8) _viv_uniform VXC_512Bits uniConvBF16toF32_Part0_2x8; _viv_uniform VXC_512Bits uniConvBF16toF32_Part1_2x8; @@ -252,3 +324,7 @@ ELTSISE_UNARY_BF16_2D(mish) ELTSISE_UNARY_BF16_2D(hard_sigmoid) //ROUND ELTSISE_UNARY_BF16_2D(round) +//GELU +ELTSISE_UNARY_BF16_2D(gelu) +//HARD_GELU +ELTSISE_UNARY_BF16_2D(hard_gelu) diff --git a/src/tim/vx/internal/src/libnnext/ops/vx/eltwise_unary_3d.vx b/src/tim/vx/internal/src/libnnext/ops/vx/eltwise_unary_3d.vx index 393e4a0..6da7605 100644 --- a/src/tim/vx/internal/src/libnnext/ops/vx/eltwise_unary_3d.vx +++ b/src/tim/vx/internal/src/libnnext/ops/vx/eltwise_unary_3d.vx @@ -72,6 +72,56 @@ float4 eltwise_unary_round(float4 x) return convert_float4(convert_int4_rte(x)); } +#define MUL2_RSQRTPI (1.1283791670955126f) +float erf_eval(float x) +{ + float res = 0; + float tmp = x; + float factorial = 1; + float x_pow = x; + float one = 1.0f; + float n = 1; + + if (x <= -3) + return -1; + else if(x >= 3) + return 1; + + while (fabs(tmp) > 1e-5) + { + res += tmp; + + factorial *= n; + one *= -1; + x_pow *= x * x; + tmp = one / factorial * x_pow / ( 2 * n + 1); + + n += 1.0f; + } + return res * MUL2_RSQRTPI; +} +#define RSQRT2 (0.70710678118654752440084436210485f) +float4 eltwise_unary_gelu(float4 x) +{ + float4 erf, data; + data = x * RSQRT2; + erf.x = erf_eval(data.x); + erf.y = erf_eval(data.y); + erf.z = erf_eval(data.z); + erf.w = erf_eval(data.w); + x = 0.5f * x * (1 + erf); + + return x; +} + +#define SQRT_2_RCP_PI 0.7978845834732056f +float4 eltwise_unary_hard_gelu(float4 x) +{ + float4 cdf = 0.5f + 0.5f * _tanh(SQRT_2_RCP_PI * + (x + 0.044715f * x * x * x)); + return x * cdf; +} + _viv_uniform float inputScale; _viv_uniform float inputTail; _viv_uniform float outputScale; @@ -203,6 +253,28 @@ ELTSISE_UNARY_3D(round, U8, U8, vxc_uchar8, vxc_uchar8, int4, vxc_uchar8, vxc ELTSISE_UNARY_3D(round, U8, F16, vxc_uchar8, vxc_uchar8, half4, vxc_half8, vxc_short8) ELTSISE_UNARY_3D(round, I16, I16, vxc_short8, vxc_short8, int4, vxc_short8, vxc_short8) ELTSISE_UNARY_3D(round, I16, F16, vxc_short8, vxc_short8, half4, vxc_half8, vxc_short8) +//GELU +ELTSISE_UNARY_3D(gelu, F16, F16, vxc_short8, vxc_half8, half4, vxc_half8, vxc_short8) +ELTSISE_UNARY_3D(gelu, F16, I8, vxc_short8, vxc_half8, int4, vxc_char8, vxc_char8) +ELTSISE_UNARY_3D(gelu, F16, U8, vxc_short8, vxc_half8, int4, vxc_uchar8, vxc_uchar8) +ELTSISE_UNARY_3D(gelu, F16, I16, vxc_short8, vxc_half8, int4, vxc_short8, vxc_short8) +ELTSISE_UNARY_3D(gelu, I8, I8, vxc_char8, vxc_char8, int4, vxc_char8, vxc_char8) +ELTSISE_UNARY_3D(gelu, I8, F16, vxc_char8, vxc_char8, half4, vxc_half8, vxc_short8) +ELTSISE_UNARY_3D(gelu, U8, U8, vxc_uchar8, vxc_uchar8, int4, vxc_uchar8, vxc_uchar8) +ELTSISE_UNARY_3D(gelu, U8, F16, vxc_uchar8, vxc_uchar8, half4, vxc_half8, vxc_short8) +ELTSISE_UNARY_3D(gelu, I16, I16, vxc_short8, vxc_short8, int4, vxc_short8, vxc_short8) +ELTSISE_UNARY_3D(gelu, I16, F16, vxc_short8, vxc_short8, half4, vxc_half8, vxc_short8) +//HARD_GELU +ELTSISE_UNARY_3D(hard_gelu, F16, F16, vxc_short8, vxc_half8, half4, vxc_half8, vxc_short8) +ELTSISE_UNARY_3D(hard_gelu, F16, I8, vxc_short8, vxc_half8, int4, vxc_char8, vxc_char8) +ELTSISE_UNARY_3D(hard_gelu, F16, U8, vxc_short8, vxc_half8, int4, vxc_uchar8, vxc_uchar8) +ELTSISE_UNARY_3D(hard_gelu, F16, I16, vxc_short8, vxc_half8, int4, vxc_short8, vxc_short8) +ELTSISE_UNARY_3D(hard_gelu, I8, I8, vxc_char8, vxc_char8, int4, vxc_char8, vxc_char8) +ELTSISE_UNARY_3D(hard_gelu, I8, F16, vxc_char8, vxc_char8, half4, vxc_half8, vxc_short8) +ELTSISE_UNARY_3D(hard_gelu, U8, U8, vxc_uchar8, vxc_uchar8, int4, vxc_uchar8, vxc_uchar8) +ELTSISE_UNARY_3D(hard_gelu, U8, F16, vxc_uchar8, vxc_uchar8, half4, vxc_half8, vxc_short8) +ELTSISE_UNARY_3D(hard_gelu, I16, I16, vxc_short8, vxc_short8, int4, vxc_short8, vxc_short8) +ELTSISE_UNARY_3D(hard_gelu, I16, F16, vxc_short8, vxc_short8, half4, vxc_half8, vxc_short8) _viv_uniform VXC_512Bits uniConvBF16toF32_Part0_2x8; _viv_uniform VXC_512Bits uniConvBF16toF32_Part1_2x8; @@ -250,4 +322,8 @@ ELTSISE_UNARY_BF16(mish) //HARD_SIGMOID ELTSISE_UNARY_BF16(hard_sigmoid) //ROUND -ELTSISE_UNARY_BF16(round) \ No newline at end of file +ELTSISE_UNARY_BF16(round) +//GELU +ELTSISE_UNARY_BF16(gelu) +//HARD_GELU +ELTSISE_UNARY_BF16(hard_gelu) \ No newline at end of file diff --git a/src/tim/vx/internal/src/libnnext/ops/vx/erf.vx b/src/tim/vx/internal/src/libnnext/ops/vx/erf.vx index 9247044..37bde57 100644 --- a/src/tim/vx/internal/src/libnnext/ops/vx/erf.vx +++ b/src/tim/vx/internal/src/libnnext/ops/vx/erf.vx @@ -1,8 +1,9 @@ #include "cl_viv_vx_ext.h" #define MUL2_RSQRTPI (1.1283791670955126f) -float eltwise_unary_erf(float x) +float eltwise_unary_erf(float _x) { + float x = clamp(_x, -2, 2); float res = 0; float tmp = x; float factorial = 1; diff --git a/src/tim/vx/internal/src/libnnext/ops/vx/get_matrix.vx b/src/tim/vx/internal/src/libnnext/ops/vx/get_matrix.vx new file mode 100644 index 0000000..8b67fb6 --- /dev/null +++ b/src/tim/vx/internal/src/libnnext/ops/vx/get_matrix.vx @@ -0,0 +1,185 @@ +#include "cl_viv_vx_ext.h" + +_viv_uniform float4 theta_1; +_viv_uniform float4 theta_2; +_viv_uniform float4 scale; +_viv_uniform float input_scale; +_viv_uniform float input_tail; + +#define GET_MATRIX_SH_IMPL(name0, in_type, read_func) \ +__kernel void get_matrix_##name0##toF32 \ + ( \ + __read_only image2d_t input, \ + __write_only image2d_t output, \ + int has_theta_1_1, \ + int has_theta_1_2, \ + int has_theta_1_3, \ + int has_theta_2_1, \ + int has_theta_2_2, \ + int has_theta_2_3, \ + float theta_1_1, \ + float theta_1_2, \ + float theta_1_3, \ + float theta_2_1, \ + float theta_2_2, \ + float theta_2_3, \ + float i_width, \ + float i_height, \ + float o_width, \ + float o_height \ + ) \ +{ \ + int2 coord = (int2)(0, get_global_id(1)); \ + float4 matrix0, matrix1; \ + float4 theta1, theta2; \ + _viv_asm(COPY, theta1, theta_1, 16); \ + _viv_asm(COPY, theta2, theta_2, 16); \ + \ + if (has_theta_1_1 == 0) \ + { \ + in_type data = read_func(input, coord); \ + coord.x ++; \ + theta1.x = convert_float(data.x) * input_scale + input_tail; \ + } \ + \ + if (has_theta_1_2 == 0) \ + { \ + in_type data = read_func(input, coord); \ + coord.x ++; \ + theta1.y = convert_float(data.x) * input_scale + input_tail; \ + } \ + \ + if (has_theta_1_3 == 0) \ + { \ + in_type data = read_func(input, coord); \ + coord.x ++; \ + theta1.z = convert_float(data.x) * input_scale + input_tail; \ + } \ + \ + if (has_theta_2_1 == 0) \ + { \ + in_type data = read_func(input, coord); \ + coord.x ++; \ + theta2.x = convert_float(data.x) * input_scale + input_tail; \ + } \ + \ + if (has_theta_2_2 == 0) \ + { \ + in_type data = read_func(input, coord); \ + coord.x ++; \ + theta2.y = convert_float(data.x) * input_scale + input_tail; \ + } \ + \ + if (has_theta_2_3 == 0) \ + { \ + in_type data = read_func(input, coord); \ + coord.x ++; \ + theta2.z = convert_float(data.x) * input_scale + input_tail; \ + } \ + \ + matrix0.x = theta2.y * scale.x; \ + matrix0.z = theta2.x * scale.z; \ + matrix1.x = ( theta2.z - theta2.y - theta2.x + 1) * i_width * 0.5f; \ + matrix0.y = theta1.y * scale.w; \ + matrix0.w = theta1.x * scale.y; \ + matrix1.y = ( theta1.z - theta1.y - theta1.x + 1) * i_height * 0.5f; \ + matrix1.zw = 2.0f * matrix0.xy; \ + \ + coord.x = 0; \ + vxc_ushort8 dst; \ + _viv_asm(COPY, dst, matrix0, 16); \ + VXC_WriteImage(output, coord, dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \ + _viv_asm(COPY, dst, matrix1, 16); \ + coord.x = 8; \ + VXC_WriteImage(output, coord, dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \ +} +GET_MATRIX_SH_IMPL(I16, int4, read_imagei) +GET_MATRIX_SH_IMPL(I8, int4, read_imagei) +GET_MATRIX_SH_IMPL(U8, uint4, read_imageui) + +__kernel void get_matrix_F16toF32 + ( + __read_only image2d_t input, + __write_only image2d_t output, + int has_theta_1_1, + int has_theta_1_2, + int has_theta_1_3, + int has_theta_2_1, + int has_theta_2_2, + int has_theta_2_3, + float theta_1_1, + float theta_1_2, + float theta_1_3, + float theta_2_1, + float theta_2_2, + float theta_2_3, + float i_width, + float i_height, + float o_width, + float o_height + ) +{ + int2 coord = (int2)(0, get_global_id(1)); + float4 matrix0, matrix1; + float4 theta1, theta2; + _viv_asm(COPY, theta1, theta_1, 16); + _viv_asm(COPY, theta2, theta_2, 16); + + if (has_theta_1_1 == 0) + { + float4 data = read_imagef(input, coord); + coord.x ++; + theta1.x = data.x; + } + + if (has_theta_1_2 == 0) + { + float4 data = read_imagef(input, coord); + coord.x ++; + theta1.y = data.x; + } + + if (has_theta_1_3 == 0) + { + float4 data = read_imagef(input, coord); + coord.x ++; + theta1.z = data.x; + } + + if (has_theta_2_1 == 0) + { + float4 data = read_imagef(input, coord); + coord.x ++; + theta2.x = data.x; + } + + if (has_theta_2_2 == 0) + { + float4 data = read_imagef(input, coord); + coord.x ++; + theta2.y = data.x; + } + + if (has_theta_2_3 == 0) + { + float4 data = read_imagef(input, coord); + coord.x ++; + theta2.z = data.x; + } + + matrix0.x = theta2.y * scale.x; + matrix0.z = theta2.x * scale.z; + matrix1.x = ( theta2.z - theta2.y - theta2.x + 1) * i_width * 0.5f; + matrix0.y = theta1.y * scale.w; + matrix0.w = theta1.x * scale.y; + matrix1.y = ( theta1.z - theta1.y - theta1.x + 1) * i_height * 0.5f; + matrix1.zw = 2.0f * matrix0.xy; + + coord.x = 0; + vxc_ushort8 dst; + _viv_asm(COPY, dst, matrix0, 16); + VXC_WriteImage(output, coord, dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); + _viv_asm(COPY, dst, matrix1, 16); + coord.x = 8; + VXC_WriteImage(output, coord, dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); +} diff --git a/src/tim/vx/internal/src/libnnext/ops/vx/instance_normalization_f16.vx b/src/tim/vx/internal/src/libnnext/ops/vx/instance_normalization_f16.vx index ed18f67..2fd2d44 100644 --- a/src/tim/vx/internal/src/libnnext/ops/vx/instance_normalization_f16.vx +++ b/src/tim/vx/internal/src/libnnext/ops/vx/instance_normalization_f16.vx @@ -16,7 +16,7 @@ __kernel __attribute__((reqd_work_group_size(16, 1, 1))) void instance_norm_mean int gidx = get_global_id(0) << 3; int lidx = get_local_id(0); int gidz = get_global_id(1); - int4 coord = (int4)(gidx, 0, gidz, 0); + int4 coord = (int4)(gidx, 0, gidz, gidz); vxc_short8 src0; vxc_half8 in_h; vxc_float4 sumsqr; @@ -133,7 +133,8 @@ __kernel __attribute__((reqd_work_group_size(16, 1, 1))) void instance_norm_F16t image2d_array_t output, float eps, int rsFlg) { int gidz = get_global_id(1); - int4 coord = (int4)(get_global_id(0), 0, gidz, 0); + int4 coord = (int4)(get_global_id(0), 0, gidz, gidz); + int4 coord_in = (int4)(get_global_id(0), 0, gidz, gidz); int4 coord_para = (int4)(gidz, 0, 0, 0); vxc_short8 src0; vxc_short8 src1; @@ -166,18 +167,20 @@ __kernel __attribute__((reqd_work_group_size(16, 1, 1))) void instance_norm_F16t int8 input_desc, output_desc; _viv_asm(COPY, input_desc, input, sizeof(input_desc)); int baseAddr_a = (int)get_global_id(1) * input_desc.s4 + input_desc.s0; - _viv_asm(MOV, coord.z, baseAddr_a); + _viv_asm(MOV, coord_in.z, baseAddr_a); _viv_asm(COPY, output_desc, output, sizeof(output_desc)); int baseAddr = (int)get_global_id(1) * output_desc.s4 + output_desc.s0; - _viv_asm(MOV, coord.w, baseAddr); + _viv_asm(MOV, coord.z, baseAddr); for(coord.y = 0; coord.y < height; coord.y++) { - VXC_OP4(img_load_3d, src0, input, coord.xyzz, VXC_5BITOFFSET_XY(0, 0), \ + VXC_OP4(img_load_3d, src0, input, coord_in, VXC_5BITOFFSET_XY(0, 0), \ VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); _viv_asm(COPY, in_h, src0, 16); + coord_in.y ++; + VXC_DP4x4(tmpData0, in_h, in_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\ UniFP16toFP32Lo4_dp4x4); VXC_DP4x4(tmpData1, in_h, in_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\ @@ -191,7 +194,7 @@ __kernel __attribute__((reqd_work_group_size(16, 1, 1))) void instance_norm_F16t VXC_DP2x8(dst, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0),\ uniConvertHalfToFp16_2x8); _viv_asm(COPY, outval, dst, 16); - VXC_OP4_NoDest(img_store_3d, output, coord.xyww, outval, \ + VXC_OP4_NoDest(img_store_3d, output, coord, outval, \ VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0)); } } diff --git a/src/tim/vx/internal/src/libnnext/ops/vx/instance_normalization_i16.vx b/src/tim/vx/internal/src/libnnext/ops/vx/instance_normalization_i16.vx index 523bb38..fa5538c 100644 --- a/src/tim/vx/internal/src/libnnext/ops/vx/instance_normalization_i16.vx +++ b/src/tim/vx/internal/src/libnnext/ops/vx/instance_normalization_i16.vx @@ -26,7 +26,7 @@ __kernel __attribute__((reqd_work_group_size(16, 1, 1))) void instance_norm_mean int gidx = get_global_id(0) << 3; int lidx = get_local_id(0); int gidz = get_global_id(1); - int4 coord = (int4)(gidx, 0, gidz, 0); + int4 coord = (int4)(gidx, 0, gidz, gidz); vxc_short8 src0; float sum = 0, sqr = 0; vxc_float4 sumsqr = (vxc_float4)(0); @@ -43,7 +43,7 @@ __kernel __attribute__((reqd_work_group_size(16, 1, 1))) void instance_norm_mean { for(coord.y = 0; coord.y < height;) { - VXC_OP4(img_load_3d, src0, input, coord, VXC_5BITOFFSET_XY(0, 0), \ + VXC_OP4(img_load_3d, src0, input, coord, 0, \ VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); coord.y++; VXC_DP8x2(sumsqr, src0, src0, VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0),\ @@ -106,7 +106,7 @@ __kernel __attribute__((reqd_work_group_size(16, 1, 1))) void instance_norm_mean { for(; coord.y < endH;) { - VXC_ReadImage(src0, input, coord, VXC_5BITOFFSET_XY(0, 0), + VXC_ReadImage(src0, input, coord, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); coord.y++; VXC_DP8x2(sumsqr, src0, src0, VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0),\ @@ -154,7 +154,8 @@ __kernel __attribute__((reqd_work_group_size(16, 1, 1))) void instance_norm_I16t int rsFlg) { int gidz = get_global_id(1); - int4 coord = (int4)(get_global_id(0), 0, gidz, 0); + int4 coord = (int4)(get_global_id(0), 0, gidz, gidz); + int4 coord_in = (int4)(get_global_id(0), 0, gidz, gidz); int4 coord_para = (int4)(gidz, 0, 0, 0); vxc_short8 src0; vxc_short8 src1; @@ -162,7 +163,7 @@ __kernel __attribute__((reqd_work_group_size(16, 1, 1))) void instance_norm_I16t float scale_vari, bias_val; vxc_float4 bias_f, scale_f, mean_vari = (vxc_float4)(0); - VXC_ReadImage(src1, scale, coord_para.xy, VXC_5BITOFFSET_XY(0, 0),\ + VXC_ReadImage(src1, scale, coord_para.xy, 0,\ VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0)); _viv_asm(COPY, scale_h, src1, 16); VXC_DP4x4(scale_f, scale_h, scale_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\ @@ -190,16 +191,17 @@ __kernel __attribute__((reqd_work_group_size(16, 1, 1))) void instance_norm_I16t int8 input_desc, output_desc; _viv_asm(COPY, input_desc, input, sizeof(input_desc)); int baseAddr_a = (int)get_global_id(1) * input_desc.s4 + input_desc.s0; - _viv_asm(MOV, coord.z, baseAddr_a); + _viv_asm(MOV, coord_in.z, baseAddr_a); _viv_asm(COPY, output_desc, output, sizeof(output_desc)); int baseAddr = (int)get_global_id(1) * output_desc.s4 + output_desc.s0; - _viv_asm(MOV, coord.w, baseAddr); + _viv_asm(MOV, coord.z, baseAddr); for(coord.y = 0; coord.y < height; coord.y++) { - VXC_OP4(img_load_3d, src0, input, coord.xyzz, VXC_5BITOFFSET_XY(0, 0), \ + VXC_OP4(img_load_3d, src0, input, coord_in, 0, \ VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); + coord_in.y ++; VXC_DP4x4(tmpData0, src0, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\ uniConvertInt16Fp32Fst_4x4); VXC_DP4x4(tmpData1, src0, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\ @@ -213,7 +215,7 @@ __kernel __attribute__((reqd_work_group_size(16, 1, 1))) void instance_norm_I16t VXC_DP2x8(dst, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0),\ uniConvertHalfToFp16_2x8); _viv_asm(COPY, outval, dst, 16); - VXC_OP4_NoDest(img_store_3d, output, coord.xyww, outval, \ + VXC_OP4_NoDest(img_store_3d, output, coord, outval, \ VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0)); } } @@ -238,7 +240,7 @@ __kernel __attribute__((reqd_work_group_size(16, 1, 1))) void instance_norm_I16t float scale_vari, bias_val; vxc_float4 bias_f, scale_f, mean_vari = (vxc_float4)(0); - VXC_ReadImage(src1, scale, coord_para.xy, VXC_5BITOFFSET_XY(0, 0),\ + VXC_ReadImage(src1, scale, coord_para.xy, 0,\ VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0)); _viv_asm(COPY, scale_h, src1, 16); VXC_DP4x4(scale_f, scale_h, scale_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\ @@ -265,7 +267,7 @@ __kernel __attribute__((reqd_work_group_size(16, 1, 1))) void instance_norm_I16t for(; coord.y < endH; coord.y++) { - VXC_ReadImage(src0, input, coord.xy, VXC_5BITOFFSET_XY(0, 0),\ + VXC_ReadImage(src0, input, coord.xy, 0,\ VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); VXC_DP4x4(tmpData0, src0, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\ @@ -294,7 +296,8 @@ __kernel __attribute__((reqd_work_group_size(16, 1, 1))) void instance_norm_I16t int rsFlg) { int gidz = get_global_id(1); - int4 coord = (int4)(get_global_id(0), 0, gidz, 0); + int4 coord = (int4)(get_global_id(0), 0, gidz, gidz); + int4 coord_in = (int4)(get_global_id(0), 0, gidz, gidz); int4 coord_para = (int4)(gidz, 0, 0, 0); vxc_short8 src0, src2; vxc_short8 src1; @@ -302,7 +305,7 @@ __kernel __attribute__((reqd_work_group_size(16, 1, 1))) void instance_norm_I16t float scale_vari, bias_val; vxc_float4 bias_f, scale_f, mean_vari = (vxc_float4)(0); - VXC_ReadImage(src1, scale, coord_para.xy, VXC_5BITOFFSET_XY(0, 0),\ + VXC_ReadImage(src1, scale, coord_para.xy, 0,\ VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0)); _viv_asm(COPY, scale_h, src1, 16); VXC_DP4x4(scale_f, scale_h, scale_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), UniFP16toFP32Lo4_dp4x4); @@ -326,15 +329,16 @@ __kernel __attribute__((reqd_work_group_size(16, 1, 1))) void instance_norm_I16t int8 input_desc, output_desc; _viv_asm(COPY, input_desc, input, sizeof(input_desc)); int baseAddr_a = (int)get_global_id(1) * input_desc.s4 + input_desc.s0; - _viv_asm(MOV, coord.z, baseAddr_a); + _viv_asm(MOV, coord_in.z, baseAddr_a); _viv_asm(COPY, output_desc, output, sizeof(output_desc)); int baseAddr = (int)get_global_id(1) * output_desc.s4 + output_desc.s0; - _viv_asm(MOV, coord.w, baseAddr); + _viv_asm(MOV, coord.z, baseAddr); for(coord.y = 0; coord.y < height; coord.y++) { - VXC_OP4(img_load_3d, src0, input, coord.xyzz, VXC_5BITOFFSET_XY(0, 0), \ + VXC_OP4(img_load_3d, src0, input, coord_in, 0, \ VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); + coord_in.y ++; VXC_DP4x4(tmpData0, src0, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\ uniConvertInt16Fp32Fst_4x4); VXC_DP4x4(tmpData1, src0, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\ @@ -346,7 +350,7 @@ __kernel __attribute__((reqd_work_group_size(16, 1, 1))) void instance_norm_I16t tmpVal1 = convert_int4_rte(norm); VXC_DP2x8(src2, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1),\ uniConvertInt32toInt16_2x8); - VXC_OP4_NoDest(img_store_3d, output, coord.xyww, src2, \ + VXC_OP4_NoDest(img_store_3d, output, coord, src2, \ VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0)); } } @@ -371,7 +375,7 @@ __kernel __attribute__((reqd_work_group_size(16, 1, 1))) void instance_norm_I16t float scale_vari, bias_val; vxc_float4 bias_f, scale_f, mean_vari = (vxc_float4)(0); - VXC_ReadImage(src1, scale, coord_para.xy, VXC_5BITOFFSET_XY(0, 0),\ + VXC_ReadImage(src1, scale, coord_para.xy, 0,\ VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0)); _viv_asm(COPY, scale_h, src1, 16); VXC_DP4x4(scale_f, scale_h, scale_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), UniFP16toFP32Lo4_dp4x4); @@ -394,7 +398,7 @@ __kernel __attribute__((reqd_work_group_size(16, 1, 1))) void instance_norm_I16t for(; coord.y < endH; coord.y++) { - VXC_ReadImage(src0, input, coord, VXC_5BITOFFSET_XY(0, 0),\ + VXC_ReadImage(src0, input, coord, 0,\ VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); VXC_DP4x4(tmpData0, src0, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\ uniConvertInt16Fp32Fst_4x4); diff --git a/src/tim/vx/internal/src/libnnext/ops/vx/instance_normalization_i8.vx b/src/tim/vx/internal/src/libnnext/ops/vx/instance_normalization_i8.vx index dc19b5e..a6c98ef 100644 --- a/src/tim/vx/internal/src/libnnext/ops/vx/instance_normalization_i8.vx +++ b/src/tim/vx/internal/src/libnnext/ops/vx/instance_normalization_i8.vx @@ -27,7 +27,7 @@ __kernel __attribute__((reqd_work_group_size(16, 1, 1))) void instance_norm_mean int gidx = get_global_id(0) << 4; int lidx = get_local_id(0); int gidz = get_global_id(1); - int4 coord = (int4)(gidx, 0, gidz, 0); + int4 coord = (int4)(gidx, 0, gidz, gidz); vxc_char16 src0; float sum = 0, sqr = 0; int tmpSum = 0, tmpSqr = 0, tmpSum1, tmpSqr1; @@ -139,7 +139,7 @@ __kernel __attribute__((reqd_work_group_size(16, 1, 1))) void instance_norm_I8to image2d_array_t output, float eps, int rsFlg) { int gidz = get_global_id(1); - int4 coord = (int4)(get_global_id(0), 0, gidz, 0); + int4 coord = (int4)(get_global_id(0), 0, gidz, gidz); int4 coord_para = (int4)(gidz, 0, 0, 0); vxc_char16 src0; vxc_short8 src1, outval; @@ -277,7 +277,8 @@ __kernel __attribute__((reqd_work_group_size(16, 1, 1))) void instance_norm_I8to image2d_array_t output, float eps, int rsFlg) { int gidz = get_global_id(1); - int4 coord = (int4)(get_global_id(0), 0, gidz, 0); + int4 coord = (int4)(get_global_id(0), 0, gidz, gidz); + int4 coord_in = (int4)(get_global_id(0), 0, gidz, gidz); int4 coord_para = (int4)(gidz, 0, 0, 0); vxc_char16 src0, src2; vxc_short8 src1; @@ -309,16 +310,17 @@ __kernel __attribute__((reqd_work_group_size(16, 1, 1))) void instance_norm_I8to int8 input_desc, output_desc; _viv_asm(COPY, input_desc, input, sizeof(input_desc)); int baseAddr_a = (int)get_global_id(1) * input_desc.s4 + input_desc.s0; - _viv_asm(MOV, coord.z, baseAddr_a); + _viv_asm(MOV, coord_in.z, baseAddr_a); _viv_asm(COPY, output_desc, output, sizeof(output_desc)); int baseAddr = (int)get_global_id(1) * output_desc.s4 + output_desc.s0; - _viv_asm(MOV, coord.w, baseAddr); + _viv_asm(MOV, coord.z, baseAddr); for(coord.y = 0; coord.y < height; coord.y++) { - VXC_OP4(img_load_3d, src0, input, coord.xyzz, VXC_5BITOFFSET_XY(0, 0), \ + VXC_OP4(img_load_3d, src0, input, coord_in, VXC_5BITOFFSET_XY(0, 0), \ VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); + coord_in.y ++; VXC_DP4x4(tmpData0, src0, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertDirInt8Fp32_4x4); VXC_DP4x4(tmpData1, src0, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertEndInt8Fp32_4x4); VXC_DP4x4(tmpData2, src0, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertTrdInt8Fp32_4x4); @@ -333,7 +335,7 @@ __kernel __attribute__((reqd_work_group_size(16, 1, 1))) void instance_norm_I8to norm = tmpData3 * alpha + bias_val; tmpVal1 = convert_int4_rte(norm); VXC_DP2x8(src2, tmpVal0, tmpVal1, VXC_MODIFIER(8, 15, 0, VXC_RM_TowardZero, 1), uniConvertInt32toUint8_2x8); - VXC_OP4_NoDest(img_store_3d, output, coord.xyww, src2, VXC_MODIFIER(0, 15, 0,VXC_RM_TowardZero, 0)); + VXC_OP4_NoDest(img_store_3d, output, coord, src2, VXC_MODIFIER(0, 15, 0,VXC_RM_TowardZero, 0)); } } diff --git a/src/tim/vx/internal/src/libnnext/ops/vx/instance_normalization_scale_f32.vx b/src/tim/vx/internal/src/libnnext/ops/vx/instance_normalization_scale_f32.vx index 845945c..b81a1a1 100644 --- a/src/tim/vx/internal/src/libnnext/ops/vx/instance_normalization_scale_f32.vx +++ b/src/tim/vx/internal/src/libnnext/ops/vx/instance_normalization_scale_f32.vx @@ -25,7 +25,8 @@ __kernel void instance_norm_##src1_type_name##F32to##src1_type_name( \ image2d_array_t output, float eps, int rsFlg) \ { \ int gidz = get_global_id(1); \ - int4 coord = (int4)(get_global_id(0), 0, gidz, 0); \ + int4 coord = (int4)(get_global_id(0), 0, gidz, gidz); \ + int4 coord_in = (int4)(get_global_id(0), 0, gidz, gidz); \ int2 coord_para = (int2)(gidz, 0); \ read_type src0, src2; \ float scale_vari, bias_val; \ @@ -60,15 +61,16 @@ __kernel void instance_norm_##src1_type_name##F32to##src1_type_name( \ int8 input_desc, output_desc; \ _viv_asm(COPY, input_desc, input, sizeof(input_desc)); \ int baseAddr_a = (int)get_global_id(1) * input_desc.s4 + input_desc.s0; \ - _viv_asm(MOV, coord.z, baseAddr_a); \ + _viv_asm(MOV, coord_in.z, baseAddr_a); \ _viv_asm(COPY, output_desc, output, sizeof(output_desc)); \ int baseAddr = (int)get_global_id(1) * output_desc.s4 + output_desc.s0; \ - _viv_asm(MOV, coord.w, baseAddr); \ + _viv_asm(MOV, coord.z, baseAddr); \ \ for(coord.y = 0; coord.y < height; coord.y++) \ { \ - VXC_OP4(img_load_3d, src0, input, coord.xyzz, VXC_5BITOFFSET_XY(0, 0), \ + VXC_OP4(img_load_3d, src0, input, coord_in, VXC_5BITOFFSET_XY(0, 0), \ VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); \ + coord_in.y ++; \ VXC_DP4x4(tmpData0, src0, zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), \ uniConvert1stUint8SubZpToFp32_4x4); \ VXC_DP4x4(tmpData1, src0, zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), \ @@ -87,7 +89,7 @@ __kernel void instance_norm_##src1_type_name##F32to##src1_type_name( \ norm = tmpData3 * alpha + bias_val; \ tmpVal1 = convert_int4_rte(norm); \ VXC_DP2x8(src2, tmpVal0, tmpVal1, VXC_MODIFIER(8, 15, 0, VXC_RM_TowardZero, 1), uniConvertInt32toUint8_2x8); \ - VXC_OP4_NoDest(img_store_3d, output, coord.xyww, src2, VXC_MODIFIER(0, 15, 0,VXC_RM_TowardZero, 0)); \ + VXC_OP4_NoDest(img_store_3d, output, coord, src2, VXC_MODIFIER(0, 15, 0,VXC_RM_TowardZero, 0)); \ } \ } INSTANCENORM_8BITS_F32(U8, vxc_uchar16) @@ -166,7 +168,8 @@ __kernel __attribute__((reqd_work_group_size(16, 1, 1))) void instance_norm_I16F image2d_array_t output, float eps, int rsFlg) { int gidz = get_global_id(1); - int4 coord = (int4)(get_global_id(0), 0, gidz, 0); + int4 coord = (int4)(get_global_id(0), 0, gidz, gidz); + int4 coord_in = (int4)(get_global_id(0), 0, gidz, gidz); int2 coord_para = (int2)(gidz, 0); vxc_short8 src0, src2; float scale_vari, bias_val; @@ -201,15 +204,16 @@ __kernel __attribute__((reqd_work_group_size(16, 1, 1))) void instance_norm_I16F int8 input_desc, output_desc; _viv_asm(COPY, input_desc, input, sizeof(input_desc)); int baseAddr_a = (int)get_global_id(1) * input_desc.s4 + input_desc.s0; - _viv_asm(MOV, coord.z, baseAddr_a); + _viv_asm(MOV, coord_in.z, baseAddr_a); _viv_asm(COPY, output_desc, output, sizeof(output_desc)); int baseAddr = (int)get_global_id(1) * output_desc.s4 + output_desc.s0; - _viv_asm(MOV, coord.w, baseAddr); + _viv_asm(MOV, coord.z, baseAddr); for(coord.y = 0; coord.y < height; coord.y++) { - VXC_OP4(img_load_3d, src0, input, coord.xyzz, VXC_5BITOFFSET_XY(0, 0), \ + VXC_OP4(img_load_3d, src0, input, coord_in, VXC_5BITOFFSET_XY(0, 0), \ VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); + coord_in.y ++; VXC_DP4x4(tmpData0, src0, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\ uniConvertInt16Fp32Fst_4x4); VXC_DP4x4(tmpData1, src0, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\ @@ -221,7 +225,7 @@ __kernel __attribute__((reqd_work_group_size(16, 1, 1))) void instance_norm_I16F tmpVal1 = convert_int4_rte(norm); VXC_DP2x8(src2, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1),\ uniConvertInt32toInt16_2x8); - VXC_OP4_NoDest(img_store_3d, output, coord.xyww, src2, \ + VXC_OP4_NoDest(img_store_3d, output, coord, src2, \ VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0)); } } diff --git a/src/tim/vx/internal/src/libnnext/ops/vx/instance_normalization_scale_f32_bf16.vx b/src/tim/vx/internal/src/libnnext/ops/vx/instance_normalization_scale_f32_bf16.vx index 771b319..bba8627 100644 --- a/src/tim/vx/internal/src/libnnext/ops/vx/instance_normalization_scale_f32_bf16.vx +++ b/src/tim/vx/internal/src/libnnext/ops/vx/instance_normalization_scale_f32_bf16.vx @@ -17,7 +17,7 @@ __kernel __attribute__((reqd_work_group_size(16, 1, 1))) void instance_norm_mean int gidx = get_global_id(0) << 3; int lidx = get_local_id(0); int gidz = get_global_id(1); - int4 coord = (int4)(gidx, 0, gidz, 0); + int4 coord = (int4)(gidx, 0, gidz, gidz); vxc_short8 src0, src1, src2; float4 srcA, srcB; vxc_float sum = 0, sqr = 0; @@ -134,7 +134,8 @@ __kernel __attribute__((reqd_work_group_size(16, 1, 1))) void instance_norm_BF16 image2d_array_t output, float eps, int rsFlg) { int gidz = get_global_id(1); - int4 coord = (int4)(get_global_id(0), 0, gidz, 0); + int4 coord = (int4)(get_global_id(0), 0, gidz, gidz); + int4 coord_in = (int4)(get_global_id(0), 0, gidz, gidz); vxc_short8 src0, src1, src2; float scale_vari, bias_val; vxc_float4 mean_vari = (vxc_float4)(0); @@ -144,7 +145,7 @@ __kernel __attribute__((reqd_work_group_size(16, 1, 1))) void instance_norm_BF16 Image img3 = create_image_from_image2d(meanVari, 4); __global float* bias_ptr = (__global float*)img1.ptr; __global float* scal_ptr = (__global float*)img2.ptr; - __global uchar* sumVari_ptr = (__global uchar*)get_image_ptr_from_coord(img3, coord.wz); + __global uchar* sumVari_ptr = (__global uchar*)get_image_ptr_from_coord(img3, (int2)(0, gidz)); __global float4* vari_ptr = (__global float4*)sumVari_ptr; float bval = bias_ptr[gidz]; @@ -166,16 +167,17 @@ __kernel __attribute__((reqd_work_group_size(16, 1, 1))) void instance_norm_BF16 int8 input_desc, output_desc; _viv_asm(COPY, input_desc, input, sizeof(input_desc)); int baseAddr_a = (int)get_global_id(1) * input_desc.s4 + input_desc.s0; - _viv_asm(MOV, coord.z, baseAddr_a); + _viv_asm(MOV, coord_in.z, baseAddr_a); _viv_asm(COPY, output_desc, output, sizeof(output_desc)); int baseAddr = (int)get_global_id(1) * output_desc.s4 + output_desc.s0; - _viv_asm(MOV, coord.w, baseAddr); + _viv_asm(MOV, coord.z, baseAddr); for(coord.y = 0; coord.y < height; coord.y++) { - VXC_OP4(img_load_3d, src0, input, coord.xyzz, VXC_5BITOFFSET_XY(0, 0), \ + VXC_OP4(img_load_3d, src0, input, coord_in, VXC_5BITOFFSET_XY(0, 0), \ VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); + coord_in.y ++; VXC_DP2x8(src1, src0, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvBF16toF32_Part0_2x8); VXC_DP2x8(src2, src0, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), @@ -189,7 +191,7 @@ __kernel __attribute__((reqd_work_group_size(16, 1, 1))) void instance_norm_BF16 norm = scale_vari * tmpData1 + bias_val; _viv_asm(COPY, src1, norm, 16); VXC_DP2x8(src2, src0, src1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniExtractOddData_2x8); - VXC_OP4_NoDest(img_store_3d, output, coord.xyww, src2, \ + VXC_OP4_NoDest(img_store_3d, output, coord, src2, \ VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0)); } } diff --git a/src/tim/vx/internal/src/libnnext/ops/vx/instance_normalization_scale_f32_f16.vx b/src/tim/vx/internal/src/libnnext/ops/vx/instance_normalization_scale_f32_f16.vx index 81e5ec5..d51e38e 100644 --- a/src/tim/vx/internal/src/libnnext/ops/vx/instance_normalization_scale_f32_f16.vx +++ b/src/tim/vx/internal/src/libnnext/ops/vx/instance_normalization_scale_f32_f16.vx @@ -13,7 +13,8 @@ __kernel __attribute__((reqd_work_group_size(16, 1, 1))) void instance_norm_F16F image2d_array_t output, float eps, int rsFlg) { int gidz = get_global_id(1); - int4 coord = (int4)(get_global_id(0), 0, gidz, 0); + int4 coord = (int4)(get_global_id(0), 0, gidz, gidz); + int4 coord_in = (int4)(get_global_id(0), 0, gidz, gidz); vxc_short8 src0; vxc_half8 in_h; float scale_vari, bias_val; @@ -24,7 +25,7 @@ __kernel __attribute__((reqd_work_group_size(16, 1, 1))) void instance_norm_F16F Image img3 = create_image_from_image2d(meanVari, 4); __global float* bias_ptr = (__global float*)img1.ptr; __global float* scal_ptr = (__global float*)img2.ptr; - __global uchar* sumVari_ptr = (__global uchar*)get_image_ptr_from_coord(img3, coord.wz); + __global uchar* sumVari_ptr = (__global uchar*)get_image_ptr_from_coord(img3, (int2)(0, gidz)); __global float4* vari_ptr = (__global float4*)sumVari_ptr; float bval = bias_ptr[gidz]; @@ -49,18 +50,20 @@ __kernel __attribute__((reqd_work_group_size(16, 1, 1))) void instance_norm_F16F int8 input_desc, output_desc; _viv_asm(COPY, input_desc, input, sizeof(input_desc)); int baseAddr_a = (int)get_global_id(1) * input_desc.s4 + input_desc.s0; - _viv_asm(MOV, coord.z, baseAddr_a); + _viv_asm(MOV, coord_in.z, baseAddr_a); _viv_asm(COPY, output_desc, output, sizeof(output_desc)); int baseAddr = (int)get_global_id(1) * output_desc.s4 + output_desc.s0; - _viv_asm(MOV, coord.w, baseAddr); + _viv_asm(MOV, coord.z, baseAddr); for(coord.y = 0; coord.y < height; coord.y++) { - VXC_OP4(img_load_3d, src0, input, coord.xyzz, VXC_5BITOFFSET_XY(0, 0), \ + VXC_OP4(img_load_3d, src0, input, coord_in, VXC_5BITOFFSET_XY(0, 0), \ VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); _viv_asm(COPY, in_h, src0, 16); + coord_in.y ++; + VXC_DP4x4(tmpData0, in_h, in_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\ UniFP16toFP32Lo4_dp4x4); VXC_DP4x4(tmpData1, in_h, in_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\ @@ -74,7 +77,7 @@ __kernel __attribute__((reqd_work_group_size(16, 1, 1))) void instance_norm_F16F VXC_DP2x8(dst, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0),\ uniConvertHalfToFp16_2x8); _viv_asm(COPY, outval, dst, 16); - VXC_OP4_NoDest(img_store_3d, output, coord.xyww, outval, \ + VXC_OP4_NoDest(img_store_3d, output, coord, outval, \ VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0)); } } diff --git a/src/tim/vx/internal/src/libnnext/ops/vx/instance_normalization_u8.vx b/src/tim/vx/internal/src/libnnext/ops/vx/instance_normalization_u8.vx index 4becc2b..5c0f235 100644 --- a/src/tim/vx/internal/src/libnnext/ops/vx/instance_normalization_u8.vx +++ b/src/tim/vx/internal/src/libnnext/ops/vx/instance_normalization_u8.vx @@ -29,7 +29,7 @@ __kernel __attribute__((reqd_work_group_size(16, 1, 1))) void instance_norm_mean int gidx = get_global_id(0) << 4; int lidx = get_local_id(0); int gidz = get_global_id(1); - int4 coord = (int4)(gidx, 0, gidz, 0); + int4 coord = (int4)(gidx, 0, gidz, gidz); vxc_uchar16 src0; float sum = 0, sqr = 0; int tmpSum = 0, tmpSqr = 0, tmpSum1 = 0, tmpSqr1 = 0; @@ -44,7 +44,7 @@ __kernel __attribute__((reqd_work_group_size(16, 1, 1))) void instance_norm_mean { for(coord.y = 0; coord.y < height;) { - VXC_OP4(img_load_3d, src0, input, coord, VXC_5BITOFFSET_XY(0, 0), \ + VXC_OP4(img_load_3d, src0, input, coord, 0, \ VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); coord.y++; VXC_DP16x1(tmpSum1, src0, src0, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0), uniSumU8_16x1); @@ -96,7 +96,7 @@ __kernel __attribute__((reqd_work_group_size(16, 1, 1))) void instance_norm_mean { for(; coord.y < endH;) { - VXC_ReadImage(src0, input, coord, VXC_5BITOFFSET_XY(0, 0), + VXC_ReadImage(src0, input, coord, 0, VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); coord.y++; VXC_DP16x1(tmpSum1, src0, src0, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0), uniSumU8_16x1); @@ -133,7 +133,8 @@ __kernel __attribute__((reqd_work_group_size(16, 1, 1))) void instance_norm_U8to image2d_array_t output, float eps, int rsFlg) { int gidz = get_global_id(1); - int4 coord = (int4)(get_global_id(0), 0, gidz, 0); + int4 coord = (int4)(get_global_id(0), 0, gidz, gidz); + int4 coord_in = (int4)(get_global_id(0), 0, gidz, gidz); int4 coord_para = (int4)(gidz, 0, 0, 0); vxc_uchar16 src0, src2; vxc_short8 src1; @@ -141,7 +142,7 @@ __kernel __attribute__((reqd_work_group_size(16, 1, 1))) void instance_norm_U8to float scale_vari, bias_val; vxc_float4 bias_f, scale_f, mean_vari = (vxc_float4)(0); - VXC_ReadImage(src1, scale, coord_para.xy, VXC_5BITOFFSET_XY(0, 0),\ + VXC_ReadImage(src1, scale, coord_para.xy, 0,\ VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0)); _viv_asm(COPY, scale_h, src1, 16); VXC_DP4x4(scale_f, scale_h, scale_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), UniFP16toFP32Lo4_dp4x4); @@ -166,15 +167,16 @@ __kernel __attribute__((reqd_work_group_size(16, 1, 1))) void instance_norm_U8to int8 input_desc, output_desc; _viv_asm(COPY, input_desc, input, sizeof(input_desc)); int baseAddr_a = (int)get_global_id(1) * input_desc.s4 + input_desc.s0; - _viv_asm(MOV, coord.z, baseAddr_a); + _viv_asm(MOV, coord_in.z, baseAddr_a); _viv_asm(COPY, output_desc, output, sizeof(output_desc)); int baseAddr = (int)get_global_id(1) * output_desc.s4 + output_desc.s0; - _viv_asm(MOV, coord.w, baseAddr); + _viv_asm(MOV, coord.z, baseAddr); for(coord.y = 0; coord.y < height; coord.y++) { - VXC_OP4(img_load_3d, src0, input, coord.xyzz, VXC_5BITOFFSET_XY(0, 0), \ + VXC_OP4(img_load_3d, src0, input, coord_in, 0, \ VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); + coord_in.y ++; VXC_DP4x4(tmpData0, src0, zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvert1stUint8SubZpToFp32_4x4); VXC_DP4x4(tmpData1, src0, zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvert2ndUint8SubZpToFp32_4x4); VXC_DP4x4(tmpData2, src0, zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvert3rdUint8SubZpToFp32_4x4); @@ -189,7 +191,7 @@ __kernel __attribute__((reqd_work_group_size(16, 1, 1))) void instance_norm_U8to norm = tmpData3 * alpha + bias_val; tmpVal1 = convert_int4_rte(norm); VXC_DP2x8(src2, tmpVal0, tmpVal1, VXC_MODIFIER(8, 15, 0, VXC_RM_TowardZero, 1), uniConvertInt32toUint8_2x8); - VXC_OP4_NoDest(img_store_3d, output, coord.xyww, src2, VXC_MODIFIER(0, 15, 0,VXC_RM_TowardZero, 0)); + VXC_OP4_NoDest(img_store_3d, output, coord, src2, VXC_MODIFIER(0, 15, 0,VXC_RM_TowardZero, 0)); } } @@ -208,7 +210,7 @@ __kernel __attribute__((reqd_work_group_size(16, 1, 1))) void instance_norm_U8to float scale_vari, bias_val; vxc_float4 bias_f, scale_f, mean_vari = (vxc_float4)(0); - VXC_ReadImage(src1, scale, coord_para.xy, VXC_5BITOFFSET_XY(0, 0),\ + VXC_ReadImage(src1, scale, coord_para.xy, 0,\ VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0)); _viv_asm(COPY, scale_h, src1, 16); VXC_DP4x4(scale_f, scale_h, scale_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), UniFP16toFP32Lo4_dp4x4); @@ -232,7 +234,7 @@ __kernel __attribute__((reqd_work_group_size(16, 1, 1))) void instance_norm_U8to for(; coord.y < endH; coord.y++) { - VXC_ReadImage(src0, input, coord, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); + VXC_ReadImage(src0, input, coord, 0, VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); VXC_DP4x4(tmpData0, src0, zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvert1stUint8SubZpToFp32_4x4); VXC_DP4x4(tmpData1, src0, zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvert2ndUint8SubZpToFp32_4x4); VXC_DP4x4(tmpData2, src0, zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvert3rdUint8SubZpToFp32_4x4); diff --git a/src/tim/vx/internal/src/libnnext/ops/vx/instance_normalization_u8_f16.vx b/src/tim/vx/internal/src/libnnext/ops/vx/instance_normalization_u8_f16.vx index 9602d13..b737ffe 100644 --- a/src/tim/vx/internal/src/libnnext/ops/vx/instance_normalization_u8_f16.vx +++ b/src/tim/vx/internal/src/libnnext/ops/vx/instance_normalization_u8_f16.vx @@ -19,7 +19,7 @@ __kernel __attribute__((reqd_work_group_size(16, 1, 1))) void instance_norm_U8to image2d_array_t output, float eps, int rsFlg) { int gidz = get_global_id(1); - int4 coord = (int4)(get_global_id(0), 0, gidz, 0); + int4 coord = (int4)(get_global_id(0), 0, gidz, gidz); int4 coord_para = (int4)(gidz, 0, 0, 0); vxc_uchar16 src0; vxc_short8 src1, outval; diff --git a/src/tim/vx/internal/src/libnnext/ops/vx/l2normalizescale_axis0.vx b/src/tim/vx/internal/src/libnnext/ops/vx/l2normalizescale_axis0.vx index d35d79e..a90f1ff 100644 --- a/src/tim/vx/internal/src/libnnext/ops/vx/l2normalizescale_axis0.vx +++ b/src/tim/vx/internal/src/libnnext/ops/vx/l2normalizescale_axis0.vx @@ -6,14 +6,6 @@ do \ VXC_OP3_NoDest(vstore3, Pointer, byteOffset, Data); } \ while(0) -inline uchar* get_image2D_array_ptr(image2d_array_t input) -{ - int8 desc; - _viv_asm(COPY, desc, input, sizeof(desc)); - uchar *src_ptr = (uchar*)desc.s0; - return src_ptr; -} - #define L2NORMSCALE_SWITCH_PROCESS(case_value, vec_val, ZpValue) \ switch (case_value) \ { \ @@ -104,8 +96,10 @@ _viv_uniform int inputZP; #define L2NORMSCALE_MUL_AXIS0_PROCESS(dst_type, convert_type, output_type, copy_type) \ vxc_float4 rsqrt0;\ - dst_type *dst_ptr = (dst_type *)get_image2D_array_ptr(output); \ - short *scale_ptr = (short *)get_image2D_array_ptr(scale); \ + Image dst_img = create_image_from_image2d(output, 1); \ + dst_type *dst_ptr = (dst_type *)dst_img.ptr; \ + Image s_img = create_image_from_image2d(scale, 2); \ + short *scale_ptr = (short *)s_img.ptr; \ vxc_float4 vec0, vec1;\ convert_type dst0, dst1;\ vxc_short8 scale_s16;\ @@ -188,15 +182,16 @@ _viv_uniform int inputZP; __kernel __attribute__((reqd_work_group_size(16, 1, 1))) \ void l2normalizescale_axis0_##in0_name##_##in1_name##to##out_name##_2D \ (\ - __read_only image2d_array_t input,\ - __read_only image2d_array_t scale,\ - __write_only image2d_array_t output,\ + __read_only image2d_t input,\ + __read_only image2d_t scale,\ + __write_only image2d_t output,\ int axis\ )\ { \ int lidx = get_local_id(0); \ int offset = get_global_id(0); \ - read_type *src_ptr_base = (read_type *)get_image2D_array_ptr(input); \ + Image src_img = create_image_from_image2d(input, 1); \ + read_type *src_ptr_base = (read_type *)src_img.ptr; \ read_type *src_ptr; \ read_type2 src0, src1; \ src_type val0, val1; \ @@ -267,7 +262,8 @@ __kernel __attribute__((reqd_work_group_size(16, 1, 1))) \ { \ int lidx = get_local_id(0); \ int offset = get_global_id(0); \ - uchar *src_ptr_base = (uchar *)get_image2D_array_ptr(input); \ + Image src_img = create_image_from_image2d(input, 1); + uchar *src_ptr_base = (uchar *)src_img.ptr; \ uchar *src_ptr; \ vxc_uchar8 src0, src1; \ vxc_uchar8 val0, val1; \ diff --git a/src/tim/vx/internal/src/libnnext/ops/vx/layer_normalization.vx b/src/tim/vx/internal/src/libnnext/ops/vx/layer_normalization.vx index 7a796a2..c0a6e19 100644 --- a/src/tim/vx/internal/src/libnnext/ops/vx/layer_normalization.vx +++ b/src/tim/vx/internal/src/libnnext/ops/vx/layer_normalization.vx @@ -11,7 +11,7 @@ __kernel void layer_norm_F16toF16( image2d_array_t input, image2d_t bias, image2d_t scale, image2d_array_t output, float eps) { - int4 coord = (int4)(0, get_global_id(1), get_global_id(2), 0); + int4 coord = (int4)(0, get_global_id(1), get_global_id(2), get_global_id(2)); int4 coord_out = coord; int8 input_desc, output_desc; @@ -21,18 +21,18 @@ __kernel void layer_norm_F16toF16( vxc_short8 src0, src1; vxc_float sum = 0, sqr = 0; - VXC_OP4(img_load_3d, src0, input, coord.xyzz, VXC_5BITOFFSET_XY(0, 0), \ + VXC_OP4(img_load_3d, src0, input, coord, VXC_5BITOFFSET_XY(0, 0), \ VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); _viv_asm(COPY, output_desc, output, sizeof(output_desc)); int baseAddr = (int)get_global_id(2) * output_desc.s4 + output_desc.s0; - _viv_asm(MOV, coord_out.w, baseAddr); + _viv_asm(MOV, coord_out.z, baseAddr); for(coord.x = 8; coord.x < (width+8); coord.x += 8) { vxc_half8 val0_h; _viv_asm(COPY, val0_h, src0, 16); - VXC_OP4(img_load_3d, src0, input, coord.xyzz, VXC_5BITOFFSET_XY(0, 0), \ + VXC_OP4(img_load_3d, src0, input, coord, VXC_5BITOFFSET_XY(0, 0), \ VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); vxc_float4 sumsqr; VXC_DP8x2(sumsqr, val0_h, val0_h, VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0),\ @@ -49,7 +49,7 @@ __kernel void layer_norm_F16toF16( vxc_float4 bias_f; for(coord.x = 0; coord.x < width; coord.x += 4) { - VXC_OP4(img_load_3d, src0, input, coord.xyzz, VXC_5BITOFFSET_XY(0, 0), \ + VXC_OP4(img_load_3d, src0, input, coord, VXC_5BITOFFSET_XY(0, 0), \ VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); VXC_ReadImage(src1, scale, coord.xw, VXC_5BITOFFSET_XY(0, 0),\ VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); @@ -73,7 +73,7 @@ __kernel void layer_norm_F16toF16( vxc_short8 dstval; _viv_asm(COPY, dstval, dst, 16); coord_out.x = coord.x; - VXC_OP4_NoDest(img_store_3d, output, coord_out.xyww, dstval, \ + VXC_OP4_NoDest(img_store_3d, output, coord_out, dstval, \ VXC_MODIFIER(0, 3, 0,VXC_RM_TowardZero, 0)); } } @@ -99,7 +99,7 @@ __kernel void layer_norm_U8toU8( image2d_array_t input, image2d_t bias, image2d_t scale, image2d_array_t output, float eps) { - int4 coord = (int4)(0, get_global_id(1), get_global_id(2), 0); + int4 coord = (int4)(0, get_global_id(1), get_global_id(2), get_global_id(2)); int4 coord_out = coord; vxc_uchar16 src0, src2; @@ -119,11 +119,11 @@ __kernel void layer_norm_U8toU8( _viv_asm(COPY, output_desc, output, sizeof(output_desc)); int baseAddr = (int)get_global_id(2) * output_desc.s4 + output_desc.s0; - _viv_asm(MOV, coord_out.w, baseAddr); + _viv_asm(MOV, coord_out.z, baseAddr); for(coord.x = 0; coord.x < width; coord.x += 16) { - VXC_OP4(img_load_3d, src0, input, coord.xyzz, VXC_5BITOFFSET_XY(0, 0), \ + VXC_OP4(img_load_3d, src0, input, coord, VXC_5BITOFFSET_XY(0, 0), \ VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); VXC_DP16x1(tmpSum1, src0, src0, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0), uniSumU8_16x1); tmpSum += (tmpSum1.x); @@ -144,7 +144,7 @@ __kernel void layer_norm_U8toU8( for(coord.x = 0; coord.x < width; coord.x += 16) { - VXC_OP4(img_load_3d, src0, input, coord.xyzz, VXC_5BITOFFSET_XY(0, 0), \ + VXC_OP4(img_load_3d, src0, input, coord, VXC_5BITOFFSET_XY(0, 0), \ VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); VXC_ReadImage(src1, scale, coord.xw, VXC_5BITOFFSET_XY(0, 0),\ VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); @@ -203,7 +203,7 @@ __kernel void layer_norm_U8toU8( VXC_DP2x8(src2, tmpVal0, tmpVal1, VXC_MODIFIER(8, 15, 0, VXC_RM_TowardZero, 1),\ uniConvertInt32toUint8_2x8); coord_out.x = coord.x; - VXC_OP4_NoDest(img_store_3d, output, coord_out.xyww, src2, \ + VXC_OP4_NoDest(img_store_3d, output, coord_out, src2, \ VXC_MODIFIER(0, 15, 0,VXC_RM_TowardZero, 0)); } } @@ -212,7 +212,7 @@ __kernel void layer_norm_F16toU8( image2d_array_t input, image2d_t bias, image2d_t scale, image2d_array_t output, float eps) { - int4 coord = (int4)(0, get_global_id(1), get_global_id(2), 0); + int4 coord = (int4)(0, get_global_id(1), get_global_id(2), get_global_id(2)); int4 coord_out = coord; int8 input_desc, output_desc; @@ -222,18 +222,18 @@ __kernel void layer_norm_F16toU8( vxc_short8 src0, src1; vxc_float sum = 0, sqr = 0; - VXC_OP4(img_load_3d, src0, input, coord.xyzz, VXC_5BITOFFSET_XY(0, 0), \ + VXC_OP4(img_load_3d, src0, input, coord, VXC_5BITOFFSET_XY(0, 0), \ VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); _viv_asm(COPY, output_desc, output, sizeof(output_desc)); int baseAddr = (int)get_global_id(2) * output_desc.s4 + output_desc.s0; - _viv_asm(MOV, coord_out.w, baseAddr); + _viv_asm(MOV, coord_out.z, baseAddr); for(coord.x = 8; coord.x < (width+8); coord.x += 8) { vxc_half8 val0_h; _viv_asm(COPY, val0_h, src0, 16); - VXC_OP4(img_load_3d, src0, input, coord.xyzz, VXC_5BITOFFSET_XY(0, 0), \ + VXC_OP4(img_load_3d, src0, input, coord, VXC_5BITOFFSET_XY(0, 0), \ VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); vxc_float4 sumsqr; VXC_DP8x2(sumsqr, val0_h, val0_h, VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0),\ @@ -250,7 +250,7 @@ __kernel void layer_norm_F16toU8( vxc_float4 bias_f; for(coord.x = 0; coord.x < width; coord.x += 4) { - VXC_OP4(img_load_3d, src0, input, coord.xyzz, VXC_5BITOFFSET_XY(0, 0), \ + VXC_OP4(img_load_3d, src0, input, coord, VXC_5BITOFFSET_XY(0, 0), \ VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); VXC_ReadImage(src1, scale, coord.xw, VXC_5BITOFFSET_XY(0, 0),\ VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); @@ -273,7 +273,7 @@ __kernel void layer_norm_F16toU8( VXC_DP2x8(dst, output_int4, output_int4, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), uniConvertInt32toUint8_2x8); coord_out.x = coord.x; - VXC_OP4_NoDest(img_store_3d, output, coord_out.xyww, dst, \ + VXC_OP4_NoDest(img_store_3d, output, coord_out, dst, \ VXC_MODIFIER(0, 3, 0,VXC_RM_TowardZero, 0)); } } \ No newline at end of file diff --git a/src/tim/vx/internal/src/libnnext/ops/vx/layer_normalization_i16.vx b/src/tim/vx/internal/src/libnnext/ops/vx/layer_normalization_i16.vx index bedc979..e461f28 100644 --- a/src/tim/vx/internal/src/libnnext/ops/vx/layer_normalization_i16.vx +++ b/src/tim/vx/internal/src/libnnext/ops/vx/layer_normalization_i16.vx @@ -21,24 +21,25 @@ __kernel void layer_norm_I16toI16( image2d_array_t input, image2d_t bias, image2d_t scale, image2d_array_t output, float eps) { - int4 coord = (int4)(0, get_global_id(1), get_global_id(2), 0); + int4 coord = (int4)(0, get_global_id(1), get_global_id(2), get_global_id(2)); + int4 coord_in = (int4)(0, get_global_id(1), get_global_id(2), get_global_id(2)); int8 input_desc, output_desc; _viv_asm(COPY, input_desc, input, sizeof(input_desc)); int baseAddr_a = (int)get_global_id(2) * input_desc.s4 + input_desc.s0; - _viv_asm(MOV, coord.z, baseAddr_a); + _viv_asm(MOV, coord_in.z, baseAddr_a); _viv_asm(COPY, output_desc, output, sizeof(output_desc)); int baseAddr = (int)get_global_id(2) * output_desc.s4 + output_desc.s0; - _viv_asm(MOV, coord.w, baseAddr); + _viv_asm(MOV, coord.z, baseAddr); vxc_short8 src0, src1, dst; vxc_float sum = 0, sqr = 0; - for(; coord.x < width;) + for(; coord_in.x < width;) { - VXC_OP4(img_load_3d, src0, input, coord.xyzz, VXC_5BITOFFSET_XY(0, 0), \ + VXC_OP4(img_load_3d, src0, input, coord_in, VXC_5BITOFFSET_XY(0, 0), \ VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); - coord.x += 8; + coord_in.x += 8; vxc_float4 sumsqr; VXC_DP8x2(sumsqr, src0, src0, VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0),\ uniInt16SumSqr_dp8x2); @@ -60,11 +61,11 @@ __kernel void layer_norm_I16toI16( int2 coord_bias = (int2)(0, 0); - for(coord.x = 0; coord.x < width; coord.x += 8) + for(coord_in.x = 0; coord_in.x < width; coord_in.x += 8, coord.x += 8) { - VXC_OP4(img_load_3d, src0, input, coord.xyzz, VXC_5BITOFFSET_XY(0, 0), \ + VXC_OP4(img_load_3d, src0, input, coord_in, VXC_5BITOFFSET_XY(0, 0), \ VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); - coord_bias.x = coord.x; + coord_bias.x = coord_in.x; VXC_ReadImage(src1, scale, coord_bias, VXC_5BITOFFSET_XY(0, 0),\ VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); bias_f0 = read_imagef(bias, coord_bias); @@ -92,7 +93,7 @@ __kernel void layer_norm_I16toI16( VXC_DP2x8(dst, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1),\ uniConvertInt32toUint8_2x8); - VXC_OP4_NoDest(img_store_3d, output, coord.xyww, dst, \ + VXC_OP4_NoDest(img_store_3d, output, coord, dst, \ VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0)); } } diff --git a/src/tim/vx/internal/src/libnnext/ops/vx/layer_normalization_scale_f32.vx b/src/tim/vx/internal/src/libnnext/ops/vx/layer_normalization_scale_f32.vx index e39ef71..221e93e 100644 --- a/src/tim/vx/internal/src/libnnext/ops/vx/layer_normalization_scale_f32.vx +++ b/src/tim/vx/internal/src/libnnext/ops/vx/layer_normalization_scale_f32.vx @@ -11,7 +11,7 @@ __kernel void layer_norm_F16F32toF16( image2d_array_t input, image2d_t bias, image2d_t scale, image2d_array_t output, float eps) { - int4 coord = (int4)(0, get_global_id(1), get_global_id(2), 0); + int4 coord = (int4)(0, get_global_id(1), get_global_id(2), get_global_id(2)); int4 coord_out = coord; int8 input_desc, output_desc; @@ -21,20 +21,20 @@ __kernel void layer_norm_F16F32toF16( vxc_short8 src0; vxc_float sum = 0, sqr = 0; - VXC_OP4(img_load_3d, src0, input, coord.xyzz, VXC_5BITOFFSET_XY(0, 0), \ + VXC_OP4(img_load_3d, src0, input, coord, VXC_5BITOFFSET_XY(0, 0), \ VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); Image img1 = create_image_from_image2d(bias, 4); Image img2 = create_image_from_image2d(scale, 4); _viv_asm(COPY, output_desc, output, sizeof(output_desc)); int baseAddr = (int)get_global_id(2) * output_desc.s4 + output_desc.s0; - _viv_asm(MOV, coord_out.w, baseAddr); + _viv_asm(MOV, coord_out.z, baseAddr); for(coord.x = 8; coord.x < (width+8); coord.x += 8) { vxc_half8 val0_h; _viv_asm(COPY, val0_h, src0, 16); - VXC_OP4(img_load_3d, src0, input, coord.xyzz, VXC_5BITOFFSET_XY(0, 0), \ + VXC_OP4(img_load_3d, src0, input, coord, VXC_5BITOFFSET_XY(0, 0), \ VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); vxc_float4 sumsqr; VXC_DP8x2(sumsqr, val0_h, val0_h, VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0),\ @@ -49,11 +49,11 @@ __kernel void layer_norm_F16F32toF16( vari += eps; vari = rsqrt(vari); vxc_float4 bias_f, scale_f, in_f; - __global float* bias_ptr = (__global float*)get_image_ptr_from_coord(img1, coord.ww); - __global float* scale_ptr = (__global float*)get_image_ptr_from_coord(img2, coord.ww); + __global float* bias_ptr = (__global float*)get_image_ptr_from_coord(img1, (int2)(0, 0)); + __global float* scale_ptr = (__global float*)get_image_ptr_from_coord(img2, (int2)(0, 0)); for(coord.x = 0; coord.x < width; coord.x += 4) { - VXC_OP4(img_load_3d, src0, input, coord.xyzz, VXC_5BITOFFSET_XY(0, 0), \ + VXC_OP4(img_load_3d, src0, input, coord, VXC_5BITOFFSET_XY(0, 0), \ VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); bias_f = vload4(0, bias_ptr + coord.x); scale_f = vload4(0, scale_ptr + coord.x); @@ -72,7 +72,7 @@ __kernel void layer_norm_F16F32toF16( vxc_short8 dstval; _viv_asm(COPY, dstval, dst, 16); coord_out.x = coord.x; - VXC_OP4_NoDest(img_store_3d, output, coord_out.xyww, dstval, \ + VXC_OP4_NoDest(img_store_3d, output, coord_out, dstval, \ VXC_MODIFIER(0, 3, 0,VXC_RM_TowardZero, 0)); } } @@ -100,7 +100,7 @@ __kernel void layer_norm_U8F32toU8( image2d_array_t input, image2d_t bias, image2d_t scale, image2d_array_t output, float eps) { - int4 coord = (int4)(0, get_global_id(1), get_global_id(2), 0); + int4 coord = (int4)(0, get_global_id(1), get_global_id(2), get_global_id(2)); int4 coord_out = coord; vxc_uchar16 src0, src2; @@ -118,11 +118,11 @@ __kernel void layer_norm_U8F32toU8( _viv_asm(COPY, output_desc, output, sizeof(output_desc)); int baseAddr = (int)get_global_id(2) * output_desc.s4 + output_desc.s0; - _viv_asm(MOV, coord_out.w, baseAddr); + _viv_asm(MOV, coord_out.z, baseAddr); for(coord.x = 0; coord.x < width; coord.x += 16) { - VXC_OP4(img_load_3d, src0, input, coord.xyzz, VXC_5BITOFFSET_XY(0, 0), \ + VXC_OP4(img_load_3d, src0, input, coord, VXC_5BITOFFSET_XY(0, 0), \ VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); VXC_DP16x1(tmpSum1, src0, src0, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0), uniSumU8_16x1); tmpSum += (tmpSum1.x); @@ -142,11 +142,11 @@ __kernel void layer_norm_U8F32toU8( Image img1 = create_image_from_image2d(bias, 4); Image img2 = create_image_from_image2d(scale, 4); - __global float* bias_ptr = (__global float*)get_image_ptr_from_coord(img1, coord.ww); - __global float* scale_ptr = (__global float*)get_image_ptr_from_coord(img2, coord.ww); + __global float* bias_ptr = (__global float*)get_image_ptr_from_coord(img1, (int2)(0, 0)); + __global float* scale_ptr = (__global float*)get_image_ptr_from_coord(img2, (int2)(0, 0)); for(coord.x = 0; coord.x < width; coord.x += 16) { - VXC_OP4(img_load_3d, src0, input, coord.xyzz, VXC_5BITOFFSET_XY(0, 0), \ + VXC_OP4(img_load_3d, src0, input, coord, VXC_5BITOFFSET_XY(0, 0), \ VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); bias_f0 = vload4(0, bias_ptr); bias_f1 = vload4(1, bias_ptr); @@ -193,7 +193,7 @@ __kernel void layer_norm_U8F32toU8( VXC_DP2x8(src2, tmpVal0, tmpVal1, VXC_MODIFIER(8, 15, 0, VXC_RM_TowardZero, 1),\ uniConvertInt32toUint8_2x8); coord_out.x = coord.x; - VXC_OP4_NoDest(img_store_3d, output, coord_out.xyww, src2, \ + VXC_OP4_NoDest(img_store_3d, output, coord_out, src2, \ VXC_MODIFIER(0, 15, 0,VXC_RM_TowardZero, 0)); } } @@ -202,24 +202,25 @@ __kernel void layer_norm_I16F32toI16( image2d_array_t input, image2d_t bias, image2d_t scale, image2d_array_t output, float eps) { - int4 coord = (int4)(0, get_global_id(1), get_global_id(2), 0); + int4 coord = (int4)(0, get_global_id(1), get_global_id(2), get_global_id(2)); + int4 coord_in = (int4)(0, get_global_id(1), get_global_id(2), get_global_id(2)); int8 input_desc, output_desc; _viv_asm(COPY, input_desc, input, sizeof(input_desc)); int baseAddr_a = (int)get_global_id(2) * input_desc.s4 + input_desc.s0; - _viv_asm(MOV, coord.z, baseAddr_a); + _viv_asm(MOV, coord_in.z, baseAddr_a); _viv_asm(COPY, output_desc, output, sizeof(output_desc)); int baseAddr = (int)get_global_id(2) * output_desc.s4 + output_desc.s0; - _viv_asm(MOV, coord.w, baseAddr); + _viv_asm(MOV, coord.z, baseAddr); vxc_short8 src0, dst; vxc_float sum = 0, sqr = 0; - for(; coord.x < width;) + for(; coord_in.x < width;) { - VXC_OP4(img_load_3d, src0, input, coord.xyzz, VXC_5BITOFFSET_XY(0, 0), \ + VXC_OP4(img_load_3d, src0, input, coord_in, VXC_5BITOFFSET_XY(0, 0), \ VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); - coord.x += 8; + coord_in.x += 8; vxc_float4 sumsqr; VXC_DP8x2(sumsqr, src0, src0, VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0),\ uniInt16SumSqr_dp8x2); @@ -243,9 +244,9 @@ __kernel void layer_norm_I16F32toI16( Image img2 = create_image_from_image2d(scale, 4); __global float* bias_ptr = (__global float*)get_image_ptr_from_coord(img1, coord_bias); __global float* scale_ptr = (__global float*)get_image_ptr_from_coord(img2, coord_bias); - for(coord.x = 0; coord.x < width; coord.x += 8) + for(coord_in.x = 0; coord_in.x < width; coord_in.x += 8, coord.x += 8) { - VXC_OP4(img_load_3d, src0, input, coord.xyzz, VXC_5BITOFFSET_XY(0, 0), \ + VXC_OP4(img_load_3d, src0, input, coord_in, VXC_5BITOFFSET_XY(0, 0), \ VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); bias_f0 = vload4(0, bias_ptr); bias_f1 = vload4(1, bias_ptr); @@ -269,7 +270,7 @@ __kernel void layer_norm_I16F32toI16( VXC_DP2x8(dst, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1),\ uniConvertInt32toUint8_2x8); - VXC_OP4_NoDest(img_store_3d, output, coord.xyww, dst, \ + VXC_OP4_NoDest(img_store_3d, output, coord, dst, \ VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0)); } } \ No newline at end of file diff --git a/src/tim/vx/internal/src/libnnext/ops/vx/layer_normalization_scale_f32_bf16.vx b/src/tim/vx/internal/src/libnnext/ops/vx/layer_normalization_scale_f32_bf16.vx index 76e3ed9..8c05b02 100644 --- a/src/tim/vx/internal/src/libnnext/ops/vx/layer_normalization_scale_f32_bf16.vx +++ b/src/tim/vx/internal/src/libnnext/ops/vx/layer_normalization_scale_f32_bf16.vx @@ -11,7 +11,7 @@ __kernel void layer_norm_BF16F32toBF16( image2d_array_t input, image2d_t bias, image2d_t scale, image2d_array_t output, float eps) { - int4 coord = (int4)(0, get_global_id(1), get_global_id(2), 0); + int4 coord = (int4)(0, get_global_id(1), get_global_id(2), get_global_id(2)); int4 coord_out = coord; int8 input_desc, output_desc; @@ -30,7 +30,7 @@ __kernel void layer_norm_BF16F32toBF16( _viv_asm(COPY, output_desc, output, sizeof(output_desc)); int baseAddr = (int)get_global_id(2) * output_desc.s4 + output_desc.s0; - _viv_asm(MOV, coord_out.w, baseAddr); + _viv_asm(MOV, coord_out.z, baseAddr); float4 srcA, srcB; for(coord.x = 8; coord.x < (width+8); coord.x += 8) { @@ -40,7 +40,7 @@ __kernel void layer_norm_BF16F32toBF16( uniConvBF16toF32_Part1_2x8); _viv_asm(COPY, srcA, src1, 16); _viv_asm(COPY, srcB, src2, 16); - VXC_OP4(img_load_3d, src0, input, coord.xyzz, VXC_5BITOFFSET_XY(0, 0), \ + VXC_OP4(img_load_3d, src0, input, coord, VXC_5BITOFFSET_XY(0, 0), \ VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); sum += dot(srcA, ones) + dot(srcB, ones); sqr += dot(srcA * srcA, ones) + dot(srcB * srcB, ones); @@ -52,12 +52,12 @@ __kernel void layer_norm_BF16F32toBF16( vari += eps; vari = rsqrt(vari); vxc_float4 bias_f0, bias_f1, scale_f0, scale_f1; - __global float* bias_ptr = (__global float*)get_image_ptr_from_coord(img1, coord.ww); - __global float* scale_ptr = (__global float*)get_image_ptr_from_coord(img2, coord.ww); + __global float* bias_ptr = (__global float*)get_image_ptr_from_coord(img1, (int2)(0, 0)); + __global float* scale_ptr = (__global float*)get_image_ptr_from_coord(img2, (int2)(0, 0)); for(coord.x = 0; coord.x < width; coord.x += 8) { - VXC_OP4(img_load_3d, src0, input, coord.xyzz, VXC_5BITOFFSET_XY(0, 0), \ + VXC_OP4(img_load_3d, src0, input, coord, VXC_5BITOFFSET_XY(0, 0), \ VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); bias_f0 = vload4(0, bias_ptr); bias_f1 = vload4(1, bias_ptr); @@ -85,7 +85,7 @@ __kernel void layer_norm_BF16F32toBF16( VXC_DP2x8(src2, src0, src1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniExtractOddData_2x8); coord_out.x = coord.x; - VXC_OP4_NoDest(img_store_3d, output, coord_out.xyww, src2, \ + VXC_OP4_NoDest(img_store_3d, output, coord_out, src2, \ VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0)); } } diff --git a/src/tim/vx/internal/src/libnnext/ops/vx/layer_normalization_u8_f16.vx b/src/tim/vx/internal/src/libnnext/ops/vx/layer_normalization_u8_f16.vx index d7d7066..e062f9f 100644 --- a/src/tim/vx/internal/src/libnnext/ops/vx/layer_normalization_u8_f16.vx +++ b/src/tim/vx/internal/src/libnnext/ops/vx/layer_normalization_u8_f16.vx @@ -26,7 +26,7 @@ __kernel void layer_norm_U8toF16( image2d_array_t output, float eps) { - int4 coord = (int4)(0, get_global_id(1), get_global_id(2), 0); + int4 coord = (int4)(0, get_global_id(1), get_global_id(2), get_global_id(2)); int4 coord_out = coord; vxc_uchar16 src0; float sum = 0, sqr = 0; @@ -41,11 +41,11 @@ __kernel void layer_norm_U8toF16( _viv_asm(COPY, output_desc, output, sizeof(output_desc)); int baseAddr = (int)get_global_id(2) * output_desc.s4 + output_desc.s0; - _viv_asm(MOV, coord_out.w, baseAddr); + _viv_asm(MOV, coord_out.z, baseAddr); for(coord.x = 0; coord.x < width; coord.x += 16) { - VXC_OP4(img_load_3d, src0, input, coord.xyzz, VXC_5BITOFFSET_XY(0, 0), \ + VXC_OP4(img_load_3d, src0, input, coord, VXC_5BITOFFSET_XY(0, 0), \ VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); VXC_DP16x1(tmpSum1, src0, src0, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0), uniSumU8_16x1); tmpSum += (tmpSum1.x); @@ -71,7 +71,7 @@ __kernel void layer_norm_U8toF16( for(coord.x = 0; coord.x < width; coord.x += 16) { - VXC_OP4(img_load_3d, src0, input, coord.xyzz, VXC_5BITOFFSET_XY(0, 0), \ + VXC_OP4(img_load_3d, src0, input, coord, VXC_5BITOFFSET_XY(0, 0), \ VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); VXC_ReadImage(src1, scale, coord.xw, VXC_5BITOFFSET_XY(0, 0),\ VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); @@ -121,7 +121,7 @@ __kernel void layer_norm_U8toF16( UniPackFP16even_2x8); _viv_asm(COPY, outval, dst, 16); coord_out.x = coord.x; - VXC_OP4_NoDest(img_store_3d, output, coord_out.xyww, outval, \ + VXC_OP4_NoDest(img_store_3d, output, coord_out, outval, \ VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0)); tmpData2 -= mean; @@ -135,7 +135,7 @@ __kernel void layer_norm_U8toF16( UniPackFP16even_2x8); _viv_asm(COPY, outval, dst, 16); coord_out.x += 8; - VXC_OP4_NoDest(img_store_3d, output, coord_out.xyww, outval, \ + VXC_OP4_NoDest(img_store_3d, output, coord_out, outval, \ VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0)); } } diff --git a/src/tim/vx/internal/src/libnnext/ops/vx/layer_normalization_wh_f16.vx b/src/tim/vx/internal/src/libnnext/ops/vx/layer_normalization_wh_f16.vx index 03802e8..d494b6d 100644 --- a/src/tim/vx/internal/src/libnnext/ops/vx/layer_normalization_wh_f16.vx +++ b/src/tim/vx/internal/src/libnnext/ops/vx/layer_normalization_wh_f16.vx @@ -39,7 +39,7 @@ __kernel __attribute__((reqd_work_group_size(16, 1, 1))) void layernorm_wh_sumSq { for(coord.y = 0; coord.y < height;) { - VXC_OP4(img_load_3d, src0, input, coord.xyww, VXC_5BITOFFSET_XY(0, 0), + VXC_OP4(img_load_3d, src0, input, coord.xywz, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); coord.y++; _viv_asm(COPY, in_h, src0, 16); @@ -134,7 +134,8 @@ __kernel __attribute__((reqd_work_group_size(16, 1, 1))) void layernorm_wh_F16to image2d_array_t output, float eps) { int gidz = get_global_id(1); - int4 coord = (int4)(get_global_id(0), 0, gidz, 0); + int4 coord = (int4)(get_global_id(0), 0, gidz, gidz); + int4 coord_in = (int4)(get_global_id(0), 0, gidz, gidz); int2 coord_sum = (int2)(0, gidz); int4 coord_para = coord; coord_para.z = (ushort)gidz / (ushort)(height_depth); @@ -157,8 +158,8 @@ __kernel __attribute__((reqd_work_group_size(16, 1, 1))) void layernorm_wh_F16to int8 input_desc, scale_desc, output_desc; _viv_asm(COPY, input_desc, input, sizeof(input_desc)); - int baseAddr_a = (int)coord.z * input_desc.s4 + input_desc.s0; - _viv_asm(MOV, coord.w, baseAddr_a); + int baseAddr_a = (int)coord_in.z * input_desc.s4 + input_desc.s0; + _viv_asm(MOV, coord_in.z, baseAddr_a); _viv_asm(COPY, scale_desc, scale, sizeof(scale_desc)); int baseAddr_c = (int)coord_para.z * scale_desc.s4 + scale_desc.s0; @@ -175,11 +176,12 @@ __kernel __attribute__((reqd_work_group_size(16, 1, 1))) void layernorm_wh_F16to for(coord.y = 0; coord.y < height; coord.y++) { - VXC_OP4(img_load_3d, src0, input, coord.xyww, VXC_5BITOFFSET_XY(0, 0), + VXC_OP4(img_load_3d, src0, input, coord_in, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); + coord_in.y ++; coord_para.y = coord.y; coord_bias.y = coord.y; - VXC_OP4(img_load_3d, src1, scale, coord_para.xyww, VXC_5BITOFFSET_XY(0, 0), + VXC_OP4(img_load_3d, src1, scale, coord_para.xywz, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); bias_f0 = read_imagef(bias, coord_bias); coord_bias.x += 4; @@ -208,7 +210,7 @@ __kernel __attribute__((reqd_work_group_size(16, 1, 1))) void layernorm_wh_F16to VXC_DP2x8(dst, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0),\ uniConvertHalfToFp16_2x8); _viv_asm(COPY, outval, dst, 16); - VXC_OP4_NoDest(img_store_3d, output, coord.xyzz, outval, \ + VXC_OP4_NoDest(img_store_3d, output, coord, outval, \ VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0)); } } @@ -284,7 +286,8 @@ __kernel __attribute__((reqd_work_group_size(16, 1, 1))) void layernorm_wh_F16to image2d_array_t output, float eps) { int gidz = get_global_id(1); - int4 coord = (int4)(get_global_id(0), 0, gidz, 0); + int4 coord = (int4)(get_global_id(0), 0, gidz, gidz); + int4 coord_in = (int4)(get_global_id(0), 0, gidz, gidz); int2 coord_sum = (int2)(0, gidz); int4 coord_para = coord; coord_para.z = (ushort)gidz / (ushort)(height_depth); @@ -307,8 +310,8 @@ __kernel __attribute__((reqd_work_group_size(16, 1, 1))) void layernorm_wh_F16to int8 input_desc, scale_desc, output_desc; _viv_asm(COPY, input_desc, input, sizeof(input_desc)); - int baseAddr_a = (int)coord.z * input_desc.s4 + input_desc.s0; - _viv_asm(MOV, coord.w, baseAddr_a); + int baseAddr_a = (int)coord_in.z * input_desc.s4 + input_desc.s0; + _viv_asm(MOV, coord_in.z, baseAddr_a); _viv_asm(COPY, scale_desc, scale, sizeof(scale_desc)); int baseAddr_c = (int)coord_para.z * scale_desc.s4 + scale_desc.s0; @@ -324,11 +327,12 @@ __kernel __attribute__((reqd_work_group_size(16, 1, 1))) void layernorm_wh_F16to for(coord.y = 0; coord.y < height; coord.y++) { - VXC_OP4(img_load_3d, src0, input, coord.xyww, VXC_5BITOFFSET_XY(0, 0), + VXC_OP4(img_load_3d, src0, input, coord_in, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); + coord_in.y ++; coord_para.y = coord.y; coord_bias.y = coord.y; - VXC_OP4(img_load_3d, src1, scale, coord_para.xyww, VXC_5BITOFFSET_XY(0, 0), + VXC_OP4(img_load_3d, src1, scale, coord_para.xywz, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); bias_f0 = read_imagef(bias, coord_bias); coord_bias.x += 4; @@ -356,7 +360,7 @@ __kernel __attribute__((reqd_work_group_size(16, 1, 1))) void layernorm_wh_F16to tmpVal1 = convert_int4_rte(norm * outputScale + output_zp); VXC_DP2x8(outval, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1),\ uniConvertInt32toUint8_2x8); - VXC_OP4_NoDest(img_store_3d, output, coord.xyzz, outval, \ + VXC_OP4_NoDest(img_store_3d, output, coord, outval, \ VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0)); } } diff --git a/src/tim/vx/internal/src/libnnext/ops/vx/layer_normalization_wh_i16.vx b/src/tim/vx/internal/src/libnnext/ops/vx/layer_normalization_wh_i16.vx index 61e4e29..7c92a66 100644 --- a/src/tim/vx/internal/src/libnnext/ops/vx/layer_normalization_wh_i16.vx +++ b/src/tim/vx/internal/src/libnnext/ops/vx/layer_normalization_wh_i16.vx @@ -43,7 +43,7 @@ __kernel __attribute__((reqd_work_group_size(16, 1, 1))) void layernorm_wh_sumSq { for(coord.y = 0; coord.y < height;) { - VXC_OP4(img_load_3d, src0, input, coord.xyww, VXC_5BITOFFSET_XY(0, 0), + VXC_OP4(img_load_3d, src0, input, coord.xywz, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); coord.y++; vxc_float4 sumsqr; @@ -130,7 +130,8 @@ __kernel __attribute__((reqd_work_group_size(16, 1, 1))) void layernorm_wh_I16to image2d_array_t output, float eps) { int gidz = get_global_id(1); - int4 coord = (int4)(get_global_id(0), 0, gidz, 0); + int4 coord = (int4)(get_global_id(0), 0, gidz, gidz); + int4 coord_in = (int4)(get_global_id(0), 0, gidz, gidz); int2 coord_sum = (int2)(0, gidz); int4 coord_para = coord; coord_para.z = (ushort)gidz / (ushort)(height_depth); @@ -152,8 +153,8 @@ __kernel __attribute__((reqd_work_group_size(16, 1, 1))) void layernorm_wh_I16to int8 input_desc, scale_desc, output_desc; _viv_asm(COPY, input_desc, input, sizeof(input_desc)); - int baseAddr_a = (int)coord.z * input_desc.s4 + input_desc.s0; - _viv_asm(MOV, coord.w, baseAddr_a); + int baseAddr_a = (int)coord_in.z * input_desc.s4 + input_desc.s0; + _viv_asm(MOV, coord_in.z, baseAddr_a); _viv_asm(COPY, scale_desc, scale, sizeof(scale_desc)); int baseAddr_c = (int)coord_para.z * scale_desc.s4 + scale_desc.s0; @@ -169,11 +170,12 @@ __kernel __attribute__((reqd_work_group_size(16, 1, 1))) void layernorm_wh_I16to for(coord.y = 0; coord.y < height; coord.y++) { - VXC_OP4(img_load_3d, src0, input, coord.xyww, VXC_5BITOFFSET_XY(0, 0), + VXC_OP4(img_load_3d, src0, input, coord_in, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); + coord_in.y ++; coord_para.y = coord.y; coord_bias.y = coord.y; - VXC_OP4(img_load_3d, src1, scale, coord_para.xyww, VXC_5BITOFFSET_XY(0, 0), + VXC_OP4(img_load_3d, src1, scale, coord_para.xywz, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); bias_f0 = read_imagef(bias, coord_bias); coord_bias.x += 4; @@ -199,7 +201,7 @@ __kernel __attribute__((reqd_work_group_size(16, 1, 1))) void layernorm_wh_I16to VXC_DP2x8(outval, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1),\ uniConvertInt32toUint8_2x8); - VXC_OP4_NoDest(img_store_3d, output, coord.xyzz, outval, \ + VXC_OP4_NoDest(img_store_3d, output, coord, outval, \ VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0)); } } diff --git a/src/tim/vx/internal/src/libnnext/ops/vx/layer_normalization_wh_u8.vx b/src/tim/vx/internal/src/libnnext/ops/vx/layer_normalization_wh_u8.vx index 521a8cf..4c9e46b 100644 --- a/src/tim/vx/internal/src/libnnext/ops/vx/layer_normalization_wh_u8.vx +++ b/src/tim/vx/internal/src/libnnext/ops/vx/layer_normalization_wh_u8.vx @@ -48,7 +48,7 @@ __kernel __attribute__((reqd_work_group_size(16, 1, 1))) void layernorm_wh_sumSq { for(coord.y = 0; coord.y < height;) { - VXC_OP4(img_load_3d, src0, input, coord.xyww, VXC_5BITOFFSET_XY(0, 0), + VXC_OP4(img_load_3d, src0, input, coord.xywz, 0, VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); coord.y++; VXC_DP16x1(tmpSum1, src0, src0, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0), uniSumU8_16x1); @@ -101,7 +101,7 @@ __kernel __attribute__((reqd_work_group_size(16, 1, 1))) void layernorm_wh_sumSq { for(; coord.y < endH;) { - VXC_ReadImage(src0, input, coord, VXC_5BITOFFSET_XY(0, 0), + VXC_ReadImage(src0, input, coord, 0, VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); coord.y++; VXC_DP16x1(tmpSum1, src0, src0, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0), uniSumU8_16x1); @@ -138,7 +138,8 @@ __kernel __attribute__((reqd_work_group_size(16, 1, 1))) void layernorm_wh_U8toF image2d_array_t output, float eps) { int gidz = get_global_id(1); - int4 coord = (int4)(get_global_id(0), 0, gidz, 0); + int4 coord = (int4)(get_global_id(0), 0, gidz, gidz); + int4 coord_in = (int4)(get_global_id(0), 0, gidz, gidz); int2 coord_sum = (int2)(0, gidz); int4 coord_para = coord; coord_para.z = (ushort)gidz / (ushort)(height_depth); @@ -161,8 +162,8 @@ __kernel __attribute__((reqd_work_group_size(16, 1, 1))) void layernorm_wh_U8toF int8 input_desc, scale_desc, output_desc; _viv_asm(COPY, input_desc, input, sizeof(input_desc)); - int baseAddr_a = (int)coord.z * input_desc.s4 + input_desc.s0; - _viv_asm(MOV, coord.w, baseAddr_a); + int baseAddr_a = (int)coord_in.z * input_desc.s4 + input_desc.s0; + _viv_asm(MOV, coord_in.z, baseAddr_a); _viv_asm(COPY, scale_desc, scale, sizeof(scale_desc)); int baseAddr_c = (int)coord_para.z * scale_desc.s4 + scale_desc.s0; @@ -178,10 +179,11 @@ __kernel __attribute__((reqd_work_group_size(16, 1, 1))) void layernorm_wh_U8toF for(coord.y = 0; coord.y < height; coord.y++) { - VXC_OP4(img_load_3d, src0, input, coord.xyww, VXC_5BITOFFSET_XY(0, 0), + VXC_OP4(img_load_3d, src0, input, coord_in, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); + coord_in.y ++; coord_para.y = coord.y; coord_bias.y = coord.y; - VXC_OP4(img_load_3d, src1, scale, coord_para.xyww, VXC_5BITOFFSET_XY(0, 0), + VXC_OP4(img_load_3d, src1, scale, coord_para.xywz, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); bias_f0 = read_imagef(bias, coord_bias); coord_bias.x += 4; @@ -208,7 +210,7 @@ __kernel __attribute__((reqd_work_group_size(16, 1, 1))) void layernorm_wh_U8toF VXC_DP2x8(dst, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0),\ uniConvertHalfToFp16_2x8); _viv_asm(COPY, outval, dst, 16); - VXC_OP4_NoDest(img_store_3d, output, coord.xyzz, outval, \ + VXC_OP4_NoDest(img_store_3d, output, coord, outval, \ VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0)); } } @@ -242,10 +244,10 @@ __kernel __attribute__((reqd_work_group_size(16, 1, 1))) void layernorm_wh_U8toF for(coord.y = 0; coord.y < height; coord.y++) { - VXC_ReadImage(src0, input, coord, VXC_5BITOFFSET_XY(0, 0),\ + VXC_ReadImage(src0, input, coord, 0,\ VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); coord_bias.y = coord.y; - VXC_ReadImage(src1, scale, coord, VXC_5BITOFFSET_XY(0, 0),\ + VXC_ReadImage(src1, scale, coord, 0,\ VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); bias_f0 = read_imagef(bias, coord_bias); coord_bias.x += 4; @@ -281,7 +283,8 @@ __kernel __attribute__((reqd_work_group_size(16, 1, 1))) void layernorm_wh_U8toU image2d_array_t output, float eps) { int gidz = get_global_id(1); - int4 coord = (int4)(get_global_id(0), 0, gidz, 0); + int4 coord = (int4)(get_global_id(0), 0, gidz, gidz); + int4 coord_in = (int4)(get_global_id(0), 0, gidz, gidz); int2 coord_sum = (int2)(0, gidz); int4 coord_para = coord; coord_para.z = (ushort)gidz / (ushort)(height_depth); @@ -304,8 +307,8 @@ __kernel __attribute__((reqd_work_group_size(16, 1, 1))) void layernorm_wh_U8toU int8 input_desc, scale_desc, output_desc; _viv_asm(COPY, input_desc, input, sizeof(input_desc)); - int baseAddr_a = (int)coord.z * input_desc.s4 + input_desc.s0; - _viv_asm(MOV, coord.w, baseAddr_a); + int baseAddr_a = (int)coord_in.z * input_desc.s4 + input_desc.s0; + _viv_asm(MOV, coord_in.z, baseAddr_a); _viv_asm(COPY, scale_desc, scale, sizeof(scale_desc)); int baseAddr_c = (int)coord_para.z * scale_desc.s4 + scale_desc.s0; @@ -321,11 +324,12 @@ __kernel __attribute__((reqd_work_group_size(16, 1, 1))) void layernorm_wh_U8toU for(coord.y = 0; coord.y < height; coord.y++) { - VXC_OP4(img_load_3d, src0, input, coord.xyww, VXC_5BITOFFSET_XY(0, 0), + VXC_OP4(img_load_3d, src0, input, coord_in, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); + coord_in.y ++; coord_para.y = coord.y; coord_bias.y = coord.y; - VXC_OP4(img_load_3d, src1, scale, coord_para.xyww, VXC_5BITOFFSET_XY(0, 0), + VXC_OP4(img_load_3d, src1, scale, coord_para.xywz, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); bias_f0 = read_imagef(bias, coord_bias); coord_bias.x += 4; @@ -351,7 +355,7 @@ __kernel __attribute__((reqd_work_group_size(16, 1, 1))) void layernorm_wh_U8toU VXC_DP2x8(outval, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1),\ uniConvertInt32toUint8_2x8); - VXC_OP4_NoDest(img_store_3d, output, coord.xyzz, outval, \ + VXC_OP4_NoDest(img_store_3d, output, coord, outval, \ VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0)); } } @@ -385,10 +389,10 @@ __kernel __attribute__((reqd_work_group_size(16, 1, 1))) void layernorm_wh_U8toU for(coord.y = 0; coord.y < height; coord.y++) { - VXC_ReadImage(src0, input, coord, VXC_5BITOFFSET_XY(0, 0),\ + VXC_ReadImage(src0, input, coord, 0,\ VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); coord_bias.y = coord.y; - VXC_ReadImage(src1, scale, coord, VXC_5BITOFFSET_XY(0, 0),\ + VXC_ReadImage(src1, scale, coord, 0,\ VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); bias_f0 = read_imagef(bias, coord_bias); coord_bias.x += 4; diff --git a/src/tim/vx/internal/src/libnnext/ops/vx/matrixmul_f16.vx b/src/tim/vx/internal/src/libnnext/ops/vx/matrixmul_f16.vx index 2a6b24f..9d2ef89 100644 --- a/src/tim/vx/internal/src/libnnext/ops/vx/matrixmul_f16.vx +++ b/src/tim/vx/internal/src/libnnext/ops/vx/matrixmul_f16.vx @@ -41,21 +41,21 @@ __kernel void gemm_F16F16toF16(image2d_array_t inputA, for(coord_a.x = 0, coord_b.y = 0; coord_a.x < K;) { vxc_float4 tempA0, tempA1, tempA2, tempA3; - VXC_OP4(img_load_3d, srcA0, inputA, coord_a.xyww, VXC_5BITOFFSET_XY(0, 0), + VXC_OP4(img_load_3d, srcA0, inputA, coord_a.xywz, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); - VXC_OP4(img_load_3d, srcB.hi, inputB, coord_b.xyww, VXC_5BITOFFSET_XY(0, 0), + VXC_OP4(img_load_3d, srcB.hi, inputB, coord_b.xywz, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); - VXC_OP4(img_load_3d, srcA1, inputA, coord_a.xyww, VXC_5BITOFFSET_XY(0, 1), + VXC_OP4(img_load_3d, srcA1, inputA, coord_a.xywz, VXC_5BITOFFSET_XY(0, 1), VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); - VXC_OP4(img_load_3d, srcB.hi, inputB, coord_b.xyww, VXC_5BITOFFSET_XY(0, 1), + VXC_OP4(img_load_3d, srcB.hi, inputB, coord_b.xywz, VXC_5BITOFFSET_XY(0, 1), VXC_MODIFIER(4, 7, 0, VXC_RM_TowardZero, 0)); - VXC_OP4(img_load_3d, srcA2, inputA, coord_a.xyww, VXC_5BITOFFSET_XY(0, 2), + VXC_OP4(img_load_3d, srcA2, inputA, coord_a.xywz, VXC_5BITOFFSET_XY(0, 2), VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); - VXC_OP4(img_load_3d, srcB.lo, inputB, coord_b.xyww, VXC_5BITOFFSET_XY(0, 2), + VXC_OP4(img_load_3d, srcB.lo, inputB, coord_b.xywz, VXC_5BITOFFSET_XY(0, 2), VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); - VXC_OP4(img_load_3d, srcA3, inputA, coord_a.xyww, VXC_5BITOFFSET_XY(0, 3), + VXC_OP4(img_load_3d, srcA3, inputA, coord_a.xywz, VXC_5BITOFFSET_XY(0, 3), VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); - VXC_OP4(img_load_3d, srcB.lo, inputB, coord_b.xyww, VXC_5BITOFFSET_XY(0, 3), + VXC_OP4(img_load_3d, srcB.lo, inputB, coord_b.xywz, VXC_5BITOFFSET_XY(0, 3), VXC_MODIFIER(4, 7, 0, VXC_RM_TowardZero, 0)); coord_a.x += 4; coord_b.y += 4; @@ -84,22 +84,22 @@ __kernel void gemm_F16F16toF16(image2d_array_t inputA, _viv_asm(MOV, coord_b.w, baseAddr); _viv_asm(CONV, valC, sum0); _viv_asm(COPY, outC, valC, 16); - VXC_OP4_NoDest(img_store_3d, output, coord_b.xyww, outC.s0246, \ + VXC_OP4_NoDest(img_store_3d, output, coord_b.xywz, outC.s0246, \ VXC_MODIFIER(0, 3, 0,VXC_RM_TowardZero, 0)); coord_b.y++; _viv_asm(CONV, valC, sum1); _viv_asm(COPY, outC, valC, 16); - VXC_OP4_NoDest(img_store_3d, output, coord_b.xyww, outC.s0246, \ + VXC_OP4_NoDest(img_store_3d, output, coord_b.xywz, outC.s0246, \ VXC_MODIFIER(0, 3, 0,VXC_RM_TowardZero, 0)); coord_b.y++; _viv_asm(CONV, valC, sum2); _viv_asm(COPY, outC, valC, 16); - VXC_OP4_NoDest(img_store_3d, output, coord_b.xyww, outC.s0246, \ + VXC_OP4_NoDest(img_store_3d, output, coord_b.xywz, outC.s0246, \ VXC_MODIFIER(0, 3, 0,VXC_RM_TowardZero, 0)); coord_b.y++; _viv_asm(CONV, valC, sum3); _viv_asm(COPY, outC, valC, 16); - VXC_OP4_NoDest(img_store_3d, output, coord_b.xyww, outC.s0246, \ + VXC_OP4_NoDest(img_store_3d, output, coord_b.xywz, outC.s0246, \ VXC_MODIFIER(0, 3, 0,VXC_RM_TowardZero, 0)); } #else @@ -130,21 +130,21 @@ __kernel void gemm_F16F16toF16(image2d_array_t inputA, { vxc_float4 tempA0, tempA1, tempA2, tempA3; vxc_float4 tempB0, tempB1, tempB2, tempB3; - VXC_OP4(img_load_3d, srcA0, inputA, coord_a.xyww, VXC_5BITOFFSET_XY(0, 0), + VXC_OP4(img_load_3d, srcA0, inputA, coord_a.xywz, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); - VXC_OP4(img_load_3d, srcB0, inputB, coord_b.xyww, VXC_5BITOFFSET_XY(0, 0), + VXC_OP4(img_load_3d, srcB0, inputB, coord_b.xywz, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); - VXC_OP4(img_load_3d, srcA0, inputA, coord_a.xyww, VXC_5BITOFFSET_XY(0, 1), + VXC_OP4(img_load_3d, srcA0, inputA, coord_a.xywz, VXC_5BITOFFSET_XY(0, 1), VXC_MODIFIER(4, 7, 0, VXC_RM_TowardZero, 0)); - VXC_OP4(img_load_3d, srcB0, inputB, coord_b.xyww, VXC_5BITOFFSET_XY(0, 1), + VXC_OP4(img_load_3d, srcB0, inputB, coord_b.xywz, VXC_5BITOFFSET_XY(0, 1), VXC_MODIFIER(4, 7, 0, VXC_RM_TowardZero, 0)); - VXC_OP4(img_load_3d, srcA1, inputA, coord_a.xyww, VXC_5BITOFFSET_XY(0, 2), + VXC_OP4(img_load_3d, srcA1, inputA, coord_a.xywz, VXC_5BITOFFSET_XY(0, 2), VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); - VXC_OP4(img_load_3d, srcB1, inputB, coord_b.xyww, VXC_5BITOFFSET_XY(0, 2), + VXC_OP4(img_load_3d, srcB1, inputB, coord_b.xywz, VXC_5BITOFFSET_XY(0, 2), VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); - VXC_OP4(img_load_3d, srcA1, inputA, coord_a.xyww, VXC_5BITOFFSET_XY(0, 3), + VXC_OP4(img_load_3d, srcA1, inputA, coord_a.xywz, VXC_5BITOFFSET_XY(0, 3), VXC_MODIFIER(4, 7, 0, VXC_RM_TowardZero, 0)); - VXC_OP4(img_load_3d, srcB1, inputB, coord_b.xyww, VXC_5BITOFFSET_XY(0, 3), + VXC_OP4(img_load_3d, srcB1, inputB, coord_b.xywz, VXC_5BITOFFSET_XY(0, 3), VXC_MODIFIER(4, 7, 0, VXC_RM_TowardZero, 0)); coord_a.x += 4; coord_b.y += 4; @@ -172,22 +172,22 @@ __kernel void gemm_F16F16toF16(image2d_array_t inputA, _viv_asm(MOV, coord_b.w, baseAddr); _viv_asm(CONV, valC, sum0); _viv_asm(COPY, outC, valC, 16); - VXC_OP4_NoDest(img_store_3d, output, coord_b.xyww, outC.s0246, \ + VXC_OP4_NoDest(img_store_3d, output, coord_b.xywz, outC.s0246, \ VXC_MODIFIER(0, 3, 0,VXC_RM_TowardZero, 0)); coord_b.y++; _viv_asm(CONV, valC, sum1); _viv_asm(COPY, outC, valC, 16); - VXC_OP4_NoDest(img_store_3d, output, coord_b.xyww, outC.s0246, \ + VXC_OP4_NoDest(img_store_3d, output, coord_b.xywz, outC.s0246, \ VXC_MODIFIER(0, 3, 0,VXC_RM_TowardZero, 0)); coord_b.y++; _viv_asm(CONV, valC, sum2); _viv_asm(COPY, outC, valC, 16); - VXC_OP4_NoDest(img_store_3d, output, coord_b.xyww, outC.s0246, \ + VXC_OP4_NoDest(img_store_3d, output, coord_b.xywz, outC.s0246, \ VXC_MODIFIER(0, 3, 0,VXC_RM_TowardZero, 0)); coord_b.y++; _viv_asm(CONV, valC, sum3); _viv_asm(COPY, outC, valC, 16); - VXC_OP4_NoDest(img_store_3d, output, coord_b.xyww, outC.s0246, \ + VXC_OP4_NoDest(img_store_3d, output, coord_b.xywz, outC.s0246, \ VXC_MODIFIER(0, 3, 0,VXC_RM_TowardZero, 0)); } #endif diff --git a/src/tim/vx/internal/src/libnnext/ops/vx/matrixmul_f16f16_u8.vx b/src/tim/vx/internal/src/libnnext/ops/vx/matrixmul_f16f16_u8.vx index 586b0c6..1929119 100644 --- a/src/tim/vx/internal/src/libnnext/ops/vx/matrixmul_f16f16_u8.vx +++ b/src/tim/vx/internal/src/libnnext/ops/vx/matrixmul_f16f16_u8.vx @@ -42,21 +42,21 @@ __kernel void gemm_F16F16to##dst_type_name( \ for(coord_a.x = 0, coord_b.y = 0; coord_a.x < K;) \ { \ vxc_float4 tempA0, tempA1, tempA2, tempA3; \ - VXC_OP4(img_load_3d, srcA0, inputA, coord_a.xyww, VXC_5BITOFFSET_XY(0, 0), \ + VXC_OP4(img_load_3d, srcA0, inputA, coord_a.xywz, VXC_5BITOFFSET_XY(0, 0), \ VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \ - VXC_OP4(img_load_3d, srcB.hi, inputB, coord_b.xyww, VXC_5BITOFFSET_XY(0, 0), \ + VXC_OP4(img_load_3d, srcB.hi, inputB, coord_b.xywz, VXC_5BITOFFSET_XY(0, 0), \ VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \ - VXC_OP4(img_load_3d, srcA1, inputA, coord_a.xyww, VXC_5BITOFFSET_XY(0, 1), \ + VXC_OP4(img_load_3d, srcA1, inputA, coord_a.xywz, VXC_5BITOFFSET_XY(0, 1), \ VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \ - VXC_OP4(img_load_3d, srcB.hi, inputB, coord_b.xyww, VXC_5BITOFFSET_XY(0, 1), \ + VXC_OP4(img_load_3d, srcB.hi, inputB, coord_b.xywz, VXC_5BITOFFSET_XY(0, 1), \ VXC_MODIFIER(4, 7, 0, VXC_RM_TowardZero, 0)); \ - VXC_OP4(img_load_3d, srcA2, inputA, coord_a.xyww, VXC_5BITOFFSET_XY(0, 2), \ + VXC_OP4(img_load_3d, srcA2, inputA, coord_a.xywz, VXC_5BITOFFSET_XY(0, 2), \ VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \ - VXC_OP4(img_load_3d, srcB.lo, inputB, coord_b.xyww, VXC_5BITOFFSET_XY(0, 2), \ + VXC_OP4(img_load_3d, srcB.lo, inputB, coord_b.xywz, VXC_5BITOFFSET_XY(0, 2), \ VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \ - VXC_OP4(img_load_3d, srcA3, inputA, coord_a.xyww, VXC_5BITOFFSET_XY(0, 3), \ + VXC_OP4(img_load_3d, srcA3, inputA, coord_a.xywz, VXC_5BITOFFSET_XY(0, 3), \ VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \ - VXC_OP4(img_load_3d, srcB.lo, inputB, coord_b.xyww, VXC_5BITOFFSET_XY(0, 3), \ + VXC_OP4(img_load_3d, srcB.lo, inputB, coord_b.xywz, VXC_5BITOFFSET_XY(0, 3), \ VXC_MODIFIER(4, 7, 0, VXC_RM_TowardZero, 0)); \ coord_a.x += 4; \ coord_b.y += 4; \ @@ -89,20 +89,20 @@ __kernel void gemm_F16F16to##dst_type_name( \ tmpOut1 = convert_int4_rte(sum1 * outputScale + output_ZP); \ VXC_DP2x8(outC, tmpOut0, tmpOut1, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1),\ uniConvertInt32toUint8_2x8); \ - VXC_OP4_NoDest(img_store_3d, output, coord_b.xyww, outC.s0123, \ + VXC_OP4_NoDest(img_store_3d, output, coord_b.xywz, outC.s0123, \ VXC_MODIFIER(0, 3, 0,VXC_RM_TowardZero, 0)); \ coord_b.y++; \ - VXC_OP4_NoDest(img_store_3d, output, coord_b.xyww, outC.s4567, \ + VXC_OP4_NoDest(img_store_3d, output, coord_b.xywz, outC.s4567, \ VXC_MODIFIER(0, 3, 0,VXC_RM_TowardZero, 0)); \ coord_b.y++; \ tmpOut0 = convert_int4_rte(sum2 * outputScale + output_ZP); \ tmpOut1 = convert_int4_rte(sum3 * outputScale + output_ZP); \ VXC_DP2x8(outC, tmpOut0, tmpOut1, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1),\ uniConvertInt32toUint8_2x8); \ - VXC_OP4_NoDest(img_store_3d, output, coord_b.xyww, outC.s0123, \ + VXC_OP4_NoDest(img_store_3d, output, coord_b.xywz, outC.s0123, \ VXC_MODIFIER(0, 3, 0,VXC_RM_TowardZero, 0)); \ coord_b.y++; \ - VXC_OP4_NoDest(img_store_3d, output, coord_b.xyww, outC.s4567, \ + VXC_OP4_NoDest(img_store_3d, output, coord_b.xywz, outC.s4567, \ VXC_MODIFIER(0, 3, 0,VXC_RM_TowardZero, 0)); \ } #else @@ -132,21 +132,21 @@ __kernel void gemm_F16F16to##dst_type_name( \ { \ vxc_float4 tempA0, tempA1, tempA2, tempA3; \ vxc_float4 tempB0, tempB1, tempB2, tempB3; \ - VXC_OP4(img_load_3d, srcA0, inputA, coord_a.xyww, VXC_5BITOFFSET_XY(0, 0), \ + VXC_OP4(img_load_3d, srcA0, inputA, coord_a.xywz, VXC_5BITOFFSET_XY(0, 0), \ VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \ - VXC_OP4(img_load_3d, srcB0, inputB, coord_b.xyww, VXC_5BITOFFSET_XY(0, 0), \ + VXC_OP4(img_load_3d, srcB0, inputB, coord_b.xywz, VXC_5BITOFFSET_XY(0, 0), \ VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \ - VXC_OP4(img_load_3d, srcA0, inputA, coord_a.xyww, VXC_5BITOFFSET_XY(0, 1), \ + VXC_OP4(img_load_3d, srcA0, inputA, coord_a.xywz, VXC_5BITOFFSET_XY(0, 1), \ VXC_MODIFIER(4, 7, 0, VXC_RM_TowardZero, 0)); \ - VXC_OP4(img_load_3d, srcB0, inputB, coord_b.xyww, VXC_5BITOFFSET_XY(0, 1), \ + VXC_OP4(img_load_3d, srcB0, inputB, coord_b.xywz, VXC_5BITOFFSET_XY(0, 1), \ VXC_MODIFIER(4, 7, 0, VXC_RM_TowardZero, 0)); \ - VXC_OP4(img_load_3d, srcA1, inputA, coord_a.xyww, VXC_5BITOFFSET_XY(0, 2), \ + VXC_OP4(img_load_3d, srcA1, inputA, coord_a.xywz, VXC_5BITOFFSET_XY(0, 2), \ VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \ - VXC_OP4(img_load_3d, srcB1, inputB, coord_b.xyww, VXC_5BITOFFSET_XY(0, 2), \ + VXC_OP4(img_load_3d, srcB1, inputB, coord_b.xywz, VXC_5BITOFFSET_XY(0, 2), \ VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \ - VXC_OP4(img_load_3d, srcA1, inputA, coord_a.xyww, VXC_5BITOFFSET_XY(0, 3), \ + VXC_OP4(img_load_3d, srcA1, inputA, coord_a.xywz, VXC_5BITOFFSET_XY(0, 3), \ VXC_MODIFIER(4, 7, 0, VXC_RM_TowardZero, 0)); \ - VXC_OP4(img_load_3d, srcB1, inputB, coord_b.xyww, VXC_5BITOFFSET_XY(0, 3), \ + VXC_OP4(img_load_3d, srcB1, inputB, coord_b.xywz, VXC_5BITOFFSET_XY(0, 3), \ VXC_MODIFIER(4, 7, 0, VXC_RM_TowardZero, 0)); \ coord_a.x += 4; \ coord_b.y += 4; \ @@ -178,20 +178,20 @@ __kernel void gemm_F16F16to##dst_type_name( \ tmpOut1 = convert_int4_rte(sum1 * outputScale + output_ZP); \ VXC_DP2x8(outC, tmpOut0, tmpOut1, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1),\ uniConvertInt32toUint8_2x8); \ - VXC_OP4_NoDest(img_store_3d, output, coord_b.xyww, outC.s0123, \ + VXC_OP4_NoDest(img_store_3d, output, coord_b.xywz, outC.s0123, \ VXC_MODIFIER(0, 3, 0,VXC_RM_TowardZero, 0)); \ coord_b.y++; \ - VXC_OP4_NoDest(img_store_3d, output, coord_b.xyww, outC.s4567, \ + VXC_OP4_NoDest(img_store_3d, output, coord_b.xywz, outC.s4567, \ VXC_MODIFIER(0, 3, 0,VXC_RM_TowardZero, 0)); \ coord_b.y++; \ tmpOut0 = convert_int4_rte(sum2 * outputScale + output_ZP); \ tmpOut1 = convert_int4_rte(sum3 * outputScale + output_ZP); \ VXC_DP2x8(outC, tmpOut0, tmpOut1, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1),\ uniConvertInt32toUint8_2x8); \ - VXC_OP4_NoDest(img_store_3d, output, coord_b.xyww, outC.s0123, \ + VXC_OP4_NoDest(img_store_3d, output, coord_b.xywz, outC.s0123, \ VXC_MODIFIER(0, 3, 0,VXC_RM_TowardZero, 0)); \ coord_b.y++; \ - VXC_OP4_NoDest(img_store_3d, output, coord_b.xyww, outC.s4567, \ + VXC_OP4_NoDest(img_store_3d, output, coord_b.xywz, outC.s4567, \ VXC_MODIFIER(0, 3, 0,VXC_RM_TowardZero, 0)); \ } #endif diff --git a/src/tim/vx/internal/src/libnnext/ops/vx/matrixmul_f16i16_i16.vx b/src/tim/vx/internal/src/libnnext/ops/vx/matrixmul_f16i16_i16.vx index cc959af..7cdf087 100644 --- a/src/tim/vx/internal/src/libnnext/ops/vx/matrixmul_f16i16_i16.vx +++ b/src/tim/vx/internal/src/libnnext/ops/vx/matrixmul_f16i16_i16.vx @@ -42,21 +42,21 @@ __kernel void gemm_F16##src1_type_name##toI16(image2d_array_t inputA, \ for(coord_a.x = 0, coord_b.y = 0; coord_a.x < K;) \ { \ vxc_float4 tempA0, tempA1, tempA2, tempA3, tmpZpScale; \ - VXC_OP4(img_load_3d, srcA0, inputA, coord_a.xyww, VXC_5BITOFFSET_XY(0, 0), \ + VXC_OP4(img_load_3d, srcA0, inputA, coord_a.xywz, VXC_5BITOFFSET_XY(0, 0), \ VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \ - VXC_OP4(img_load_3d, srcB.hi, inputB, coord_b.xyww, VXC_5BITOFFSET_XY(0, 0), \ + VXC_OP4(img_load_3d, srcB.hi, inputB, coord_b.xywz, VXC_5BITOFFSET_XY(0, 0), \ VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \ - VXC_OP4(img_load_3d, srcA0, inputA, coord_a.xyww, VXC_5BITOFFSET_XY(0, 1), \ + VXC_OP4(img_load_3d, srcA0, inputA, coord_a.xywz, VXC_5BITOFFSET_XY(0, 1), \ VXC_MODIFIER(4, 7, 0, VXC_RM_TowardZero, 0)); \ - VXC_OP4(img_load_3d, srcB.hi, inputB, coord_b.xyww, VXC_5BITOFFSET_XY(0, 1), \ + VXC_OP4(img_load_3d, srcB.hi, inputB, coord_b.xywz, VXC_5BITOFFSET_XY(0, 1), \ VXC_MODIFIER(4, 7, 0, VXC_RM_TowardZero, 0)); \ - VXC_OP4(img_load_3d, srcA1, inputA, coord_a.xyww, VXC_5BITOFFSET_XY(0, 2), \ + VXC_OP4(img_load_3d, srcA1, inputA, coord_a.xywz, VXC_5BITOFFSET_XY(0, 2), \ VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \ - VXC_OP4(img_load_3d, srcB.lo, inputB, coord_b.xyww, VXC_5BITOFFSET_XY(0, 2), \ + VXC_OP4(img_load_3d, srcB.lo, inputB, coord_b.xywz, VXC_5BITOFFSET_XY(0, 2), \ VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \ - VXC_OP4(img_load_3d, srcA1, inputA, coord_a.xyww, VXC_5BITOFFSET_XY(0, 3), \ + VXC_OP4(img_load_3d, srcA1, inputA, coord_a.xywz, VXC_5BITOFFSET_XY(0, 3), \ VXC_MODIFIER(4, 7, 0, VXC_RM_TowardZero, 0)); \ - VXC_OP4(img_load_3d, srcB.lo, inputB, coord_b.xyww, VXC_5BITOFFSET_XY(0, 3), \ + VXC_OP4(img_load_3d, srcB.lo, inputB, coord_b.xywz, VXC_5BITOFFSET_XY(0, 3), \ VXC_MODIFIER(4, 7, 0, VXC_RM_TowardZero, 0)); \ coord_a.x += 4; \ coord_b.y += 4; \ @@ -86,20 +86,20 @@ __kernel void gemm_F16##src1_type_name##toI16(image2d_array_t inputA, \ tmpOut1 = convert_int4_rte(sum1 * in1outScale + output_ZP); \ VXC_DP2x8(outC, tmpOut0, tmpOut1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1),\ uniConvertInt32toUint8_2x8); \ - VXC_OP4_NoDest(img_store_3d, output, coord_b.xyww, outC.s0123, \ + VXC_OP4_NoDest(img_store_3d, output, coord_b.xywz, outC.s0123, \ VXC_MODIFIER(0, 3, 0,VXC_RM_TowardZero, 0)); \ coord_b.y++; \ - VXC_OP4_NoDest(img_store_3d, output, coord_b.xyww, outC.s4567, \ + VXC_OP4_NoDest(img_store_3d, output, coord_b.xywz, outC.s4567, \ VXC_MODIFIER(0, 3, 0,VXC_RM_TowardZero, 0)); \ coord_b.y++; \ tmpOut0 = convert_int4_rte(sum2 * in1outScale + output_ZP); \ tmpOut1 = convert_int4_rte(sum3 * in1outScale + output_ZP); \ VXC_DP2x8(outC, tmpOut0, tmpOut1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1),\ uniConvertInt32toUint8_2x8); \ - VXC_OP4_NoDest(img_store_3d, output, coord_b.xyww, outC.s0123, \ + VXC_OP4_NoDest(img_store_3d, output, coord_b.xywz, outC.s0123, \ VXC_MODIFIER(0, 3, 0,VXC_RM_TowardZero, 0)); \ coord_b.y++; \ - VXC_OP4_NoDest(img_store_3d, output, coord_b.xyww, outC.s4567, \ + VXC_OP4_NoDest(img_store_3d, output, coord_b.xywz, outC.s4567, \ VXC_MODIFIER(0, 3, 0,VXC_RM_TowardZero, 0)); \ } #else @@ -130,21 +130,21 @@ __kernel void gemm_F16##src1_type_name##toI16(image2d_array_t inputA, \ { \ vxc_float4 tempA0, tempA1, tempA2, tempA3, tmpZpScale; \ vxc_float4 tempB0, tempB1, tempB2, tempB3; \ - VXC_OP4(img_load_3d, srcA0, inputA, coord_a.xyww, VXC_5BITOFFSET_XY(0, 0), \ + VXC_OP4(img_load_3d, srcA0, inputA, coord_a.xywz, VXC_5BITOFFSET_XY(0, 0), \ VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \ - VXC_OP4(img_load_3d, srcB0, inputB, coord_b.xyww, VXC_5BITOFFSET_XY(0, 0), \ + VXC_OP4(img_load_3d, srcB0, inputB, coord_b.xywz, VXC_5BITOFFSET_XY(0, 0), \ VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \ - VXC_OP4(img_load_3d, srcA0, inputA, coord_a.xyww, VXC_5BITOFFSET_XY(0, 1), \ + VXC_OP4(img_load_3d, srcA0, inputA, coord_a.xywz, VXC_5BITOFFSET_XY(0, 1), \ VXC_MODIFIER(4, 7, 0, VXC_RM_TowardZero, 0)); \ - VXC_OP4(img_load_3d, srcB0, inputB, coord_b.xyww, VXC_5BITOFFSET_XY(0, 1), \ + VXC_OP4(img_load_3d, srcB0, inputB, coord_b.xywz, VXC_5BITOFFSET_XY(0, 1), \ VXC_MODIFIER(4, 7, 0, VXC_RM_TowardZero, 0)); \ - VXC_OP4(img_load_3d, srcA1, inputA, coord_a.xyww, VXC_5BITOFFSET_XY(0, 2), \ + VXC_OP4(img_load_3d, srcA1, inputA, coord_a.xywz, VXC_5BITOFFSET_XY(0, 2), \ VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \ - VXC_OP4(img_load_3d, srcB1, inputB, coord_b.xyww, VXC_5BITOFFSET_XY(0, 2), \ + VXC_OP4(img_load_3d, srcB1, inputB, coord_b.xywz, VXC_5BITOFFSET_XY(0, 2), \ VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \ - VXC_OP4(img_load_3d, srcA1, inputA, coord_a.xyww, VXC_5BITOFFSET_XY(0, 3), \ + VXC_OP4(img_load_3d, srcA1, inputA, coord_a.xywz, VXC_5BITOFFSET_XY(0, 3), \ VXC_MODIFIER(4, 7, 0, VXC_RM_TowardZero, 0)); \ - VXC_OP4(img_load_3d, srcB1, inputB, coord_b.xyww, VXC_5BITOFFSET_XY(0, 3), \ + VXC_OP4(img_load_3d, srcB1, inputB, coord_b.xywz, VXC_5BITOFFSET_XY(0, 3), \ VXC_MODIFIER(4, 7, 0, VXC_RM_TowardZero, 0)); \ coord_a.x += 4; \ coord_b.y += 4; \ @@ -174,20 +174,20 @@ __kernel void gemm_F16##src1_type_name##toI16(image2d_array_t inputA, \ tmpOut1 = convert_int4_rte(sum1 * in1outScale + output_ZP); \ VXC_DP2x8(outC, tmpOut0, tmpOut1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1),\ uniConvertInt32toUint8_2x8); \ - VXC_OP4_NoDest(img_store_3d, output, coord_b.xyww, outC.s0123, \ + VXC_OP4_NoDest(img_store_3d, output, coord_b.xywz, outC.s0123, \ VXC_MODIFIER(0, 3, 0,VXC_RM_TowardZero, 0)); \ coord_b.y++; \ - VXC_OP4_NoDest(img_store_3d, output, coord_b.xyww, outC.s4567, \ + VXC_OP4_NoDest(img_store_3d, output, coord_b.xywz, outC.s4567, \ VXC_MODIFIER(0, 3, 0,VXC_RM_TowardZero, 0)); \ coord_b.y++; \ tmpOut0 = convert_int4_rte(sum2 * in1outScale + output_ZP); \ tmpOut1 = convert_int4_rte(sum3 * in1outScale + output_ZP); \ VXC_DP2x8(outC, tmpOut0, tmpOut1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1),\ uniConvertInt32toUint8_2x8); \ - VXC_OP4_NoDest(img_store_3d, output, coord_b.xyww, outC.s0123, \ + VXC_OP4_NoDest(img_store_3d, output, coord_b.xywz, outC.s0123, \ VXC_MODIFIER(0, 3, 0,VXC_RM_TowardZero, 0)); \ coord_b.y++; \ - VXC_OP4_NoDest(img_store_3d, output, coord_b.xyww, outC.s4567, \ + VXC_OP4_NoDest(img_store_3d, output, coord_b.xywz, outC.s4567, \ VXC_MODIFIER(0, 3, 0,VXC_RM_TowardZero, 0)); \ } #endif diff --git a/src/tim/vx/internal/src/libnnext/ops/vx/matrixmul_f16u8_f16.vx b/src/tim/vx/internal/src/libnnext/ops/vx/matrixmul_f16u8_f16.vx index 4a5ee66..515d2fb 100644 --- a/src/tim/vx/internal/src/libnnext/ops/vx/matrixmul_f16u8_f16.vx +++ b/src/tim/vx/internal/src/libnnext/ops/vx/matrixmul_f16u8_f16.vx @@ -39,21 +39,21 @@ __kernel void gemm_F16##src1_type_name##toF16(image2d_array_t inputA, \ { \ vxc_float4 tempA0, tempA1, tempA2, tempA3; \ vxc_float4 tempZp; \ - VXC_OP4(img_load_3d, srcA0, inputA, coord_a.xyww, VXC_5BITOFFSET_XY(0, 0), \ + VXC_OP4(img_load_3d, srcA0, inputA, coord_a.xywz, VXC_5BITOFFSET_XY(0, 0), \ VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \ - VXC_OP4(img_load_3d, srcB, inputB, coord_b.xyww, VXC_5BITOFFSET_XY(0, 0), \ + VXC_OP4(img_load_3d, srcB, inputB, coord_b.xywz, VXC_5BITOFFSET_XY(0, 0), \ VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \ - VXC_OP4(img_load_3d, srcA0, inputA, coord_a.xyww, VXC_5BITOFFSET_XY(0, 1), \ + VXC_OP4(img_load_3d, srcA0, inputA, coord_a.xywz, VXC_5BITOFFSET_XY(0, 1), \ VXC_MODIFIER(4, 7, 0, VXC_RM_TowardZero, 0)); \ - VXC_OP4(img_load_3d, srcB, inputB, coord_b.xyww, VXC_5BITOFFSET_XY(0, 1), \ + VXC_OP4(img_load_3d, srcB, inputB, coord_b.xywz, VXC_5BITOFFSET_XY(0, 1), \ VXC_MODIFIER(4, 7, 0, VXC_RM_TowardZero, 0)); \ - VXC_OP4(img_load_3d, srcA1, inputA, coord_a.xyww, VXC_5BITOFFSET_XY(0, 2), \ + VXC_OP4(img_load_3d, srcA1, inputA, coord_a.xywz, VXC_5BITOFFSET_XY(0, 2), \ VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \ - VXC_OP4(img_load_3d, srcB, inputB, coord_b.xyww, VXC_5BITOFFSET_XY(0, 2), \ + VXC_OP4(img_load_3d, srcB, inputB, coord_b.xywz, VXC_5BITOFFSET_XY(0, 2), \ VXC_MODIFIER(8, 11, 0, VXC_RM_TowardZero, 0)); \ - VXC_OP4(img_load_3d, srcA1, inputA, coord_a.xyww, VXC_5BITOFFSET_XY(0, 3), \ + VXC_OP4(img_load_3d, srcA1, inputA, coord_a.xywz, VXC_5BITOFFSET_XY(0, 3), \ VXC_MODIFIER(4, 7, 0, VXC_RM_TowardZero, 0)); \ - VXC_OP4(img_load_3d, srcB, inputB, coord_b.xyww, VXC_5BITOFFSET_XY(0, 3), \ + VXC_OP4(img_load_3d, srcB, inputB, coord_b.xywz, VXC_5BITOFFSET_XY(0, 3), \ VXC_MODIFIER(12, 15, 0, VXC_RM_TowardZero, 0)); \ coord_a.x += 4; \ coord_b.y += 4; \ @@ -80,22 +80,22 @@ __kernel void gemm_F16##src1_type_name##toF16(image2d_array_t inputA, \ _viv_asm(MOV, coord_b.w, baseAddr); \ _viv_asm(CONV, valC, sum0); \ _viv_asm(COPY, outC, valC, 16); \ - VXC_OP4_NoDest(img_store_3d, output, coord_b.xyww, outC.s0246, \ + VXC_OP4_NoDest(img_store_3d, output, coord_b.xywz, outC.s0246, \ VXC_MODIFIER(0, 3, 0,VXC_RM_TowardZero, 0)); \ coord_b.y++; \ _viv_asm(CONV, valC, sum1); \ _viv_asm(COPY, outC, valC, 16); \ - VXC_OP4_NoDest(img_store_3d, output, coord_b.xyww, outC.s0246, \ + VXC_OP4_NoDest(img_store_3d, output, coord_b.xywz, outC.s0246, \ VXC_MODIFIER(0, 3, 0,VXC_RM_TowardZero, 0)); \ coord_b.y++; \ _viv_asm(CONV, valC, sum2); \ _viv_asm(COPY, outC, valC, 16); \ - VXC_OP4_NoDest(img_store_3d, output, coord_b.xyww, outC.s0246, \ + VXC_OP4_NoDest(img_store_3d, output, coord_b.xywz, outC.s0246, \ VXC_MODIFIER(0, 3, 0,VXC_RM_TowardZero, 0)); \ coord_b.y++; \ _viv_asm(CONV, valC, sum3); \ _viv_asm(COPY, outC, valC, 16); \ - VXC_OP4_NoDest(img_store_3d, output, coord_b.xyww, outC.s0246, \ + VXC_OP4_NoDest(img_store_3d, output, coord_b.xywz, outC.s0246, \ VXC_MODIFIER(0, 3, 0,VXC_RM_TowardZero, 0)); \ } GEMM_F16_QINT_TO_F16(U8, vxc_uchar16) @@ -131,33 +131,33 @@ __kernel void gemm_F16##src1_type_name##toF16(image2d_array_t inputA, \ { \ vxc_float4 tempA0, tempA1, tempA2, tempA3; \ vxc_float4 tempB0, tempB1, tempB2, tempB3; \ - VXC_OP4(img_load_3d, srcA, inputA, coord_a.xyww, VXC_5BITOFFSET_XY(0, 0), \ + VXC_OP4(img_load_3d, srcA, inputA, coord_a.xywz, VXC_5BITOFFSET_XY(0, 0), \ VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \ - VXC_OP4(img_load_3d, srcB, inputB, coord_b.xyww, VXC_5BITOFFSET_XY(0, 0), \ + VXC_OP4(img_load_3d, srcB, inputB, coord_b.xywz, VXC_5BITOFFSET_XY(0, 0), \ VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \ _viv_asm(COPY, tmpA, srcA, 16); \ VXC_DP4x4(tempA0, tmpA, tmpA, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvert1stFp16ToFp32_4x4); \ VXC_DP4x4(tempB0, srcB, in1_zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), \ uniConvertUint8SubZpToFp32B_4x4); \ - VXC_OP4(img_load_3d, srcA, inputA, coord_a.xyww, VXC_5BITOFFSET_XY(0, 1), \ + VXC_OP4(img_load_3d, srcA, inputA, coord_a.xywz, VXC_5BITOFFSET_XY(0, 1), \ VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \ - VXC_OP4(img_load_3d, srcB, inputB, coord_b.xyww, VXC_5BITOFFSET_XY(0, 1), \ + VXC_OP4(img_load_3d, srcB, inputB, coord_b.xywz, VXC_5BITOFFSET_XY(0, 1), \ VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \ _viv_asm(COPY, tmpA, srcA, 16); \ VXC_DP4x4(tempA1, tmpA, tmpA, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvert1stFp16ToFp32_4x4); \ VXC_DP4x4(tempB1, srcB, in1_zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), \ uniConvertUint8SubZpToFp32B_4x4); \ - VXC_OP4(img_load_3d, srcA, inputA, coord_a.xyww, VXC_5BITOFFSET_XY(0, 2), \ + VXC_OP4(img_load_3d, srcA, inputA, coord_a.xywz, VXC_5BITOFFSET_XY(0, 2), \ VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \ - VXC_OP4(img_load_3d, srcB, inputB, coord_b.xyww, VXC_5BITOFFSET_XY(0, 2), \ + VXC_OP4(img_load_3d, srcB, inputB, coord_b.xywz, VXC_5BITOFFSET_XY(0, 2), \ VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \ _viv_asm(COPY, tmpA, srcA, 16); \ VXC_DP4x4(tempA2, tmpA, tmpA, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvert1stFp16ToFp32_4x4); \ VXC_DP4x4(tempB2, srcB, in1_zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), \ uniConvertUint8SubZpToFp32B_4x4); \ - VXC_OP4(img_load_3d, srcA, inputA, coord_a.xyww, VXC_5BITOFFSET_XY(0, 3), \ + VXC_OP4(img_load_3d, srcA, inputA, coord_a.xywz, VXC_5BITOFFSET_XY(0, 3), \ VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \ - VXC_OP4(img_load_3d, srcB, inputB, coord_b.xyww, VXC_5BITOFFSET_XY(0, 3), \ + VXC_OP4(img_load_3d, srcB, inputB, coord_b.xywz, VXC_5BITOFFSET_XY(0, 3), \ VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \ coord_a.x += 4; \ coord_b.y += 4; \ @@ -176,22 +176,22 @@ __kernel void gemm_F16##src1_type_name##toF16(image2d_array_t inputA, \ _viv_asm(MOV, coord_b.w, baseAddr); \ _viv_asm(CONV, valC, sum0); \ _viv_asm(COPY, outC, valC, 16); \ - VXC_OP4_NoDest(img_store_3d, output, coord_b.xyww, outC.s0246, \ + VXC_OP4_NoDest(img_store_3d, output, coord_b.xywz, outC.s0246, \ VXC_MODIFIER(0, 3, 0,VXC_RM_TowardZero, 0)); \ coord_b.y++; \ _viv_asm(CONV, valC, sum1); \ _viv_asm(COPY, outC, valC, 16); \ - VXC_OP4_NoDest(img_store_3d, output, coord_b.xyww, outC.s0246, \ + VXC_OP4_NoDest(img_store_3d, output, coord_b.xywz, outC.s0246, \ VXC_MODIFIER(0, 3, 0,VXC_RM_TowardZero, 0)); \ coord_b.y++; \ _viv_asm(CONV, valC, sum2); \ _viv_asm(COPY, outC, valC, 16); \ - VXC_OP4_NoDest(img_store_3d, output, coord_b.xyww, outC.s0246, \ + VXC_OP4_NoDest(img_store_3d, output, coord_b.xywz, outC.s0246, \ VXC_MODIFIER(0, 3, 0,VXC_RM_TowardZero, 0)); \ coord_b.y++; \ _viv_asm(CONV, valC, sum3); \ _viv_asm(COPY, outC, valC, 16); \ - VXC_OP4_NoDest(img_store_3d, output, coord_b.xyww, outC.s0246, \ + VXC_OP4_NoDest(img_store_3d, output, coord_b.xywz, outC.s0246, \ VXC_MODIFIER(0, 3, 0,VXC_RM_TowardZero, 0)); \ } GEMM_F16_QINT16_TO_F16(I16, vxc_short8) diff --git a/src/tim/vx/internal/src/libnnext/ops/vx/matrixmul_f16u8_u8.vx b/src/tim/vx/internal/src/libnnext/ops/vx/matrixmul_f16u8_u8.vx index 3617df4..39ddada 100644 --- a/src/tim/vx/internal/src/libnnext/ops/vx/matrixmul_f16u8_u8.vx +++ b/src/tim/vx/internal/src/libnnext/ops/vx/matrixmul_f16u8_u8.vx @@ -37,21 +37,21 @@ __kernel void gemm_F16##src1_type_name##to##src1_type_name(image2d_array_t input { \ vxc_float4 tempA0, tempA1, tempA2, tempA3; \ vxc_float4 tempZp; \ - VXC_OP4(img_load_3d, srcA0, inputA, coord_a.xyww, VXC_5BITOFFSET_XY(0, 0), \ + VXC_OP4(img_load_3d, srcA0, inputA, coord_a.xywz, VXC_5BITOFFSET_XY(0, 0), \ VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \ - VXC_OP4(img_load_3d, srcB, inputB, coord_b.xyww, VXC_5BITOFFSET_XY(0, 0), \ + VXC_OP4(img_load_3d, srcB, inputB, coord_b.xywz, VXC_5BITOFFSET_XY(0, 0), \ VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \ - VXC_OP4(img_load_3d, srcA0, inputA, coord_a.xyww, VXC_5BITOFFSET_XY(0, 1), \ + VXC_OP4(img_load_3d, srcA0, inputA, coord_a.xywz, VXC_5BITOFFSET_XY(0, 1), \ VXC_MODIFIER(4, 7, 0, VXC_RM_TowardZero, 0)); \ - VXC_OP4(img_load_3d, srcB, inputB, coord_b.xyww, VXC_5BITOFFSET_XY(0, 1), \ + VXC_OP4(img_load_3d, srcB, inputB, coord_b.xywz, VXC_5BITOFFSET_XY(0, 1), \ VXC_MODIFIER(4, 7, 0, VXC_RM_TowardZero, 0)); \ - VXC_OP4(img_load_3d, srcA1, inputA, coord_a.xyww, VXC_5BITOFFSET_XY(0, 2), \ + VXC_OP4(img_load_3d, srcA1, inputA, coord_a.xywz, VXC_5BITOFFSET_XY(0, 2), \ VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \ - VXC_OP4(img_load_3d, srcB, inputB, coord_b.xyww, VXC_5BITOFFSET_XY(0, 2), \ + VXC_OP4(img_load_3d, srcB, inputB, coord_b.xywz, VXC_5BITOFFSET_XY(0, 2), \ VXC_MODIFIER(8, 11, 0, VXC_RM_TowardZero, 0)); \ - VXC_OP4(img_load_3d, srcA1, inputA, coord_a.xyww, VXC_5BITOFFSET_XY(0, 3), \ + VXC_OP4(img_load_3d, srcA1, inputA, coord_a.xywz, VXC_5BITOFFSET_XY(0, 3), \ VXC_MODIFIER(4, 7, 0, VXC_RM_TowardZero, 0)); \ - VXC_OP4(img_load_3d, srcB, inputB, coord_b.xyww, VXC_5BITOFFSET_XY(0, 3), \ + VXC_OP4(img_load_3d, srcB, inputB, coord_b.xywz, VXC_5BITOFFSET_XY(0, 3), \ VXC_MODIFIER(12, 15, 0, VXC_RM_TowardZero, 0)); \ coord_a.x += 4; \ coord_b.y += 4; \ @@ -77,20 +77,20 @@ __kernel void gemm_F16##src1_type_name##to##src1_type_name(image2d_array_t input tmpOut1 = convert_int4_rte(sum1 * in1outScale + output_ZP); \ VXC_DP2x8(outC, tmpOut0, tmpOut1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1),\ uniConvertInt32toUint8_2x8); \ - VXC_OP4_NoDest(img_store_3d, output, coord_b.xyww, outC.s0123, \ + VXC_OP4_NoDest(img_store_3d, output, coord_b.xywz, outC.s0123, \ VXC_MODIFIER(0, 3, 0,VXC_RM_TowardZero, 0)); \ coord_b.y++; \ - VXC_OP4_NoDest(img_store_3d, output, coord_b.xyww, outC.s4567, \ + VXC_OP4_NoDest(img_store_3d, output, coord_b.xywz, outC.s4567, \ VXC_MODIFIER(0, 3, 0,VXC_RM_TowardZero, 0)); \ coord_b.y++; \ tmpOut0 = convert_int4_rte(sum2 * in1outScale + output_ZP); \ tmpOut1 = convert_int4_rte(sum3 * in1outScale + output_ZP); \ VXC_DP2x8(outC, tmpOut0, tmpOut1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1),\ uniConvertInt32toUint8_2x8); \ - VXC_OP4_NoDest(img_store_3d, output, coord_b.xyww, outC.s0123, \ + VXC_OP4_NoDest(img_store_3d, output, coord_b.xywz, outC.s0123, \ VXC_MODIFIER(0, 3, 0,VXC_RM_TowardZero, 0)); \ coord_b.y++; \ - VXC_OP4_NoDest(img_store_3d, output, coord_b.xyww, outC.s4567, \ + VXC_OP4_NoDest(img_store_3d, output, coord_b.xywz, outC.s4567, \ VXC_MODIFIER(0, 3, 0,VXC_RM_TowardZero, 0)); \ } GEMM_F16_QINT_TO_QINT(U8, vxc_uchar16) diff --git a/src/tim/vx/internal/src/libnnext/ops/vx/matrixmul_i16.vx b/src/tim/vx/internal/src/libnnext/ops/vx/matrixmul_i16.vx index b3674c1..7792e92 100644 --- a/src/tim/vx/internal/src/libnnext/ops/vx/matrixmul_i16.vx +++ b/src/tim/vx/internal/src/libnnext/ops/vx/matrixmul_i16.vx @@ -38,33 +38,33 @@ __kernel void gemm_##src0_type_name##src0_type_name##to##src0_type_name( \ { \ vxc_float4 tempA0, tempA1, tempA2, tempA3; \ vxc_float4 tempB0, tempB1, tempB2, tempB3; \ - VXC_OP4(img_load_3d, srcA, inputA, coord_a.xyww, VXC_5BITOFFSET_XY(0, 0), \ + VXC_OP4(img_load_3d, srcA, inputA, coord_a.xywz, VXC_5BITOFFSET_XY(0, 0), \ VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \ - VXC_OP4(img_load_3d, srcB, inputB, coord_b.xyww, VXC_5BITOFFSET_XY(0, 0), \ + VXC_OP4(img_load_3d, srcB, inputB, coord_b.xywz, VXC_5BITOFFSET_XY(0, 0), \ VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \ VXC_DP4x4(tempA0, srcA, in0_zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), \ uniConvertUint8SubZpToFp32_4x4); \ VXC_DP4x4(tempB0, srcB, in1_zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), \ uniConvertUint8SubZpToFp32B_4x4); \ - VXC_OP4(img_load_3d, srcA, inputA, coord_a.xyww, VXC_5BITOFFSET_XY(0, 1), \ + VXC_OP4(img_load_3d, srcA, inputA, coord_a.xywz, VXC_5BITOFFSET_XY(0, 1), \ VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \ - VXC_OP4(img_load_3d, srcB, inputB, coord_b.xyww, VXC_5BITOFFSET_XY(0, 1), \ + VXC_OP4(img_load_3d, srcB, inputB, coord_b.xywz, VXC_5BITOFFSET_XY(0, 1), \ VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \ VXC_DP4x4(tempA1, srcA, in0_zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), \ uniConvertUint8SubZpToFp32_4x4); \ VXC_DP4x4(tempB1, srcB, in1_zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), \ uniConvertUint8SubZpToFp32B_4x4); \ - VXC_OP4(img_load_3d, srcA, inputA, coord_a.xyww, VXC_5BITOFFSET_XY(0, 2), \ + VXC_OP4(img_load_3d, srcA, inputA, coord_a.xywz, VXC_5BITOFFSET_XY(0, 2), \ VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \ - VXC_OP4(img_load_3d, srcB, inputB, coord_b.xyww, VXC_5BITOFFSET_XY(0, 2), \ + VXC_OP4(img_load_3d, srcB, inputB, coord_b.xywz, VXC_5BITOFFSET_XY(0, 2), \ VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \ VXC_DP4x4(tempA2, srcA, in0_zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), \ uniConvertUint8SubZpToFp32_4x4); \ VXC_DP4x4(tempB2, srcB, in1_zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), \ uniConvertUint8SubZpToFp32B_4x4); \ - VXC_OP4(img_load_3d, srcA, inputA, coord_a.xyww, VXC_5BITOFFSET_XY(0, 3), \ + VXC_OP4(img_load_3d, srcA, inputA, coord_a.xywz, VXC_5BITOFFSET_XY(0, 3), \ VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \ - VXC_OP4(img_load_3d, srcB, inputB, coord_b.xyww, VXC_5BITOFFSET_XY(0, 3), \ + VXC_OP4(img_load_3d, srcB, inputB, coord_b.xywz, VXC_5BITOFFSET_XY(0, 3), \ VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \ coord_a.x += 4; \ coord_b.y += 4; \ @@ -85,19 +85,19 @@ __kernel void gemm_##src0_type_name##src0_type_name##to##src0_type_name( \ tmpOut0 = convert_int4_rte(sum0 * outputScale + output_ZP); \ tmpOut1 = convert_int4_rte(sum1 * outputScale + output_ZP); \ VXC_DP2x8(outC, tmpOut0, tmpOut1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1), uniConvertInt32toUint8_2x8); \ - VXC_OP4_NoDest(img_store_3d, output, coord_b.xyww, outC.s0123, \ + VXC_OP4_NoDest(img_store_3d, output, coord_b.xywz, outC.s0123, \ VXC_MODIFIER(0, 3, 0,VXC_RM_TowardZero, 0)); \ coord_b.y++; \ - VXC_OP4_NoDest(img_store_3d, output, coord_b.xyww, outC.s4567, \ + VXC_OP4_NoDest(img_store_3d, output, coord_b.xywz, outC.s4567, \ VXC_MODIFIER(0, 3, 0,VXC_RM_TowardZero, 0)); \ coord_b.y++; \ tmpOut0 = convert_int4_rte(sum2 * outputScale + output_ZP); \ tmpOut1 = convert_int4_rte(sum3 * outputScale + output_ZP); \ VXC_DP2x8(outC, tmpOut0, tmpOut1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1), uniConvertInt32toUint8_2x8); \ - VXC_OP4_NoDest(img_store_3d, output, coord_b.xyww, outC.s0123, \ + VXC_OP4_NoDest(img_store_3d, output, coord_b.xywz, outC.s0123, \ VXC_MODIFIER(0, 3, 0,VXC_RM_TowardZero, 0)); \ coord_b.y++; \ - VXC_OP4_NoDest(img_store_3d, output, coord_b.xyww, outC.s4567, \ + VXC_OP4_NoDest(img_store_3d, output, coord_b.xywz, outC.s4567, \ VXC_MODIFIER(0, 3, 0,VXC_RM_TowardZero, 0)); \ } GEMM_QINT_TO_QINT(I16, vxc_short8) diff --git a/src/tim/vx/internal/src/libnnext/ops/vx/matrixmul_transA.vx b/src/tim/vx/internal/src/libnnext/ops/vx/matrixmul_transA.vx index 4f5e558..2fb3d26 100644 --- a/src/tim/vx/internal/src/libnnext/ops/vx/matrixmul_transA.vx +++ b/src/tim/vx/internal/src/libnnext/ops/vx/matrixmul_transA.vx @@ -48,9 +48,9 @@ __kernel void gemm_transa_##src0_type_name##src1_type_name##to##dst_type_name( \ \ for(coord_a.y = 0, coord_b.y = 0; coord_a.y < K;) \ { \ - VXC_OP4(img_load_3d, srcA, inputA, coord_a.xyww, VXC_5BITOFFSET_XY(0, 0), \ + VXC_OP4(img_load_3d, srcA, inputA, coord_a.xywz, VXC_5BITOFFSET_XY(0, 0), \ VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \ - VXC_OP4(img_load_3d, srcB, inputB, coord_b.xyww, VXC_5BITOFFSET_XY(0, 0), \ + VXC_OP4(img_load_3d, srcB, inputB, coord_b.xywz, VXC_5BITOFFSET_XY(0, 0), \ VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \ coord_a.y++; \ coord_b.y++; \ @@ -72,10 +72,10 @@ __kernel void gemm_transa_##src0_type_name##src1_type_name##to##dst_type_name( \ tmpOut1 = convert_int4_rte(sum1 * outputScale + output_ZP); \ VXC_DP2x8(outC, tmpOut0, tmpOut1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1),\ uniConvertInt32toUint8_2x8); \ - VXC_OP4_NoDest(img_store_3d, output, coord_b.xyww, outC.s0123, \ + VXC_OP4_NoDest(img_store_3d, output, coord_b.xywz, outC.s0123, \ VXC_MODIFIER(0, 3, 0,VXC_RM_TowardZero, 0)); \ coord_b.y++; \ - VXC_OP4_NoDest(img_store_3d, output, coord_b.xyww, outC.s4567, \ + VXC_OP4_NoDest(img_store_3d, output, coord_b.xywz, outC.s4567, \ VXC_MODIFIER(0, 3, 0,VXC_RM_TowardZero, 0)); \ \ coord_b.y++; \ @@ -83,10 +83,10 @@ __kernel void gemm_transa_##src0_type_name##src1_type_name##to##dst_type_name( \ tmpOut1 = convert_int4_rte(sum3 * outputScale + output_ZP); \ VXC_DP2x8(outC, tmpOut0, tmpOut1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1),\ uniConvertInt32toUint8_2x8); \ - VXC_OP4_NoDest(img_store_3d, output, coord_b.xyww, outC.s0123, \ + VXC_OP4_NoDest(img_store_3d, output, coord_b.xywz, outC.s0123, \ VXC_MODIFIER(0, 3, 0,VXC_RM_TowardZero, 0)); \ coord_b.y++; \ - VXC_OP4_NoDest(img_store_3d, output, coord_b.xyww, outC.s4567, \ + VXC_OP4_NoDest(img_store_3d, output, coord_b.xywz, outC.s4567, \ VXC_MODIFIER(0, 3, 0,VXC_RM_TowardZero, 0)); \ } GEMM_TRANSA_QINT(U8, U8, U8, vxc_uchar16, vxc_uchar16, vxc_uchar16) @@ -133,9 +133,9 @@ __kernel void gemm_transa_##src0_type_name##F16to##src0_type_name( \ \ for(coord_a.y = 0, coord_b.y = 0; coord_a.y < K;) \ { \ - VXC_OP4(img_load_3d, srcA, inputA, coord_a.xyww, VXC_5BITOFFSET_XY(0, 0), \ + VXC_OP4(img_load_3d, srcA, inputA, coord_a.xywz, VXC_5BITOFFSET_XY(0, 0), \ VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \ - VXC_OP4(img_load_3d, srcB, inputB, coord_b.xyww, VXC_5BITOFFSET_XY(0, 0), \ + VXC_OP4(img_load_3d, srcB, inputB, coord_b.xywz, VXC_5BITOFFSET_XY(0, 0), \ VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \ coord_a.y++; \ coord_b.y++; \ @@ -157,10 +157,10 @@ __kernel void gemm_transa_##src0_type_name##F16to##src0_type_name( \ tmpOut1 = convert_int4_rte(sum1 * outputScale + output_ZP); \ VXC_DP2x8(outC, tmpOut0, tmpOut1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1),\ uniConvertInt32toUint8_2x8); \ - VXC_OP4_NoDest(img_store_3d, output, coord_b.xyww, outC.s0123, \ + VXC_OP4_NoDest(img_store_3d, output, coord_b.xywz, outC.s0123, \ VXC_MODIFIER(0, 3, 0,VXC_RM_TowardZero, 0)); \ coord_b.y++; \ - VXC_OP4_NoDest(img_store_3d, output, coord_b.xyww, outC.s4567, \ + VXC_OP4_NoDest(img_store_3d, output, coord_b.xywz, outC.s4567, \ VXC_MODIFIER(0, 3, 0,VXC_RM_TowardZero, 0)); \ \ coord_b.y++; \ @@ -168,10 +168,10 @@ __kernel void gemm_transa_##src0_type_name##F16to##src0_type_name( \ tmpOut1 = convert_int4_rte(sum3 * outputScale + output_ZP); \ VXC_DP2x8(outC, tmpOut0, tmpOut1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1),\ uniConvertInt32toUint8_2x8); \ - VXC_OP4_NoDest(img_store_3d, output, coord_b.xyww, outC.s0123, \ + VXC_OP4_NoDest(img_store_3d, output, coord_b.xywz, outC.s0123, \ VXC_MODIFIER(0, 3, 0,VXC_RM_TowardZero, 0)); \ coord_b.y++; \ - VXC_OP4_NoDest(img_store_3d, output, coord_b.xyww, outC.s4567, \ + VXC_OP4_NoDest(img_store_3d, output, coord_b.xywz, outC.s4567, \ VXC_MODIFIER(0, 3, 0,VXC_RM_TowardZero, 0)); \ } GEMM_TRANSA_INPUTB_F16(U8, vxc_uchar16) @@ -215,9 +215,9 @@ __kernel void gemm_transa_F16F16toF16( for(coord_a.y = 0, coord_b.y = 0; coord_a.y < K;) { - VXC_OP4(img_load_3d, srcA, inputA, coord_a.xyww, VXC_5BITOFFSET_XY(0, 0), + VXC_OP4(img_load_3d, srcA, inputA, coord_a.xywz, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); - VXC_OP4(img_load_3d, srcB, inputB, coord_b.xyww, VXC_5BITOFFSET_XY(0, 0), + VXC_OP4(img_load_3d, srcB, inputB, coord_b.xywz, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); coord_a.y++; coord_b.y++; @@ -237,24 +237,24 @@ __kernel void gemm_transa_F16F16toF16( _viv_asm(MOV, coord_b.w, baseAddr); _viv_asm(CONV, valC, sum0); _viv_asm(COPY, outC, valC, 16); - VXC_OP4_NoDest(img_store_3d, output, coord_b.xyww, outC.s0246, \ + VXC_OP4_NoDest(img_store_3d, output, coord_b.xywz, outC.s0246, \ VXC_MODIFIER(0, 3, 0,VXC_RM_TowardZero, 0)); coord_b.y++; _viv_asm(CONV, valC, sum1); _viv_asm(COPY, outC, valC, 16); - VXC_OP4_NoDest(img_store_3d, output, coord_b.xyww, outC.s0246, \ + VXC_OP4_NoDest(img_store_3d, output, coord_b.xywz, outC.s0246, \ VXC_MODIFIER(0, 3, 0,VXC_RM_TowardZero, 0)); coord_b.y++; _viv_asm(CONV, valC, sum2); _viv_asm(COPY, outC, valC, 16); - VXC_OP4_NoDest(img_store_3d, output, coord_b.xyww, outC.s0246, \ + VXC_OP4_NoDest(img_store_3d, output, coord_b.xywz, outC.s0246, \ VXC_MODIFIER(0, 3, 0,VXC_RM_TowardZero, 0)); coord_b.y++; _viv_asm(CONV, valC, sum3); _viv_asm(COPY, outC, valC, 16); - VXC_OP4_NoDest(img_store_3d, output, coord_b.xyww, outC.s0246, \ + VXC_OP4_NoDest(img_store_3d, output, coord_b.xywz, outC.s0246, \ VXC_MODIFIER(0, 3, 0,VXC_RM_TowardZero, 0)); } \ No newline at end of file diff --git a/src/tim/vx/internal/src/libnnext/ops/vx/matrixmul_transB_f16.vx b/src/tim/vx/internal/src/libnnext/ops/vx/matrixmul_transB_f16.vx index 04af02c..8548fe7 100644 --- a/src/tim/vx/internal/src/libnnext/ops/vx/matrixmul_transB_f16.vx +++ b/src/tim/vx/internal/src/libnnext/ops/vx/matrixmul_transB_f16.vx @@ -36,21 +36,21 @@ __kernel void gemm_transb_F16F16toF16(image2d_array_t inputA, { vxc_short8 srcA0,srcA1,srcA2,srcA3; vxc_short8 srcB0,srcB1,srcB2,srcB3; - VXC_OP4(img_load_3d, srcA0, inputA, coord_a.xyww, VXC_5BITOFFSET_XY(0, 0), + VXC_OP4(img_load_3d, srcA0, inputA, coord_a.xywz, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); - VXC_OP4(img_load_3d, srcA1, inputA, coord_a.xyww, VXC_5BITOFFSET_XY(0, 1), + VXC_OP4(img_load_3d, srcA1, inputA, coord_a.xywz, VXC_5BITOFFSET_XY(0, 1), VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); - VXC_OP4(img_load_3d, srcA2, inputA, coord_a.xyww, VXC_5BITOFFSET_XY(0, 2), + VXC_OP4(img_load_3d, srcA2, inputA, coord_a.xywz, VXC_5BITOFFSET_XY(0, 2), VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); - VXC_OP4(img_load_3d, srcA3, inputA, coord_a.xyww, VXC_5BITOFFSET_XY(0, 3), + VXC_OP4(img_load_3d, srcA3, inputA, coord_a.xywz, VXC_5BITOFFSET_XY(0, 3), VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); - VXC_OP4(img_load_3d, srcB0, inputB, coord_b.xyww, VXC_5BITOFFSET_XY(0, 0), + VXC_OP4(img_load_3d, srcB0, inputB, coord_b.xywz, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); - VXC_OP4(img_load_3d, srcB1, inputB, coord_b.xyww, VXC_5BITOFFSET_XY(0, 1), + VXC_OP4(img_load_3d, srcB1, inputB, coord_b.xywz, VXC_5BITOFFSET_XY(0, 1), VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); - VXC_OP4(img_load_3d, srcB2, inputB, coord_b.xyww, VXC_5BITOFFSET_XY(0, 2), + VXC_OP4(img_load_3d, srcB2, inputB, coord_b.xywz, VXC_5BITOFFSET_XY(0, 2), VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); - VXC_OP4(img_load_3d, srcB3, inputB, coord_b.xyww, VXC_5BITOFFSET_XY(0, 3), + VXC_OP4(img_load_3d, srcB3, inputB, coord_b.xywz, VXC_5BITOFFSET_XY(0, 3), VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); coord_a.x += 8; coord_b.x += 8; diff --git a/src/tim/vx/internal/src/libnnext/ops/vx/matrixmul_transB_f16_mix.vx b/src/tim/vx/internal/src/libnnext/ops/vx/matrixmul_transB_f16_mix.vx index e33d532..1c6ad3d 100644 --- a/src/tim/vx/internal/src/libnnext/ops/vx/matrixmul_transB_f16_mix.vx +++ b/src/tim/vx/internal/src/libnnext/ops/vx/matrixmul_transB_f16_mix.vx @@ -41,21 +41,21 @@ __kernel void gemm_transb_F16U8toF16(image2d_array_t inputA, { vxc_short8 srcA0,srcA1,srcA2,srcA3; vxc_uchar8 srcB0,srcB1,srcB2,srcB3; - VXC_OP4(img_load_3d, srcA0, inputA, coord_a.xyww, VXC_5BITOFFSET_XY(0, 0), + VXC_OP4(img_load_3d, srcA0, inputA, coord_a.xywz, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); - VXC_OP4(img_load_3d, srcA1, inputA, coord_a.xyww, VXC_5BITOFFSET_XY(0, 1), + VXC_OP4(img_load_3d, srcA1, inputA, coord_a.xywz, VXC_5BITOFFSET_XY(0, 1), VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); - VXC_OP4(img_load_3d, srcA2, inputA, coord_a.xyww, VXC_5BITOFFSET_XY(0, 2), + VXC_OP4(img_load_3d, srcA2, inputA, coord_a.xywz, VXC_5BITOFFSET_XY(0, 2), VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); - VXC_OP4(img_load_3d, srcA3, inputA, coord_a.xyww, VXC_5BITOFFSET_XY(0, 3), + VXC_OP4(img_load_3d, srcA3, inputA, coord_a.xywz, VXC_5BITOFFSET_XY(0, 3), VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); - VXC_OP4(img_load_3d, srcB0, inputB, coord_b.xyww, VXC_5BITOFFSET_XY(0, 0), + VXC_OP4(img_load_3d, srcB0, inputB, coord_b.xywz, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); - VXC_OP4(img_load_3d, srcB1, inputB, coord_b.xyww, VXC_5BITOFFSET_XY(0, 1), + VXC_OP4(img_load_3d, srcB1, inputB, coord_b.xywz, VXC_5BITOFFSET_XY(0, 1), VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); - VXC_OP4(img_load_3d, srcB2, inputB, coord_b.xyww, VXC_5BITOFFSET_XY(0, 2), + VXC_OP4(img_load_3d, srcB2, inputB, coord_b.xywz, VXC_5BITOFFSET_XY(0, 2), VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); - VXC_OP4(img_load_3d, srcB3, inputB, coord_b.xyww, VXC_5BITOFFSET_XY(0, 3), + VXC_OP4(img_load_3d, srcB3, inputB, coord_b.xywz, VXC_5BITOFFSET_XY(0, 3), VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); coord_a.x += 8; coord_b.x += 8; @@ -171,21 +171,21 @@ __kernel void gemm_transb_F16U8toU8(image2d_array_t inputA, { vxc_short8 srcA0,srcA1,srcA2,srcA3; vxc_uchar8 srcB0,srcB1,srcB2,srcB3; - VXC_OP4(img_load_3d, srcA0, inputA, coord_a.xyww, VXC_5BITOFFSET_XY(0, 0), + VXC_OP4(img_load_3d, srcA0, inputA, coord_a.xywz, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); - VXC_OP4(img_load_3d, srcA1, inputA, coord_a.xyww, VXC_5BITOFFSET_XY(0, 1), + VXC_OP4(img_load_3d, srcA1, inputA, coord_a.xywz, VXC_5BITOFFSET_XY(0, 1), VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); - VXC_OP4(img_load_3d, srcA2, inputA, coord_a.xyww, VXC_5BITOFFSET_XY(0, 2), + VXC_OP4(img_load_3d, srcA2, inputA, coord_a.xywz, VXC_5BITOFFSET_XY(0, 2), VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); - VXC_OP4(img_load_3d, srcA3, inputA, coord_a.xyww, VXC_5BITOFFSET_XY(0, 3), + VXC_OP4(img_load_3d, srcA3, inputA, coord_a.xywz, VXC_5BITOFFSET_XY(0, 3), VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); - VXC_OP4(img_load_3d, srcB0, inputB, coord_b.xyww, VXC_5BITOFFSET_XY(0, 0), + VXC_OP4(img_load_3d, srcB0, inputB, coord_b.xywz, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); - VXC_OP4(img_load_3d, srcB1, inputB, coord_b.xyww, VXC_5BITOFFSET_XY(0, 1), + VXC_OP4(img_load_3d, srcB1, inputB, coord_b.xywz, VXC_5BITOFFSET_XY(0, 1), VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); - VXC_OP4(img_load_3d, srcB2, inputB, coord_b.xyww, VXC_5BITOFFSET_XY(0, 2), + VXC_OP4(img_load_3d, srcB2, inputB, coord_b.xywz, VXC_5BITOFFSET_XY(0, 2), VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); - VXC_OP4(img_load_3d, srcB3, inputB, coord_b.xyww, VXC_5BITOFFSET_XY(0, 3), + VXC_OP4(img_load_3d, srcB3, inputB, coord_b.xywz, VXC_5BITOFFSET_XY(0, 3), VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); coord_a.x += 8; coord_b.x += 8; diff --git a/src/tim/vx/internal/src/libnnext/ops/vx/matrixmul_transB_u8_mix.vx b/src/tim/vx/internal/src/libnnext/ops/vx/matrixmul_transB_u8_mix.vx index 24ea0e0..71bd242 100644 --- a/src/tim/vx/internal/src/libnnext/ops/vx/matrixmul_transB_u8_mix.vx +++ b/src/tim/vx/internal/src/libnnext/ops/vx/matrixmul_transB_u8_mix.vx @@ -43,21 +43,21 @@ __kernel void gemm_transb_U8U8toF16(image2d_array_t inputA, { vxc_uchar8 srcA0,srcA1,srcA2,srcA3; vxc_uchar8 srcB0,srcB1,srcB2,srcB3; - VXC_OP4(img_load_3d, srcA0, inputA, coord_a.xyww, VXC_5BITOFFSET_XY(0, 0), + VXC_OP4(img_load_3d, srcA0, inputA, coord_a.xywz, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); - VXC_OP4(img_load_3d, srcA1, inputA, coord_a.xyww, VXC_5BITOFFSET_XY(0, 1), + VXC_OP4(img_load_3d, srcA1, inputA, coord_a.xywz, VXC_5BITOFFSET_XY(0, 1), VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); - VXC_OP4(img_load_3d, srcA2, inputA, coord_a.xyww, VXC_5BITOFFSET_XY(0, 2), + VXC_OP4(img_load_3d, srcA2, inputA, coord_a.xywz, VXC_5BITOFFSET_XY(0, 2), VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); - VXC_OP4(img_load_3d, srcA3, inputA, coord_a.xyww, VXC_5BITOFFSET_XY(0, 3), + VXC_OP4(img_load_3d, srcA3, inputA, coord_a.xywz, VXC_5BITOFFSET_XY(0, 3), VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); - VXC_OP4(img_load_3d, srcB0, inputB, coord_b.xyww, VXC_5BITOFFSET_XY(0, 0), + VXC_OP4(img_load_3d, srcB0, inputB, coord_b.xywz, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); - VXC_OP4(img_load_3d, srcB1, inputB, coord_b.xyww, VXC_5BITOFFSET_XY(0, 1), + VXC_OP4(img_load_3d, srcB1, inputB, coord_b.xywz, VXC_5BITOFFSET_XY(0, 1), VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); - VXC_OP4(img_load_3d, srcB2, inputB, coord_b.xyww, VXC_5BITOFFSET_XY(0, 2), + VXC_OP4(img_load_3d, srcB2, inputB, coord_b.xywz, VXC_5BITOFFSET_XY(0, 2), VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); - VXC_OP4(img_load_3d, srcB3, inputB, coord_b.xyww, VXC_5BITOFFSET_XY(0, 3), + VXC_OP4(img_load_3d, srcB3, inputB, coord_b.xywz, VXC_5BITOFFSET_XY(0, 3), VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); coord_a.x += 8; coord_b.x += 8; @@ -178,21 +178,21 @@ __kernel void gemm_transb_U8U8toU8(image2d_array_t inputA, { vxc_uchar8 srcA0,srcA1,srcA2,srcA3; vxc_uchar8 srcB0,srcB1,srcB2,srcB3; - VXC_OP4(img_load_3d, srcA0, inputA, coord_a.xyww, VXC_5BITOFFSET_XY(0, 0), + VXC_OP4(img_load_3d, srcA0, inputA, coord_a.xywz, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); - VXC_OP4(img_load_3d, srcA1, inputA, coord_a.xyww, VXC_5BITOFFSET_XY(0, 1), + VXC_OP4(img_load_3d, srcA1, inputA, coord_a.xywz, VXC_5BITOFFSET_XY(0, 1), VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); - VXC_OP4(img_load_3d, srcA2, inputA, coord_a.xyww, VXC_5BITOFFSET_XY(0, 2), + VXC_OP4(img_load_3d, srcA2, inputA, coord_a.xywz, VXC_5BITOFFSET_XY(0, 2), VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); - VXC_OP4(img_load_3d, srcA3, inputA, coord_a.xyww, VXC_5BITOFFSET_XY(0, 3), + VXC_OP4(img_load_3d, srcA3, inputA, coord_a.xywz, VXC_5BITOFFSET_XY(0, 3), VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); - VXC_OP4(img_load_3d, srcB0, inputB, coord_b.xyww, VXC_5BITOFFSET_XY(0, 0), + VXC_OP4(img_load_3d, srcB0, inputB, coord_b.xywz, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); - VXC_OP4(img_load_3d, srcB1, inputB, coord_b.xyww, VXC_5BITOFFSET_XY(0, 1), + VXC_OP4(img_load_3d, srcB1, inputB, coord_b.xywz, VXC_5BITOFFSET_XY(0, 1), VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); - VXC_OP4(img_load_3d, srcB2, inputB, coord_b.xyww, VXC_5BITOFFSET_XY(0, 2), + VXC_OP4(img_load_3d, srcB2, inputB, coord_b.xywz, VXC_5BITOFFSET_XY(0, 2), VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); - VXC_OP4(img_load_3d, srcB3, inputB, coord_b.xyww, VXC_5BITOFFSET_XY(0, 3), + VXC_OP4(img_load_3d, srcB3, inputB, coord_b.xywz, VXC_5BITOFFSET_XY(0, 3), VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); coord_a.x += 8; coord_b.x += 8; diff --git a/src/tim/vx/internal/src/libnnext/ops/vx/matrixmul_u8.vx b/src/tim/vx/internal/src/libnnext/ops/vx/matrixmul_u8.vx index e5d2c76..1b1e92f 100644 --- a/src/tim/vx/internal/src/libnnext/ops/vx/matrixmul_u8.vx +++ b/src/tim/vx/internal/src/libnnext/ops/vx/matrixmul_u8.vx @@ -33,21 +33,21 @@ __kernel void gemm_##src0_type_name##src0_type_name##to##src0_type_name( \ { \ vxc_float4 tempA0, tempA1, tempA2, tempA3; \ vxc_float4 tempB0, tempB1, tempB2, tempB3; \ - VXC_OP4(img_load_3d, srcA0, inputA, coord_a.xyww, VXC_5BITOFFSET_XY(0, 0), \ + VXC_OP4(img_load_3d, srcA0, inputA, coord_a.xywz, VXC_5BITOFFSET_XY(0, 0), \ VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \ - VXC_OP4(img_load_3d, srcB, inputB, coord_b.xyww, VXC_5BITOFFSET_XY(0, 0), \ + VXC_OP4(img_load_3d, srcB, inputB, coord_b.xywz, VXC_5BITOFFSET_XY(0, 0), \ VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \ - VXC_OP4(img_load_3d, srcA1, inputA, coord_a.xyww, VXC_5BITOFFSET_XY(0, 1), \ + VXC_OP4(img_load_3d, srcA1, inputA, coord_a.xywz, VXC_5BITOFFSET_XY(0, 1), \ VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \ - VXC_OP4(img_load_3d, srcB, inputB, coord_b.xyww, VXC_5BITOFFSET_XY(0, 1), \ + VXC_OP4(img_load_3d, srcB, inputB, coord_b.xywz, VXC_5BITOFFSET_XY(0, 1), \ VXC_MODIFIER(4, 7, 0, VXC_RM_TowardZero, 0)); \ - VXC_OP4(img_load_3d, srcA2, inputA, coord_a.xyww, VXC_5BITOFFSET_XY(0, 2), \ + VXC_OP4(img_load_3d, srcA2, inputA, coord_a.xywz, VXC_5BITOFFSET_XY(0, 2), \ VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \ - VXC_OP4(img_load_3d, srcB, inputB, coord_b.xyww, VXC_5BITOFFSET_XY(0, 2), \ + VXC_OP4(img_load_3d, srcB, inputB, coord_b.xywz, VXC_5BITOFFSET_XY(0, 2), \ VXC_MODIFIER(8, 11, 0, VXC_RM_TowardZero, 0)); \ - VXC_OP4(img_load_3d, srcA3, inputA, coord_a.xyww, VXC_5BITOFFSET_XY(0, 3), \ + VXC_OP4(img_load_3d, srcA3, inputA, coord_a.xywz, VXC_5BITOFFSET_XY(0, 3), \ VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \ - VXC_OP4(img_load_3d, srcB, inputB, coord_b.xyww, VXC_5BITOFFSET_XY(0, 3), \ + VXC_OP4(img_load_3d, srcB, inputB, coord_b.xywz, VXC_5BITOFFSET_XY(0, 3), \ VXC_MODIFIER(12, 15, 0, VXC_RM_TowardZero, 0)); \ coord_a.x += 4; \ coord_b.y += 4; \ @@ -80,19 +80,19 @@ __kernel void gemm_##src0_type_name##src0_type_name##to##src0_type_name( \ tmpOut0 = convert_int4_rte(sum0 * inOutScale + output_ZP); \ tmpOut1 = convert_int4_rte(sum1 * inOutScale + output_ZP); \ VXC_DP2x8(outC, tmpOut0, tmpOut1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1), uniConvertInt32toUint8_2x8); \ - VXC_OP4_NoDest(img_store_3d, output, coord_b.xyww, outC.s0123, \ + VXC_OP4_NoDest(img_store_3d, output, coord_b.xywz, outC.s0123, \ VXC_MODIFIER(0, 3, 0,VXC_RM_TowardZero, 0)); \ coord_b.y++; \ - VXC_OP4_NoDest(img_store_3d, output, coord_b.xyww, outC.s4567, \ + VXC_OP4_NoDest(img_store_3d, output, coord_b.xywz, outC.s4567, \ VXC_MODIFIER(0, 3, 0,VXC_RM_TowardZero, 0)); \ coord_b.y++; \ tmpOut0 = convert_int4_rte(sum2 * inOutScale + output_ZP); \ tmpOut1 = convert_int4_rte(sum3 * inOutScale + output_ZP); \ VXC_DP2x8(outC, tmpOut0, tmpOut1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1), uniConvertInt32toUint8_2x8); \ - VXC_OP4_NoDest(img_store_3d, output, coord_b.xyww, outC.s0123, \ + VXC_OP4_NoDest(img_store_3d, output, coord_b.xywz, outC.s0123, \ VXC_MODIFIER(0, 3, 0,VXC_RM_TowardZero, 0)); \ coord_b.y++; \ - VXC_OP4_NoDest(img_store_3d, output, coord_b.xyww, outC.s4567, \ + VXC_OP4_NoDest(img_store_3d, output, coord_b.xywz, outC.s4567, \ VXC_MODIFIER(0, 3, 0,VXC_RM_TowardZero, 0)); \ } GEMM_QINT_TO_QINT(U8, vxc_uchar16) diff --git a/src/tim/vx/internal/src/libnnext/ops/vx/matrixmul_u8f16_f16.vx b/src/tim/vx/internal/src/libnnext/ops/vx/matrixmul_u8f16_f16.vx index 8f9ae12..021ff4b 100644 --- a/src/tim/vx/internal/src/libnnext/ops/vx/matrixmul_u8f16_f16.vx +++ b/src/tim/vx/internal/src/libnnext/ops/vx/matrixmul_u8f16_f16.vx @@ -45,21 +45,21 @@ __kernel void gemm_##src0_type_name##F16toF16( \ { \ vxc_float4 tempA0, tempA1, tempA2, tempA3, tmpZpScale; \ vxc_float4 tempB0, tempB1, tempB2, tempB3; \ - VXC_OP4(img_load_3d, srcA0, inputA, coord_a.xyww, VXC_5BITOFFSET_XY(0, 0), \ + VXC_OP4(img_load_3d, srcA0, inputA, coord_a.xywz, VXC_5BITOFFSET_XY(0, 0), \ VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \ - VXC_OP4(img_load_3d, srcB.hi, inputB, coord_b.xyww, VXC_5BITOFFSET_XY(0, 0), \ + VXC_OP4(img_load_3d, srcB.hi, inputB, coord_b.xywz, VXC_5BITOFFSET_XY(0, 0), \ VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \ - VXC_OP4(img_load_3d, srcA0, inputA, coord_a.xyww, VXC_5BITOFFSET_XY(0, 1), \ + VXC_OP4(img_load_3d, srcA0, inputA, coord_a.xywz, VXC_5BITOFFSET_XY(0, 1), \ VXC_MODIFIER(4, 7, 0, VXC_RM_TowardZero, 0)); \ - VXC_OP4(img_load_3d, srcB.hi, inputB, coord_b.xyww, VXC_5BITOFFSET_XY(0, 1), \ + VXC_OP4(img_load_3d, srcB.hi, inputB, coord_b.xywz, VXC_5BITOFFSET_XY(0, 1), \ VXC_MODIFIER(4, 7, 0, VXC_RM_TowardZero, 0)); \ - VXC_OP4(img_load_3d, srcA1, inputA, coord_a.xyww, VXC_5BITOFFSET_XY(0, 2), \ + VXC_OP4(img_load_3d, srcA1, inputA, coord_a.xywz, VXC_5BITOFFSET_XY(0, 2), \ VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \ - VXC_OP4(img_load_3d, srcB.lo, inputB, coord_b.xyww, VXC_5BITOFFSET_XY(0, 2), \ + VXC_OP4(img_load_3d, srcB.lo, inputB, coord_b.xywz, VXC_5BITOFFSET_XY(0, 2), \ VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \ - VXC_OP4(img_load_3d, srcA1, inputA, coord_a.xyww, VXC_5BITOFFSET_XY(0, 3), \ + VXC_OP4(img_load_3d, srcA1, inputA, coord_a.xywz, VXC_5BITOFFSET_XY(0, 3), \ VXC_MODIFIER(4, 7, 0, VXC_RM_TowardZero, 0)); \ - VXC_OP4(img_load_3d, srcB.lo, inputB, coord_b.xyww, VXC_5BITOFFSET_XY(0, 3), \ + VXC_OP4(img_load_3d, srcB.lo, inputB, coord_b.xywz, VXC_5BITOFFSET_XY(0, 3), \ VXC_MODIFIER(4, 7, 0, VXC_RM_TowardZero, 0)); \ coord_a.x += 4; \ coord_b.y += 4; \ @@ -91,22 +91,22 @@ __kernel void gemm_##src0_type_name##F16toF16( \ \ _viv_asm(CONV, valC, sum0); \ _viv_asm(COPY, outC, valC, 16); \ - VXC_OP4_NoDest(img_store_3d, output, coord_b.xyww, outC.s0246, \ + VXC_OP4_NoDest(img_store_3d, output, coord_b.xywz, outC.s0246, \ VXC_MODIFIER(0, 3, 0,VXC_RM_TowardZero, 0)); \ coord_b.y++; \ _viv_asm(CONV, valC, sum1); \ _viv_asm(COPY, outC, valC, 16); \ - VXC_OP4_NoDest(img_store_3d, output, coord_b.xyww, outC.s0246, \ + VXC_OP4_NoDest(img_store_3d, output, coord_b.xywz, outC.s0246, \ VXC_MODIFIER(0, 3, 0,VXC_RM_TowardZero, 0)); \ coord_b.y++; \ _viv_asm(CONV, valC, sum2); \ _viv_asm(COPY, outC, valC, 16); \ - VXC_OP4_NoDest(img_store_3d, output, coord_b.xyww, outC.s0246, \ + VXC_OP4_NoDest(img_store_3d, output, coord_b.xywz, outC.s0246, \ VXC_MODIFIER(0, 3, 0,VXC_RM_TowardZero, 0)); \ coord_b.y++; \ _viv_asm(CONV, valC, sum3); \ _viv_asm(COPY, outC, valC, 16); \ - VXC_OP4_NoDest(img_store_3d, output, coord_b.xyww, outC.s0246, \ + VXC_OP4_NoDest(img_store_3d, output, coord_b.xywz, outC.s0246, \ VXC_MODIFIER(0, 3, 0,VXC_RM_TowardZero, 0)); \ } GEMM_QINT_F16_TO_F16(U8, vxc_uchar16) @@ -142,21 +142,21 @@ __kernel void gemm_##src0_type_name##F16toF16( \ { \ vxc_float4 tempA0, tempA1, tempA2, tempA3, tmpZpScale; \ vxc_float4 tempB0, tempB1, tempB2, tempB3; \ - VXC_OP4(img_load_3d, srcA0, inputA, coord_a.xyww, VXC_5BITOFFSET_XY(0, 0), \ + VXC_OP4(img_load_3d, srcA0, inputA, coord_a.xywz, VXC_5BITOFFSET_XY(0, 0), \ VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \ - VXC_OP4(img_load_3d, srcB.hi, inputB, coord_b.xyww, VXC_5BITOFFSET_XY(0, 0), \ + VXC_OP4(img_load_3d, srcB.hi, inputB, coord_b.xywz, VXC_5BITOFFSET_XY(0, 0), \ VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \ - VXC_OP4(img_load_3d, srcA0, inputA, coord_a.xyww, VXC_5BITOFFSET_XY(0, 1), \ + VXC_OP4(img_load_3d, srcA0, inputA, coord_a.xywz, VXC_5BITOFFSET_XY(0, 1), \ VXC_MODIFIER(4, 7, 0, VXC_RM_TowardZero, 0)); \ - VXC_OP4(img_load_3d, srcB.hi, inputB, coord_b.xyww, VXC_5BITOFFSET_XY(0, 1), \ + VXC_OP4(img_load_3d, srcB.hi, inputB, coord_b.xywz, VXC_5BITOFFSET_XY(0, 1), \ VXC_MODIFIER(4, 7, 0, VXC_RM_TowardZero, 0)); \ - VXC_OP4(img_load_3d, srcA1, inputA, coord_a.xyww, VXC_5BITOFFSET_XY(0, 2), \ + VXC_OP4(img_load_3d, srcA1, inputA, coord_a.xywz, VXC_5BITOFFSET_XY(0, 2), \ VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \ - VXC_OP4(img_load_3d, srcB.lo, inputB, coord_b.xyww, VXC_5BITOFFSET_XY(0, 2), \ + VXC_OP4(img_load_3d, srcB.lo, inputB, coord_b.xywz, VXC_5BITOFFSET_XY(0, 2), \ VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \ - VXC_OP4(img_load_3d, srcA1, inputA, coord_a.xyww, VXC_5BITOFFSET_XY(0, 3), \ + VXC_OP4(img_load_3d, srcA1, inputA, coord_a.xywz, VXC_5BITOFFSET_XY(0, 3), \ VXC_MODIFIER(4, 7, 0, VXC_RM_TowardZero, 0)); \ - VXC_OP4(img_load_3d, srcB.lo, inputB, coord_b.xyww, VXC_5BITOFFSET_XY(0, 3), \ + VXC_OP4(img_load_3d, srcB.lo, inputB, coord_b.xywz, VXC_5BITOFFSET_XY(0, 3), \ VXC_MODIFIER(4, 7, 0, VXC_RM_TowardZero, 0)); \ coord_a.x += 4; \ coord_b.y += 4; \ @@ -196,22 +196,22 @@ __kernel void gemm_##src0_type_name##F16toF16( \ \ _viv_asm(CONV, valC, sum0); \ _viv_asm(COPY, outC, valC, 16); \ - VXC_OP4_NoDest(img_store_3d, output, coord_b.xyww, outC.s0246, \ + VXC_OP4_NoDest(img_store_3d, output, coord_b.xywz, outC.s0246, \ VXC_MODIFIER(0, 3, 0,VXC_RM_TowardZero, 0)); \ coord_b.y++; \ _viv_asm(CONV, valC, sum1); \ _viv_asm(COPY, outC, valC, 16); \ - VXC_OP4_NoDest(img_store_3d, output, coord_b.xyww, outC.s0246, \ + VXC_OP4_NoDest(img_store_3d, output, coord_b.xywz, outC.s0246, \ VXC_MODIFIER(0, 3, 0,VXC_RM_TowardZero, 0)); \ coord_b.y++; \ _viv_asm(CONV, valC, sum2); \ _viv_asm(COPY, outC, valC, 16); \ - VXC_OP4_NoDest(img_store_3d, output, coord_b.xyww, outC.s0246, \ + VXC_OP4_NoDest(img_store_3d, output, coord_b.xywz, outC.s0246, \ VXC_MODIFIER(0, 3, 0,VXC_RM_TowardZero, 0)); \ coord_b.y++; \ _viv_asm(CONV, valC, sum3); \ _viv_asm(COPY, outC, valC, 16); \ - VXC_OP4_NoDest(img_store_3d, output, coord_b.xyww, outC.s0246, \ + VXC_OP4_NoDest(img_store_3d, output, coord_b.xywz, outC.s0246, \ VXC_MODIFIER(0, 3, 0,VXC_RM_TowardZero, 0)); \ } GEMM_QINT_F16_TO_F16(U8, vxc_uchar16) diff --git a/src/tim/vx/internal/src/libnnext/ops/vx/matrixmul_u8f16_u8.vx b/src/tim/vx/internal/src/libnnext/ops/vx/matrixmul_u8f16_u8.vx index 18c4214..6cdf89e 100644 --- a/src/tim/vx/internal/src/libnnext/ops/vx/matrixmul_u8f16_u8.vx +++ b/src/tim/vx/internal/src/libnnext/ops/vx/matrixmul_u8f16_u8.vx @@ -42,21 +42,21 @@ __kernel void gemm_##src0_type_name##F16to##src0_type_name( \ { \ vxc_float4 tempA0, tempA1, tempA2, tempA3, tmpZpScale; \ vxc_float4 tempB0, tempB1, tempB2, tempB3; \ - VXC_OP4(img_load_3d, srcA0, inputA, coord_a.xyww, VXC_5BITOFFSET_XY(0, 0), \ + VXC_OP4(img_load_3d, srcA0, inputA, coord_a.xywz, VXC_5BITOFFSET_XY(0, 0), \ VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \ - VXC_OP4(img_load_3d, srcB.hi, inputB, coord_b.xyww, VXC_5BITOFFSET_XY(0, 0), \ + VXC_OP4(img_load_3d, srcB.hi, inputB, coord_b.xywz, VXC_5BITOFFSET_XY(0, 0), \ VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \ - VXC_OP4(img_load_3d, srcA0, inputA, coord_a.xyww, VXC_5BITOFFSET_XY(0, 1), \ + VXC_OP4(img_load_3d, srcA0, inputA, coord_a.xywz, VXC_5BITOFFSET_XY(0, 1), \ VXC_MODIFIER(4, 7, 0, VXC_RM_TowardZero, 0)); \ - VXC_OP4(img_load_3d, srcB.hi, inputB, coord_b.xyww, VXC_5BITOFFSET_XY(0, 1), \ + VXC_OP4(img_load_3d, srcB.hi, inputB, coord_b.xywz, VXC_5BITOFFSET_XY(0, 1), \ VXC_MODIFIER(4, 7, 0, VXC_RM_TowardZero, 0)); \ - VXC_OP4(img_load_3d, srcA1, inputA, coord_a.xyww, VXC_5BITOFFSET_XY(0, 2), \ + VXC_OP4(img_load_3d, srcA1, inputA, coord_a.xywz, VXC_5BITOFFSET_XY(0, 2), \ VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \ - VXC_OP4(img_load_3d, srcB.lo, inputB, coord_b.xyww, VXC_5BITOFFSET_XY(0, 2), \ + VXC_OP4(img_load_3d, srcB.lo, inputB, coord_b.xywz, VXC_5BITOFFSET_XY(0, 2), \ VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \ - VXC_OP4(img_load_3d, srcA1, inputA, coord_a.xyww, VXC_5BITOFFSET_XY(0, 3), \ + VXC_OP4(img_load_3d, srcA1, inputA, coord_a.xywz, VXC_5BITOFFSET_XY(0, 3), \ VXC_MODIFIER(4, 7, 0, VXC_RM_TowardZero, 0)); \ - VXC_OP4(img_load_3d, srcB.lo, inputB, coord_b.xyww, VXC_5BITOFFSET_XY(0, 3), \ + VXC_OP4(img_load_3d, srcB.lo, inputB, coord_b.xywz, VXC_5BITOFFSET_XY(0, 3), \ VXC_MODIFIER(4, 7, 0, VXC_RM_TowardZero, 0)); \ coord_a.x += 4; \ coord_b.y += 4; \ @@ -86,20 +86,20 @@ __kernel void gemm_##src0_type_name##F16to##src0_type_name( \ tmpOut1 = convert_int4_rte(sum1 * in0outScale + output_ZP); \ VXC_DP2x8(outC, tmpOut0, tmpOut1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1),\ uniConvertInt32toUint8_2x8); \ - VXC_OP4_NoDest(img_store_3d, output, coord_b.xyww, outC.s0123, \ + VXC_OP4_NoDest(img_store_3d, output, coord_b.xywz, outC.s0123, \ VXC_MODIFIER(0, 3, 0,VXC_RM_TowardZero, 0)); \ coord_b.y++; \ - VXC_OP4_NoDest(img_store_3d, output, coord_b.xyww, outC.s4567, \ + VXC_OP4_NoDest(img_store_3d, output, coord_b.xywz, outC.s4567, \ VXC_MODIFIER(0, 3, 0,VXC_RM_TowardZero, 0)); \ coord_b.y++; \ tmpOut0 = convert_int4_rte(sum2 * in0outScale + output_ZP); \ tmpOut1 = convert_int4_rte(sum3 * in0outScale + output_ZP); \ VXC_DP2x8(outC, tmpOut0, tmpOut1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1),\ uniConvertInt32toUint8_2x8); \ - VXC_OP4_NoDest(img_store_3d, output, coord_b.xyww, outC.s0123, \ + VXC_OP4_NoDest(img_store_3d, output, coord_b.xywz, outC.s0123, \ VXC_MODIFIER(0, 3, 0,VXC_RM_TowardZero, 0)); \ coord_b.y++; \ - VXC_OP4_NoDest(img_store_3d, output, coord_b.xyww, outC.s4567, \ + VXC_OP4_NoDest(img_store_3d, output, coord_b.xywz, outC.s4567, \ VXC_MODIFIER(0, 3, 0,VXC_RM_TowardZero, 0)); \ } #else @@ -133,36 +133,36 @@ __kernel void gemm_##src0_type_name##F16to##src0_type_name( \ { \ vxc_float4 tempA0, tempA1, tempA2, tempA3; \ vxc_float4 tempB0, tempB1, tempB2, tempB3; \ - VXC_OP4(img_load_3d, srcA, inputA, coord_a.xyww, VXC_5BITOFFSET_XY(0, 0), \ + VXC_OP4(img_load_3d, srcA, inputA, coord_a.xywz, VXC_5BITOFFSET_XY(0, 0), \ VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \ - VXC_OP4(img_load_3d, srcB, inputB, coord_b.xyww, VXC_5BITOFFSET_XY(0, 0), \ + VXC_OP4(img_load_3d, srcB, inputB, coord_b.xywz, VXC_5BITOFFSET_XY(0, 0), \ VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \ VXC_DP4x4(tempA0, srcA, in0_zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), \ uniConvertUint8SubZpToFp32_4x4); \ _viv_asm(COPY, tmpB, srcB, 16); \ VXC_DP4x4(tempB0, tmpB, tmpB, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvert1stFp16ToFp32_4x4); \ \ - VXC_OP4(img_load_3d, srcA, inputA, coord_a.xyww, VXC_5BITOFFSET_XY(0, 1), \ + VXC_OP4(img_load_3d, srcA, inputA, coord_a.xywz, VXC_5BITOFFSET_XY(0, 1), \ VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \ - VXC_OP4(img_load_3d, srcB, inputB, coord_b.xyww, VXC_5BITOFFSET_XY(0, 1), \ + VXC_OP4(img_load_3d, srcB, inputB, coord_b.xywz, VXC_5BITOFFSET_XY(0, 1), \ VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \ VXC_DP4x4(tempA1, srcA, in0_zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), \ uniConvertUint8SubZpToFp32_4x4); \ _viv_asm(COPY, tmpB, srcB, 16); \ VXC_DP4x4(tempB1, tmpB, tmpB, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvert1stFp16ToFp32_4x4); \ \ - VXC_OP4(img_load_3d, srcA, inputA, coord_a.xyww, VXC_5BITOFFSET_XY(0, 2), \ + VXC_OP4(img_load_3d, srcA, inputA, coord_a.xywz, VXC_5BITOFFSET_XY(0, 2), \ VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \ - VXC_OP4(img_load_3d, srcB, inputB, coord_b.xyww, VXC_5BITOFFSET_XY(0, 2), \ + VXC_OP4(img_load_3d, srcB, inputB, coord_b.xywz, VXC_5BITOFFSET_XY(0, 2), \ VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \ VXC_DP4x4(tempA2, srcA, in0_zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), \ uniConvertUint8SubZpToFp32_4x4); \ _viv_asm(COPY, tmpB, srcB, 16); \ VXC_DP4x4(tempB2, tmpB, tmpB, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvert1stFp16ToFp32_4x4); \ \ - VXC_OP4(img_load_3d, srcA, inputA, coord_a.xyww, VXC_5BITOFFSET_XY(0, 3), \ + VXC_OP4(img_load_3d, srcA, inputA, coord_a.xywz, VXC_5BITOFFSET_XY(0, 3), \ VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \ - VXC_OP4(img_load_3d, srcB, inputB, coord_b.xyww, VXC_5BITOFFSET_XY(0, 3), \ + VXC_OP4(img_load_3d, srcB, inputB, coord_b.xywz, VXC_5BITOFFSET_XY(0, 3), \ VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \ coord_a.x += 4; \ coord_b.y += 4; \ @@ -184,20 +184,20 @@ __kernel void gemm_##src0_type_name##F16to##src0_type_name( \ tmpOut1 = convert_int4_rte(sum1 * outputScale + output_ZP); \ VXC_DP2x8(outC, tmpOut0, tmpOut1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1),\ uniConvertInt32toUint8_2x8); \ - VXC_OP4_NoDest(img_store_3d, output, coord_b.xyww, outC.s0123, \ + VXC_OP4_NoDest(img_store_3d, output, coord_b.xywz, outC.s0123, \ VXC_MODIFIER(0, 3, 0,VXC_RM_TowardZero, 0)); \ coord_b.y++; \ - VXC_OP4_NoDest(img_store_3d, output, coord_b.xyww, outC.s4567, \ + VXC_OP4_NoDest(img_store_3d, output, coord_b.xywz, outC.s4567, \ VXC_MODIFIER(0, 3, 0,VXC_RM_TowardZero, 0)); \ coord_b.y++; \ tmpOut0 = convert_int4_rte(sum2 * outputScale + output_ZP); \ tmpOut1 = convert_int4_rte(sum3 * outputScale + output_ZP); \ VXC_DP2x8(outC, tmpOut0, tmpOut1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1),\ uniConvertInt32toUint8_2x8); \ - VXC_OP4_NoDest(img_store_3d, output, coord_b.xyww, outC.s0123, \ + VXC_OP4_NoDest(img_store_3d, output, coord_b.xywz, outC.s0123, \ VXC_MODIFIER(0, 3, 0,VXC_RM_TowardZero, 0)); \ coord_b.y++; \ - VXC_OP4_NoDest(img_store_3d, output, coord_b.xyww, outC.s4567, \ + VXC_OP4_NoDest(img_store_3d, output, coord_b.xywz, outC.s4567, \ VXC_MODIFIER(0, 3, 0,VXC_RM_TowardZero, 0)); \ } #endif diff --git a/src/tim/vx/internal/src/libnnext/ops/vx/matrixmul_u8u8_f16.vx b/src/tim/vx/internal/src/libnnext/ops/vx/matrixmul_u8u8_f16.vx index 8b925f3..3816d56 100644 --- a/src/tim/vx/internal/src/libnnext/ops/vx/matrixmul_u8u8_f16.vx +++ b/src/tim/vx/internal/src/libnnext/ops/vx/matrixmul_u8u8_f16.vx @@ -40,21 +40,21 @@ __kernel void gemm_##src0_type_name##src0_type_name##toF16( \ { \ vxc_float4 tempA0, tempA1, tempA2, tempA3; \ vxc_float4 tempB0, tempB1, tempB2, tempB3; \ - VXC_OP4(img_load_3d, srcA0, inputA, coord_a.xyww, VXC_5BITOFFSET_XY(0, 0), \ + VXC_OP4(img_load_3d, srcA0, inputA, coord_a.xywz, VXC_5BITOFFSET_XY(0, 0), \ VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \ - VXC_OP4(img_load_3d, srcB, inputB, coord_b.xyww, VXC_5BITOFFSET_XY(0, 0), \ + VXC_OP4(img_load_3d, srcB, inputB, coord_b.xywz, VXC_5BITOFFSET_XY(0, 0), \ VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \ - VXC_OP4(img_load_3d, srcA1, inputA, coord_a.xyww, VXC_5BITOFFSET_XY(0, 1), \ + VXC_OP4(img_load_3d, srcA1, inputA, coord_a.xywz, VXC_5BITOFFSET_XY(0, 1), \ VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \ - VXC_OP4(img_load_3d, srcB, inputB, coord_b.xyww, VXC_5BITOFFSET_XY(0, 1), \ + VXC_OP4(img_load_3d, srcB, inputB, coord_b.xywz, VXC_5BITOFFSET_XY(0, 1), \ VXC_MODIFIER(4, 7, 0, VXC_RM_TowardZero, 0)); \ - VXC_OP4(img_load_3d, srcA2, inputA, coord_a.xyww, VXC_5BITOFFSET_XY(0, 2), \ + VXC_OP4(img_load_3d, srcA2, inputA, coord_a.xywz, VXC_5BITOFFSET_XY(0, 2), \ VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \ - VXC_OP4(img_load_3d, srcB, inputB, coord_b.xyww, VXC_5BITOFFSET_XY(0, 2), \ + VXC_OP4(img_load_3d, srcB, inputB, coord_b.xywz, VXC_5BITOFFSET_XY(0, 2), \ VXC_MODIFIER(8, 11, 0, VXC_RM_TowardZero, 0)); \ - VXC_OP4(img_load_3d, srcA3, inputA, coord_a.xyww, VXC_5BITOFFSET_XY(0, 3), \ + VXC_OP4(img_load_3d, srcA3, inputA, coord_a.xywz, VXC_5BITOFFSET_XY(0, 3), \ VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \ - VXC_OP4(img_load_3d, srcB, inputB, coord_b.xyww, VXC_5BITOFFSET_XY(0, 3), \ + VXC_OP4(img_load_3d, srcB, inputB, coord_b.xywz, VXC_5BITOFFSET_XY(0, 3), \ VXC_MODIFIER(12, 15, 0, VXC_RM_TowardZero, 0)); \ coord_a.x += 4; \ coord_b.y += 4; \ @@ -89,22 +89,22 @@ __kernel void gemm_##src0_type_name##src0_type_name##toF16( \ _viv_asm(MOV, coord_b.w, baseAddr); \ _viv_asm(CONV, valC, sum0); \ _viv_asm(COPY, outC, valC, 16); \ - VXC_OP4_NoDest(img_store_3d, output, coord_b.xyww, outC.s0246, \ + VXC_OP4_NoDest(img_store_3d, output, coord_b.xywz, outC.s0246, \ VXC_MODIFIER(0, 3, 0,VXC_RM_TowardZero, 0)); \ coord_b.y++; \ _viv_asm(CONV, valC, sum1); \ _viv_asm(COPY, outC, valC, 16); \ - VXC_OP4_NoDest(img_store_3d, output, coord_b.xyww, outC.s0246, \ + VXC_OP4_NoDest(img_store_3d, output, coord_b.xywz, outC.s0246, \ VXC_MODIFIER(0, 3, 0,VXC_RM_TowardZero, 0)); \ coord_b.y++; \ _viv_asm(CONV, valC, sum2); \ _viv_asm(COPY, outC, valC, 16); \ - VXC_OP4_NoDest(img_store_3d, output, coord_b.xyww, outC.s0246, \ + VXC_OP4_NoDest(img_store_3d, output, coord_b.xywz, outC.s0246, \ VXC_MODIFIER(0, 3, 0,VXC_RM_TowardZero, 0)); \ coord_b.y++; \ _viv_asm(CONV, valC, sum3); \ _viv_asm(COPY, outC, valC, 16); \ - VXC_OP4_NoDest(img_store_3d, output, coord_b.xyww, outC.s0246, \ + VXC_OP4_NoDest(img_store_3d, output, coord_b.xywz, outC.s0246, \ VXC_MODIFIER(0, 3, 0,VXC_RM_TowardZero, 0)); \ } GEMM_QINT_TO_F16(U8, vxc_uchar16) @@ -142,36 +142,36 @@ __kernel void gemm_##src0_type_name##src0_type_name##toF16( \ { \ vxc_float4 tempA0, tempA1, tempA2, tempA3; \ vxc_float4 tempB0, tempB1, tempB2, tempB3; \ - VXC_OP4(img_load_3d, srcA, inputA, coord_a.xyww, VXC_5BITOFFSET_XY(0, 0), \ + VXC_OP4(img_load_3d, srcA, inputA, coord_a.xywz, VXC_5BITOFFSET_XY(0, 0), \ VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \ - VXC_OP4(img_load_3d, srcB, inputB, coord_b.xyww, VXC_5BITOFFSET_XY(0, 0), \ + VXC_OP4(img_load_3d, srcB, inputB, coord_b.xywz, VXC_5BITOFFSET_XY(0, 0), \ VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \ VXC_DP4x4(tempA0, srcA, in0_zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), \ uniConvertUint8SubZpToFp32_4x4); \ VXC_DP4x4(tempB0, srcB, in1_zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), \ uniConvertUint8SubZpToFp32B_4x4); \ \ - VXC_OP4(img_load_3d, srcA, inputA, coord_a.xyww, VXC_5BITOFFSET_XY(0, 1), \ + VXC_OP4(img_load_3d, srcA, inputA, coord_a.xywz, VXC_5BITOFFSET_XY(0, 1), \ VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \ - VXC_OP4(img_load_3d, srcB, inputB, coord_b.xyww, VXC_5BITOFFSET_XY(0, 1), \ + VXC_OP4(img_load_3d, srcB, inputB, coord_b.xywz, VXC_5BITOFFSET_XY(0, 1), \ VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \ VXC_DP4x4(tempA1, srcA, in0_zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), \ uniConvertUint8SubZpToFp32_4x4); \ VXC_DP4x4(tempB1, srcB, in1_zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), \ uniConvertUint8SubZpToFp32B_4x4); \ \ - VXC_OP4(img_load_3d, srcA, inputA, coord_a.xyww, VXC_5BITOFFSET_XY(0, 2), \ + VXC_OP4(img_load_3d, srcA, inputA, coord_a.xywz, VXC_5BITOFFSET_XY(0, 2), \ VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \ - VXC_OP4(img_load_3d, srcB, inputB, coord_b.xyww, VXC_5BITOFFSET_XY(0, 2), \ + VXC_OP4(img_load_3d, srcB, inputB, coord_b.xywz, VXC_5BITOFFSET_XY(0, 2), \ VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \ VXC_DP4x4(tempA2, srcA, in0_zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), \ uniConvertUint8SubZpToFp32_4x4); \ VXC_DP4x4(tempB2, srcB, in1_zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), \ uniConvertUint8SubZpToFp32B_4x4); \ \ - VXC_OP4(img_load_3d, srcA, inputA, coord_a.xyww, VXC_5BITOFFSET_XY(0, 3), \ + VXC_OP4(img_load_3d, srcA, inputA, coord_a.xywz, VXC_5BITOFFSET_XY(0, 3), \ VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \ - VXC_OP4(img_load_3d, srcB, inputB, coord_b.xyww, VXC_5BITOFFSET_XY(0, 3), \ + VXC_OP4(img_load_3d, srcB, inputB, coord_b.xywz, VXC_5BITOFFSET_XY(0, 3), \ VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \ coord_a.x += 4; \ coord_b.y += 4; \ @@ -190,22 +190,22 @@ __kernel void gemm_##src0_type_name##src0_type_name##toF16( \ _viv_asm(MOV, coord_b.w, baseAddr); \ _viv_asm(CONV, valC, sum0); \ _viv_asm(COPY, outC, valC, 16); \ - VXC_OP4_NoDest(img_store_3d, output, coord_b.xyww, outC.s0246, \ + VXC_OP4_NoDest(img_store_3d, output, coord_b.xywz, outC.s0246, \ VXC_MODIFIER(0, 3, 0,VXC_RM_TowardZero, 0)); \ coord_b.y++; \ _viv_asm(CONV, valC, sum1); \ _viv_asm(COPY, outC, valC, 16); \ - VXC_OP4_NoDest(img_store_3d, output, coord_b.xyww, outC.s0246, \ + VXC_OP4_NoDest(img_store_3d, output, coord_b.xywz, outC.s0246, \ VXC_MODIFIER(0, 3, 0,VXC_RM_TowardZero, 0)); \ coord_b.y++; \ _viv_asm(CONV, valC, sum2); \ _viv_asm(COPY, outC, valC, 16); \ - VXC_OP4_NoDest(img_store_3d, output, coord_b.xyww, outC.s0246, \ + VXC_OP4_NoDest(img_store_3d, output, coord_b.xywz, outC.s0246, \ VXC_MODIFIER(0, 3, 0,VXC_RM_TowardZero, 0)); \ coord_b.y++; \ _viv_asm(CONV, valC, sum3); \ _viv_asm(COPY, outC, valC, 16); \ - VXC_OP4_NoDest(img_store_3d, output, coord_b.xyww, outC.s0246, \ + VXC_OP4_NoDest(img_store_3d, output, coord_b.xywz, outC.s0246, \ VXC_MODIFIER(0, 3, 0,VXC_RM_TowardZero, 0)); \ } GEMM_QINT16_TO_F16(I16, vxc_short8) diff --git a/src/tim/vx/internal/src/libnnext/ops/vx/moments_axis1.vx b/src/tim/vx/internal/src/libnnext/ops/vx/moments_axis1.vx index dffa293..0be50bf 100644 --- a/src/tim/vx/internal/src/libnnext/ops/vx/moments_axis1.vx +++ b/src/tim/vx/internal/src/libnnext/ops/vx/moments_axis1.vx @@ -26,15 +26,22 @@ __kernel void moments_axis1_##src0_type_name##toF16( \ short zp = inputZP;\ float4 tmpData0;\ \ - for(coord.y = 0; coord.y < height; coord.y++) \ + VXC_ReadImage2DArray(src0, input, coord, VXC_5BITOFFSET_XY(0, 0), \ + VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \ + for(coord.y = 1; coord.y < height; ) \ { \ - VXC_ReadImage2DArray(src0, input, coord, VXC_5BITOFFSET_XY(0, 0), \ - VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \ VXC_DP4x4(tmpData0, src0, zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\ uniConvert1stUint8SubZpToFp32_4x4); \ + VXC_ReadImage2DArray(src0, input, coord, VXC_5BITOFFSET_XY(0, 0), \ + VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \ + coord.y++; \ sum += (tmpData0); \ sqr += (tmpData0 * tmpData0); \ } \ + VXC_DP4x4(tmpData0, src0, zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\ + uniConvert1stUint8SubZpToFp32_4x4); \ + sum += (tmpData0); \ + sqr += (tmpData0 * tmpData0); \ sum *= input_scale; \ sqr *= e2InScale; \ \ @@ -71,16 +78,23 @@ __kernel void moments_axis1_##src0_type_name##toF16_2D( \ float4 sum = 0, sqr = 0; \ short zp = inputZP;\ float4 tmpData0;\ + VXC_ReadImage(src0, input, coord, VXC_5BITOFFSET_XY(0, 0), \ + VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \ \ - for(coord.y = 0; coord.y < height; coord.y++) \ + for (coord.y = 1; coord.y < height; ) \ { \ - VXC_ReadImage(src0, input, coord, VXC_5BITOFFSET_XY(0, 0), \ - VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \ VXC_DP4x4(tmpData0, src0, zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\ uniConvert1stUint8SubZpToFp32_4x4); \ + VXC_ReadImage(src0, input, coord, VXC_5BITOFFSET_XY(0, 0), \ + VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \ + coord.y++; \ sum += (tmpData0); \ sqr += (tmpData0 * tmpData0); \ } \ + VXC_DP4x4(tmpData0, src0, zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\ + uniConvert1stUint8SubZpToFp32_4x4); \ + sum += (tmpData0); \ + sqr += (tmpData0 * tmpData0); \ sum *= input_scale; \ sqr *= e2InScale; \ \ diff --git a/src/tim/vx/internal/src/libnnext/ops/vx/moments_u8.vx b/src/tim/vx/internal/src/libnnext/ops/vx/moments_u8.vx new file mode 100644 index 0000000..7a6e8f6 --- /dev/null +++ b/src/tim/vx/internal/src/libnnext/ops/vx/moments_u8.vx @@ -0,0 +1,346 @@ +#include "cl_viv_vx_ext.h" + +_viv_uniform int width; +_viv_uniform int height; +_viv_uniform int channel; +_viv_uniform float dimRatio; + +_viv_uniform VXC_512Bits uniSumU8_16x1; +_viv_uniform VXC_512Bits uniSqrSum_16x1; +_viv_uniform float input_scale; +_viv_uniform int inputZP; +_viv_uniform int sumInZp; +_viv_uniform int tmpZp1; +_viv_uniform float e2InScale; +_viv_uniform float rowSumScale; +_viv_uniform float4 output_ZP; +_viv_uniform float4 outputScale; +_viv_uniform float output_ZP0; +_viv_uniform float outputScale0; +_viv_uniform float output_ZP1; +_viv_uniform float outputScale1; +_viv_uniform VXC_512Bits uniConvertInt32toUint8_2x8; +_viv_uniform VXC_512Bits uniConvert1stUint8SubZpToFp32_4x4; + +#define MOMENTS_AXIS0_QINT_U8(src0_type_name, read0_type) \ +__kernel void moments_axis0_##src0_type_name##toU8( \ + image2d_array_t input, image2d_t output_mean, image2d_t output_vari, \ + int axis, int axis_num) \ +{ \ + int gidy = get_global_id(0); \ + int gidz = get_global_id(1); \ + int4 coord = (int4)(0, gidy, gidz, gidz); \ + read0_type src0; \ + float sum = 0, sqr = 0; \ + int tmpSum = 0, tmpSqr = 0; \ + int4 tmpSum0, tmpSqr0; \ + int8 inputA_desc; \ + _viv_asm(COPY, inputA_desc, input, sizeof(inputA_desc)); \ + int baseAddr_a = (int)gidz * inputA_desc.s4 + inputA_desc.s0; \ + _viv_asm(MOV, coord.z, baseAddr_a); \ + \ + for(coord.x = 0; coord.x < width; coord.x += 16) \ + { \ + VXC_OP4(img_load_3d, src0, input, coord, VXC_5BITOFFSET_XY(0, 0), \ + VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); \ + VXC_DP16x1(tmpSum0, src0, src0, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0), uniSumU8_16x1); \ + VXC_DP16x1(tmpSqr0, src0, src0, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0), uniSqrSum_16x1); \ + tmpSum += (tmpSum0.x); \ + tmpSqr += (tmpSqr0.x + tmpZp1 * tmpSum0.x); \ + } \ + sqr = (convert_float(tmpSqr) * e2InScale + rowSumScale); \ + sum = convert_float(tmpSum + sumInZp) * input_scale; \ + vxc_float4 mean_vari0 = (vxc_float4)(sum, sqr, 0, 0); \ + mean_vari0 *= dimRatio; \ + mean_vari0.s1 = mean_vari0.s1 - mean_vari0.s0 * mean_vari0.s0; \ + int2 coord_out = (int2)(gidy, gidz); \ + vxc_int4 tmpData = convert_int4_rte(mean_vari0 * outputScale + output_ZP); \ + VXC_DP2x8(src0, tmpData, tmpData, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1),\ + uniConvertInt32toUint8_2x8); \ + VXC_WriteImage(output_mean, coord_out, src0.s0123, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0)); \ + VXC_WriteImage(output_vari, coord_out, src0.s1023, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0)); \ +} +MOMENTS_AXIS0_QINT_U8(U8, vxc_uchar16) + +#define MOMENTS_AXIS0_QINT_U8_2D(src0_type_name, read0_type) \ +__kernel void moments_axis0_##src0_type_name##toU8_2D( \ + image2d_t input, image2d_t output_mean, image2d_t output_vari, \ + int axis, int axis_num) \ +{ \ + int gidy = get_global_id(0); \ + int2 coord = (int2)(0, gidy); \ + read0_type src0; \ + float sum = 0, sqr = 0; \ + int tmpSum = 0, tmpSqr = 0; \ + int4 tmpSum0, tmpSqr0; \ + \ + for(coord.x = 0; coord.x < width; coord.x += 16) \ + { \ + VXC_ReadImage(src0, input, coord, VXC_5BITOFFSET_XY(0, 0), \ + VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); \ + VXC_DP16x1(tmpSum0, src0, src0, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0), uniSumU8_16x1); \ + VXC_DP16x1(tmpSqr0, src0, src0, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0), uniSqrSum_16x1); \ + tmpSum += (tmpSum0.x); \ + tmpSqr += (tmpSqr0.x + tmpZp1 * tmpSum0.x); \ + } \ + sqr = (convert_float(tmpSqr) * e2InScale + rowSumScale); \ + sum = convert_float(tmpSum + sumInZp) * input_scale; \ + vxc_float4 mean_vari0 = (vxc_float4)(sum, sqr, 0, 0); \ + mean_vari0 *= dimRatio; \ + mean_vari0.s1 = mean_vari0.s1 - mean_vari0.s0 * mean_vari0.s0; \ + int2 coord_out = (int2)(gidy, 0); \ + vxc_int4 tmpData = convert_int4_rte(mean_vari0 * outputScale + output_ZP); \ + VXC_DP2x8(src0, tmpData, tmpData, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1),\ + uniConvertInt32toUint8_2x8); \ + VXC_WriteImage(output_mean, coord_out, src0.s0123, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0)); \ + VXC_WriteImage(output_vari, coord_out, src0.s1023, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0)); \ +} +MOMENTS_AXIS0_QINT_U8_2D(U8, vxc_uchar16) + +#define MOMENTS_AXIS01_QINT_U8(src0_type_name, read0_type) \ +__kernel __attribute__((reqd_work_group_size(16, 1, 1))) void moments_axis01_##src0_type_name##toU8( \ + image2d_array_t input, image2d_t output_mean, image2d_t output_vari, \ + int axis, int axis_num) \ +{ \ + int gidx = get_global_id(0) << 4; \ + int lidx = get_local_id(0); \ + int gidz = get_global_id(1); \ + int4 coord = (int4)(gidx, 0, gidz, gidz); \ + read0_type src0; \ + float sum = 0, sqr = 0; \ + \ + __local float lcl_sum[16]; \ + __local float lcl_sqr[16]; \ + int8 inputA_desc; \ + _viv_asm(COPY, inputA_desc, input, sizeof(inputA_desc)); \ + int baseAddr_a = (int)gidz * inputA_desc.s4 + inputA_desc.s0; \ + _viv_asm(MOV, coord.z, baseAddr_a); \ + \ + for(coord.x = gidx; coord.x < width; coord.x += 256) \ + { \ + int tmpSum = 0, tmpSqr = 0, tmpSum1, tmpSqr1; \ + for(coord.y = 0; coord.y < height;) \ + { \ + VXC_OP4(img_load_3d, src0, input, coord, VXC_5BITOFFSET_XY(0, 0), \ + VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); \ + coord.y++; \ + VXC_DP16x1(tmpSum1, src0, src0, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0), uniSumU8_16x1); \ + tmpSum += (tmpSum1); \ + VXC_DP16x1(tmpSqr1, src0, src0, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0), uniSqrSum_16x1); \ + tmpSqr += (tmpSqr1 + tmpZp1 * tmpSum1); \ + } \ + sqr += (tmpSqr * e2InScale + rowSumScale); \ + sum += (tmpSum + sumInZp) * input_scale; \ + } \ + lcl_sum[lidx] = sum; \ + lcl_sqr[lidx] = sqr; \ + barrier(CLK_LOCAL_MEM_FENCE); \ + int2 coord_out = (int2)(gidz, 0); \ + if(lidx == 0) \ + { \ + float4 one = (float4)(1, 1, 1, 1); \ + __local float4* tmp_sum = (__local float4*)lcl_sum; \ + __local float4* tmp_sqr = (__local float4*)lcl_sqr; \ + \ + sum = (0); \ + sqr = (0); \ + for(int i = 0; i < 4; i++) \ + { \ + sum += dot(tmp_sum[i], one); \ + sqr += dot(tmp_sqr[i], one); \ + } \ + float4 meanVari; \ + meanVari.x = sum * dimRatio; \ + meanVari.y = sqr * dimRatio - meanVari.x * meanVari.x; \ + vxc_int4 tmpData = convert_int4_rte(meanVari * outputScale + output_ZP); \ + VXC_DP2x8(src0, tmpData, tmpData, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1),\ + uniConvertInt32toUint8_2x8); \ + VXC_WriteImage(output_mean, coord_out, src0.s0123, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0)); \ + VXC_WriteImage(output_vari, coord_out, src0.s1023, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0)); \ + } \ +} +MOMENTS_AXIS01_QINT_U8(U8, vxc_uchar16) + +#define MOMENTS_AXIS01_QINT_U8_2D(src0_type_name, read0_type) \ +__kernel __attribute__((reqd_work_group_size(16, 1, 1))) void moments_axis01_##src0_type_name##toU8_2D( \ + image2d_t input, image2d_t output_mean, image2d_t output_vari, \ + int axis, int axis_num) \ +{ \ + int gidx = get_global_id(0) << 4; \ + int lidx = get_local_id(0); \ + int2 coord = (int2)(gidx, 0); \ + read0_type src0; \ + float sum = 0, sqr = 0; \ + __local float lcl_sum[16]; \ + __local float lcl_sqr[16]; \ + for(coord.x = gidx; coord.x < width; coord.x += 256) \ + { \ + int tmpSum = 0, tmpSqr = 0, tmpSum1, tmpSqr1; \ + for(coord.y = 0; coord.y < height;) \ + { \ + VXC_ReadImage(src0, input, coord, VXC_5BITOFFSET_XY(0, 0), \ + VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); \ + coord.y++; \ + VXC_DP16x1(tmpSum1, src0, src0, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0), uniSumU8_16x1); \ + tmpSum += (tmpSum1); \ + VXC_DP16x1(tmpSqr1, src0, src0, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0), uniSqrSum_16x1); \ + tmpSqr += (tmpSqr1 + tmpZp1 * tmpSum1); \ + } \ + sqr += (tmpSqr * e2InScale + rowSumScale); \ + sum += (tmpSum + sumInZp) * input_scale; \ + } \ + lcl_sum[lidx] = sum; \ + lcl_sqr[lidx] = sqr; \ + barrier(CLK_LOCAL_MEM_FENCE); \ + int2 coord_out = (int2)(0, 0); \ + if(lidx == 0) \ + { \ + float4 one = (float4)(1, 1, 1, 1); \ + __local float4* tmp_sum = (__local float4*)lcl_sum; \ + __local float4* tmp_sqr = (__local float4*)lcl_sqr; \ + sum = (0); sqr = (0); \ + for(int i = 0; i < 4; i++) \ + { \ + sum += dot(tmp_sum[i], one); \ + sqr += dot(tmp_sqr[i], one); \ + } \ + float4 meanVari; \ + meanVari.x = sum * dimRatio; \ + meanVari.y = sqr * dimRatio - meanVari.x * meanVari.x; \ + vxc_int4 tmpData = convert_int4_rte(meanVari * outputScale + output_ZP); \ + VXC_DP2x8(src0, tmpData, tmpData, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1),\ + uniConvertInt32toUint8_2x8); \ + VXC_WriteImage(output_mean, coord_out, src0.s0123, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0)); \ + VXC_WriteImage(output_vari, coord_out, src0.s1023, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0)); \ + } \ +} +MOMENTS_AXIS01_QINT_U8_2D(U8, vxc_uchar16) + +#define MOMENTS_AXIS1_QINT_U8(src0_type_name, read0_type) \ +__kernel void moments_axis1_##src0_type_name##toU8( \ + image2d_array_t input, image2d_t output_mean, image2d_t output_vari, \ + int axis, int axis_num) \ +{ \ + int gidx = get_global_id(0); \ + int gidz = get_global_id(1); \ + int4 coord = (int4)(gidx, 0, gidz, gidz); \ + read0_type src0; \ + float4 sum = 0, sqr = 0; \ + short zp = inputZP;\ + float4 tmpData0;\ + \ + int8 inputA_desc; \ + _viv_asm(COPY, inputA_desc, input, sizeof(inputA_desc)); \ + int baseAddr_a = (int)gidz * inputA_desc.s4 + inputA_desc.s0; \ + _viv_asm(MOV, coord.z, baseAddr_a); \ + VXC_OP4(img_load_3d, src0, input, coord, VXC_5BITOFFSET_XY(0, 0), \ + VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \ + for(coord.y = 1; coord.y < height; ) \ + { \ + VXC_DP4x4(tmpData0, src0, zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\ + uniConvert1stUint8SubZpToFp32_4x4); \ + VXC_OP4(img_load_3d, src0, input, coord, VXC_5BITOFFSET_XY(0, 0), \ + VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \ + coord.y++; \ + sum += (tmpData0); \ + sqr += (tmpData0 * tmpData0); \ + } \ + VXC_DP4x4(tmpData0, src0, zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\ + uniConvert1stUint8SubZpToFp32_4x4); \ + sum += (tmpData0); \ + sqr += (tmpData0 * tmpData0); \ + sum *= input_scale; \ + sqr *= e2InScale; \ + \ + float4 mean = sum * dimRatio; \ + float4 vari = sqr * dimRatio; \ + vari = vari - mean * mean; \ + vxc_int4 tmpVal0 = convert_int4_rte(mean * outputScale0 + output_ZP0); \ + vxc_int4 tmpVal1 = convert_int4_rte(vari * outputScale1 + output_ZP1); \ + VXC_DP2x8(src0, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1),\ + uniConvertInt32toUint8_2x8); \ + int2 coord_out = (int2)(gidx, gidz); \ + VXC_WriteImage(output_mean, coord_out, src0.s0123, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \ + VXC_WriteImage(output_vari, coord_out, src0.s4567, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \ +} +MOMENTS_AXIS1_QINT_U8(U8, vxc_uchar16) + +#define MOMENTS_AXIS1_QINT_U8_2D(src0_type_name, read0_type) \ +__kernel void moments_axis1_##src0_type_name##toU8_2D( \ + image2d_t input, image2d_t output_mean, image2d_t output_vari, \ + int axis, int axis_num) \ +{ \ + int gidx = get_global_id(0); \ + int2 coord = (int2)(gidx, 0); \ + read0_type src0; \ + float4 sum = 0, sqr = 0; \ + short zp = inputZP;\ + float4 tmpData0;\ + VXC_ReadImage(src0, input, coord, VXC_5BITOFFSET_XY(0, 0), \ + VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \ + \ + for (coord.y = 1; coord.y < height; ) \ + { \ + VXC_DP4x4(tmpData0, src0, zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\ + uniConvert1stUint8SubZpToFp32_4x4); \ + VXC_ReadImage(src0, input, coord, VXC_5BITOFFSET_XY(0, 0), \ + VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \ + coord.y++; \ + sum += (tmpData0); \ + sqr += (tmpData0 * tmpData0); \ + } \ + VXC_DP4x4(tmpData0, src0, zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\ + uniConvert1stUint8SubZpToFp32_4x4); \ + sum += (tmpData0); \ + sqr += (tmpData0 * tmpData0); \ + sum *= input_scale; \ + sqr *= e2InScale; \ + \ + float4 mean = sum * dimRatio; \ + float4 vari = sqr * dimRatio - mean * mean; \ + vxc_int4 tmpVal0 = convert_int4_rte(mean * outputScale0 + output_ZP0); \ + vxc_int4 tmpVal1 = convert_int4_rte(vari * outputScale1 + output_ZP1); \ + VXC_DP2x8(src0, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1),\ + uniConvertInt32toUint8_2x8); \ + int2 coord_out = (int2)(gidx, 0); \ + VXC_WriteImage(output_mean, coord_out, src0.s0123, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \ + VXC_WriteImage(output_vari, coord_out, src0.s4567, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \ +} +MOMENTS_AXIS1_QINT_U8_2D(U8, vxc_uchar16) + +#define MOMENTS_AXIS2_QINT_U8(src0_type_name, read0_type) \ +__kernel void moments_axis2_##src0_type_name##toU8( \ + image2d_array_t input, image2d_t output_mean, image2d_t output_vari, \ + int axis, int axis_num) \ +{ \ + int gidx = get_global_id(0); \ + int gidy = get_global_id(1); \ + int4 coord = (int4)(gidx, gidy, 0, 0); \ + read0_type src0; \ + float4 sum = 0, sqr = 0; \ + short zp = inputZP;\ + float4 tmpData0;\ + \ + for(coord.z = 0; coord.z < channel; coord.z++) \ + { \ + VXC_ReadImage2DArray(src0, input, coord, VXC_5BITOFFSET_XY(0, 0), \ + VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \ + VXC_DP4x4(tmpData0, src0, zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\ + uniConvert1stUint8SubZpToFp32_4x4); \ + sum += (tmpData0); \ + sqr += (tmpData0 * tmpData0); \ + } \ + sum *= input_scale; \ + sqr *= e2InScale; \ + \ + float4 mean = sum * dimRatio; \ + float4 vari = sqr * dimRatio - mean * mean; \ + vxc_int4 tmpVal0 = convert_int4_rte(mean * outputScale0 + output_ZP0); \ + vxc_int4 tmpVal1 = convert_int4_rte(vari * outputScale1 + output_ZP1); \ + VXC_DP2x8(src0, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1),\ + uniConvertInt32toUint8_2x8); \ + int2 coord_out = (int2)(gidx, gidy); \ + VXC_WriteImage(output_mean, coord_out, src0.s0123, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \ + VXC_WriteImage(output_vari, coord_out, src0.s4567, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \ +} +MOMENTS_AXIS2_QINT_U8(U8, vxc_uchar16) diff --git a/src/tim/vx/internal/src/libnnext/ops/vx/moments_u8_axis012.vx b/src/tim/vx/internal/src/libnnext/ops/vx/moments_u8_axis012.vx new file mode 100644 index 0000000..b456ee6 --- /dev/null +++ b/src/tim/vx/internal/src/libnnext/ops/vx/moments_u8_axis012.vx @@ -0,0 +1,75 @@ +#include "cl_viv_vx_ext.h" + +_viv_uniform int width; +_viv_uniform int height; +_viv_uniform int channel; +_viv_uniform float dimRatio; + +_viv_uniform VXC_512Bits uniSumU8_16x1; +_viv_uniform VXC_512Bits uniSqrSum_16x1; +_viv_uniform float input_scale; +_viv_uniform int sumInZp; +_viv_uniform int tmpZp1; +_viv_uniform float e2InScale; +_viv_uniform float rowSumScale; +_viv_uniform float4 output_ZP; +_viv_uniform float4 outputScale; +_viv_uniform VXC_512Bits uniConvertInt32toUint8_2x8; + +#define MOMENTS_AXIS012_QINT_U8(src0_type_name, read0_type) \ +__kernel __attribute__((reqd_work_group_size(16, 1, 1))) void moments_axis012_##src0_type_name##toU8( \ + image2d_array_t input, image2d_t output_mean, image2d_t output_vari, \ + int axis, int axis_num) \ +{ \ + int gidx = get_global_id(0) << 4; \ + int lidx = get_local_id(0); \ + int4 coord = (int4)(gidx, 0, 0, 0); \ + read0_type src0; \ + float sum = 0, sqr = 0; \ + __local float lcl_sum[16]; \ + __local float lcl_sqr[16]; \ + for(coord.z = 0; coord.z < channel; coord.z++) \ + { \ + for(coord.x = gidx; coord.x < width; coord.x += 256) \ + { \ + int tmpSum = 0, tmpSqr = 0, tmpSum1, tmpSqr1; \ + for(coord.y = 0; coord.y < height;) \ + { \ + VXC_ReadImage2DArray(src0, input, coord, VXC_5BITOFFSET_XY(0, 0), \ + VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); \ + coord.y++; \ + VXC_DP16x1(tmpSum1, src0, src0, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0), uniSumU8_16x1); \ + tmpSum += (tmpSum1); \ + VXC_DP16x1(tmpSqr1, src0, src0, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0), uniSqrSum_16x1); \ + tmpSqr += (tmpSqr1 + tmpZp1 * tmpSum1); \ + } \ + sqr += (tmpSqr * e2InScale + rowSumScale); \ + sum += (tmpSum + sumInZp) * input_scale; \ + } \ + } \ + lcl_sum[lidx] = sum; \ + lcl_sqr[lidx] = sqr; \ + barrier(CLK_LOCAL_MEM_FENCE); \ + int2 coord_out = (int2)(0, 0); \ + if(lidx == 0) \ + { \ + float4 one = (float4)(1, 1, 1, 1); \ + __local float4* tmp_sum = (__local float4*)lcl_sum; \ + __local float4* tmp_sqr = (__local float4*)lcl_sqr; \ + sum = (0); sqr = (0); \ + for(int i = 0; i < 4; i++) \ + { \ + sum += dot(tmp_sum[i], one); \ + sqr += dot(tmp_sqr[i], one); \ + } \ + float4 meanVari; \ + meanVari.x = sum * dimRatio; \ + meanVari.y = sqr * dimRatio - meanVari.x * meanVari.x; \ + vxc_int4 tmpData = convert_int4_rte(meanVari * outputScale + output_ZP); \ + VXC_DP2x8(src0, tmpData, tmpData, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1),\ + uniConvertInt32toUint8_2x8); \ + VXC_WriteImage(output_mean, coord_out, src0.s0123, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0)); \ + VXC_WriteImage(output_vari, coord_out, src0.s1023, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0)); \ + } \ +} +MOMENTS_AXIS012_QINT_U8(U8, vxc_uchar16) \ No newline at end of file diff --git a/src/tim/vx/internal/src/libnnext/ops/vx/random_multinomial.vx b/src/tim/vx/internal/src/libnnext/ops/vx/random_multinomial.vx index b588303..8d1b25b 100644 --- a/src/tim/vx/internal/src/libnnext/ops/vx/random_multinomial.vx +++ b/src/tim/vx/internal/src/libnnext/ops/vx/random_multinomial.vx @@ -8,15 +8,6 @@ _viv_uniform int iter; _viv_uniform int stride; _viv_uniform float re_rand_max; -inline uchar* get_image2D_array_ptr(image2d_array_t input) -{ - int8 desc; - _viv_asm(COPY, desc, input, sizeof(desc)); - uchar *src_ptr = (uchar*)desc.s0; - - return src_ptr; -} - uint4 _philox4x32bumpkey(uint4 key) { uint4 mask = (uint4)((uint)0x9E3779B9, (uint)0xBB67AE85, 0, 0); @@ -72,15 +63,16 @@ uint4 philox4x32_R_10(uint4 ctr, uint4 key) } __kernel void random_seed( - __read_only image2d_array_t seeds, - __write_only image2d_array_t output) + __read_only image2d_t seeds, + __write_only image2d_t output) { int gidx = get_global_id(0); int gidy = get_global_id(1); int4 coord = (int4)(gidx << 1, gidy, 0, 0); int width = get_image_width(seeds); - __global uint* seeds_ptr = (__global uint*)get_image2D_array_ptr(seeds); + Image s_img = create_image_from_image2d(seeds, 4); + __global uint* seeds_ptr = (__global uint*)s_img.ptr; seeds_ptr = seeds_ptr + coord.x + coord.y * width; uint4 key = vload4(0, seeds_ptr); @@ -88,8 +80,9 @@ __kernel void random_seed( float4 result = 0; width = get_image_width(output); + Image o_img = create_image_from_image2d(output, 4); coord.x = gidx * stride + width * coord.y; - __global float* output_ptr = (__global float*)get_image2D_array_ptr(output); + __global float* output_ptr = (__global float*)o_img.ptr; output_ptr += coord.x; for(int i = 0; i < iter; i++) @@ -111,8 +104,8 @@ float4 eltwise_unary_exp(float4 x) // x dim = 1 __kernel void random_multinomial_cdf_F16 ( - __read_only image2d_array_t input, - __write_only image2d_array_t output + __read_only image2d_t input, + __write_only image2d_t output ) { int gidx = get_global_id(0); @@ -127,7 +120,8 @@ __kernel void random_multinomial_cdf_F16 int class_max_stride = get_image_width(input); int offset = gidy * class_max_stride; - __global float* output_ptr = (__global float*)get_image2D_array_ptr(output); + Image o_img = create_image_from_image2d(output, 4); + __global float* output_ptr = (__global float*)o_img.ptr; __global float* cdfPtr = output_ptr + offset; VXC_ReadImage(maxData, input, coord, VXC_5BITOFFSET_XY(0, 0),\ @@ -170,8 +164,8 @@ __kernel void random_multinomial_cdf_F16 __kernel void random_multinomial_cdf_F32 ( - __read_only image2d_array_t input, - __write_only image2d_array_t output + __read_only image2d_t input, + __write_only image2d_t output ) { int gidx = get_global_id(0); @@ -187,11 +181,13 @@ __kernel void random_multinomial_cdf_F32 int class_max_stride = get_image_width(input); float tmp = 0; int offset = gidy * class_max_stride; - __global float* output_ptr = (__global float*)get_image2D_array_ptr(output); + Image o_img = create_image_from_image2d(output, 4); + __global float* output_ptr = (__global float*)o_img.ptr; __global float* cdfPtr = output_ptr + offset; int width = get_image_width(input); - __global float* input_ptr = (__global float*)get_image2D_array_ptr(input); + Image i_img = create_image_from_image2d(input, 4); + __global float* input_ptr = (__global float*)i_img.ptr; input_ptr = input_ptr + coord.x + coord.y * width; float4 maxVal = vload4(0, input_ptr); @@ -235,10 +231,10 @@ uint upper_bound(float* a, int n, float x) // one thread calculate 4 __kernel void random_multinomial ( - __read_only image2d_array_t randoms, - __read_only image2d_array_t cdfs, - __write_only image2d_array_t output, - int class_size + __read_only image2d_t randoms, + __read_only image2d_t cdfs, + __write_only image2d_t output, + int class_size ) { int gidx = get_global_id(0); @@ -247,17 +243,20 @@ __kernel void random_multinomial int class_max_stride = get_image_width(cdfs); int offset = gidy * class_max_stride; - __global float* cdf_ptr = (__global float*)get_image2D_array_ptr(cdfs); + Image cdf_img = create_image_from_image2d(cdfs, 4); + __global float* cdf_ptr = (__global float*)cdf_img.ptr; __global float* cdfPtr = cdf_ptr + offset; int width = get_image_width(randoms); offset = coord.x + coord.y * width; - __global float* randoms_ptr = (__global float*)get_image2D_array_ptr(randoms); + Image r_img = create_image_from_image2d(randoms, 4); + __global float* randoms_ptr = (__global float*)r_img.ptr; randoms_ptr = randoms_ptr + offset; width = get_image_width(output); offset = coord.x + coord.y * width; - __global uint* output_ptr = (__global uint*)get_image2D_array_ptr(output); + Image o_img = create_image_from_image2d(output, 4); + __global uint* output_ptr = (__global uint*)o_img.ptr; output_ptr = output_ptr + offset; float4 ran = vload4(0, randoms_ptr); diff --git a/src/tim/vx/internal/src/libnnext/ops/vx/repeat.vx b/src/tim/vx/internal/src/libnnext/ops/vx/repeat.vx index 5898ea4..90e5d85 100644 --- a/src/tim/vx/internal/src/libnnext/ops/vx/repeat.vx +++ b/src/tim/vx/internal/src/libnnext/ops/vx/repeat.vx @@ -66,7 +66,7 @@ __kernel void repeat_I16_axis0( image2d_array_t input0, image2d_t input1, image2d_t input2, image2d_array_t output, int axis) { - int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0); + int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), get_global_id(2)); vxc_short8 src0; VXC_ReadImage2DArray(src0, input0, coord, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); @@ -83,6 +83,7 @@ __kernel void repeat_I16_axis0( int baseAddr = (int)get_global_id(2) * output_desc.s4 + output_desc.s0; _viv_asm(MOV, coord.z, baseAddr); int end = len + start; + coord.w = get_global_id(2); for(coord.y = start; coord.y < end; coord.y++) { @@ -192,6 +193,7 @@ __kernel void repeat_U8_axis0( int baseAddr = (int)get_global_id(2) * output_desc.s4 + output_desc.s0; _viv_asm(MOV, coord.z, baseAddr); int end = len + start; + coord.w = get_global_id(2); for(coord.y = start; coord.y < end; coord.y++) { diff --git a/src/tim/vx/internal/src/libnnext/ops/vx/repeat_axis1.vx b/src/tim/vx/internal/src/libnnext/ops/vx/repeat_axis1.vx index d22a292..de8c81e 100644 --- a/src/tim/vx/internal/src/libnnext/ops/vx/repeat_axis1.vx +++ b/src/tim/vx/internal/src/libnnext/ops/vx/repeat_axis1.vx @@ -25,7 +25,7 @@ __kernel void repeat_I16_axis1( image2d_array_t output, int axis) { int gidy = get_global_id(1); - int4 coord = (int4)(get_global_id(0), gidy, get_global_id(2), 0); + int4 coord = (int4)(get_global_id(0), gidy, get_global_id(2), get_global_id(2)); vxc_short8 src0, src1, src2, src3, src4, src5, src6, src7; int8 input_desc, output_desc; @@ -130,7 +130,7 @@ __kernel void repeat_U8_axis1( image2d_array_t output, int axis) { int gidy = get_global_id(1); - int4 coord = (int4)(get_global_id(0), gidy, get_global_id(2), 0); + int4 coord = (int4)(get_global_id(0), gidy, get_global_id(2), get_global_id(2)); vxc_uchar16 src0, src1, src2, src3, src4, src5, src6, src7; int8 input_desc, output_desc; diff --git a/src/tim/vx/internal/src/libnnext/ops/vx/resize_1d_bilinear_BF16.vx b/src/tim/vx/internal/src/libnnext/ops/vx/resize_1d_bilinear_BF16.vx index cfbae00..a9a79d0 100644 --- a/src/tim/vx/internal/src/libnnext/ops/vx/resize_1d_bilinear_BF16.vx +++ b/src/tim/vx/internal/src/libnnext/ops/vx/resize_1d_bilinear_BF16.vx @@ -40,16 +40,16 @@ __kernel void resize_1d_bilinear_BF16toBF16_DOWN do { - VXC_OP4(img_load_3d, top, input, coord_in.xyww, VXC_5BITOFFSET_XY(0, 0), + VXC_OP4(img_load_3d, top, input, coord_in.xywz, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0)); coord_in.x = left_x_idx.y; - VXC_OP4(img_load_3d, top, input, coord_in.xyww, VXC_5BITOFFSET_XY(0, 0), + VXC_OP4(img_load_3d, top, input, coord_in.xywz, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(2, 3, 0, VXC_RM_TowardZero, 0)); coord_in.x = left_x_idx.z; - VXC_OP4(img_load_3d, top, input, coord_in.xyww, VXC_5BITOFFSET_XY(0, 0), + VXC_OP4(img_load_3d, top, input, coord_in.xywz, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(4, 5, 0, VXC_RM_TowardZero, 0)); coord_in.x = left_x_idx.w; - VXC_OP4(img_load_3d, top, input, coord_in.xyww, VXC_5BITOFFSET_XY(0, 0), + VXC_OP4(img_load_3d, top, input, coord_in.xywz, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(6, 7, 0, VXC_RM_TowardZero, 0)); vxc_ushort8 src; @@ -66,7 +66,7 @@ __kernel void resize_1d_bilinear_BF16toBF16_DOWN vxc_ushort8 tmp, dst; _viv_asm(COPY, tmp, dst4, 16); dst.s0123 = tmp.s1357; - VXC_OP4_NoDest(img_store_3d, output, coord_out.xyww, dst, + VXC_OP4_NoDest(img_store_3d, output, coord_out.xywz, dst, VXC_MODIFIER(0, 3, 0,VXC_RM_TowardZero, 0)); coord_in.y++; coord_out.y ++; @@ -98,9 +98,9 @@ __kernel void resize_1d_bilinear_BF16toBF16_UP _viv_asm(COPY, input_desc, input, sizeof(input_desc)); int baseAddr = (int)coord_in.z * input_desc.s4 + input_desc.s0; _viv_asm(MOV, coord_in.w, baseAddr); - VXC_OP4(img_load_3d, src0, input, coord_in.xyww, VXC_5BITOFFSET_XY(0, 0), + VXC_OP4(img_load_3d, src0, input, coord_in.xywz, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); - VXC_OP4(img_load_3d, src1, input, coord_in.xyww, VXC_5BITOFFSET_XY(8, 0), + VXC_OP4(img_load_3d, src1, input, coord_in.xywz, VXC_5BITOFFSET_XY(8, 0), VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); vxc_ushort8 bitextract_p0; @@ -121,9 +121,9 @@ __kernel void resize_1d_bilinear_BF16toBF16_UP VXC_BitExtract(dst0, src0, src1, maskShift, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); coord_in.y ++; - VXC_OP4(img_load_3d, src0, input, coord_in.xyww, VXC_5BITOFFSET_XY(0, 0), + VXC_OP4(img_load_3d, src0, input, coord_in.xywz, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); - VXC_OP4(img_load_3d, src1, input, coord_in.xyww, VXC_5BITOFFSET_XY(8, 0), + VXC_OP4(img_load_3d, src1, input, coord_in.xywz, VXC_5BITOFFSET_XY(8, 0), VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); vxc_ushort8 dst_tmp; @@ -141,7 +141,7 @@ __kernel void resize_1d_bilinear_BF16toBF16_UP _viv_asm(COPY, tmp, dst4, 16); dst.s0123 = tmp.s1357; - VXC_OP4_NoDest(img_store_3d, output, coord_out.xyww, dst, + VXC_OP4_NoDest(img_store_3d, output, coord_out.xywz, dst, VXC_MODIFIER(0, 3, 0,VXC_RM_TowardZero, 0)); coord_out.y++; } while (coord_out.y < out_height); diff --git a/src/tim/vx/internal/src/libnnext/ops/vx/resize_1d_bilinear_DOWN_NX.vx b/src/tim/vx/internal/src/libnnext/ops/vx/resize_1d_bilinear_DOWN_NX.vx index e8cc06c..c0ba044 100644 --- a/src/tim/vx/internal/src/libnnext/ops/vx/resize_1d_bilinear_DOWN_NX.vx +++ b/src/tim/vx/internal/src/libnnext/ops/vx/resize_1d_bilinear_DOWN_NX.vx @@ -19,12 +19,12 @@ _viv_uniform int out_height; \ while (coord_out.y < out_height) \ { \ - VXC_OP4(img_load_3d, read_data, input, coord_in.xyww, VXC_5BITOFFSET_XY(0, 0), \ + VXC_OP4(img_load_3d, read_data, input, coord_in.xywz, VXC_5BITOFFSET_XY(0, 0), \ VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); \ _viv_asm(COPY, in0, read_data, 16); \ VXC_DP2x8(result, in0, in0, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniResizeNxDown_2x8); \ _viv_asm(COPY, save_data, result, 16); \ - VXC_OP4_NoDest(img_store_3d, output, coord_out.xyww, save_data, \ + VXC_OP4_NoDest(img_store_3d, output, coord_out.xywz, save_data, \ VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0)); \ coord_in.y++; \ coord_out.y++; \ @@ -83,15 +83,15 @@ RESIZE_1D_2X_DOWN_8BIT_SAME(I8, I8, vxc_char16, vxc_char16) \ while (coord_out.y < out_height) \ { \ - VXC_OP4(img_load_3d, read_data, input, coord_in.xyww, VXC_5BITOFFSET_XY(0, 0), \ + VXC_OP4(img_load_3d, read_data, input, coord_in.xywz, VXC_5BITOFFSET_XY(0, 0), \ VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \ _viv_asm(COPY, in0, read_data, 16); \ - VXC_OP4(img_load_3d, read_data1, input, coord_in.xyww, VXC_5BITOFFSET_XY(8, 0), \ + VXC_OP4(img_load_3d, read_data1, input, coord_in.xywz, VXC_5BITOFFSET_XY(8, 0), \ VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \ _viv_asm(COPY, in1, read_data1, 16); \ VXC_DP2x8(result, in0, in1, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniResizeNxDown_2x8); \ _viv_asm(COPY, save_data, result, 16); \ - VXC_OP4_NoDest(img_store_3d, output, coord_out.xyww, save_data, \ + VXC_OP4_NoDest(img_store_3d, output, coord_out.xywz, save_data, \ VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0)); \ coord_in.y++; \ coord_out.y++; \ diff --git a/src/tim/vx/internal/src/libnnext/ops/vx/resize_1d_bilinear_F16.vx b/src/tim/vx/internal/src/libnnext/ops/vx/resize_1d_bilinear_F16.vx index 3487679..e74328b 100644 --- a/src/tim/vx/internal/src/libnnext/ops/vx/resize_1d_bilinear_F16.vx +++ b/src/tim/vx/internal/src/libnnext/ops/vx/resize_1d_bilinear_F16.vx @@ -48,16 +48,16 @@ __kernel void resize_1d_bilinear_F16toF16_DOWN do { - VXC_OP4(img_load_3d, src, input, coord_in.xyww, VXC_5BITOFFSET_XY(0, 0), + VXC_OP4(img_load_3d, src, input, coord_in.xywz, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0)); coord_in.x = left_x_idx.y; - VXC_OP4(img_load_3d, src, input, coord_in.xyww, VXC_5BITOFFSET_XY(0, 0), + VXC_OP4(img_load_3d, src, input, coord_in.xywz, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(2, 3, 0, VXC_RM_TowardZero, 0)); coord_in.x = left_x_idx.z; - VXC_OP4(img_load_3d, src, input, coord_in.xyww, VXC_5BITOFFSET_XY(0, 0), + VXC_OP4(img_load_3d, src, input, coord_in.xywz, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(4, 5, 0, VXC_RM_TowardZero, 0)); coord_in.x = left_x_idx.w; - VXC_OP4(img_load_3d, src, input, coord_in.xyww, VXC_5BITOFFSET_XY(0, 0), + VXC_OP4(img_load_3d, src, input, coord_in.xywz, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(6, 7, 0, VXC_RM_TowardZero, 0)); _viv_asm(COPY, src_half, src, 16); @@ -71,7 +71,7 @@ __kernel void resize_1d_bilinear_F16toF16_DOWN _viv_asm(CONV, tmp, dst4); VXC_DP2x8(dst, tmp, tmp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniExtactHalf8_2x8); _viv_asm(COPY, result, dst, 16); - VXC_OP4_NoDest(img_store_3d, output, coord_out.xyww, result, + VXC_OP4_NoDest(img_store_3d, output, coord_out.xywz, result, VXC_MODIFIER(0, 3, 0,VXC_RM_TowardZero, 0)); coord_in.y++; coord_out.y ++; @@ -113,16 +113,16 @@ __kernel void resize_1d_bilinear_F16toU8_DOWN do { - VXC_OP4(img_load_3d, src, input, coord_in.xyww, VXC_5BITOFFSET_XY(0, 0), + VXC_OP4(img_load_3d, src, input, coord_in.xywz, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0)); coord_in.x = left_x_idx.y; - VXC_OP4(img_load_3d, src, input, coord_in.xyww, VXC_5BITOFFSET_XY(0, 0), + VXC_OP4(img_load_3d, src, input, coord_in.xywz, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(2, 3, 0, VXC_RM_TowardZero, 0)); coord_in.x = left_x_idx.z; - VXC_OP4(img_load_3d, src, input, coord_in.xyww, VXC_5BITOFFSET_XY(0, 0), + VXC_OP4(img_load_3d, src, input, coord_in.xywz, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(4, 5, 0, VXC_RM_TowardZero, 0)); coord_in.x = left_x_idx.w; - VXC_OP4(img_load_3d, src, input, coord_in.xyww, VXC_5BITOFFSET_XY(0, 0), + VXC_OP4(img_load_3d, src, input, coord_in.xywz, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(6, 7, 0, VXC_RM_TowardZero, 0)); _viv_asm(COPY, src_half, src, 16); @@ -136,7 +136,7 @@ __kernel void resize_1d_bilinear_F16toU8_DOWN int4 dst = convert_int4_rte(dst4); VXC_DP2x8(result, dst, dst, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), uniExtact8Bit_2x8); - VXC_OP4_NoDest(img_store_3d, output, coord_out.xyww, result, + VXC_OP4_NoDest(img_store_3d, output, coord_out.xywz, result, VXC_MODIFIER(0, 3, 0,VXC_RM_TowardZero, 0)); coord_in.y++; coord_out.y ++; @@ -168,9 +168,9 @@ __kernel void resize_1d_bilinear_F16toF16_UP _viv_asm(COPY, input_desc, input, sizeof(input_desc)); int baseAddr = (int)coord_in.z * input_desc.s4 + input_desc.s0; _viv_asm(MOV, coord_in.w, baseAddr); - VXC_OP4(img_load_3d, src0, input, coord_in.xyww, VXC_5BITOFFSET_XY(0, 0), + VXC_OP4(img_load_3d, src0, input, coord_in.xywz, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); - VXC_OP4(img_load_3d, src1, input, coord_in.xyww, VXC_5BITOFFSET_XY(8, 0), + VXC_OP4(img_load_3d, src1, input, coord_in.xywz, VXC_5BITOFFSET_XY(8, 0), VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); vxc_ushort8 bitextract_p0; @@ -192,9 +192,9 @@ __kernel void resize_1d_bilinear_F16toF16_UP _viv_asm(COPY, top, dst0, 16); coord_in.y ++; - VXC_OP4(img_load_3d, src0, input, coord_in.xyww, VXC_5BITOFFSET_XY(0, 0), + VXC_OP4(img_load_3d, src0, input, coord_in.xywz, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); - VXC_OP4(img_load_3d, src1, input, coord_in.xyww, VXC_5BITOFFSET_XY(8, 0), + VXC_OP4(img_load_3d, src1, input, coord_in.xywz, VXC_5BITOFFSET_XY(8, 0), VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); float4 left4; @@ -209,7 +209,7 @@ __kernel void resize_1d_bilinear_F16toF16_UP VXC_DP2x8(top, tmp, tmp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniExtactHalf8_2x8); _viv_asm(COPY, dst0, top, 16); - VXC_OP4_NoDest(img_store_3d, output, coord_out.xyww, dst0, + VXC_OP4_NoDest(img_store_3d, output, coord_out.xywz, dst0, VXC_MODIFIER(0, 3, 0,VXC_RM_TowardZero, 0)); coord_out.y++; } while (coord_out.y < out_height); diff --git a/src/tim/vx/internal/src/libnnext/ops/vx/resize_1d_bilinear_I16.vx b/src/tim/vx/internal/src/libnnext/ops/vx/resize_1d_bilinear_I16.vx index 956dc62..61ff2e9 100644 --- a/src/tim/vx/internal/src/libnnext/ops/vx/resize_1d_bilinear_I16.vx +++ b/src/tim/vx/internal/src/libnnext/ops/vx/resize_1d_bilinear_I16.vx @@ -40,9 +40,9 @@ __kernel void resize_1d_bilinear_I16toI16_UP _viv_asm(COPY, input_desc, input, sizeof(input_desc)); int baseAddr = (int)coord_in.z * input_desc.s4 + input_desc.s0; _viv_asm(MOV, coord_in.w, baseAddr); - VXC_OP4(img_load_3d, src0, input, coord_in.xyww, VXC_5BITOFFSET_XY(0, 0), + VXC_OP4(img_load_3d, src0, input, coord_in.xywz, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); - VXC_OP4(img_load_3d, src1, input, coord_in.xyww, VXC_5BITOFFSET_XY(8, 0), + VXC_OP4(img_load_3d, src1, input, coord_in.xywz, VXC_5BITOFFSET_XY(8, 0), VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); vxc_ushort8 bitextract_p0; @@ -67,9 +67,9 @@ __kernel void resize_1d_bilinear_I16toI16_UP float4 right4; coord_in.y ++; - VXC_OP4(img_load_3d, src0, input, coord_in.xyww, VXC_5BITOFFSET_XY(0, 0), + VXC_OP4(img_load_3d, src0, input, coord_in.xywz, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); - VXC_OP4(img_load_3d, src1, input, coord_in.xyww, VXC_5BITOFFSET_XY(8, 0), + VXC_OP4(img_load_3d, src1, input, coord_in.xywz, VXC_5BITOFFSET_XY(8, 0), VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); VXC_DP4x4(left4, top, top, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertDFP2FP32_4x4); @@ -79,7 +79,7 @@ __kernel void resize_1d_bilinear_I16toI16_UP int4 dst = convert_int4_rte(dst4); VXC_DP2x8(top, dst, dst, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), uniExtact8Bit_2x8); - VXC_OP4_NoDest(img_store_3d, output, coord_out.xyww, top, + VXC_OP4_NoDest(img_store_3d, output, coord_out.xywz, top, VXC_MODIFIER(0, 3, 0,VXC_RM_TowardZero, 0)); coord_out.y ++; } while (coord_out.y < out_height); @@ -117,16 +117,16 @@ __kernel void resize_1d_bilinear_I16toI16_DOWN do { - VXC_OP4(img_load_3d, src, input, coord_in.xyww, VXC_5BITOFFSET_XY(0, 0), + VXC_OP4(img_load_3d, src, input, coord_in.xywz, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0)); coord_in.x = left_x_idx.y; - VXC_OP4(img_load_3d, src, input, coord_in.xyww, VXC_5BITOFFSET_XY(0, 0), + VXC_OP4(img_load_3d, src, input, coord_in.xywz, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(2, 3, 0, VXC_RM_TowardZero, 0)); coord_in.x = left_x_idx.z; - VXC_OP4(img_load_3d, src, input, coord_in.xyww, VXC_5BITOFFSET_XY(0, 0), + VXC_OP4(img_load_3d, src, input, coord_in.xywz, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(4, 5, 0, VXC_RM_TowardZero, 0)); coord_in.x = left_x_idx.w; - VXC_OP4(img_load_3d, src, input, coord_in.xyww, VXC_5BITOFFSET_XY(0, 0), + VXC_OP4(img_load_3d, src, input, coord_in.xywz, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(6, 7, 0, VXC_RM_TowardZero, 0)); VXC_DP4x4(left4, src, src, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertDFP2FP32_left_4x4); @@ -138,7 +138,7 @@ __kernel void resize_1d_bilinear_I16toI16_DOWN int4 dst = convert_int4_rte(dst4); VXC_DP2x8(result, dst, dst, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), uniExtact8Bit_2x8); - VXC_OP4_NoDest(img_store_3d, output, coord_out.xyww, result, + VXC_OP4_NoDest(img_store_3d, output, coord_out.xywz, result, VXC_MODIFIER(0, 3, 0,VXC_RM_TowardZero, 0)); coord_in.y++; coord_out.y ++; diff --git a/src/tim/vx/internal/src/libnnext/ops/vx/resize_1d_bilinear_I8.vx b/src/tim/vx/internal/src/libnnext/ops/vx/resize_1d_bilinear_I8.vx index e25071c..95a8d5b 100644 --- a/src/tim/vx/internal/src/libnnext/ops/vx/resize_1d_bilinear_I8.vx +++ b/src/tim/vx/internal/src/libnnext/ops/vx/resize_1d_bilinear_I8.vx @@ -41,7 +41,7 @@ __kernel void resize_1d_bilinear_I8toI8_UP _viv_asm(COPY, input_desc, input, sizeof(input_desc)); int baseAddr = (int)coord_in.z * input_desc.s4 + input_desc.s0; _viv_asm(MOV, coord_in.w, baseAddr); - VXC_OP4(img_load_3d, src0, input, coord_in.xyww, VXC_5BITOFFSET_XY(0, 0), + VXC_OP4(img_load_3d, src0, input, coord_in.xywz, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); vxc_ushort8 bitextract_p0; @@ -63,7 +63,7 @@ __kernel void resize_1d_bilinear_I8toI8_UP _viv_asm(COPY, top, dst0, 16); coord_in.y++; - VXC_OP4(img_load_3d, src0, input, coord_in.xyww, VXC_5BITOFFSET_XY(0, 0), + VXC_OP4(img_load_3d, src0, input, coord_in.xywz, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); float4 left4; @@ -79,7 +79,7 @@ __kernel void resize_1d_bilinear_I8toI8_UP dst4 = dst4 * dfpScale; int4 dst = convert_int4_rte(dst4); VXC_DP2x8(top, dst, dst, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), uniExtact8Bit_2x8); - VXC_OP4_NoDest(img_store_3d, output, coord_out.xyww, top, + VXC_OP4_NoDest(img_store_3d, output, coord_out.xywz, top, VXC_MODIFIER(0, 3, 0,VXC_RM_TowardZero, 0)); coord_out.y ++; @@ -118,16 +118,16 @@ __kernel void resize_1d_bilinear_I8toI8_DOWN do { - VXC_OP4(img_load_3d, src, input, coord_in.xyww, VXC_5BITOFFSET_XY(0, 0), + VXC_OP4(img_load_3d, src, input, coord_in.xywz, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0)); coord_in.x = left_x_idx.y; - VXC_OP4(img_load_3d, src, input, coord_in.xyww, VXC_5BITOFFSET_XY(0, 0), + VXC_OP4(img_load_3d, src, input, coord_in.xywz, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(2, 3, 0, VXC_RM_TowardZero, 0)); coord_in.x = left_x_idx.z; - VXC_OP4(img_load_3d, src, input, coord_in.xyww, VXC_5BITOFFSET_XY(0, 0), + VXC_OP4(img_load_3d, src, input, coord_in.xywz, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(4, 5, 0, VXC_RM_TowardZero, 0)); coord_in.x = left_x_idx.w; - VXC_OP4(img_load_3d, src, input, coord_in.xyww, VXC_5BITOFFSET_XY(0, 0), + VXC_OP4(img_load_3d, src, input, coord_in.xywz, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(6, 7, 0, VXC_RM_TowardZero, 0)); VXC_DP4x4(left4, src, src, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertDFP2FP32_left_4x4); @@ -139,7 +139,7 @@ __kernel void resize_1d_bilinear_I8toI8_DOWN int4 dst = convert_int4_rte(dst4); VXC_DP2x8(result, dst, dst, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), uniExtact8Bit_2x8); - VXC_OP4_NoDest(img_store_3d, output, coord_out.xyww, result, + VXC_OP4_NoDest(img_store_3d, output, coord_out.xywz, result, VXC_MODIFIER(0, 3, 0,VXC_RM_TowardZero, 0)); coord_in.y++; coord_out.y ++; diff --git a/src/tim/vx/internal/src/libnnext/ops/vx/resize_1d_bilinear_U8.vx b/src/tim/vx/internal/src/libnnext/ops/vx/resize_1d_bilinear_U8.vx index b25fba9..f7243a6 100644 --- a/src/tim/vx/internal/src/libnnext/ops/vx/resize_1d_bilinear_U8.vx +++ b/src/tim/vx/internal/src/libnnext/ops/vx/resize_1d_bilinear_U8.vx @@ -48,16 +48,16 @@ __kernel void resize_1d_bilinear_U8toF16_DOWN do { - VXC_OP4(img_load_3d, src, input, coord_in.xyww, VXC_5BITOFFSET_XY(0, 0), + VXC_OP4(img_load_3d, src, input, coord_in.xywz, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0)); coord_in.x = left_x_idx.y; - VXC_OP4(img_load_3d, src, input, coord_in.xyww, VXC_5BITOFFSET_XY(0, 0), + VXC_OP4(img_load_3d, src, input, coord_in.xywz, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(2, 3, 0, VXC_RM_TowardZero, 0)); coord_in.x = left_x_idx.z; - VXC_OP4(img_load_3d, src, input, coord_in.xyww, VXC_5BITOFFSET_XY(0, 0), + VXC_OP4(img_load_3d, src, input, coord_in.xywz, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(4, 5, 0, VXC_RM_TowardZero, 0)); coord_in.x = left_x_idx.w; - VXC_OP4(img_load_3d, src, input, coord_in.xyww, VXC_5BITOFFSET_XY(0, 0), + VXC_OP4(img_load_3d, src, input, coord_in.xywz, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(6, 7, 0, VXC_RM_TowardZero, 0)); VXC_DP4x4(left4, src, inputZP, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniU8SubZPtoFp32_left_4x4); @@ -69,7 +69,7 @@ __kernel void resize_1d_bilinear_U8toF16_DOWN _viv_asm(CONV, dst, dst4); vxc_short8 dst_short; _viv_asm(COPY, dst_short, dst, 16); - VXC_OP4_NoDest(img_store_3d, output, coord_out.xyww, dst_short, + VXC_OP4_NoDest(img_store_3d, output, coord_out.xywz, dst_short, VXC_MODIFIER(0, 3, 0,VXC_RM_TowardZero, 0)); coord_in.y++; coord_out.y ++; @@ -106,7 +106,7 @@ __kernel void resize_1d_bilinear_U8toU8_UP _viv_asm(COPY, input_desc, input, sizeof(input_desc)); int baseAddr = (int)coord_in.z * input_desc.s4 + input_desc.s0; _viv_asm(MOV, coord_in.w, baseAddr); - VXC_OP4(img_load_3d, src0, input, coord_in.xyww, VXC_5BITOFFSET_XY(0, 0), + VXC_OP4(img_load_3d, src0, input, coord_in.xywz, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); vxc_ushort8 bitextract_p0; @@ -126,7 +126,7 @@ __kernel void resize_1d_bilinear_U8toU8_UP { VXC_BitExtract(top, src0, src0, maskShift, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); coord_in.y++; - VXC_OP4(img_load_3d, src0, input, coord_in.xyww, + VXC_OP4(img_load_3d, src0, input, coord_in.xywz, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); float4 left4; @@ -141,7 +141,7 @@ __kernel void resize_1d_bilinear_U8toU8_UP dst4 = dst4 * uint8Scale + output_ZP; int4 dst = convert_int4_rte(dst4); VXC_DP2x8(top, dst, dst, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), uniExtact8Bit_2x8); - VXC_OP4_NoDest(img_store_3d, output, coord_out.xyww, top, + VXC_OP4_NoDest(img_store_3d, output, coord_out.xywz, top, VXC_MODIFIER(0, 3, 0,VXC_RM_TowardZero, 0)); coord_out.y ++; @@ -182,16 +182,16 @@ __kernel void resize_1d_bilinear_U8toU8_DOWN do { - VXC_OP4(img_load_3d, src, input, coord_in.xyww, VXC_5BITOFFSET_XY(0, 0), + VXC_OP4(img_load_3d, src, input, coord_in.xywz, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0)); coord_in.x = left_x_idx.y; - VXC_OP4(img_load_3d, src, input, coord_in.xyww, VXC_5BITOFFSET_XY(0, 0), + VXC_OP4(img_load_3d, src, input, coord_in.xywz, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(2, 3, 0, VXC_RM_TowardZero, 0)); coord_in.x = left_x_idx.z; - VXC_OP4(img_load_3d, src, input, coord_in.xyww, VXC_5BITOFFSET_XY(0, 0), + VXC_OP4(img_load_3d, src, input, coord_in.xywz, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(4, 5, 0, VXC_RM_TowardZero, 0)); coord_in.x = left_x_idx.w; - VXC_OP4(img_load_3d, src, input, coord_in.xyww, VXC_5BITOFFSET_XY(0, 0), + VXC_OP4(img_load_3d, src, input, coord_in.xywz, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(6, 7, 0, VXC_RM_TowardZero, 0)); VXC_DP4x4(left4, src, inputZP, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniU8SubZPtoFp32_left_4x4); @@ -203,7 +203,7 @@ __kernel void resize_1d_bilinear_U8toU8_DOWN VXC_DP2x8(result, dst, dst, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), uniExtact8Bit_2x8); - VXC_OP4_NoDest(img_store_3d, output, coord_out.xyww, result, + VXC_OP4_NoDest(img_store_3d, output, coord_out.xywz, result, VXC_MODIFIER(0, 3, 0,VXC_RM_TowardZero, 0)); coord_in.y++; coord_out.y ++; diff --git a/src/tim/vx/internal/src/libnnext/ops/vx/resize_1d_bilinear_U8_opt.vx b/src/tim/vx/internal/src/libnnext/ops/vx/resize_1d_bilinear_U8_opt.vx index ab7e74b..ce34d66 100644 --- a/src/tim/vx/internal/src/libnnext/ops/vx/resize_1d_bilinear_U8_opt.vx +++ b/src/tim/vx/internal/src/libnnext/ops/vx/resize_1d_bilinear_U8_opt.vx @@ -35,7 +35,7 @@ __kernel void resize_1d_bilinear_U8toU8_UP_opt _viv_asm(COPY, input_desc, input, sizeof(input_desc)); int baseAddr = (int)coord_in.z * input_desc.s4 + input_desc.s0; _viv_asm(MOV, coord_in.w, baseAddr); - VXC_OP4(img_load_3d, src0, input, coord_in.xyww, VXC_5BITOFFSET_XY(0, 0), + VXC_OP4(img_load_3d, src0, input, coord_in.xywz, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); vxc_ushort8 bitextract_p0; @@ -63,14 +63,14 @@ __kernel void resize_1d_bilinear_U8toU8_UP_opt VXC_BitExtract(src_mask, src0, src0, maskShift, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); coord_in.y++; - VXC_OP4(img_load_3d, src0, input, coord_in.xyww, + VXC_OP4(img_load_3d, src0, input, coord_in.xywz, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); vxc_uchar16 dst; VXC_DP4x4(dst, src_mask, lerp, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), uniBilinear_4x4); - VXC_OP4_NoDest(img_store_3d, output, coord_out.xyww, dst, + VXC_OP4_NoDest(img_store_3d, output, coord_out.xywz, dst, VXC_MODIFIER(0, 3, 0,VXC_RM_TowardZero, 0)); coord_out.y ++; diff --git a/src/tim/vx/internal/src/libnnext/ops/vx/resize_1d_bilinear_UP_NX.vx b/src/tim/vx/internal/src/libnnext/ops/vx/resize_1d_bilinear_UP_NX.vx index 3ddd305..9d33eec 100644 --- a/src/tim/vx/internal/src/libnnext/ops/vx/resize_1d_bilinear_UP_NX.vx +++ b/src/tim/vx/internal/src/libnnext/ops/vx/resize_1d_bilinear_UP_NX.vx @@ -19,12 +19,12 @@ _viv_uniform int out_height; \ while (coord_out.y < out_height) \ { \ - VXC_OP4(img_load_3d, read_data, input, coord_in.xyww, VXC_5BITOFFSET_XY(0, 0), \ + VXC_OP4(img_load_3d, read_data, input, coord_in.xywz, VXC_5BITOFFSET_XY(0, 0), \ VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \ _viv_asm(COPY, in0, read_data, 16); \ VXC_DP2x8(result, in0, in0, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniResizeNxUp_2x8); \ _viv_asm(COPY, save_data, result, 16); \ - VXC_OP4_NoDest(img_store_3d, output, coord_out.xyww, save_data, \ + VXC_OP4_NoDest(img_store_3d, output, coord_out.xywz, save_data, \ VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0)); \ coord_in.y++; \ coord_out.y++; \ diff --git a/src/tim/vx/internal/src/libnnext/ops/vx/resize_1d_nearest.vx b/src/tim/vx/internal/src/libnnext/ops/vx/resize_1d_nearest.vx index 75d0c47..1de48d7 100644 --- a/src/tim/vx/internal/src/libnnext/ops/vx/resize_1d_nearest.vx +++ b/src/tim/vx/internal/src/libnnext/ops/vx/resize_1d_nearest.vx @@ -36,19 +36,19 @@ __kernel void resize_1d_nearest_F16toF16 baseAddr = (int)coord_out.z * output_desc.s4 + output_desc.s0; _viv_asm(MOV, coord_out.w, baseAddr); - VXC_OP4(img_load_3d, src, input, coord_in.xyww, VXC_5BITOFFSET_XY(0, 0), + VXC_OP4(img_load_3d, src, input, coord_in.xywz, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0)); coord_in.x = in_x_idx.y; - VXC_OP4(img_load_3d, src, input, coord_in.xyww, VXC_5BITOFFSET_XY(0, 0), + VXC_OP4(img_load_3d, src, input, coord_in.xywz, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(1, 1, 0, VXC_RM_TowardZero, 0)); coord_in.x = in_x_idx.z; - VXC_OP4(img_load_3d, src, input, coord_in.xyww, VXC_5BITOFFSET_XY(0, 0), + VXC_OP4(img_load_3d, src, input, coord_in.xywz, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(2, 2, 0, VXC_RM_TowardZero, 0)); coord_in.x = in_x_idx.w; - VXC_OP4(img_load_3d, src, input, coord_in.xyww, VXC_5BITOFFSET_XY(0, 0), + VXC_OP4(img_load_3d, src, input, coord_in.xywz, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(3, 3, 0, VXC_RM_TowardZero, 0)); - VXC_OP4_NoDest(img_store_3d, output, coord_out.xyww, src, + VXC_OP4_NoDest(img_store_3d, output, coord_out.xywz, src, VXC_MODIFIER(0, 3, 0,VXC_RM_TowardZero, 0)); } @@ -76,9 +76,9 @@ __kernel void resize_1d_nearest_F16toF16_op baseAddr = (int)coord_out.z * output_desc.s4 + output_desc.s0; _viv_asm(MOV, coord_out.w, baseAddr); - VXC_OP4(img_load_3d, src0, input, coord_in.xyww, VXC_5BITOFFSET_XY(0, 0), + VXC_OP4(img_load_3d, src0, input, coord_in.xywz, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); - VXC_OP4(img_load_3d, src1, input, coord_in.xyww, VXC_5BITOFFSET_XY(8, 0), + VXC_OP4(img_load_3d, src1, input, coord_in.xywz, VXC_5BITOFFSET_XY(8, 0), VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); vxc_uchar16 mask = (vxc_uchar16)(8, 8, 8, 8, 8, 8, 8, 8, 16, 16, 16, 16, 16, 16, 16, 16); @@ -88,7 +88,7 @@ __kernel void resize_1d_nearest_F16toF16_op VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniGetExtractData_2x8); VXC_BitExtract(dst, src0, src1, mask, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); - VXC_OP4_NoDest(img_store_3d, output, coord_out.xyww, dst, + VXC_OP4_NoDest(img_store_3d, output, coord_out.xywz, dst, VXC_MODIFIER(0, 3, 0,VXC_RM_TowardZero, 0)); } @@ -116,19 +116,19 @@ __kernel void resize_1d_nearest_I8toI8 baseAddr = (int)coord_out.z * output_desc.s4 + output_desc.s0; _viv_asm(MOV, coord_out.w, baseAddr); - VXC_OP4(img_load_3d, src, input, coord_in.xyww, VXC_5BITOFFSET_XY(0, 0), + VXC_OP4(img_load_3d, src, input, coord_in.xywz, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0)); coord_in.x = in_x_idx.y; - VXC_OP4(img_load_3d, src, input, coord_in.xyww, VXC_5BITOFFSET_XY(0, 0), + VXC_OP4(img_load_3d, src, input, coord_in.xywz, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(1, 1, 0, VXC_RM_TowardZero, 0)); coord_in.x = in_x_idx.z; - VXC_OP4(img_load_3d, src, input, coord_in.xyww, VXC_5BITOFFSET_XY(0, 0), + VXC_OP4(img_load_3d, src, input, coord_in.xywz, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(2, 2, 0, VXC_RM_TowardZero, 0)); coord_in.x = in_x_idx.w; - VXC_OP4(img_load_3d, src, input, coord_in.xyww, VXC_5BITOFFSET_XY(0, 0), + VXC_OP4(img_load_3d, src, input, coord_in.xywz, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(3, 3, 0, VXC_RM_TowardZero, 0)); VXC_DP2x8(src, src, src, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniConvertI8toI8_2x8); - VXC_OP4_NoDest(img_store_3d, output, coord_out.xyww, src, + VXC_OP4_NoDest(img_store_3d, output, coord_out.xywz, src, VXC_MODIFIER(0, 3, 0,VXC_RM_TowardZero, 0)); } @@ -156,7 +156,7 @@ __kernel void resize_1d_nearest_I8toI8_op baseAddr = (int)coord_out.z * output_desc.s4 + output_desc.s0; _viv_asm(MOV, coord_out.w, baseAddr); - VXC_OP4(img_load_3d, src0, input, coord_in.xyww, VXC_5BITOFFSET_XY(0, 0), + VXC_OP4(img_load_3d, src0, input, coord_in.xywz, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); vxc_uchar16 mask = (vxc_uchar16)(8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8); @@ -167,7 +167,7 @@ __kernel void resize_1d_nearest_I8toI8_op VXC_BitExtract(dst0, src0, src0, mask, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); _viv_asm(COPY, dst, dst0, 8); VXC_DP2x8(dst, dst, dst, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), uniConvertI8toI8_2x8); - VXC_OP4_NoDest(img_store_3d, output, coord_out.xyww, dst, + VXC_OP4_NoDest(img_store_3d, output, coord_out.xywz, dst, VXC_MODIFIER(0, 3, 0,VXC_RM_TowardZero, 0)); } @@ -194,16 +194,16 @@ __kernel void resize_1d_nearest_U8toU8 baseAddr = (int)coord_out.z * output_desc.s4 + output_desc.s0; _viv_asm(MOV, coord_out.w, baseAddr); - VXC_OP4(img_load_3d, src, input, coord_in.xyww, VXC_5BITOFFSET_XY(0, 0), + VXC_OP4(img_load_3d, src, input, coord_in.xywz, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0)); coord_in.x = in_x_idx.y; - VXC_OP4(img_load_3d, src, input, coord_in.xyww, VXC_5BITOFFSET_XY(0, 0), + VXC_OP4(img_load_3d, src, input, coord_in.xywz, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(1, 1, 0, VXC_RM_TowardZero, 0)); coord_in.x = in_x_idx.z; - VXC_OP4(img_load_3d, src, input, coord_in.xyww, VXC_5BITOFFSET_XY(0, 0), + VXC_OP4(img_load_3d, src, input, coord_in.xywz, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(2, 2, 0, VXC_RM_TowardZero, 0)); coord_in.x = in_x_idx.w; - VXC_OP4(img_load_3d, src, input, coord_in.xyww, VXC_5BITOFFSET_XY(0, 0), + VXC_OP4(img_load_3d, src, input, coord_in.xywz, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(3, 3, 0, VXC_RM_TowardZero, 0)); vxc_ushort8 multiplier; @@ -211,7 +211,7 @@ __kernel void resize_1d_nearest_U8toU8 VXC_DP2x8(src, src, multiplier, \ VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), uniMultiplyAndPostShift_2x8); - VXC_OP4_NoDest(img_store_3d, output, coord_out.xyww, src, + VXC_OP4_NoDest(img_store_3d, output, coord_out.xywz, src, VXC_MODIFIER(0, 3, 0,VXC_RM_TowardZero, 0)); } @@ -238,7 +238,7 @@ __kernel void resize_1d_nearest_U8toU8_op baseAddr = (int)coord_out.z * output_desc.s4 + output_desc.s0; _viv_asm(MOV, coord_out.w, baseAddr); - VXC_OP4(img_load_3d, src0, input, coord_in.xyww, VXC_5BITOFFSET_XY(0, 0), + VXC_OP4(img_load_3d, src0, input, coord_in.xywz, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); vxc_uchar16 mask = (vxc_uchar16)(8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8); @@ -250,7 +250,7 @@ __kernel void resize_1d_nearest_U8toU8_op _viv_asm(COPY, multiplier, multAndoutZP, 16); VXC_DP2x8(dst, dst, multiplier, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), uniMultiplyAndPostShift_2x8); - VXC_OP4_NoDest(img_store_3d, output, coord_out.xyww, dst, + VXC_OP4_NoDest(img_store_3d, output, coord_out.xywz, dst, VXC_MODIFIER(0, 3, 0,VXC_RM_TowardZero, 0)); } @@ -277,21 +277,21 @@ __kernel void resize_1d_nearest_I16toI16 baseAddr = (int)coord_out.z * output_desc.s4 + output_desc.s0; _viv_asm(MOV, coord_out.w, baseAddr); - VXC_OP4(img_load_3d, src, input, coord_in.xyww, VXC_5BITOFFSET_XY(0, 0), + VXC_OP4(img_load_3d, src, input, coord_in.xywz, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0)); coord_in.x = in_x_idx.y; - VXC_OP4(img_load_3d, src, input, coord_in.xyww, VXC_5BITOFFSET_XY(0, 0), + VXC_OP4(img_load_3d, src, input, coord_in.xywz, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(1, 1, 0, VXC_RM_TowardZero, 0)); coord_in.x = in_x_idx.z; - VXC_OP4(img_load_3d, src, input, coord_in.xyww, VXC_5BITOFFSET_XY(0, 0), + VXC_OP4(img_load_3d, src, input, coord_in.xywz, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(2, 2, 0, VXC_RM_TowardZero, 0)); coord_in.x = in_x_idx.w; - VXC_OP4(img_load_3d, src, input, coord_in.xyww, VXC_5BITOFFSET_XY(0, 0), + VXC_OP4(img_load_3d, src, input, coord_in.xywz, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(3, 3, 0, VXC_RM_TowardZero, 0)); VXC_DP2x8(src, src, src, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), uniConvertI8toI8_2x8); - VXC_OP4_NoDest(img_store_3d, output, coord_out.xyww, src, + VXC_OP4_NoDest(img_store_3d, output, coord_out.xywz, src, VXC_MODIFIER(0, 3, 0,VXC_RM_TowardZero, 0)); } @@ -319,9 +319,9 @@ __kernel void resize_1d_nearest_I16toI16_op baseAddr = (int)coord_out.z * output_desc.s4 + output_desc.s0; _viv_asm(MOV, coord_out.w, baseAddr); - VXC_OP4(img_load_3d, src0, input, coord_in.xyww, VXC_5BITOFFSET_XY(0, 0), + VXC_OP4(img_load_3d, src0, input, coord_in.xywz, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); - VXC_OP4(img_load_3d, src1, input, coord_in.xyww, VXC_5BITOFFSET_XY(8, 0), + VXC_OP4(img_load_3d, src1, input, coord_in.xywz, VXC_5BITOFFSET_XY(8, 0), VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); vxc_uchar16 mask = (vxc_uchar16)(8, 8, 8, 8, 8, 8, 8, 8, 16, 16, 16, 16, 16, 16, 16, 16); @@ -332,6 +332,6 @@ __kernel void resize_1d_nearest_I16toI16_op _viv_asm(COPY, dst, dst0, 8); VXC_DP2x8(dst, dst, dst, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), uniConvertI8toI8_2x8); - VXC_OP4_NoDest(img_store_3d, output, coord_out.xyww, dst, + VXC_OP4_NoDest(img_store_3d, output, coord_out.xywz, dst, VXC_MODIFIER(0, 3, 0,VXC_RM_TowardZero, 0)); } diff --git a/src/tim/vx/internal/src/libnnext/ops/vx/resize_bilinear_BF16.vx b/src/tim/vx/internal/src/libnnext/ops/vx/resize_bilinear_BF16.vx index 8f7826b..9a379a9 100644 --- a/src/tim/vx/internal/src/libnnext/ops/vx/resize_bilinear_BF16.vx +++ b/src/tim/vx/internal/src/libnnext/ops/vx/resize_bilinear_BF16.vx @@ -37,24 +37,24 @@ __kernel void resize_bilinear_BF16toBF16_DOWN _viv_asm(COPY, input_desc, input, sizeof(input_desc)); int baseAddr = (int)coord_in.z * input_desc.s4 + input_desc.s0; _viv_asm(MOV, coord_in.w, baseAddr); - VXC_OP4(img_load_3d, top, input, coord_in.xyww, VXC_5BITOFFSET_XY(0, 0), + VXC_OP4(img_load_3d, top, input, coord_in.xywz, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0)); - VXC_OP4(img_load_3d, bottom, input, coord_in.xyww, VXC_5BITOFFSET_XY(0, 1), + VXC_OP4(img_load_3d, bottom, input, coord_in.xywz, VXC_5BITOFFSET_XY(0, 1), VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0)); coord_in.x = left_x_idx.y; - VXC_OP4(img_load_3d, top, input, coord_in.xyww, VXC_5BITOFFSET_XY(0, 0), + VXC_OP4(img_load_3d, top, input, coord_in.xywz, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(2, 3, 0, VXC_RM_TowardZero, 0)); - VXC_OP4(img_load_3d, bottom, input, coord_in.xyww, VXC_5BITOFFSET_XY(0, 1), + VXC_OP4(img_load_3d, bottom, input, coord_in.xywz, VXC_5BITOFFSET_XY(0, 1), VXC_MODIFIER(2, 3, 0, VXC_RM_TowardZero, 0)); coord_in.x = left_x_idx.z; - VXC_OP4(img_load_3d, top, input, coord_in.xyww, VXC_5BITOFFSET_XY(0, 0), + VXC_OP4(img_load_3d, top, input, coord_in.xywz, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(4, 5, 0, VXC_RM_TowardZero, 0)); - VXC_OP4(img_load_3d, bottom, input, coord_in.xyww, VXC_5BITOFFSET_XY(0, 1), + VXC_OP4(img_load_3d, bottom, input, coord_in.xywz, VXC_5BITOFFSET_XY(0, 1), VXC_MODIFIER(4, 5, 0, VXC_RM_TowardZero, 0)); coord_in.x = left_x_idx.w; - VXC_OP4(img_load_3d, top, input, coord_in.xyww, VXC_5BITOFFSET_XY(0, 0), + VXC_OP4(img_load_3d, top, input, coord_in.xywz, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(6, 7, 0, VXC_RM_TowardZero, 0)); - VXC_OP4(img_load_3d, bottom, input, coord_in.xyww, VXC_5BITOFFSET_XY(0, 1), + VXC_OP4(img_load_3d, bottom, input, coord_in.xywz, VXC_5BITOFFSET_XY(0, 1), VXC_MODIFIER(6, 7, 0, VXC_RM_TowardZero, 0)); vxc_ushort8 src; @@ -87,7 +87,7 @@ __kernel void resize_bilinear_BF16toBF16_DOWN baseAddr = (int)coord_out.z * output_desc.s4 + output_desc.s0; _viv_asm(MOV, coord_out.w, baseAddr); - VXC_OP4_NoDest(img_store_3d, output, coord_out.xyww, dst, + VXC_OP4_NoDest(img_store_3d, output, coord_out.xywz, dst, VXC_MODIFIER(0, 3, 0,VXC_RM_TowardZero, 0)); } @@ -120,13 +120,13 @@ __kernel void resize_bilinear_BF16toBF16_UP int baseAddr = (int)coord_in.z * input_desc.s4 + input_desc.s0; _viv_asm(MOV, coord_in.w, baseAddr); - VXC_OP4(img_load_3d, src0, input, coord_in.xyww, VXC_5BITOFFSET_XY(0, 0), + VXC_OP4(img_load_3d, src0, input, coord_in.xywz, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); - VXC_OP4(img_load_3d, src1, input, coord_in.xyww, VXC_5BITOFFSET_XY(8, 0), + VXC_OP4(img_load_3d, src1, input, coord_in.xywz, VXC_5BITOFFSET_XY(8, 0), VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); - VXC_OP4(img_load_3d, src2, input, coord_in.xyww, VXC_5BITOFFSET_XY(0, 1), + VXC_OP4(img_load_3d, src2, input, coord_in.xywz, VXC_5BITOFFSET_XY(0, 1), VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); - VXC_OP4(img_load_3d, src3, input, coord_in.xyww, VXC_5BITOFFSET_XY(8, 1), + VXC_OP4(img_load_3d, src3, input, coord_in.xywz, VXC_5BITOFFSET_XY(8, 1), VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); @@ -155,20 +155,18 @@ __kernel void resize_bilinear_BF16toBF16_UP VXC_BitExtract(dst1, src2, src3, maskShift, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); - coord_in.w += input_desc.s4; - VXC_OP4(img_load_3d, src0, input, coord_in.xyww, VXC_5BITOFFSET_XY(0, 0), + coord_in.zw += (int2)(1, input_desc.s4); + VXC_OP4(img_load_3d, src0, input, coord_in.xywz, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); - VXC_OP4(img_load_3d, src1, input, coord_in.xyww, VXC_5BITOFFSET_XY(8, 0), + VXC_OP4(img_load_3d, src1, input, coord_in.xywz, VXC_5BITOFFSET_XY(8, 0), VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); - VXC_OP4(img_load_3d, src2, input, coord_in.xyww, VXC_5BITOFFSET_XY(0, 1), + VXC_OP4(img_load_3d, src2, input, coord_in.xywz, VXC_5BITOFFSET_XY(0, 1), VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); - VXC_OP4(img_load_3d, src3, input, coord_in.xyww, VXC_5BITOFFSET_XY(8, 1), + VXC_OP4(img_load_3d, src3, input, coord_in.xywz, VXC_5BITOFFSET_XY(8, 1), VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); - coord_in.z ++; vxc_ushort8 dst_tmp; - VXC_DP2x8(dst_tmp, dst0, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvBF16toF32_Part0_2x8); _viv_asm(COPY, left4, dst_tmp, 16); VXC_DP2x8(dst_tmp, dst0, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvBF16toF32_Part1_2x8); @@ -190,8 +188,8 @@ __kernel void resize_bilinear_BF16toBF16_UP _viv_asm(COPY, tmp, dst4, 16); dst.s0123 = tmp.s1357; - VXC_OP4_NoDest(img_store_3d, output, coord_out.xyww, dst, VXC_MODIFIER(0, 3, 0,VXC_RM_TowardZero, 0)); - coord_out.w += output_desc.s4; + VXC_OP4_NoDest(img_store_3d, output, coord_out.xywz, dst, VXC_MODIFIER(0, 3, 0,VXC_RM_TowardZero, 0)); + coord_out.zw += (int2)(1, output_desc.s4); } VXC_BitExtract(dst0, src0, src1, maskShift, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); @@ -214,5 +212,5 @@ __kernel void resize_bilinear_BF16toBF16_UP vxc_ushort8 tmp, dst; _viv_asm(COPY, tmp, dst4, 16); dst.s0123 = tmp.s1357; - VXC_OP4_NoDest(img_store_3d, output, coord_out.xyww, dst, VXC_MODIFIER(0, 3, 0,VXC_RM_TowardZero, 0)); + VXC_OP4_NoDest(img_store_3d, output, coord_out.xywz, dst, VXC_MODIFIER(0, 3, 0,VXC_RM_TowardZero, 0)); } diff --git a/src/tim/vx/internal/src/libnnext/ops/vx/resize_bilinear_F16.vx b/src/tim/vx/internal/src/libnnext/ops/vx/resize_bilinear_F16.vx index 463b5a2..2546ca5 100644 --- a/src/tim/vx/internal/src/libnnext/ops/vx/resize_bilinear_F16.vx +++ b/src/tim/vx/internal/src/libnnext/ops/vx/resize_bilinear_F16.vx @@ -39,24 +39,24 @@ __kernel void resize_bilinear_F16toF16_DOWN _viv_asm(COPY, input_desc, input, sizeof(input_desc)); int baseAddr = (int)coord_in.z * input_desc.s4 + input_desc.s0; _viv_asm(MOV, coord_in.w, baseAddr); - VXC_OP4(img_load_3d, top_short, input, coord_in.xyww, VXC_5BITOFFSET_XY(0, 0), + VXC_OP4(img_load_3d, top_short, input, coord_in.xywz, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0)); - VXC_OP4(img_load_3d, bottom_short, input, coord_in.xyww, VXC_5BITOFFSET_XY(0, 1), + VXC_OP4(img_load_3d, bottom_short, input, coord_in.xywz, VXC_5BITOFFSET_XY(0, 1), VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0)); coord_in.x = left_x_idx.y; - VXC_OP4(img_load_3d, top_short, input, coord_in.xyww, VXC_5BITOFFSET_XY(0, 0), + VXC_OP4(img_load_3d, top_short, input, coord_in.xywz, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(2, 3, 0, VXC_RM_TowardZero, 0)); - VXC_OP4(img_load_3d, bottom_short, input, coord_in.xyww, VXC_5BITOFFSET_XY(0, 1), + VXC_OP4(img_load_3d, bottom_short, input, coord_in.xywz, VXC_5BITOFFSET_XY(0, 1), VXC_MODIFIER(2, 3, 0, VXC_RM_TowardZero, 0)); coord_in.x = left_x_idx.z; - VXC_OP4(img_load_3d, top_short, input, coord_in.xyww, VXC_5BITOFFSET_XY(0, 0), + VXC_OP4(img_load_3d, top_short, input, coord_in.xywz, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(4, 5, 0, VXC_RM_TowardZero, 0)); - VXC_OP4(img_load_3d, bottom_short, input, coord_in.xyww, VXC_5BITOFFSET_XY(0, 1), + VXC_OP4(img_load_3d, bottom_short, input, coord_in.xywz, VXC_5BITOFFSET_XY(0, 1), VXC_MODIFIER(4, 5, 0, VXC_RM_TowardZero, 0)); coord_in.x = left_x_idx.w; - VXC_OP4(img_load_3d, top_short, input, coord_in.xyww, VXC_5BITOFFSET_XY(0, 0), + VXC_OP4(img_load_3d, top_short, input, coord_in.xywz, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(6, 7, 0, VXC_RM_TowardZero, 0)); - VXC_OP4(img_load_3d, bottom_short, input, coord_in.xyww, VXC_5BITOFFSET_XY(0, 1), + VXC_OP4(img_load_3d, bottom_short, input, coord_in.xywz, VXC_5BITOFFSET_XY(0, 1), VXC_MODIFIER(6, 7, 0, VXC_RM_TowardZero, 0)); _viv_asm(COPY, top, top_short, 16); _viv_asm(COPY, bottom, bottom_short, 16); @@ -85,7 +85,7 @@ __kernel void resize_bilinear_F16toF16_DOWN baseAddr = (int)coord_out.z * output_desc.s4 + output_desc.s0; _viv_asm(MOV, coord_out.w, baseAddr); - VXC_OP4_NoDest(img_store_3d, output, coord_out.xyww, dst, + VXC_OP4_NoDest(img_store_3d, output, coord_out.xywz, dst, VXC_MODIFIER(0, 3, 0,VXC_RM_TowardZero, 0)); } @@ -116,24 +116,24 @@ __kernel void resize_bilinear_F16toU8_DOWN _viv_asm(COPY, input_desc, input, sizeof(input_desc)); int baseAddr = (int)coord_in.z * input_desc.s4 + input_desc.s0; _viv_asm(MOV, coord_in.w, baseAddr); - VXC_OP4(img_load_3d, top_short, input, coord_in.xyww, VXC_5BITOFFSET_XY(0, 0), + VXC_OP4(img_load_3d, top_short, input, coord_in.xywz, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0)); - VXC_OP4(img_load_3d, bottom_short, input, coord_in.xyww, VXC_5BITOFFSET_XY(0, 1), + VXC_OP4(img_load_3d, bottom_short, input, coord_in.xywz, VXC_5BITOFFSET_XY(0, 1), VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0)); coord_in.x = left_x_idx.y; - VXC_OP4(img_load_3d, top_short, input, coord_in.xyww, VXC_5BITOFFSET_XY(0, 0), + VXC_OP4(img_load_3d, top_short, input, coord_in.xywz, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(2, 3, 0, VXC_RM_TowardZero, 0)); - VXC_OP4(img_load_3d, bottom_short, input, coord_in.xyww, VXC_5BITOFFSET_XY(0, 1), + VXC_OP4(img_load_3d, bottom_short, input, coord_in.xywz, VXC_5BITOFFSET_XY(0, 1), VXC_MODIFIER(2, 3, 0, VXC_RM_TowardZero, 0)); coord_in.x = left_x_idx.z; - VXC_OP4(img_load_3d, top_short, input, coord_in.xyww, VXC_5BITOFFSET_XY(0, 0), + VXC_OP4(img_load_3d, top_short, input, coord_in.xywz, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(4, 5, 0, VXC_RM_TowardZero, 0)); - VXC_OP4(img_load_3d, bottom_short, input, coord_in.xyww, VXC_5BITOFFSET_XY(0, 1), + VXC_OP4(img_load_3d, bottom_short, input, coord_in.xywz, VXC_5BITOFFSET_XY(0, 1), VXC_MODIFIER(4, 5, 0, VXC_RM_TowardZero, 0)); coord_in.x = left_x_idx.w; - VXC_OP4(img_load_3d, top_short, input, coord_in.xyww, VXC_5BITOFFSET_XY(0, 0), + VXC_OP4(img_load_3d, top_short, input, coord_in.xywz, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(6, 7, 0, VXC_RM_TowardZero, 0)); - VXC_OP4(img_load_3d, bottom_short, input, coord_in.xyww, VXC_5BITOFFSET_XY(0, 1), + VXC_OP4(img_load_3d, bottom_short, input, coord_in.xywz, VXC_5BITOFFSET_XY(0, 1), VXC_MODIFIER(6, 7, 0, VXC_RM_TowardZero, 0)); _viv_asm(COPY, top, top_short, 16); _viv_asm(COPY, bottom, bottom_short, 16); @@ -160,7 +160,7 @@ __kernel void resize_bilinear_F16toU8_DOWN baseAddr = (int)coord_out.z * output_desc.s4 + output_desc.s0; _viv_asm(MOV, coord_out.w, baseAddr); - VXC_OP4_NoDest(img_store_3d, output, coord_out.xyww, dst_uchar, + VXC_OP4_NoDest(img_store_3d, output, coord_out.xywz, dst_uchar, VXC_MODIFIER(0, 3, 0,VXC_RM_TowardZero, 0)); } @@ -196,13 +196,13 @@ __kernel void resize_bilinear_F16toF16_UP int baseAddr = (int)coord_in.z * input_desc.s4 + input_desc.s0; _viv_asm(MOV, coord_in.w, baseAddr); - VXC_OP4(img_load_3d, src0, input, coord_in.xyww, VXC_5BITOFFSET_XY(0, 0), + VXC_OP4(img_load_3d, src0, input, coord_in.xywz, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); - VXC_OP4(img_load_3d, src1, input, coord_in.xyww, VXC_5BITOFFSET_XY(8, 0), + VXC_OP4(img_load_3d, src1, input, coord_in.xywz, VXC_5BITOFFSET_XY(8, 0), VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); - VXC_OP4(img_load_3d, src2, input, coord_in.xyww, VXC_5BITOFFSET_XY(0, 1), + VXC_OP4(img_load_3d, src2, input, coord_in.xywz, VXC_5BITOFFSET_XY(0, 1), VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); - VXC_OP4(img_load_3d, src3, input, coord_in.xyww, VXC_5BITOFFSET_XY(8, 1), + VXC_OP4(img_load_3d, src3, input, coord_in.xywz, VXC_5BITOFFSET_XY(8, 1), VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); vxc_ushort8 bitextract_p0; @@ -232,16 +232,15 @@ __kernel void resize_bilinear_F16toF16_UP _viv_asm(COPY, bottom, dst1, 16); - coord_in.w += input_desc.s4; - VXC_OP4(img_load_3d, src0, input, coord_in.xyww, VXC_5BITOFFSET_XY(0, 0), + coord_in.zw += (int2)(1, input_desc.s4); + VXC_OP4(img_load_3d, src0, input, coord_in.xywz, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); - VXC_OP4(img_load_3d, src1, input, coord_in.xyww, VXC_5BITOFFSET_XY(8, 0), + VXC_OP4(img_load_3d, src1, input, coord_in.xywz, VXC_5BITOFFSET_XY(8, 0), VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); - VXC_OP4(img_load_3d, src2, input, coord_in.xyww, VXC_5BITOFFSET_XY(0, 1), + VXC_OP4(img_load_3d, src2, input, coord_in.xywz, VXC_5BITOFFSET_XY(0, 1), VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); - VXC_OP4(img_load_3d, src3, input, coord_in.xyww, VXC_5BITOFFSET_XY(8, 1), + VXC_OP4(img_load_3d, src3, input, coord_in.xywz, VXC_5BITOFFSET_XY(8, 1), VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); - coord_in.z ++; VXC_DP4x4(left4, top, top, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniFp16toFp32_left_4x4); VXC_DP4x4(right4, top, top, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniRightSubLeft_4x4); @@ -256,8 +255,8 @@ __kernel void resize_bilinear_F16toF16_UP VXC_DP2x8(top, tmp, tmp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniExtactHalf8_2x8); _viv_asm(COPY, dst0, top, 16); - VXC_OP4_NoDest(img_store_3d, output, coord_out.xyww, dst0, VXC_MODIFIER(0, 3, 0,VXC_RM_TowardZero, 0)); - coord_out.w += output_desc.s4; + VXC_OP4_NoDest(img_store_3d, output, coord_out.xywz, dst0, VXC_MODIFIER(0, 3, 0,VXC_RM_TowardZero, 0)); + coord_out.zw += (int2)(1, output_desc.s4); } VXC_BitExtract(dst0, src0, src1, maskShift, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); @@ -278,5 +277,5 @@ __kernel void resize_bilinear_F16toF16_UP VXC_DP2x8(top, tmp, tmp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniExtactHalf8_2x8); _viv_asm(COPY, dst0, top, 16); - VXC_OP4_NoDest(img_store_3d, output, coord_out.xyww, dst0, VXC_MODIFIER(0, 3, 0,VXC_RM_TowardZero, 0)); + VXC_OP4_NoDest(img_store_3d, output, coord_out.xywz, dst0, VXC_MODIFIER(0, 3, 0,VXC_RM_TowardZero, 0)); } diff --git a/src/tim/vx/internal/src/libnnext/ops/vx/resize_bilinear_I16.vx b/src/tim/vx/internal/src/libnnext/ops/vx/resize_bilinear_I16.vx index bdfa3fb..8f4735b 100644 --- a/src/tim/vx/internal/src/libnnext/ops/vx/resize_bilinear_I16.vx +++ b/src/tim/vx/internal/src/libnnext/ops/vx/resize_bilinear_I16.vx @@ -47,13 +47,13 @@ __kernel void resize_bilinear_I16toI16_UP int baseAddr = (int)coord_in.z * input_desc.s4 + input_desc.s0; _viv_asm(MOV, coord_in.w, baseAddr); - VXC_OP4(img_load_3d, src0, input, coord_in.xyww, VXC_5BITOFFSET_XY(0, 0), + VXC_OP4(img_load_3d, src0, input, coord_in.xywz, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); - VXC_OP4(img_load_3d, src1, input, coord_in.xyww, VXC_5BITOFFSET_XY(8, 0), + VXC_OP4(img_load_3d, src1, input, coord_in.xywz, VXC_5BITOFFSET_XY(8, 0), VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); - VXC_OP4(img_load_3d, src2, input, coord_in.xyww, VXC_5BITOFFSET_XY(0, 1), + VXC_OP4(img_load_3d, src2, input, coord_in.xywz, VXC_5BITOFFSET_XY(0, 1), VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); - VXC_OP4(img_load_3d, src3, input, coord_in.xyww, VXC_5BITOFFSET_XY(8, 1), + VXC_OP4(img_load_3d, src3, input, coord_in.xywz, VXC_5BITOFFSET_XY(8, 1), VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); vxc_ushort8 bitextract_p0; @@ -81,16 +81,15 @@ __kernel void resize_bilinear_I16toI16_UP VXC_BitExtract(dst1, src2, src3, maskShift, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); _viv_asm(COPY, top, dst0, 16); _viv_asm(COPY, bottom, dst1, 16); - coord_in.w += input_desc.s4; - VXC_OP4(img_load_3d, src0, input, coord_in.xyww, VXC_5BITOFFSET_XY(0, 0), + coord_in.zw += (int2)(1, input_desc.s4); + VXC_OP4(img_load_3d, src0, input, coord_in.xywz, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); - VXC_OP4(img_load_3d, src1, input, coord_in.xyww, VXC_5BITOFFSET_XY(8, 0), + VXC_OP4(img_load_3d, src1, input, coord_in.xywz, VXC_5BITOFFSET_XY(8, 0), VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); - VXC_OP4(img_load_3d, src2, input, coord_in.xyww, VXC_5BITOFFSET_XY(0, 1), + VXC_OP4(img_load_3d, src2, input, coord_in.xywz, VXC_5BITOFFSET_XY(0, 1), VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); - VXC_OP4(img_load_3d, src3, input, coord_in.xyww, VXC_5BITOFFSET_XY(8, 1), + VXC_OP4(img_load_3d, src3, input, coord_in.xywz, VXC_5BITOFFSET_XY(8, 1), VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); - coord_in.z ++; VXC_DP4x4(left4, top, top, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniDFPtoFp32_left_4x4); VXC_DP4x4(right4, top, top, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniRightSubLeft_4x4); @@ -108,8 +107,8 @@ __kernel void resize_bilinear_I16toI16_UP VXC_DP2x8(top, dst, dst, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), uniExtact8Bit_2x8); - VXC_OP4_NoDest(img_store_3d, output, coord_out.xyww, top, VXC_MODIFIER(0, 3, 0,VXC_RM_TowardZero, 0)); - coord_out.w += output_desc.s4; + VXC_OP4_NoDest(img_store_3d, output, coord_out.xywz, top, VXC_MODIFIER(0, 3, 0,VXC_RM_TowardZero, 0)); + coord_out.zw += (int2)(1, output_desc.s4); } VXC_BitExtract(dst0, src0, src1, maskShift, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); @@ -129,7 +128,7 @@ __kernel void resize_bilinear_I16toI16_UP dst4 = dst4 * dfpScale; int4 dst = convert_int4_rte(dst4); VXC_DP2x8(top, dst, dst, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), uniExtact8Bit_2x8); - VXC_OP4_NoDest(img_store_3d, output, coord_out.xyww, top, VXC_MODIFIER(0, 3, 0,VXC_RM_TowardZero, 0)); + VXC_OP4_NoDest(img_store_3d, output, coord_out.xywz, top, VXC_MODIFIER(0, 3, 0,VXC_RM_TowardZero, 0)); } @@ -161,24 +160,24 @@ __kernel void resize_bilinear_I16toI16_DOWN _viv_asm(COPY, input_desc, input, sizeof(input_desc)); int baseAddr = (int)coord_in.z * input_desc.s4 + input_desc.s0; _viv_asm(MOV, coord_in.w, baseAddr); - VXC_OP4(img_load_3d, top, input, coord_in.xyww, VXC_5BITOFFSET_XY(0, 0), + VXC_OP4(img_load_3d, top, input, coord_in.xywz, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0)); - VXC_OP4(img_load_3d, bottom, input, coord_in.xyww, VXC_5BITOFFSET_XY(0, 1), + VXC_OP4(img_load_3d, bottom, input, coord_in.xywz, VXC_5BITOFFSET_XY(0, 1), VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0)); coord_in.x = left_x_idx.y; - VXC_OP4(img_load_3d, top, input, coord_in.xyww, VXC_5BITOFFSET_XY(0, 0), + VXC_OP4(img_load_3d, top, input, coord_in.xywz, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(2, 3, 0, VXC_RM_TowardZero, 0)); - VXC_OP4(img_load_3d, bottom, input, coord_in.xyww, VXC_5BITOFFSET_XY(0, 1), + VXC_OP4(img_load_3d, bottom, input, coord_in.xywz, VXC_5BITOFFSET_XY(0, 1), VXC_MODIFIER(2, 3, 0, VXC_RM_TowardZero, 0)); coord_in.x = left_x_idx.z; - VXC_OP4(img_load_3d, top, input, coord_in.xyww, VXC_5BITOFFSET_XY(0, 0), + VXC_OP4(img_load_3d, top, input, coord_in.xywz, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(4, 5, 0, VXC_RM_TowardZero, 0)); - VXC_OP4(img_load_3d, bottom, input, coord_in.xyww, VXC_5BITOFFSET_XY(0, 1), + VXC_OP4(img_load_3d, bottom, input, coord_in.xywz, VXC_5BITOFFSET_XY(0, 1), VXC_MODIFIER(4, 5, 0, VXC_RM_TowardZero, 0)); coord_in.x = left_x_idx.w; - VXC_OP4(img_load_3d, top, input, coord_in.xyww, VXC_5BITOFFSET_XY(0, 0), + VXC_OP4(img_load_3d, top, input, coord_in.xywz, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(6, 7, 0, VXC_RM_TowardZero, 0)); - VXC_OP4(img_load_3d, bottom, input, coord_in.xyww, VXC_5BITOFFSET_XY(0, 1), + VXC_OP4(img_load_3d, bottom, input, coord_in.xywz, VXC_5BITOFFSET_XY(0, 1), VXC_MODIFIER(6, 7, 0, VXC_RM_TowardZero, 0)); float4 left4; @@ -209,7 +208,7 @@ __kernel void resize_bilinear_I16toI16_DOWN baseAddr = (int)coord_out.z * output_desc.s4 + output_desc.s0; _viv_asm(MOV, coord_out.w, baseAddr); - VXC_OP4_NoDest(img_store_3d, output, coord_out.xyww, result, + VXC_OP4_NoDest(img_store_3d, output, coord_out.xywz, result, VXC_MODIFIER(0, 3, 0,VXC_RM_TowardZero, 0)); } diff --git a/src/tim/vx/internal/src/libnnext/ops/vx/resize_bilinear_I8.vx b/src/tim/vx/internal/src/libnnext/ops/vx/resize_bilinear_I8.vx index 0be6cc5..bcb465e 100644 --- a/src/tim/vx/internal/src/libnnext/ops/vx/resize_bilinear_I8.vx +++ b/src/tim/vx/internal/src/libnnext/ops/vx/resize_bilinear_I8.vx @@ -47,9 +47,9 @@ __kernel void resize_bilinear_I8toI8_UP int baseAddr = (int)coord_in.z * input_desc.s4 + input_desc.s0; _viv_asm(MOV, coord_in.w, baseAddr); - VXC_OP4(img_load_3d, src0, input, coord_in.xyww, VXC_5BITOFFSET_XY(0, 0), + VXC_OP4(img_load_3d, src0, input, coord_in.xywz, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); - VXC_OP4(img_load_3d, src1, input, coord_in.xyww, VXC_5BITOFFSET_XY(0, 1), + VXC_OP4(img_load_3d, src1, input, coord_in.xywz, VXC_5BITOFFSET_XY(0, 1), VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); vxc_ushort8 bitextract_p0; @@ -78,12 +78,11 @@ __kernel void resize_bilinear_I8toI8_UP _viv_asm(COPY, top, dst0, 16); _viv_asm(COPY, bottom, dst1, 16); - coord_in.w += input_desc.s4; - VXC_OP4(img_load_3d, src0, input, coord_in.xyww, VXC_5BITOFFSET_XY(0, 0), + coord_in.zw += (int2)(1, input_desc.s4); + VXC_OP4(img_load_3d, src0, input, coord_in.xywz, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); - VXC_OP4(img_load_3d, src1, input, coord_in.xyww, VXC_5BITOFFSET_XY(0, 1), + VXC_OP4(img_load_3d, src1, input, coord_in.xywz, VXC_5BITOFFSET_XY(0, 1), VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); - coord_in.z ++; VXC_DP4x4(left4, top, top, \ VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniDFPtoFp32_left_4x4); @@ -104,8 +103,8 @@ __kernel void resize_bilinear_I8toI8_UP int4 dst = convert_int4_rte(dst4); VXC_DP2x8(top, dst, dst, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), uniExtact8Bit_2x8); - VXC_OP4_NoDest(img_store_3d, output, coord_out.xyww, top, VXC_MODIFIER(0, 3, 0,VXC_RM_TowardZero, 0)); - coord_out.w += output_desc.s4; + VXC_OP4_NoDest(img_store_3d, output, coord_out.xywz, top, VXC_MODIFIER(0, 3, 0,VXC_RM_TowardZero, 0)); + coord_out.zw += (int2)(1, output_desc.s4); } VXC_BitExtract(dst0, src0, src0, maskShift, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); @@ -127,7 +126,7 @@ __kernel void resize_bilinear_I8toI8_UP dst4 = dst4 * dfpScale; int4 dst = convert_int4_rte(dst4); VXC_DP2x8(top, dst, dst, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), uniExtact8Bit_2x8); - VXC_OP4_NoDest(img_store_3d, output, coord_out.xyww, top, VXC_MODIFIER(0, 3, 0,VXC_RM_TowardZero, 0)); + VXC_OP4_NoDest(img_store_3d, output, coord_out.xywz, top, VXC_MODIFIER(0, 3, 0,VXC_RM_TowardZero, 0)); } __kernel void resize_bilinear_I8toI8_DOWN @@ -155,24 +154,24 @@ __kernel void resize_bilinear_I8toI8_DOWN _viv_asm(COPY, input_desc, input, sizeof(input_desc)); int baseAddr = (int)coord_in.z * input_desc.s4 + input_desc.s0; _viv_asm(MOV, coord_in.w, baseAddr); - VXC_OP4(img_load_3d, top, input, coord_in.xyww, VXC_5BITOFFSET_XY(0, 0), + VXC_OP4(img_load_3d, top, input, coord_in.xywz, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0)); - VXC_OP4(img_load_3d, bottom, input, coord_in.xyww, VXC_5BITOFFSET_XY(0, 1), + VXC_OP4(img_load_3d, bottom, input, coord_in.xywz, VXC_5BITOFFSET_XY(0, 1), VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0)); coord_in.x = left_x_idx.y; - VXC_OP4(img_load_3d, top, input, coord_in.xyww, VXC_5BITOFFSET_XY(0, 0), + VXC_OP4(img_load_3d, top, input, coord_in.xywz, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(2, 3, 0, VXC_RM_TowardZero, 0)); - VXC_OP4(img_load_3d, bottom, input, coord_in.xyww, VXC_5BITOFFSET_XY(0, 1), + VXC_OP4(img_load_3d, bottom, input, coord_in.xywz, VXC_5BITOFFSET_XY(0, 1), VXC_MODIFIER(2, 3, 0, VXC_RM_TowardZero, 0)); coord_in.x = left_x_idx.z; - VXC_OP4(img_load_3d, top, input, coord_in.xyww, VXC_5BITOFFSET_XY(0, 0), + VXC_OP4(img_load_3d, top, input, coord_in.xywz, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(4, 5, 0, VXC_RM_TowardZero, 0)); - VXC_OP4(img_load_3d, bottom, input, coord_in.xyww, VXC_5BITOFFSET_XY(0, 1), + VXC_OP4(img_load_3d, bottom, input, coord_in.xywz, VXC_5BITOFFSET_XY(0, 1), VXC_MODIFIER(4, 5, 0, VXC_RM_TowardZero, 0)); coord_in.x = left_x_idx.w; - VXC_OP4(img_load_3d, top, input, coord_in.xyww, VXC_5BITOFFSET_XY(0, 0), + VXC_OP4(img_load_3d, top, input, coord_in.xywz, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(6, 7, 0, VXC_RM_TowardZero, 0)); - VXC_OP4(img_load_3d, bottom, input, coord_in.xyww, VXC_5BITOFFSET_XY(0, 1), + VXC_OP4(img_load_3d, bottom, input, coord_in.xywz, VXC_5BITOFFSET_XY(0, 1), VXC_MODIFIER(6, 7, 0, VXC_RM_TowardZero, 0)); float4 left4; @@ -203,5 +202,5 @@ __kernel void resize_bilinear_I8toI8_DOWN _viv_asm(COPY, output_desc, output, sizeof(output_desc)); baseAddr = (int)coord_out.z * output_desc.s4 + output_desc.s0; _viv_asm(MOV, coord_out.w, baseAddr); - VXC_OP4_NoDest(img_store_3d, output, coord_out.xyww, result, VXC_MODIFIER(0, 3, 0,VXC_RM_TowardZero, 0)); + VXC_OP4_NoDest(img_store_3d, output, coord_out.xywz, result, VXC_MODIFIER(0, 3, 0,VXC_RM_TowardZero, 0)); } diff --git a/src/tim/vx/internal/src/libnnext/ops/vx/resize_bilinear_U8.vx b/src/tim/vx/internal/src/libnnext/ops/vx/resize_bilinear_U8.vx index 39f239a..88f0cd5 100644 --- a/src/tim/vx/internal/src/libnnext/ops/vx/resize_bilinear_U8.vx +++ b/src/tim/vx/internal/src/libnnext/ops/vx/resize_bilinear_U8.vx @@ -37,24 +37,24 @@ __kernel void resize_bilinear_U8toF16_DOWN _viv_asm(COPY, input_desc, input, sizeof(input_desc)); int baseAddr = (int)coord_in.z * input_desc.s4 + input_desc.s0; _viv_asm(MOV, coord_in.w, baseAddr); - VXC_OP4(img_load_3d, top, input, coord_in.xyww, VXC_5BITOFFSET_XY(0, 0), + VXC_OP4(img_load_3d, top, input, coord_in.xywz, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0)); - VXC_OP4(img_load_3d, bottom, input, coord_in.xyww, VXC_5BITOFFSET_XY(0, 1), + VXC_OP4(img_load_3d, bottom, input, coord_in.xywz, VXC_5BITOFFSET_XY(0, 1), VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0)); coord_in.x = left_x_idx.y; - VXC_OP4(img_load_3d, top, input, coord_in.xyww, VXC_5BITOFFSET_XY(0, 0), + VXC_OP4(img_load_3d, top, input, coord_in.xywz, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(2, 3, 0, VXC_RM_TowardZero, 0)); - VXC_OP4(img_load_3d, bottom, input, coord_in.xyww, VXC_5BITOFFSET_XY(0, 1), + VXC_OP4(img_load_3d, bottom, input, coord_in.xywz, VXC_5BITOFFSET_XY(0, 1), VXC_MODIFIER(2, 3, 0, VXC_RM_TowardZero, 0)); coord_in.x = left_x_idx.z; - VXC_OP4(img_load_3d, top, input, coord_in.xyww, VXC_5BITOFFSET_XY(0, 0), + VXC_OP4(img_load_3d, top, input, coord_in.xywz, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(4, 5, 0, VXC_RM_TowardZero, 0)); - VXC_OP4(img_load_3d, bottom, input, coord_in.xyww, VXC_5BITOFFSET_XY(0, 1), + VXC_OP4(img_load_3d, bottom, input, coord_in.xywz, VXC_5BITOFFSET_XY(0, 1), VXC_MODIFIER(4, 5, 0, VXC_RM_TowardZero, 0)); coord_in.x = left_x_idx.w; - VXC_OP4(img_load_3d, top, input, coord_in.xyww, VXC_5BITOFFSET_XY(0, 0), + VXC_OP4(img_load_3d, top, input, coord_in.xywz, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(6, 7, 0, VXC_RM_TowardZero, 0)); - VXC_OP4(img_load_3d, bottom, input, coord_in.xyww, VXC_5BITOFFSET_XY(0, 1), + VXC_OP4(img_load_3d, bottom, input, coord_in.xywz, VXC_5BITOFFSET_XY(0, 1), VXC_MODIFIER(6, 7, 0, VXC_RM_TowardZero, 0)); float4 left4; @@ -87,7 +87,7 @@ __kernel void resize_bilinear_U8toF16_DOWN _viv_asm(COPY, output_desc, output, sizeof(output_desc)); baseAddr = (int)coord_out.z * output_desc.s4 + output_desc.s0; _viv_asm(MOV, coord_out.w, baseAddr); - VXC_OP4_NoDest(img_store_3d, output, coord_out.xyww, dst_short.s0246, + VXC_OP4_NoDest(img_store_3d, output, coord_out.xywz, dst_short.s0246, VXC_MODIFIER(0, 3, 0,VXC_RM_TowardZero, 0)); } @@ -128,9 +128,9 @@ __kernel void resize_bilinear_U8toU8_UP int baseAddr = (int)coord_in.z * input_desc.s4 + input_desc.s0; _viv_asm(MOV, coord_in.w, baseAddr); - VXC_OP4(img_load_3d, src0, input, coord_in.xyww, VXC_5BITOFFSET_XY(0, 0), + VXC_OP4(img_load_3d, src0, input, coord_in.xywz, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); - VXC_OP4(img_load_3d, src1, input, coord_in.xyww, VXC_5BITOFFSET_XY(0, 1), + VXC_OP4(img_load_3d, src1, input, coord_in.xywz, VXC_5BITOFFSET_XY(0, 1), VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); vxc_ushort8 bitextract_p0; @@ -157,12 +157,12 @@ __kernel void resize_bilinear_U8toU8_UP VXC_BitExtract(top, src0, src0, maskShift, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); VXC_BitExtract(bottom, src1, src1, maskShift, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); - coord_in.w += input_desc.s4; - VXC_OP4(img_load_3d, src0, input, coord_in.xyww, VXC_5BITOFFSET_XY(0, 0), + coord_in.zw += (int2)(1, input_desc.s4); + VXC_OP4(img_load_3d, src0, input, coord_in.xywz, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); - VXC_OP4(img_load_3d, src1, input, coord_in.xyww, VXC_5BITOFFSET_XY(0, 1), + VXC_OP4(img_load_3d, src1, input, coord_in.xywz, VXC_5BITOFFSET_XY(0, 1), VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); - coord_in.z ++; + unsigned char inputZP; _viv_asm(COPY, inputZP, input_ZP, 4); VXC_DP4x4(left4, top, inputZP, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniU8SubZPtoFp32_left_4x4); @@ -179,8 +179,8 @@ __kernel void resize_bilinear_U8toU8_UP dst4 = dst4 * uint8Scale + output_ZP; int4 dst = convert_int4_rte(dst4); VXC_DP2x8(top, dst, dst, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), uniExtact8Bit_2x8); - VXC_OP4_NoDest(img_store_3d, output, coord_out.xyww, top, VXC_MODIFIER(0, 3, 0,VXC_RM_TowardZero, 0)); - coord_out.w += output_desc.s4; + VXC_OP4_NoDest(img_store_3d, output, coord_out.xywz, top, VXC_MODIFIER(0, 3, 0,VXC_RM_TowardZero, 0)); + coord_out.zw += (int2)(1, output_desc.s4); } VXC_BitExtract(top, src0, src0, maskShift, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); @@ -201,7 +201,7 @@ __kernel void resize_bilinear_U8toU8_UP dst4 = dst4 * uint8Scale + output_ZP; int4 dst = convert_int4_rte(dst4); VXC_DP2x8(top, dst, dst, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), uniExtact8Bit_2x8); - VXC_OP4_NoDest(img_store_3d, output, coord_out.xyww, top, VXC_MODIFIER(0, 3, 0,VXC_RM_TowardZero, 0)); + VXC_OP4_NoDest(img_store_3d, output, coord_out.xywz, top, VXC_MODIFIER(0, 3, 0,VXC_RM_TowardZero, 0)); } __kernel void resize_bilinear_U8toU8_DOWN @@ -229,24 +229,24 @@ __kernel void resize_bilinear_U8toU8_DOWN _viv_asm(COPY, input_desc, input, sizeof(input_desc)); int baseAddr = (int)coord_in.z * input_desc.s4 + input_desc.s0; _viv_asm(MOV, coord_in.w, baseAddr); - VXC_OP4(img_load_3d, top, input, coord_in.xyww, VXC_5BITOFFSET_XY(0, 0), + VXC_OP4(img_load_3d, top, input, coord_in.xywz, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0)); - VXC_OP4(img_load_3d, bottom, input, coord_in.xyww, VXC_5BITOFFSET_XY(0, 1), + VXC_OP4(img_load_3d, bottom, input, coord_in.xywz, VXC_5BITOFFSET_XY(0, 1), VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0)); coord_in.x = left_x_idx.y; - VXC_OP4(img_load_3d, top, input, coord_in.xyww, VXC_5BITOFFSET_XY(0, 0), + VXC_OP4(img_load_3d, top, input, coord_in.xywz, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(2, 3, 0, VXC_RM_TowardZero, 0)); - VXC_OP4(img_load_3d, bottom, input, coord_in.xyww, VXC_5BITOFFSET_XY(0, 1), + VXC_OP4(img_load_3d, bottom, input, coord_in.xywz, VXC_5BITOFFSET_XY(0, 1), VXC_MODIFIER(2, 3, 0, VXC_RM_TowardZero, 0)); coord_in.x = left_x_idx.z; - VXC_OP4(img_load_3d, top, input, coord_in.xyww, VXC_5BITOFFSET_XY(0, 0), + VXC_OP4(img_load_3d, top, input, coord_in.xywz, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(4, 5, 0, VXC_RM_TowardZero, 0)); - VXC_OP4(img_load_3d, bottom, input, coord_in.xyww, VXC_5BITOFFSET_XY(0, 1), + VXC_OP4(img_load_3d, bottom, input, coord_in.xywz, VXC_5BITOFFSET_XY(0, 1), VXC_MODIFIER(4, 5, 0, VXC_RM_TowardZero, 0)); coord_in.x = left_x_idx.w; - VXC_OP4(img_load_3d, top, input, coord_in.xyww, VXC_5BITOFFSET_XY(0, 0), + VXC_OP4(img_load_3d, top, input, coord_in.xywz, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(6, 7, 0, VXC_RM_TowardZero, 0)); - VXC_OP4(img_load_3d, bottom, input, coord_in.xyww, VXC_5BITOFFSET_XY(0, 1), + VXC_OP4(img_load_3d, bottom, input, coord_in.xywz, VXC_5BITOFFSET_XY(0, 1), VXC_MODIFIER(6, 7, 0, VXC_RM_TowardZero, 0)); float4 left4; @@ -276,6 +276,6 @@ __kernel void resize_bilinear_U8toU8_DOWN _viv_asm(COPY, output_desc, output, sizeof(output_desc)); baseAddr = (int)coord_out.z * output_desc.s4 + output_desc.s0; _viv_asm(MOV, coord_out.w, baseAddr); - VXC_OP4_NoDest(img_store_3d, output, coord_out.xyww, result, + VXC_OP4_NoDest(img_store_3d, output, coord_out.xywz, result, VXC_MODIFIER(0, 3, 0,VXC_RM_TowardZero, 0)); } diff --git a/src/tim/vx/internal/src/libnnext/ops/vx/resize_bilinear_U8_half_pixel_centers.vx b/src/tim/vx/internal/src/libnnext/ops/vx/resize_bilinear_U8_half_pixel_centers.vx index 1c1071d..4339bff 100644 --- a/src/tim/vx/internal/src/libnnext/ops/vx/resize_bilinear_U8_half_pixel_centers.vx +++ b/src/tim/vx/internal/src/libnnext/ops/vx/resize_bilinear_U8_half_pixel_centers.vx @@ -23,9 +23,9 @@ __kernel void resize_bilinear_U8toU8_SAME_2x_upsample_half_pixel_centers _viv_asm(COPY, input_desc, input, sizeof(input_desc)); int baseAddr = (int)coord_in.z * input_desc.s4 + input_desc.s0; _viv_asm(MOV, coord_in.w, baseAddr); - VXC_OP4(img_load_3d, in0, input, coord_in.xyww, VXC_5BITOFFSET_XY(0, 0), + VXC_OP4(img_load_3d, in0, input, coord_in.xywz, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); - VXC_OP4(img_load_3d, in1, input, coord_in.xyww, VXC_5BITOFFSET_XY(0, 1), + VXC_OP4(img_load_3d, in1, input, coord_in.xywz, VXC_5BITOFFSET_XY(0, 1), VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); int8 output_desc; @@ -37,26 +37,26 @@ __kernel void resize_bilinear_U8toU8_SAME_2x_upsample_half_pixel_centers { VXC_DP4x8(result, in0, in1, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniResize2xUp_0_4x8); VXC_DP4x8(result, in0, in1, VXC_MODIFIER(8, 15, 0, VXC_RM_ToNearestEven, 1), uniResize2xUp_1_4x8); - VXC_OP4_NoDest(img_store_3d, output, coord_out.xyww, result, + VXC_OP4_NoDest(img_store_3d, output, coord_out.xywz, result, VXC_MODIFIER(0, 15, 0,VXC_RM_TowardZero, 0)); coord_out.y++; - VXC_OP4(img_load_3d, in0, input, coord_in.xyww, VXC_5BITOFFSET_XY(0, 2), + VXC_OP4(img_load_3d, in0, input, coord_in.xywz, VXC_5BITOFFSET_XY(0, 2), VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); VXC_DP4x8(result, in0, in1, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniResize2xUp_0_4x8); VXC_DP4x8(result, in0, in1, VXC_MODIFIER(8, 15, 0, VXC_RM_ToNearestEven, 1), uniResize2xUp_1_4x8); - VXC_OP4_NoDest(img_store_3d, output, coord_out.xyww, result, + VXC_OP4_NoDest(img_store_3d, output, coord_out.xywz, result, VXC_MODIFIER(0, 15, 0,VXC_RM_TowardZero, 0)); coord_out.y++; VXC_DP4x8(result, in1, in0, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniResize2xUp_0_4x8); VXC_DP4x8(result, in1, in0, VXC_MODIFIER(8, 15, 0, VXC_RM_ToNearestEven, 1), uniResize2xUp_1_4x8); - VXC_OP4_NoDest(img_store_3d, output, coord_out.xyww, result, + VXC_OP4_NoDest(img_store_3d, output, coord_out.xywz, result, VXC_MODIFIER(0, 15, 0,VXC_RM_TowardZero, 0)); - VXC_OP4(img_load_3d, in1, input, coord_in.xyww, VXC_5BITOFFSET_XY(0, 3), + VXC_OP4(img_load_3d, in1, input, coord_in.xywz, VXC_5BITOFFSET_XY(0, 3), VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); coord_out.y++; VXC_DP4x8(result, in1, in0, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniResize2xUp_0_4x8); VXC_DP4x8(result, in1, in0, VXC_MODIFIER(8, 15, 0, VXC_RM_ToNearestEven, 1), uniResize2xUp_1_4x8); - VXC_OP4_NoDest(img_store_3d, output, coord_out.xyww, result, + VXC_OP4_NoDest(img_store_3d, output, coord_out.xywz, result, VXC_MODIFIER(0, 15, 0,VXC_RM_TowardZero, 0)); coord_in.y += 2; coord_out.y++; @@ -86,9 +86,9 @@ __kernel void resize_bilinear_U8toU8_SAME_4x_upsample_half_pixel_centers _viv_asm(COPY, input_desc, input, sizeof(input_desc)); int baseAddr = (int)coord_in.z * input_desc.s4 + input_desc.s0; _viv_asm(MOV, coord_in.w, baseAddr); - VXC_OP4(img_load_3d, in0, input, coord_in.xyww, VXC_5BITOFFSET_XY(0, 0), + VXC_OP4(img_load_3d, in0, input, coord_in.xywz, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); - VXC_OP4(img_load_3d, in1, input, coord_in.xyww, VXC_5BITOFFSET_XY(0, 1), + VXC_OP4(img_load_3d, in1, input, coord_in.xywz, VXC_5BITOFFSET_XY(0, 1), VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); int8 output_desc; @@ -102,44 +102,44 @@ __kernel void resize_bilinear_U8toU8_SAME_4x_upsample_half_pixel_centers VXC_DP4x8(dst0, in0, in1, VXC_MODIFIER(8, 15, 0, VXC_RM_ToNearestEven, 1), uniResize4xUp_l01_4x8); VXC_DP4x8(dst1, in0, in1, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniResize4xUp_l10_4x8); VXC_DP4x8(dst1, in0, in1, VXC_MODIFIER(8, 15, 0, VXC_RM_ToNearestEven, 1), uniResize4xUp_l11_4x8); - VXC_OP4(img_load_3d, in0, input, coord_in.xyww, VXC_5BITOFFSET_XY(0, 2), + VXC_OP4(img_load_3d, in0, input, coord_in.xywz, VXC_5BITOFFSET_XY(0, 2), VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); - VXC_OP4_NoDest(img_store_3d, output, coord_out.xyww, dst0, + VXC_OP4_NoDest(img_store_3d, output, coord_out.xywz, dst0, VXC_MODIFIER(0, 15, 0,VXC_RM_TowardZero, 0)); coord_out.y++; - VXC_OP4_NoDest(img_store_3d, output, coord_out.xyww, dst1, + VXC_OP4_NoDest(img_store_3d, output, coord_out.xywz, dst1, VXC_MODIFIER(0, 15, 0,VXC_RM_TowardZero, 0)); coord_out.y++; VXC_DP4x8(dst0, in0, in1, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniResize4xUp_l10_4x8); VXC_DP4x8(dst0, in0, in1, VXC_MODIFIER(8, 15, 0, VXC_RM_ToNearestEven, 1), uniResize4xUp_l11_4x8); VXC_DP4x8(dst1, in0, in1, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniResize4xUp_l00_4x8); VXC_DP4x8(dst1, in0, in1, VXC_MODIFIER(8, 15, 0, VXC_RM_ToNearestEven, 1), uniResize4xUp_l01_4x8); - VXC_OP4_NoDest(img_store_3d, output, coord_out.xyww, dst0, + VXC_OP4_NoDest(img_store_3d, output, coord_out.xywz, dst0, VXC_MODIFIER(0, 15, 0,VXC_RM_TowardZero, 0)); coord_out.y++; - VXC_OP4_NoDest(img_store_3d, output, coord_out.xyww, dst1, + VXC_OP4_NoDest(img_store_3d, output, coord_out.xywz, dst1, VXC_MODIFIER(0, 15, 0,VXC_RM_TowardZero, 0)); coord_out.y++; VXC_DP4x8(dst0, in1, in0, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniResize4xUp_l00_4x8); VXC_DP4x8(dst0, in1, in0, VXC_MODIFIER(8, 15, 0, VXC_RM_ToNearestEven, 1), uniResize4xUp_l01_4x8); VXC_DP4x8(dst1, in1, in0, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniResize4xUp_l10_4x8); VXC_DP4x8(dst1, in1, in0, VXC_MODIFIER(8, 15, 0, VXC_RM_ToNearestEven, 1), uniResize4xUp_l11_4x8); - VXC_OP4(img_load_3d, in1, input, coord_in.xyww, VXC_5BITOFFSET_XY(0, 3), + VXC_OP4(img_load_3d, in1, input, coord_in.xywz, VXC_5BITOFFSET_XY(0, 3), VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); - VXC_OP4_NoDest(img_store_3d, output, coord_out.xyww, dst0, + VXC_OP4_NoDest(img_store_3d, output, coord_out.xywz, dst0, VXC_MODIFIER(0, 15, 0,VXC_RM_TowardZero, 0)); coord_out.y++; - VXC_OP4_NoDest(img_store_3d, output, coord_out.xyww, dst1, + VXC_OP4_NoDest(img_store_3d, output, coord_out.xywz, dst1, VXC_MODIFIER(0, 15, 0,VXC_RM_TowardZero, 0)); coord_out.y++; VXC_DP4x8(dst0, in1, in0, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniResize4xUp_l10_4x8); VXC_DP4x8(dst0, in1, in0, VXC_MODIFIER(8, 15, 0, VXC_RM_ToNearestEven, 1), uniResize4xUp_l11_4x8); VXC_DP4x8(dst1, in1, in0, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniResize4xUp_l00_4x8); VXC_DP4x8(dst1, in1, in0, VXC_MODIFIER(8, 15, 0, VXC_RM_ToNearestEven, 1), uniResize4xUp_l01_4x8); - VXC_OP4_NoDest(img_store_3d, output, coord_out.xyww, dst0, + VXC_OP4_NoDest(img_store_3d, output, coord_out.xywz, dst0, VXC_MODIFIER(0, 15, 0,VXC_RM_TowardZero, 0)); coord_out.y++; - VXC_OP4_NoDest(img_store_3d, output, coord_out.xyww, dst1, + VXC_OP4_NoDest(img_store_3d, output, coord_out.xywz, dst1, VXC_MODIFIER(0, 15, 0,VXC_RM_TowardZero, 0)); coord_in.y += 2; coord_out.y++; @@ -173,13 +173,13 @@ __kernel void resize_bilinear_U8toU8_SAME_3x_upsample_half_pixel_centers _viv_asm(COPY, input_desc, input, sizeof(input_desc)); int baseAddr = (int)coord_in.z * input_desc.s4 + input_desc.s0; _viv_asm(MOV, coord_in.w, baseAddr); - VXC_OP4(img_load_3d, in0, input, coord_in.xyww, VXC_5BITOFFSET_XY(0, 0), + VXC_OP4(img_load_3d, in0, input, coord_in.xywz, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); - VXC_OP4(img_load_3d, in1, input, coord_in.xyww, VXC_5BITOFFSET_XY(0, 1), + VXC_OP4(img_load_3d, in1, input, coord_in.xywz, VXC_5BITOFFSET_XY(0, 1), VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); - VXC_OP4(img_load_3d, in2, input, coord_in.xyww, VXC_5BITOFFSET_XY(0, 2), + VXC_OP4(img_load_3d, in2, input, coord_in.xywz, VXC_5BITOFFSET_XY(0, 2), VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); - VXC_OP4(img_load_3d, in3, input, coord_in.xyww, VXC_5BITOFFSET_XY(0, 3), + VXC_OP4(img_load_3d, in3, input, coord_in.xywz, VXC_5BITOFFSET_XY(0, 3), VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); int8 output_desc; @@ -191,7 +191,7 @@ __kernel void resize_bilinear_U8toU8_SAME_3x_upsample_half_pixel_centers VXC_DP4x4(dst2, in1, in0, VXC_MODIFIER(4, 7, 0, VXC_RM_ToNearestEven, 1), uniResize3xUp_l11_4x4); VXC_DP4x4(dst2, in1, in0, VXC_MODIFIER(8, 11, 0, VXC_RM_ToNearestEven, 1), uniResize3xUp_l12_4x4); VXC_DP4x4(dst2, in1, in0, VXC_MODIFIER(12, 14, 0, VXC_RM_ToNearestEven, 1), uniResize3xUp_l13_4x4); - VXC_OP4_NoDest(img_store_3d, output, coord_out.xyww, dst2, + VXC_OP4_NoDest(img_store_3d, output, coord_out.xywz, dst2, VXC_MODIFIER(0, 14, 0,VXC_RM_TowardZero, 0)); coord_out.y++; @@ -205,13 +205,13 @@ __kernel void resize_bilinear_U8toU8_SAME_3x_upsample_half_pixel_centers VXC_DP4x4(dst2, in2, in1, VXC_MODIFIER(4, 7, 0, VXC_RM_ToNearestEven, 1), uniResize3xUp_l11_4x4); VXC_DP4x4(dst2, in2, in1, VXC_MODIFIER(8, 11, 0, VXC_RM_ToNearestEven, 1), uniResize3xUp_l12_4x4); VXC_DP4x4(dst2, in2, in1, VXC_MODIFIER(12, 14, 0, VXC_RM_ToNearestEven, 1), uniResize3xUp_l13_4x4); - VXC_OP4_NoDest(img_store_3d, output, coord_out.xyww, dst0, + VXC_OP4_NoDest(img_store_3d, output, coord_out.xywz, dst0, VXC_MODIFIER(0, 14, 0,VXC_RM_TowardZero, 0)); coord_out.y++; - VXC_OP4_NoDest(img_store_3d, output, coord_out.xyww, dst1, + VXC_OP4_NoDest(img_store_3d, output, coord_out.xywz, dst1, VXC_MODIFIER(0, 14, 0,VXC_RM_TowardZero, 0)); coord_out.y++; - VXC_OP4_NoDest(img_store_3d, output, coord_out.xyww, dst2, + VXC_OP4_NoDest(img_store_3d, output, coord_out.xywz, dst2, VXC_MODIFIER(0, 14, 0,VXC_RM_TowardZero, 0)); coord_out.y++; @@ -221,9 +221,9 @@ __kernel void resize_bilinear_U8toU8_SAME_3x_upsample_half_pixel_centers VXC_DP4x4(dst1, in2, in3, VXC_MODIFIER(4, 7, 0, VXC_RM_ToNearestEven, 1), uniResize3xUp_l11_4x4); VXC_DP4x4(dst1, in2, in3, VXC_MODIFIER(8, 11, 0, VXC_RM_ToNearestEven, 1), uniResize3xUp_l12_4x4); VXC_DP4x4(dst1, in2, in3, VXC_MODIFIER(12, 14, 0, VXC_RM_ToNearestEven, 1), uniResize3xUp_l13_4x4); - VXC_OP4_NoDest(img_store_3d, output, coord_out.xyww, dst0, + VXC_OP4_NoDest(img_store_3d, output, coord_out.xywz, dst0, VXC_MODIFIER(0, 14, 0,VXC_RM_TowardZero, 0)); coord_out.y++; - VXC_OP4_NoDest(img_store_3d, output, coord_out.xyww, dst1, + VXC_OP4_NoDest(img_store_3d, output, coord_out.xywz, dst1, VXC_MODIFIER(0, 14, 0,VXC_RM_TowardZero, 0)); } diff --git a/src/tim/vx/internal/src/libnnext/ops/vx/resize_bilinear_U8_opt.vx b/src/tim/vx/internal/src/libnnext/ops/vx/resize_bilinear_U8_opt.vx index 59e8211..8c30927 100644 --- a/src/tim/vx/internal/src/libnnext/ops/vx/resize_bilinear_U8_opt.vx +++ b/src/tim/vx/internal/src/libnnext/ops/vx/resize_bilinear_U8_opt.vx @@ -42,9 +42,9 @@ __kernel void resize_bilinear_U8toU8_UP_opt _viv_asm(COPY, input_desc, input, sizeof(input_desc)); int baseAddr = (int)coord_in.z * input_desc.s4 + input_desc.s0; _viv_asm(MOV, coord_in.w, baseAddr); - VXC_OP4(img_load_3d, src0, input, coord_in.xyww, VXC_5BITOFFSET_XY(0, 0), + VXC_OP4(img_load_3d, src0, input, coord_in.xywz, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); - VXC_OP4(img_load_3d, src1, input, coord_in.xyww, VXC_5BITOFFSET_XY(0, 1), + VXC_OP4(img_load_3d, src1, input, coord_in.xywz, VXC_5BITOFFSET_XY(0, 1), VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); vxc_ushort8 bitextract_p0; @@ -75,21 +75,19 @@ __kernel void resize_bilinear_U8toU8_UP_opt VXC_BitExtract(top_bottom, src0, src0, maskShift, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); VXC_BitExtract(top_bottom, src1, src1, maskShift, VXC_MODIFIER(8, 15, 0, VXC_RM_TowardZero, 0)); - coord_in.w += input_desc.s4; - VXC_OP4(img_load_3d, src0, input, coord_in.xyww, + coord_in.zw += (int2)(1, input_desc.s4); + VXC_OP4(img_load_3d, src0, input, coord_in.xywz, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); - VXC_OP4(img_load_3d, src1, input, coord_in.xyww, + VXC_OP4(img_load_3d, src1, input, coord_in.xywz, VXC_5BITOFFSET_XY(0, 1), VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); vxc_uchar16 dst; VXC_DP4x4_b(dst, lerp.hi, lerp.lo, top_bottom, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), uniBilinear_4x4_b); - VXC_OP4_NoDest(img_store_3d, output, coord_out.xyww, dst, + VXC_OP4_NoDest(img_store_3d, output, coord_out.xywz, dst, VXC_MODIFIER(0, 3, 0,VXC_RM_TowardZero, 0)); - coord_out.w += output_desc.s4; - - coord_in.z ++; + coord_out.zw += (int2)(1, output_desc.s4); } VXC_BitExtract(top_bottom, src0, src0, maskShift, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); @@ -97,7 +95,7 @@ __kernel void resize_bilinear_U8toU8_UP_opt vxc_uchar16 dst; VXC_DP4x4_b(dst, lerp.hi, lerp.lo, top_bottom, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), uniBilinear_4x4_b); - VXC_OP4_NoDest(img_store_3d, output, coord_out.xyww, dst, + VXC_OP4_NoDest(img_store_3d, output, coord_out.xywz, dst, VXC_MODIFIER(0, 3, 0,VXC_RM_TowardZero, 0)); } diff --git a/src/tim/vx/internal/src/libnnext/ops/vx/resize_nearest.vx b/src/tim/vx/internal/src/libnnext/ops/vx/resize_nearest.vx index 7172017..b5b0162 100644 --- a/src/tim/vx/internal/src/libnnext/ops/vx/resize_nearest.vx +++ b/src/tim/vx/internal/src/libnnext/ops/vx/resize_nearest.vx @@ -33,16 +33,16 @@ __kernel void resize_nearest_F16toF16 int baseAddr = (int)coord_in.z * input_desc.s4 + input_desc.s0; _viv_asm(MOV, coord_in.w, baseAddr); - VXC_OP4(img_load_3d, src, input, coord_in.xyww, VXC_5BITOFFSET_XY(0, 0), + VXC_OP4(img_load_3d, src, input, coord_in.xywz, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0)); coord_in.x = in_x_idx.y; - VXC_OP4(img_load_3d, src, input, coord_in.xyww, VXC_5BITOFFSET_XY(0, 0), + VXC_OP4(img_load_3d, src, input, coord_in.xywz, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(1, 1, 0, VXC_RM_TowardZero, 0)); coord_in.x = in_x_idx.z; - VXC_OP4(img_load_3d, src, input, coord_in.xyww, VXC_5BITOFFSET_XY(0, 0), + VXC_OP4(img_load_3d, src, input, coord_in.xywz, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(2, 2, 0, VXC_RM_TowardZero, 0)); coord_in.x = in_x_idx.w; - VXC_OP4(img_load_3d, src, input, coord_in.xyww, VXC_5BITOFFSET_XY(0, 0), + VXC_OP4(img_load_3d, src, input, coord_in.xywz, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(3, 3, 0, VXC_RM_TowardZero, 0)); @@ -50,7 +50,7 @@ __kernel void resize_nearest_F16toF16 _viv_asm(COPY, output_desc, output, sizeof(output_desc)); baseAddr = (int)coord_out.z * output_desc.s4 + output_desc.s0; _viv_asm(MOV, coord_out.w, baseAddr); - VXC_OP4_NoDest(img_store_3d, output, coord_out.xyww, src, + VXC_OP4_NoDest(img_store_3d, output, coord_out.xywz, src, VXC_MODIFIER(0, 3, 0,VXC_RM_TowardZero, 0)); } @@ -73,9 +73,9 @@ __kernel void resize_nearest_F16toF16_op int baseAddr = (int)coord_in.z * input_desc.s4 + input_desc.s0; _viv_asm(MOV, coord_in.w, baseAddr); - VXC_OP4(img_load_3d, src0, input, coord_in.xyww, VXC_5BITOFFSET_XY(0, 0), + VXC_OP4(img_load_3d, src0, input, coord_in.xywz, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); - VXC_OP4(img_load_3d, src1, input, coord_in.xyww, VXC_5BITOFFSET_XY(8, 0), + VXC_OP4(img_load_3d, src1, input, coord_in.xywz, VXC_5BITOFFSET_XY(8, 0), VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); vxc_uchar16 mask = (vxc_uchar16)(8, 8, 8, 8, 8, 8, 8, 8, 16, 16, 16, 16, 16, 16, 16, 16); @@ -89,7 +89,7 @@ __kernel void resize_nearest_F16toF16_op _viv_asm(COPY, output_desc, output, sizeof(output_desc)); baseAddr = (int)coord_out.z * output_desc.s4 + output_desc.s0; _viv_asm(MOV, coord_out.w, baseAddr); - VXC_OP4_NoDest(img_store_3d, output, coord_out.xyww, dst, + VXC_OP4_NoDest(img_store_3d, output, coord_out.xywz, dst, VXC_MODIFIER(0, 3, 0,VXC_RM_TowardZero, 0)); } @@ -112,16 +112,16 @@ __kernel void resize_nearest_I8toI8 int baseAddr = (int)coord_in.z * input_desc.s4 + input_desc.s0; _viv_asm(MOV, coord_in.w, baseAddr); - VXC_OP4(img_load_3d, src, input, coord_in.xyww, VXC_5BITOFFSET_XY(0, 0), + VXC_OP4(img_load_3d, src, input, coord_in.xywz, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0)); coord_in.x = in_x_idx.y; - VXC_OP4(img_load_3d, src, input, coord_in.xyww, VXC_5BITOFFSET_XY(0, 0), + VXC_OP4(img_load_3d, src, input, coord_in.xywz, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(1, 1, 0, VXC_RM_TowardZero, 0)); coord_in.x = in_x_idx.z; - VXC_OP4(img_load_3d, src, input, coord_in.xyww, VXC_5BITOFFSET_XY(0, 0), + VXC_OP4(img_load_3d, src, input, coord_in.xywz, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(2, 2, 0, VXC_RM_TowardZero, 0)); coord_in.x = in_x_idx.w; - VXC_OP4(img_load_3d, src, input, coord_in.xyww, VXC_5BITOFFSET_XY(0, 0), + VXC_OP4(img_load_3d, src, input, coord_in.xywz, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(3, 3, 0, VXC_RM_TowardZero, 0)); VXC_DP2x8(src, src, src, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniConvertI8toI8_2x8); @@ -130,7 +130,7 @@ __kernel void resize_nearest_I8toI8 _viv_asm(COPY, output_desc, output, sizeof(output_desc)); baseAddr = (int)coord_out.z * output_desc.s4 + output_desc.s0; _viv_asm(MOV, coord_out.w, baseAddr); - VXC_OP4_NoDest(img_store_3d, output, coord_out.xyww, src, + VXC_OP4_NoDest(img_store_3d, output, coord_out.xywz, src, VXC_MODIFIER(0, 3, 0,VXC_RM_TowardZero, 0)); } @@ -153,7 +153,7 @@ __kernel void resize_nearest_I8toI8_op int baseAddr = (int)coord_in.z * input_desc.s4 + input_desc.s0; _viv_asm(MOV, coord_in.w, baseAddr); - VXC_OP4(img_load_3d, src0, input, coord_in.xyww, VXC_5BITOFFSET_XY(0, 0), + VXC_OP4(img_load_3d, src0, input, coord_in.xywz, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); vxc_uchar16 mask = (vxc_uchar16)(8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8); @@ -169,7 +169,7 @@ __kernel void resize_nearest_I8toI8_op _viv_asm(COPY, output_desc, output, sizeof(output_desc)); baseAddr = (int)coord_out.z * output_desc.s4 + output_desc.s0; _viv_asm(MOV, coord_out.w, baseAddr); - VXC_OP4_NoDest(img_store_3d, output, coord_out.xyww, dst, + VXC_OP4_NoDest(img_store_3d, output, coord_out.xywz, dst, VXC_MODIFIER(0, 3, 0,VXC_RM_TowardZero, 0)); } @@ -191,16 +191,16 @@ __kernel void resize_nearest_U8toU8 int baseAddr = (int)coord_in.z * input_desc.s4 + input_desc.s0; _viv_asm(MOV, coord_in.w, baseAddr); - VXC_OP4(img_load_3d, src, input, coord_in.xyww, VXC_5BITOFFSET_XY(0, 0), + VXC_OP4(img_load_3d, src, input, coord_in.xywz, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0)); coord_in.x = in_x_idx.y; - VXC_OP4(img_load_3d, src, input, coord_in.xyww, VXC_5BITOFFSET_XY(0, 0), + VXC_OP4(img_load_3d, src, input, coord_in.xywz, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(1, 1, 0, VXC_RM_TowardZero, 0)); coord_in.x = in_x_idx.z; - VXC_OP4(img_load_3d, src, input, coord_in.xyww, VXC_5BITOFFSET_XY(0, 0), + VXC_OP4(img_load_3d, src, input, coord_in.xywz, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(2, 2, 0, VXC_RM_TowardZero, 0)); coord_in.x = in_x_idx.w; - VXC_OP4(img_load_3d, src, input, coord_in.xyww, VXC_5BITOFFSET_XY(0, 0), + VXC_OP4(img_load_3d, src, input, coord_in.xywz, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(3, 3, 0, VXC_RM_TowardZero, 0)); vxc_ushort8 multiplier; @@ -212,7 +212,7 @@ __kernel void resize_nearest_U8toU8 _viv_asm(COPY, output_desc, output, sizeof(output_desc)); baseAddr = (int)coord_out.z * output_desc.s4 + output_desc.s0; _viv_asm(MOV, coord_out.w, baseAddr); - VXC_OP4_NoDest(img_store_3d, output, coord_out.xyww, src, + VXC_OP4_NoDest(img_store_3d, output, coord_out.xywz, src, VXC_MODIFIER(0, 3, 0,VXC_RM_TowardZero, 0)); } @@ -234,7 +234,7 @@ __kernel void resize_nearest_U8toU8_op int baseAddr = (int)coord_in.z * input_desc.s4 + input_desc.s0; _viv_asm(MOV, coord_in.w, baseAddr); - VXC_OP4(img_load_3d, src0, input, coord_in.xyww, VXC_5BITOFFSET_XY(0, 0), + VXC_OP4(img_load_3d, src0, input, coord_in.xywz, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); vxc_uchar16 mask = (vxc_uchar16)(8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8); @@ -250,7 +250,7 @@ __kernel void resize_nearest_U8toU8_op _viv_asm(COPY, output_desc, output, sizeof(output_desc)); baseAddr = (int)coord_out.z * output_desc.s4 + output_desc.s0; _viv_asm(MOV, coord_out.w, baseAddr); - VXC_OP4_NoDest(img_store_3d, output, coord_out.xyww, dst, + VXC_OP4_NoDest(img_store_3d, output, coord_out.xywz, dst, VXC_MODIFIER(0, 3, 0,VXC_RM_TowardZero, 0)); } @@ -272,16 +272,16 @@ __kernel void resize_nearest_I16toI16 int baseAddr = (int)coord_in.z * input_desc.s4 + input_desc.s0; _viv_asm(MOV, coord_in.w, baseAddr); - VXC_OP4(img_load_3d, src, input, coord_in.xyww, VXC_5BITOFFSET_XY(0, 0), + VXC_OP4(img_load_3d, src, input, coord_in.xywz, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0)); coord_in.x = in_x_idx.y; - VXC_OP4(img_load_3d, src, input, coord_in.xyww, VXC_5BITOFFSET_XY(0, 0), + VXC_OP4(img_load_3d, src, input, coord_in.xywz, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(1, 1, 0, VXC_RM_TowardZero, 0)); coord_in.x = in_x_idx.z; - VXC_OP4(img_load_3d, src, input, coord_in.xyww, VXC_5BITOFFSET_XY(0, 0), + VXC_OP4(img_load_3d, src, input, coord_in.xywz, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(2, 2, 0, VXC_RM_TowardZero, 0)); coord_in.x = in_x_idx.w; - VXC_OP4(img_load_3d, src, input, coord_in.xyww, VXC_5BITOFFSET_XY(0, 0), + VXC_OP4(img_load_3d, src, input, coord_in.xywz, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(3, 3, 0, VXC_RM_TowardZero, 0)); VXC_DP2x8(src, src, src, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), uniConvertI8toI8_2x8); @@ -291,7 +291,7 @@ __kernel void resize_nearest_I16toI16 _viv_asm(COPY, output_desc, output, sizeof(output_desc)); baseAddr = (int)coord_out.z * output_desc.s4 + output_desc.s0; _viv_asm(MOV, coord_out.w, baseAddr); - VXC_OP4_NoDest(img_store_3d, output, coord_out.xyww, src, + VXC_OP4_NoDest(img_store_3d, output, coord_out.xywz, src, VXC_MODIFIER(0, 3, 0,VXC_RM_TowardZero, 0)); } @@ -314,9 +314,9 @@ __kernel void resize_nearest_I16toI16_op int baseAddr = (int)coord_in.z * input_desc.s4 + input_desc.s0; _viv_asm(MOV, coord_in.w, baseAddr); - VXC_OP4(img_load_3d, src0, input, coord_in.xyww, VXC_5BITOFFSET_XY(0, 0), + VXC_OP4(img_load_3d, src0, input, coord_in.xywz, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); - VXC_OP4(img_load_3d, src1, input, coord_in.xyww, VXC_5BITOFFSET_XY(8, 0), + VXC_OP4(img_load_3d, src1, input, coord_in.xywz, VXC_5BITOFFSET_XY(8, 0), VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); //in_x_idx = in_x_idx - in_x_idx.xxxx; @@ -332,6 +332,6 @@ __kernel void resize_nearest_I16toI16_op _viv_asm(COPY, output_desc, output, sizeof(output_desc)); baseAddr = (int)coord_out.z * output_desc.s4 + output_desc.s0; _viv_asm(MOV, coord_out.w, baseAddr); - VXC_OP4_NoDest(img_store_3d, output, coord_out.xyww, dst, + VXC_OP4_NoDest(img_store_3d, output, coord_out.xywz, dst, VXC_MODIFIER(0, 3, 0,VXC_RM_TowardZero, 0)); } diff --git a/src/tim/vx/internal/src/libnnext/ops/vx/scatter_nd_big.vx b/src/tim/vx/internal/src/libnnext/ops/vx/scatter_nd_big.vx index 0a08f33..8d4b51a 100644 --- a/src/tim/vx/internal/src/libnnext/ops/vx/scatter_nd_big.vx +++ b/src/tim/vx/internal/src/libnnext/ops/vx/scatter_nd_big.vx @@ -10,15 +10,6 @@ _viv_uniform int offsetX; _viv_uniform int offsetY; _viv_uniform int offsetZ; -inline uchar* get_image2D_array_ptr(image2d_t input) -{ - int8 desc; - _viv_asm(COPY, desc, input, sizeof(desc)); - uchar *src_ptr = (uchar*)desc.s0; - - return src_ptr; -} - __kernel void scatter_nd_F16toF16_big( __read_only image2d_t input0, __read_only image2d_t input1, @@ -34,9 +25,12 @@ __kernel void scatter_nd_F16toF16_big( vxc_short8 tmpVal = (vxc_short8)(0, 0, 0, 0, 0, 0, 0, 0); vxc_half8 sum; _viv_asm(COPY, sum, tmpVal, 16); - __global int* index_ptr = (__global int*)get_image2D_array_ptr(input0); - __global short* update_ptr = (__global short*)get_image2D_array_ptr(input1); - __global short* output_ptr = (__global short*)get_image2D_array_ptr(output); + Image i0_img = create_image_from_image2d(input0, 4); + __global int* index_ptr = (__global int*)i0_img.ptr; + Image i1_img = create_image_from_image2d(input1, 2); + __global short* update_ptr = (__global short*)i1_img.ptr; + Image o_img = create_image_from_image2d(output, 2); + __global short* output_ptr = (__global short*)o_img.ptr; for(int i = 0; i < index_num; i++) { int4 indice = vload4(0, index_ptr + i * coord_dim); @@ -70,9 +64,12 @@ __kernel void scatter_nd_##src0_type_name##to##src0_type_name##_big( \ int firstFlg = 1; \ \ data_type sum = (data_type)(0, 0, 0, 0, 0, 0, 0, 0); \ - __global int* index_ptr = (__global int*)get_image2D_array_ptr(input0); \ - __global ptr_type* update_ptr = (__global ptr_type*)get_image2D_array_ptr(input1); \ - __global ptr_type* output_ptr = (__global ptr_type*)get_image2D_array_ptr(output); \ + Image i0_img = create_image_from_image2d(input0, 2); \ + __global int* index_ptr = (__global int*)i0_img.ptr; \ + Image i1_img = create_image_from_image2d(input1, 2); \ + __global ptr_type* update_ptr = (__global ptr_type*)i1_img.ptr; \ + Image o_img = create_image_from_image2d(output, 2); \ + __global ptr_type* output_ptr = (__global ptr_type*)o_img.ptr; \ for(int i = 0; i < index_num; i++) \ { \ int4 indice = vload4(0, index_ptr + i * coord_dim); \ diff --git a/src/tim/vx/internal/src/libnnext/ops/vx/scatter_nd_update.vx b/src/tim/vx/internal/src/libnnext/ops/vx/scatter_nd_update.vx new file mode 100644 index 0000000..8532ae0 --- /dev/null +++ b/src/tim/vx/internal/src/libnnext/ops/vx/scatter_nd_update.vx @@ -0,0 +1,231 @@ +#include "cl_viv_vx_ext.h" + +_viv_uniform VXC_512Bits uniAccumulateSum_2x8; +_viv_uniform VXC_512Bits uniU8MulAndPostShift_0_Lo_2x8; +_viv_uniform VXC_512Bits uniU8MulAndPostShift_1_Lo_2x8; +_viv_uniform int index_num; +_viv_uniform int offset_idx; +_viv_uniform int offsetX; +_viv_uniform int offsetY; +_viv_uniform int offsetZ; +_viv_uniform int offsetW; +_viv_uniform int2 multAndoutZP0; +_viv_uniform int2 multAndoutZP1;//[0:15] multiplier, [31:63] output zp + +_viv_uniform VXC_512Bits uniConvBF16toF32_Part0_2x8; +_viv_uniform VXC_512Bits uniConvBF16toF32_Part1_2x8; +_viv_uniform VXC_512Bits uniExtractOddData_2x8; + +__kernel void scatter_nd_update_F16F16toF16( + __read_only image2d_t input0, + __read_only image2d_t input1, + __read_only image2d_t input2, + image2d_array_t output, + int width, + int area, + int vol, + int coord_dim + ) +{ + int gidx = get_global_id(0); + int gidy = get_global_id(1); + int cnt = 0; + + vxc_short8 tmpVal = (vxc_short8)(0, 0, 0, 0, 0, 0, 0, 0); + vxc_half8 sum; + _viv_asm(COPY, sum, tmpVal, 16); + Image img1 = create_image_from_image2d(input1, 4); + __global int* index_ptr = (__global int*)img1.ptr; + for(int i = 0; i < index_num; i++) + { + //int4 indice = read_imagei(input1, (int2)(0, i)); + int4 indice = vload4(0, index_ptr + offset_idx); + index_ptr += coord_dim; + int idx = indice.x * offsetX + indice.y * offsetY + indice.z * offsetZ + indice.w * offsetW; + if(gidy == idx) + { + vxc_half8 src; + VXC_ReadImage(tmpVal, input2, (int2)(gidx, i), 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); + cnt++; + _viv_asm(COPY, src, tmpVal, 16); + VXC_DP2x8(sum, sum, src, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniAccumulateSum_2x8); + } + } + _viv_asm(COPY, tmpVal, sum, 16); + int2 coord = (int2)(gidx, gidy); + if(cnt == 0) + { + VXC_ReadImage(tmpVal, input0, coord, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); + } + VXC_WriteImage(output, coord, tmpVal, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); +} + +#define SCATTER_ND_UPDATE_QINT(src0_type_name, src2_type_name, out_type_name, data_type) \ +__kernel void scatter_nd_update_##src0_type_name##src2_type_name##to##out_type_name##( \ + __read_only image2d_t input0, \ + __read_only image2d_t input1, \ + __read_only image2d_t input2, \ + image2d_array_t output, \ + int width, \ + int area, \ + int vol, \ + int coord_dim \ + ) \ +{ \ + int gidx = get_global_id(0); \ + int gidy = get_global_id(1); \ + int cnt = 0; \ + \ + data_type sum = (data_type)(0, 0, 0, 0, 0, 0, 0, 0); \ + Image img1 = create_image_from_image2d(input1, 4); \ + __global int* index_ptr = (__global int*)img1.ptr; \ + for(int i = 0; i < index_num; i++) \ + { \ + int4 indice = vload4(0, index_ptr + offset_idx); \ + index_ptr += coord_dim; \ + int idx = indice.x * offsetX + indice.y * offsetY + indice.z * offsetZ + indice.w * offsetW; \ + if(gidy == idx) \ + { \ + data_type src; \ + VXC_ReadImage(src, input2, (int2)(gidx, i), 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \ + cnt++; \ + VXC_DP2x8(sum, sum, src, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniAccumulateSum_2x8); \ + } \ + } \ + int2 coord = (int2)(gidx, gidy); \ + vxc_ushort8 ms0; \ + data_type dst; \ + if(cnt == 0) \ + { \ + VXC_ReadImage(sum, input0, coord, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \ + _viv_asm(COPY, ms0, multAndoutZP0, 16); \ + VXC_DP2x8(dst, sum, ms0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1), \ + uniU8MulAndPostShift_0_Lo_2x8); \ + VXC_WriteImage(output, coord, dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \ + } \ + else \ + { \ + _viv_asm(COPY, ms0, multAndoutZP1, 16); \ + VXC_DP2x8(dst, sum, ms0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1), \ + uniU8MulAndPostShift_1_Lo_2x8); \ + VXC_WriteImage(output, coord, dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \ + } \ +} +SCATTER_ND_UPDATE_QINT(U8, U8, U8, vxc_uchar8) +SCATTER_ND_UPDATE_QINT(I8, I8, I8, vxc_char8) +SCATTER_ND_UPDATE_QINT(I16, I16, I16, vxc_short8) + +#define SCATTER_ND_UPDATE_QINT_TO_F16(src0_type, data_type) \ +__kernel void scatter_nd_update_##src0_type##src0_type##toF16( \ + __read_only image2d_t input0, \ + __read_only image2d_t input1, \ + __read_only image2d_t input2, \ + image2d_array_t output, \ + int width, \ + int area, \ + int vol, \ + int coord_dim \ + ) \ +{ \ + int gidx = get_global_id(0); \ + int gidy = get_global_id(1); \ + int cnt = 0; \ + vxc_short8 sum = (vxc_short8)(0, 0, 0, 0, 0, 0, 0, 0); \ + data_type src; \ + Image img1 = create_image_from_image2d(input1, 4); \ + __global int* index_ptr = (__global int*)img1.ptr; \ + for(int i = 0; i < index_num; i++) \ + { \ + int4 indice = vload4(0, index_ptr + offset_idx); \ + index_ptr += coord_dim; \ + int idx = indice.x * offsetX + indice.y * offsetY + indice.z * offsetZ + indice.w * offsetW; \ + if(gidy == idx) \ + { \ + VXC_ReadImage(src, input2, (int2)(gidx, i), 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \ + cnt++; \ + VXC_DP2x8(sum, sum, src, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniAccumulateSum_2x8); \ + } \ + } \ + int2 coord = (int2)(gidx, gidy); \ + vxc_ushort8 ms0; \ + vxc_half8 tmpDst; \ + vxc_short8 dst; \ + if(cnt == 0) \ + { \ + VXC_ReadImage(src, input0, coord, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \ + _viv_asm(COPY, ms0, multAndoutZP0, 16); \ + VXC_DP2x8(tmpDst, src, ms0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1), \ + uniU8MulAndPostShift_0_Lo_2x8); \ + _viv_asm(COPY, dst, tmpDst, 16); \ + VXC_WriteImage(output, coord, dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \ + } \ + else \ + { \ + _viv_asm(COPY, ms0, multAndoutZP1, 16); \ + VXC_DP2x8(tmpDst, sum, ms0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1), \ + uniU8MulAndPostShift_1_Lo_2x8); \ + _viv_asm(COPY, dst, tmpDst, 16); \ + VXC_WriteImage(output, coord, dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \ + } \ +} +SCATTER_ND_UPDATE_QINT_TO_F16(U8, vxc_uchar8) +SCATTER_ND_UPDATE_QINT_TO_F16(I8, vxc_char8) +SCATTER_ND_UPDATE_QINT_TO_F16(I16, vxc_short8) + +__kernel void scatter_nd_update_BF16BF16toBF16( + __read_only image2d_t input0, + __read_only image2d_t input1, + __read_only image2d_t input2, + image2d_array_t output, + int width, + int area, + int vol, + int coord_dim + ) +{ + int gidx = get_global_id(0); + int gidy = get_global_id(1); + int cnt = 0; + + vxc_short8 zero = (vxc_short8)(0, 0, 0, 0, 0, 0, 0, 0); + vxc_ushort8 src0, src1, src2; + float4 srcA, srcB; + float4 sum0 = (float4)(0); + float4 sum1 = sum0; + + Image img1 = create_image_from_image2d(input1, 4); + __global int* index_ptr = (__global int*)img1.ptr; + for(int i = 0; i < index_num; i++) + { + //int4 indice = read_imagei(input1, (int2)(0, i)); + int4 indice = vload4(0, index_ptr + offset_idx); + index_ptr += coord_dim; + int idx = indice.x * offsetX + indice.y * offsetY + indice.z * offsetZ + indice.w * offsetW; + if(gidy == idx) + { + VXC_ReadImage(src0, input2, (int2)(gidx, i), 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); + cnt++; + VXC_DP2x8(src1, src0, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), + uniConvBF16toF32_Part0_2x8); + VXC_DP2x8(src2, src0, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), + uniConvBF16toF32_Part1_2x8); + _viv_asm(COPY, srcA, src1, 16); + _viv_asm(COPY, srcB, src2, 16); + sum0 += srcA; + sum1 += srcB; + } + } + int2 coord = (int2)(gidx, gidy); + if(cnt == 0) + { + VXC_ReadImage(src2, input0, coord, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); + VXC_WriteImage(output, coord, src2, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); + } + else + { + _viv_asm(COPY, src0, sum0, 16); + _viv_asm(COPY, src1, sum1, 16); + VXC_DP2x8(src2, src0, src1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniExtractOddData_2x8); + VXC_WriteImage(output, coord, src2, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); + } +} \ No newline at end of file diff --git a/src/tim/vx/internal/src/libnnext/ops/vx/scatter_nd_update_atom.vx b/src/tim/vx/internal/src/libnnext/ops/vx/scatter_nd_update_atom.vx new file mode 100644 index 0000000..44f0c63 --- /dev/null +++ b/src/tim/vx/internal/src/libnnext/ops/vx/scatter_nd_update_atom.vx @@ -0,0 +1,192 @@ +#include "cl_viv_vx_ext.h" + +_viv_uniform VXC_512Bits uniU8MulAndPostShift_0_Lo_2x8; +_viv_uniform int update_width; +_viv_uniform int output_width; +_viv_uniform int2 multAndoutZP0; + +_viv_uniform int offsetX; +_viv_uniform int offsetY; +_viv_uniform int offsetZ; +_viv_uniform int offsetW; +_viv_uniform int offset_idx; + +_viv_uniform float scaleInOut; +_viv_uniform float output_zp; +_viv_uniform int input_zp; +_viv_uniform float input_scale; +_viv_uniform VXC_512Bits uniConvert1stUint8SubZpToFp32_4x4; +_viv_uniform VXC_512Bits uniConvertInt32toUint8_2x8; + +_viv_uniform int count_width; + +#define SCATTER_ND_UPDATE_QINT_PRE(src0_type, data_type, ptr_type, element_size) \ +__kernel void scatter_nd_update_##src0_type##_pre( \ + __read_only image2d_t input1, __read_only image2d_t input2, \ + image2d_t output, image2d_t output_cnt, image2d_t tmp_output, \ + int width, int area, int vol, int coord_dim ) \ +{ \ + int gidx = get_global_id(0); \ + int gidy = get_global_id(1); \ + \ + Image img1 = create_image_from_image2d(input1, 4); \ + Image img2 = create_image_from_image2d(input2, element_size); \ + Image img3 = create_image_from_image2d(output, 4); \ + Image img4 = create_image_from_image2d(output_cnt, 4); \ + __global int* index_ptr = (__global int*)img1.ptr; \ + __global ptr_type* update_ptr = (__global ptr_type*)img2.ptr; \ + __global int* output_ptr = (__global int*)img3.ptr; \ + __global int* cnt_ptr = (__global int*)img4.ptr; \ + data_type src; \ + \ + int4 indice = vload4(0, index_ptr + gidy * coord_dim + offset_idx); \ + ptr_type tmpData = update_ptr[gidy * update_width + gidx]; \ + int idx = indice.x * offsetX + indice.y * offsetY + indice.z * offsetZ + indice.w * offsetW; \ + int loc = idx * output_width + gidx; \ + _viv_asm(COPY, src, tmpData, 4); \ + vxc_int4 data; \ + short zp = input_zp; \ + VXC_DP4x4(data, src, zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 1), \ + uniConvert1stUint8SubZpToFp32_4x4); \ + atomic_add(output_ptr + loc, data.x); \ + if(gidx == 0) \ + { \ + atomic_inc(cnt_ptr + idx); \ + } \ +} +SCATTER_ND_UPDATE_QINT_PRE(U8, vxc_uchar8, uchar, 1) +SCATTER_ND_UPDATE_QINT_PRE(I8, vxc_char8, char, 1) +SCATTER_ND_UPDATE_QINT_PRE(I16, vxc_short8, short, 2) + +// input0 ref +// input1 sum +// input2 count +// input3 update +#define SCATTER_ND_UPDATE_QINT_TO_F16_BIG(src0_type, data_type, ptr_type, element_size) \ +__kernel void scatter_nd_update_##src0_type##src0_type##toF16_big( \ + __read_only image2d_t input0, \ + __read_only image2d_t input1, \ + __read_only image2d_t input2, \ + __read_only image2d_t input3, \ + __read_only image2d_t input4, \ + image2d_t output, \ + int width, \ + int area, \ + int vol, \ + int coord_dim \ + ) \ +{ \ + int gidx = get_global_id(0); \ + int gidy = get_global_id(1); \ + \ + Image img2 = create_image_from_image2d(input2, 4); \ + Image img3 = create_image_from_image2d(output, 2); \ + \ + __global int* cnt_ptr = (__global int*)img2.ptr; \ + __global short* output_ptr = (__global short*)img3.ptr; \ + data_type src; \ + \ + int cnt = cnt_ptr[gidy]; \ + int loc = gidy * output_width + gidx; \ + \ + vxc_ushort8 ms0; \ + vxc_half8 tmpDst; \ + vxc_short8 dst; \ + if(cnt == 0) \ + { \ + Image img0 = create_image_from_image2d(input0, element_size); \ + __global ptr_type* ref_ptr = (__global ptr_type*)img0.ptr; \ + ptr_type tmpData = ref_ptr[loc]; \ + _viv_asm(COPY, ms0, multAndoutZP0, 16); \ + _viv_asm(COPY, src, tmpData, 4); \ + VXC_DP2x8(tmpDst, src, ms0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1), \ + uniU8MulAndPostShift_0_Lo_2x8); \ + _viv_asm(COPY, dst, tmpDst, 16); \ + output_ptr[loc] = dst.x; \ + } \ + else \ + { \ + Image img1 = create_image_from_image2d(input1, 4); \ + __global int* sum_ptr = (__global int*)img1.ptr; \ + int sum = sum_ptr[loc]; \ + float result = sum * input_scale; \ + half tmpOut; \ + _viv_asm(CONV, tmpOut, result); \ + _viv_asm(COPY, dst, tmpOut, 4); \ + output_ptr[loc] = dst.x; \ + } \ +} +SCATTER_ND_UPDATE_QINT_TO_F16_BIG(U8, vxc_uchar8, uchar, 1) +SCATTER_ND_UPDATE_QINT_TO_F16_BIG(I8, vxc_char8, char, 1) +SCATTER_ND_UPDATE_QINT_TO_F16_BIG(I16, vxc_short8, short, 2) + +#define SCATTER_ND_UPDATE_QINT_BIG(src0_type, data_type, ptr_type, element_size) \ +__kernel void scatter_nd_update_##src0_type##src0_type##to##src0_type##_big( \ + __read_only image2d_t input0, \ + __read_only image2d_t input1, \ + __read_only image2d_t input2, \ + __read_only image2d_t input3, \ + __read_only image2d_t input4, \ + image2d_t output, \ + int width, \ + int area, \ + int vol, \ + int coord_dim \ + ) \ +{ \ + int gidx = get_global_id(0); \ + int gidy = get_global_id(1); \ + Image img2 = create_image_from_image2d(input2, 4); \ + Image img3 = create_image_from_image2d(output, element_size); \ + __global int* cnt_ptr = (__global int*)img2.ptr; \ + __global ptr_type* output_ptr = (__global ptr_type*)img3.ptr; \ + int cnt = cnt_ptr[gidy]; \ + int loc = gidy * output_width + gidx; \ + data_type src, dst; \ + if(cnt == 0) \ + { \ + Image img0 = create_image_from_image2d(input0, element_size); \ + __global ptr_type* ref_ptr = (__global ptr_type*)img0.ptr; \ + ptr_type tmpData = ref_ptr[loc]; \ + vxc_ushort8 ms0; \ + _viv_asm(COPY, ms0, multAndoutZP0, 16); \ + _viv_asm(COPY, src, tmpData, 4); \ + VXC_DP2x8(dst, src, ms0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1), \ + uniU8MulAndPostShift_0_Lo_2x8); \ + output_ptr[loc] = dst.x; \ + } \ + else \ + { \ + Image img1 = create_image_from_image2d(input1, 4); \ + __global int* sum_ptr = (__global int*)img1.ptr; \ + int sum = sum_ptr[loc]; \ + int4 result; \ + result.x = convert_int_rte(sum * scaleInOut + output_zp); \ + VXC_DP2x8(dst, result, result, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1), \ + uniConvertInt32toUint8_2x8); \ + output_ptr[loc] = dst.x; \ + } \ +} +SCATTER_ND_UPDATE_QINT_BIG(U8, vxc_uchar8, uchar, 1) +SCATTER_ND_UPDATE_QINT_BIG(I8, vxc_char8, char, 1) +SCATTER_ND_UPDATE_QINT_BIG(I16, vxc_short8, short, 2) + +__kernel void scatter_nd_update_reset( + __read_only image2d_t input0, + image2d_t output_sum, + image2d_t output_cnt + ) +{ + int gidx = get_global_id(0); + + Image img3 = create_image_from_image2d(output_sum, 4); + Image img4 = create_image_from_image2d(output_cnt, 4); + __global int* sum_ptr = (__global int*)img3.ptr; + __global int* cnt_ptr = (__global int*)img4.ptr; + int4 data = (int4)(0); + vstore4(data, gidx, sum_ptr); + if(gidx < count_width) + { + vstore4(data, gidx, cnt_ptr); + } +} \ No newline at end of file diff --git a/src/tim/vx/internal/src/libnnext/ops/vx/scatter_nd_update_big.vx b/src/tim/vx/internal/src/libnnext/ops/vx/scatter_nd_update_big.vx new file mode 100644 index 0000000..122fddb --- /dev/null +++ b/src/tim/vx/internal/src/libnnext/ops/vx/scatter_nd_update_big.vx @@ -0,0 +1,64 @@ +#include "cl_viv_vx_ext.h" + +_viv_uniform VXC_512Bits uniAccumulateSum_2x8; +_viv_uniform int index_num; +_viv_uniform int update_width; +_viv_uniform int output_width; + +_viv_uniform int offsetX; +_viv_uniform int offsetY; +_viv_uniform int offsetZ; +_viv_uniform int offsetW; +_viv_uniform int offset_idx; + +__kernel void scatter_nd_update_F16F16toF16_big( + __read_only image2d_t input0, + __read_only image2d_t input1, + __read_only image2d_t input2, + image2d_t output, + int width, + int area, + int vol, + int coord_dim + ) +{ + int gidx = get_global_id(0); + int gidy = get_global_id(1); + int cnt = 0; + + vxc_short8 tmpVal = (vxc_short8)(0, 0, 0, 0, 0, 0, 0, 0); + vxc_half8 sum; + _viv_asm(COPY, sum, tmpVal, 16); + Image img1 = create_image_from_image2d(input1, 4); + Image img2 = create_image_from_image2d(input2, 2); + Image img3 = create_image_from_image2d(output, 2); + + __global int* index_ptr = (__global int*)img1.ptr; + __global short* update_ptr = (__global short*)img2.ptr; + __global short* output_ptr = (__global short*)img3.ptr; + for(int i = 0; i < index_num; i++) + { + int4 indice = vload4(0, index_ptr + offset_idx); + index_ptr += coord_dim; + + int idx = indice.x * offsetX + indice.y * offsetY + indice.z * offsetZ + indice.w * offsetW; + if(gidy == idx) + { + vxc_half8 src; + short tmpData = update_ptr[i * update_width + gidx]; + cnt++; + _viv_asm(COPY, src, tmpData, 4); + VXC_DP2x8(sum, sum, src, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniAccumulateSum_2x8); + } + } + short dst; + _viv_asm(COPY, dst, sum, 4); + int loc = gidy * output_width+ gidx; + if(cnt == 0) + { + Image img0 = create_image_from_image2d(input0, 2); + __global short* ref_ptr = (__global short*)img0.ptr; + dst = ref_ptr[loc]; + } + output_ptr[loc] = dst; +} diff --git a/src/tim/vx/internal/src/libnnext/ops/vx/signal_frame.vx b/src/tim/vx/internal/src/libnnext/ops/vx/signal_frame.vx new file mode 100644 index 0000000..eec7217 --- /dev/null +++ b/src/tim/vx/internal/src/libnnext/ops/vx/signal_frame.vx @@ -0,0 +1,46 @@ +#include "cl_viv_vx_ext.h" + +#define SIGNAL_FRAME_8BITS_SH_IMPL(type) \ +__kernel void signal_frame_##type##to##type \ + ( \ + __read_only image2d_array_t input, \ + __write_only image2d_array_t output, \ + int frame_step \ + ) \ +{ \ + int inner = get_global_id(0); \ + int length_k = get_global_id(1); \ + int frames_id = get_global_id(2); \ + \ + int4 coord = (int4)(inner, length_k, frames_id, frames_id); \ + int2 coord_in = (int2)(inner, frames_id * frame_step + length_k); \ + \ + vxc_uchar16 src; \ + VXC_ReadImage(src, input, coord_in, 0, VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); \ + VXC_WriteImage2DArray(output, coord, src, VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); \ +} +SIGNAL_FRAME_8BITS_SH_IMPL(U8) +SIGNAL_FRAME_8BITS_SH_IMPL(I8) + +#define SIGNAL_FRAME_16BITS_SH_IMPL(type) \ +__kernel void signal_frame_##type##to##type \ + ( \ + __read_only image2d_array_t input, \ + __write_only image2d_array_t output, \ + int frame_step \ + ) \ +{ \ + int inner = get_global_id(0); \ + int length_k = get_global_id(1); \ + int frames_id = get_global_id(2); \ + \ + int4 coord = (int4)(inner, length_k, frames_id, frames_id); \ + int2 coord_in = (int2)(inner, frames_id * frame_step + length_k); \ + \ + vxc_short8 src; \ + VXC_ReadImage(src, input, coord_in, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \ + VXC_WriteImage2DArray(output, coord, src, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \ +} +SIGNAL_FRAME_16BITS_SH_IMPL(I16) +SIGNAL_FRAME_16BITS_SH_IMPL(F16) +SIGNAL_FRAME_16BITS_SH_IMPL(BF16) \ No newline at end of file diff --git a/src/tim/vx/internal/src/libnnext/ops/vx/tensorstackconcat.vx b/src/tim/vx/internal/src/libnnext/ops/vx/tensorstackconcat.vx new file mode 100644 index 0000000..78ff35c --- /dev/null +++ b/src/tim/vx/internal/src/libnnext/ops/vx/tensorstackconcat.vx @@ -0,0 +1,64 @@ +#include "cl_viv_vx_ext.h" + +__kernel void tensorstackconcat_16bits + ( + __read_only image2d_array_t input, + __read_only image2d_t index, + __write_only image2d_array_t output + ) +{ + int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0); + vxc_short8 src0; + VXC_ReadImage2DArray(src0, input, coord, VXC_5BITOFFSET_XY(0, 0),\ + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); + coord.w = 0; + coord.y = read_imagei(index, coord.ww).x; + VXC_WriteImage2DArray(output, coord, src0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); +} + +__kernel void tensorstackconcat_8bits + ( + __read_only image2d_array_t input, + __read_only image2d_t index, + __write_only image2d_array_t output + ) +{ + int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0); + int idx = coord.x; + vxc_char16 src0, src1; + VXC_ReadImage2DArray(src0, input, coord, VXC_5BITOFFSET_XY(0, 0),\ + VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); + coord.y = read_imagei(index, coord.ww).x; + VXC_WriteImage2DArray(output, coord, src0, VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); +} + +__kernel void tensorstackconcat_16bits_2D + ( + __read_only image2d_array_t input, + __read_only image2d_t index, + __write_only image2d_array_t output + ) +{ + int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0); + vxc_short8 src0; + VXC_ReadImage(src0, input, coord.xy, VXC_5BITOFFSET_XY(0, 0),\ + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); + coord.y = read_imagei(index, coord.ww).x; + VXC_WriteImage(output, coord.xy, src0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); +} + +__kernel void tensorstackconcat_8bits_2D + ( + __read_only image2d_array_t input, + __read_only image2d_t index, + __write_only image2d_array_t output + ) +{ + int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0); + int idx = coord.x; + vxc_char16 src0, src1; + VXC_ReadImage(src0, input, coord.xy, VXC_5BITOFFSET_XY(0, 0),\ + VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); + coord.y = read_imagei(index, coord.ww).x; + VXC_WriteImage(output, coord.xy, src0, VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); +} \ No newline at end of file diff --git a/src/tim/vx/internal/src/libnnext/ops/vx/upsamplescale.vx b/src/tim/vx/internal/src/libnnext/ops/vx/upsamplescale.vx index efc9266..87abddb 100644 --- a/src/tim/vx/internal/src/libnnext/ops/vx/upsamplescale.vx +++ b/src/tim/vx/internal/src/libnnext/ops/vx/upsamplescale.vx @@ -36,7 +36,7 @@ _viv_uniform float tail; coord_out.x = coord.x; \ for (int x = 0; x < stride; ) \ { \ - VXC_OP4_NoDest(img_store_3d, output, coord_out.xyww, write_val, \ + VXC_OP4_NoDest(img_store_3d, output, coord_out.xywz, write_val, \ VXC_MODIFIER(0, 0, 0,VXC_RM_TowardZero, 0)); \ x++; \ coord_out.x ++; \ diff --git a/src/tim/vx/internal/src/libnnext/ops/vx/upsamplescale_k2.vx b/src/tim/vx/internal/src/libnnext/ops/vx/upsamplescale_k2.vx index d1935b1..93eebdd 100644 --- a/src/tim/vx/internal/src/libnnext/ops/vx/upsamplescale_k2.vx +++ b/src/tim/vx/internal/src/libnnext/ops/vx/upsamplescale_k2.vx @@ -30,9 +30,9 @@ _viv_uniform int2 multAndoutZP;//[0:15] multiplier, [31:63] output zp VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniUpScale2X_lo_2x8); \ VXC_DP2x8(dst_val, src_val, multiplier, \ VXC_MODIFIER(8, 15, 0, VXC_RM_ToNearestEven, 1), uniUpScale2X_hi_2x8); \ - VXC_OP4_NoDest(img_store_3d, output, coord.xyww, dst_val, VXC_MODIFIER(0, 15, 0,VXC_RM_TowardZero, 0)); \ + VXC_OP4_NoDest(img_store_3d, output, coord.xywz, dst_val, VXC_MODIFIER(0, 15, 0,VXC_RM_TowardZero, 0)); \ coord.y ++; \ - VXC_OP4_NoDest(img_store_3d, output, coord.xyww, dst_val, VXC_MODIFIER(0, 15, 0,VXC_RM_TowardZero, 0)); \ + VXC_OP4_NoDest(img_store_3d, output, coord.xywz, dst_val, VXC_MODIFIER(0, 15, 0,VXC_RM_TowardZero, 0)); \ } UPSAMPLE_SCALETO8B_FUN(F16, I8, vxc_short8, vxc_half8, vxc_char16) @@ -66,14 +66,14 @@ UPSAMPLE_SCALETO8B_FUN(U8, U8, vxc_uchar16, vxc_uchar16, vxc_uchar16) VXC_DP2x8(dst0_val, src_val, multiplier, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1), uniUpScale2X_lo_2x8); \ VXC_DP2x8(dst1_val, src_val, multiplier, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1), uniUpScale2X_hi_2x8); \ _viv_asm(COPY, write_val, dst0_val, 16); \ - VXC_OP4_NoDest(img_store_3d, output, coord.xyww, write_val, VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0)); \ + VXC_OP4_NoDest(img_store_3d, output, coord.xywz, write_val, VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0)); \ coord.y ++; \ - VXC_OP4_NoDest(img_store_3d, output, coord.xyww, write_val, VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0)); \ + VXC_OP4_NoDest(img_store_3d, output, coord.xywz, write_val, VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0)); \ _viv_asm(COPY, write_val, dst1_val, 16); \ coord.xy = coord.xy + (int2)(8, -1); \ - VXC_OP4_NoDest(img_store_3d, output, coord.xyww, write_val, VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0)); \ + VXC_OP4_NoDest(img_store_3d, output, coord.xywz, write_val, VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0)); \ coord.y ++; \ - VXC_OP4_NoDest(img_store_3d, output, coord.xyww, write_val, VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0)); \ + VXC_OP4_NoDest(img_store_3d, output, coord.xywz, write_val, VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0)); \ } UPSAMPLE_SCALETO16B_FUN(F16, F16, vxc_short8, vxc_half8, vxc_half8, vxc_short8) UPSAMPLE_SCALETO16B_FUN(F16, I16, vxc_short8, vxc_half8, vxc_short8, vxc_short8) diff --git a/src/tim/vx/internal/src/libnnext/ops/vx/vsi_nn_kernel_box_with_nms_limit.vx b/src/tim/vx/internal/src/libnnext/ops/vx/vsi_nn_kernel_box_with_nms_limit.vx deleted file mode 100644 index c351f66..0000000 --- a/src/tim/vx/internal/src/libnnext/ops/vx/vsi_nn_kernel_box_with_nms_limit.vx +++ /dev/null @@ -1,8 +0,0 @@ -#include "cl_viv_vx_ext.h" - -__kernel void vxcBox_with_nms_limit( - __read_only image2d_array_t input, - __write_only image2d_array_t output) -{ - -} diff --git a/src/tim/vx/internal/src/libnnext/ops/vx/vsi_nn_kernel_detection_postprocess.vx b/src/tim/vx/internal/src/libnnext/ops/vx/vsi_nn_kernel_detection_postprocess.vx deleted file mode 100644 index 763daa0..0000000 --- a/src/tim/vx/internal/src/libnnext/ops/vx/vsi_nn_kernel_detection_postprocess.vx +++ /dev/null @@ -1,8 +0,0 @@ -#include "cl_viv_vx_ext.h" - -__kernel void vxcDetection_postprocess( - __read_only image2d_array_t input, - __write_only image2d_array_t output) -{ - -} diff --git a/src/tim/vx/internal/src/libnnext/ops/vx/vsi_nn_kernel_extra_ending.vx b/src/tim/vx/internal/src/libnnext/ops/vx/vsi_nn_kernel_extra_ending.vx index 7ebb20a..52f51e5 100644 --- a/src/tim/vx/internal/src/libnnext/ops/vx/vsi_nn_kernel_extra_ending.vx +++ b/src/tim/vx/internal/src/libnnext/ops/vx/vsi_nn_kernel_extra_ending.vx @@ -1,9 +1,11 @@ #include "cl_viv_vx_ext.h" -__kernel void vxcExtra_ending_i16( +__kernel void extra_ending_I16 + ( __read_only image2d_array_t input0, __read_only image2d_array_t input, - __write_only image2d_array_t output) + __write_only image2d_array_t output + ) { int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0); @@ -14,10 +16,28 @@ __kernel void vxcExtra_ending_i16( VXC_WriteImage2DArray(output, coord, data, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); } -__kernel void vxcExtra_ending_i8( +__kernel void extra_ending_F16 + ( __read_only image2d_array_t input0, __read_only image2d_array_t input, - __write_only image2d_array_t output) + __write_only image2d_array_t output + ) +{ + int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0); + + vxc_short8 data; + VXC_ReadImage2DArray(data, input, coord, VXC_5BITOFFSET_XY(0, 0),\ + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); + + VXC_WriteImage2DArray(output, coord, data, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); +} + +__kernel void extra_ending_I8 + ( + __read_only image2d_array_t input0, + __read_only image2d_array_t input, + __write_only image2d_array_t output + ) { int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0); @@ -28,10 +48,12 @@ __kernel void vxcExtra_ending_i8( VXC_WriteImage2DArray(output, coord, data, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); } -__kernel void vxcExtra_ending_u8( +__kernel void extra_ending_U8 + ( __read_only image2d_array_t input0, __read_only image2d_array_t input, - __write_only image2d_array_t output) + __write_only image2d_array_t output + ) { int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0); diff --git a/src/tim/vx/internal/src/libnnext/ops/vx/vsi_nn_kernel_header.vx b/src/tim/vx/internal/src/libnnext/ops/vx/vsi_nn_kernel_header.vx index 86d1c60..7540ae6 100644 --- a/src/tim/vx/internal/src/libnnext/ops/vx/vsi_nn_kernel_header.vx +++ b/src/tim/vx/internal/src/libnnext/ops/vx/vsi_nn_kernel_header.vx @@ -26,11 +26,19 @@ inline Image create_image_from_image2d(image2d_t input, int stride_x) int8 desc; _viv_asm(COPY, desc, input, sizeof(desc)); +#if (USE_40BITS_VA==0) + uint address = as_uint(desc.s0); + int stride_y = desc.s1; +#else + ulong address = as_ulong(desc.s05); + int stride_y = desc.s6; +#endif + Image img = { - .ptr = (uchar*)desc.s0, + .ptr = (uchar*)address, .stride_x = stride_x, - .stride_y = desc.s1 + .stride_y = stride_y }; return img; @@ -44,22 +52,35 @@ typedef struct Tensor int stride_z; } Tensor; -inline uchar* create_tensor_ptr_from_coord(Tensor t, int4 coord) +inline uchar* get_tensor_ptr_from_coord(Tensor t, int4 coord) { return t.ptr + coord.x * t.stride_x + coord.y * t.stride_y + coord.z * t.stride_z; } inline Tensor create_tensor_from_image2d_array(image2d_array_t input, int stride_x) { +#if (USE_40BITS_VA==0) int8 desc; _viv_asm(COPY, desc, input, sizeof(desc)); + uint address = as_uint(desc.s0); + int stride_y = desc.s1; + int stride_z = desc.s4; +#else + int16 desc; + _viv_asm(COPY, desc, input, sizeof(desc)); + + ulong address = as_ulong(desc.s05); + int stride_y = desc.s6; + int stride_z = desc.sa; +#endif + Tensor t = { - .ptr = (uchar*)desc.s0, + .ptr = (uchar*)address, .stride_x = stride_x, - .stride_y = desc.s1, - .stride_z = desc.s4 + .stride_y = stride_y, + .stride_z = stride_z }; return t; diff --git a/src/tim/vx/internal/src/libnnext/ops/vx/vsi_nn_kernel_heatmap_max_keypoint.vx b/src/tim/vx/internal/src/libnnext/ops/vx/vsi_nn_kernel_heatmap_max_keypoint.vx deleted file mode 100644 index 4d7a7a7..0000000 --- a/src/tim/vx/internal/src/libnnext/ops/vx/vsi_nn_kernel_heatmap_max_keypoint.vx +++ /dev/null @@ -1,8 +0,0 @@ -#include "cl_viv_vx_ext.h" - -__kernel void vxcHeatmap_max_keypoint( - __read_only image2d_array_t input, - __write_only image2d_array_t output) -{ - -} diff --git a/src/tim/vx/internal/src/libnnext/ops/vx/vsi_nn_kernel_imageprocess.vx b/src/tim/vx/internal/src/libnnext/ops/vx/vsi_nn_kernel_imageprocess.vx deleted file mode 100644 index 93ad2cd..0000000 --- a/src/tim/vx/internal/src/libnnext/ops/vx/vsi_nn_kernel_imageprocess.vx +++ /dev/null @@ -1,321 +0,0 @@ -#include "cl_viv_vx_ext.h" - -_viv_uniform VXC_512Bits uniVecShift10; -_viv_uniform VXC_512Bits uniAddRShift; -_viv_uniform VXC_512Bits uniGetTempVal; -_viv_uniform VXC_512Bits uniExtractBytes; -_viv_uniform VXC_512Bits uniUnpackToR; -_viv_uniform VXC_512Bits uniUnpackToG; -_viv_uniform VXC_512Bits uniUnpackToB; -_viv_uniform VXC_512Bits uniDataMulAlpha_4x4; -_viv_uniform VXC_512Bits uniDataSubMean_4x4; - -_viv_uniform VXC_512Bits uniConvertIntergetoF32_4x4; -_viv_uniform float outputScale; -_viv_uniform VXC_512Bits uniExtactInteger_2x8; - -#define DESCALE(x) (((x) + (1<<19)) >> 20) -__kernel void ScaletoTensor_Int8 - ( - __read_only image2d_t input, - __write_only image2d_array_t output, - global int *xRatio, - global int *yRatio, - global int *xOffset, - global int *yOffset, - float rMean, - float gMean, - float bMean, - float f32Var - ) -{ - int2 ratioXY = (int2)(*xRatio, *yRatio); - - int4 xPos = get_global_id(0); - int yPos = get_global_id(1); - - int2 ratioSufXY = (ratioXY >> 1) - (1 << 14); - xPos += (int4)(0, 1, 2, 3); - - //x - int4 fx0 = xPos * ratioXY.x + ratioSufXY.x; - int4 sx = fx0 & 0xffff8000; - fx0 -= sx; - sx = sx >> 15; - - vxc_short4 fx; - VXC_DP4x4(fx, fx0, 1 << 4, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAddRShift); - //y - int fy = yPos * ratioXY.y + ratioSufXY.y; - int sy = fy & 0xffff8000; // Floor - - fy -= sy; - sy = sy >> 15; - - fy = (fy + (1<< 4)) >> 5; - - //R - vxc_uchar16 line0RGB1, line0RGB2; - vxc_uchar16 line1RGB3, line1RGB4; - int4 coord; - sx = sx * 3 + *xOffset; - coord.xyz = sx.xyz; - coord.w = sy + *yOffset; - int2 coord1 = (int2)(sx.w, coord.w); - VXC_ReadImage(line0RGB1, input, coord.xw, VXC_5BITOFFSET_XY(0, 0),\ - VXC_MODIFIER(0, 5, 0, VXC_RM_TowardZero, 0)); - VXC_ReadImage(line0RGB1, input, coord.yw, VXC_5BITOFFSET_XY(0, 0),\ - VXC_MODIFIER(6, 11, 0, VXC_RM_TowardZero, 0)); - VXC_ReadImage(line0RGB2, input, coord.zw, VXC_5BITOFFSET_XY(0, 0),\ - VXC_MODIFIER(0, 5, 0, VXC_RM_TowardZero, 0)); - VXC_ReadImage(line0RGB2, input, coord1, VXC_5BITOFFSET_XY(0, 0),\ - VXC_MODIFIER(6, 11, 0, VXC_RM_TowardZero, 0)); - - VXC_ReadImage(line1RGB3, input, coord.xw, VXC_5BITOFFSET_XY(0, 1),\ - VXC_MODIFIER(0, 5, 0, VXC_RM_TowardZero, 0)); - VXC_ReadImage(line1RGB3, input, coord.yw, VXC_5BITOFFSET_XY(0, 1),\ - VXC_MODIFIER(6, 11, 0, VXC_RM_TowardZero, 0)); - VXC_ReadImage(line1RGB4, input, coord.zw, VXC_5BITOFFSET_XY(0, 1),\ - VXC_MODIFIER(0, 5, 0, VXC_RM_TowardZero, 0)); - VXC_ReadImage(line1RGB4, input, coord1, VXC_5BITOFFSET_XY(0, 1),\ - VXC_MODIFIER(6, 11, 0, VXC_RM_TowardZero, 0)); - - float4 bgrMean = (float4)(bMean, gMean, rMean, 0); - - bgrMean *= f32Var; - - int4 test01, temp1; - int4 test02, temp2; - int4 tt; - vxc_uchar4 val; - int4 coord_out = (int4)(xPos.x, yPos, 2, 0); - - vxc_uchar8 line1, line2; - - //R - VXC_DP2x8(line1, line0RGB1, line0RGB2,\ - VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniUnpackToR); - VXC_DP2x8(line2, line1RGB3, line1RGB4,\ - VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniUnpackToR); - - VXC_DP4x4(test01, line1, line1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniVecShift10); - VXC_DP4x4(temp1, line1, fx, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniGetTempVal); - temp1 = temp1 + test01; - - VXC_DP4x4(test02, line2, line2, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniVecShift10); - VXC_DP4x4(temp2, line2, fx, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniGetTempVal); - temp2 = temp2 + test02; - temp2 = fy * (temp2 - temp1) + (temp1 << 10); - - vxc_float4 tmp_dst; - vxc_uchar4 u8_dst; - VXC_DP4x4(u8_dst, temp2, 1 << 19, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 1), uniExtractBytes); - VXC_DP4x4(tmp_dst, u8_dst, u8_dst,\ - VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 1), uniConvertIntergetoF32_4x4); - - //convert U8 to dfp8 - int4 dst0; - vxc_char4 dst; - tmp_dst = tmp_dst * f32Var - bgrMean.z; - tmp_dst *= outputScale; - dst0 = convert_int4_rte(tmp_dst); - VXC_DP2x8(dst, dst0, dst0, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), uniExtactInteger_2x8); - - VXC_WriteImage2DArray(output, coord_out, dst, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); - - //G - VXC_DP2x8(line1, line0RGB1, line0RGB2, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniUnpackToG); - VXC_DP2x8(line2, line1RGB3, line1RGB4, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniUnpackToG); - - coord_out.z = 1; - VXC_DP4x4(test01, line1, line1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniVecShift10); - VXC_DP4x4(temp1, line1, fx, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniGetTempVal); - temp1 = temp1 + test01; - - VXC_DP4x4(test02, line2, line2, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniVecShift10); - VXC_DP4x4(temp2, line2, fx, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniGetTempVal); - temp2 = temp2 + test02; - temp2 = fy * (temp2 - temp1) + (temp1 << 10); - - VXC_DP4x4(u8_dst, temp2, 1 << 19, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 1), uniExtractBytes); - VXC_DP4x4(tmp_dst, u8_dst, u8_dst,\ - VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 1), uniConvertIntergetoF32_4x4); - - tmp_dst = tmp_dst * f32Var - bgrMean.y; - tmp_dst *= outputScale; - dst0 = convert_int4_rte(tmp_dst); - VXC_DP2x8(dst, dst0, dst0, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), uniExtactInteger_2x8); - VXC_WriteImage2DArray(output, coord_out, dst, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); - - //B - VXC_DP2x8(line1, line0RGB1, line0RGB2, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniUnpackToB); - VXC_DP2x8(line2, line1RGB3, line1RGB4, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniUnpackToB); - - coord_out.z = 0; - VXC_DP4x4(test01, line1, line1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniVecShift10); - VXC_DP4x4(temp1, line1, fx, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniGetTempVal); - temp1 = temp1 + test01; - - VXC_DP4x4(test02, line2, line2, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniVecShift10); - VXC_DP4x4(temp2, line2, fx, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniGetTempVal); - temp2 = temp2 + test02; - temp2 = fy * (temp2 - temp1) + (temp1 << 10); - - VXC_DP4x4(u8_dst, temp2, 1 << 19,\ - VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 1), uniExtractBytes); - VXC_DP4x4(tmp_dst, u8_dst, u8_dst,\ - VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 1), uniConvertIntergetoF32_4x4); - - tmp_dst = tmp_dst * f32Var - bgrMean.x; - tmp_dst *= outputScale; - dst0 = convert_int4_rte(tmp_dst); - VXC_DP2x8(dst, dst0, dst0, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), uniExtactInteger_2x8); - VXC_WriteImage2DArray(output, coord_out, dst, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); -} - -__kernel void ScaletoTensor_Fp16 - ( - __read_only image2d_t input, - __write_only image2d_array_t output, - global int *xRatio, - global int *yRatio, - global int *xOffset, - global int *yOffset, - float rMean, - float gMean, - float bMean, - float f32Var - ) -{ - int2 ratioXY = (int2)(*xRatio, *yRatio); - - int4 xPos = get_global_id(0); - int yPos = get_global_id(1); - - int2 ratioSufXY = (ratioXY >> 1) - (1 << 14); - xPos += (int4)(0, 1, 2, 3); - - //x - int4 fx0 = xPos * ratioXY.x + ratioSufXY.x; - int4 sx = fx0 & 0xffff8000; - fx0 -= sx; - sx = sx >> 15; - - vxc_short4 fx; - VXC_DP4x4(fx, fx0, 1 << 4, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAddRShift); - //y - int fy = yPos * ratioXY.y + ratioSufXY.y; - int sy = fy & 0xffff8000; // Floor - - fy -= sy; - sy = sy >> 15; - - fy = (fy + (1<< 4)) >> 5; - - //R - vxc_uchar16 line0RGB1, line0RGB2; - vxc_uchar16 line1RGB3, line1RGB4; - int4 coord; - sx = sx * 3 + *xOffset; - coord.xyz = sx.xyz; - coord.w = sy + *yOffset; - int2 coord1 = (int2)(sx.w, coord.w); - VXC_ReadImage(line0RGB1, input, coord.xw,\ - VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 5, 0, VXC_RM_TowardZero, 0)); - VXC_ReadImage(line0RGB1, input, coord.yw,\ - VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(6, 11, 0, VXC_RM_TowardZero, 0)); - VXC_ReadImage(line0RGB2, input, coord.zw,\ - VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 5, 0, VXC_RM_TowardZero, 0)); - VXC_ReadImage(line0RGB2, input, coord1,\ - VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(6, 11, 0, VXC_RM_TowardZero, 0)); - - VXC_ReadImage(line1RGB3, input, coord.xw,\ - VXC_5BITOFFSET_XY(0, 1), VXC_MODIFIER(0, 5, 0, VXC_RM_TowardZero, 0)); - VXC_ReadImage(line1RGB3, input, coord.yw,\ - VXC_5BITOFFSET_XY(0, 1), VXC_MODIFIER(6, 11, 0, VXC_RM_TowardZero, 0)); - VXC_ReadImage(line1RGB4, input, coord.zw,\ - VXC_5BITOFFSET_XY(0, 1), VXC_MODIFIER(0, 5, 0, VXC_RM_TowardZero, 0)); - VXC_ReadImage(line1RGB4, input, coord1,\ - VXC_5BITOFFSET_XY(0, 1), VXC_MODIFIER(6, 11, 0, VXC_RM_TowardZero, 0)); - - float4 bgrMean = (float4)(bMean, gMean, rMean, 0); - - int4 test01, temp1; - int4 test02, temp2; - int4 tt; - vxc_uchar4 val; - int4 coord_out = (int4)(xPos.x, yPos, 2, 0); - - vxc_uchar8 line1, line2; - - //R - VXC_DP2x8(line1, line0RGB1, line0RGB2, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniUnpackToR); - VXC_DP2x8(line2, line1RGB3, line1RGB4, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniUnpackToR); - - VXC_DP4x4(test01, line1, line1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniVecShift10); - VXC_DP4x4(temp1, line1, fx, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniGetTempVal); - temp1 = temp1 + test01; - - VXC_DP4x4(test02, line2, line2, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniVecShift10); - VXC_DP4x4(temp2, line2, fx, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniGetTempVal); - temp2 = temp2 + test02; - temp2 = fy * (temp2 - temp1) + (temp1 << 10); - - VXC_DP4x4(val, temp2, 1 << 19, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 1), uniExtractBytes); - - //convert U8 to FP16 - half4 f16mean; - half f16alpha; - vxc_half4 dst; - vxc_short4 tmp_dst; - _viv_asm(CONV, f16mean, bgrMean); - _viv_asm(CONV, f16alpha, f32Var); - VXC_DP4x4(dst, val, f16mean.z, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniDataSubMean_4x4); - VXC_DP4x4(dst, dst, f16alpha, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniDataMulAlpha_4x4); - _viv_asm(COPY, tmp_dst, dst, 8); - VXC_WriteImage2DArray(output, coord_out, tmp_dst, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); - - //G - VXC_DP2x8(line1, line0RGB1, line0RGB2, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniUnpackToG); - VXC_DP2x8(line2, line1RGB3, line1RGB4, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniUnpackToG); - - coord_out.z = 1; - VXC_DP4x4(test01, line1, line1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniVecShift10); - VXC_DP4x4(temp1, line1, fx, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniGetTempVal); - temp1 = temp1 + test01; - - VXC_DP4x4(test02, line2, line2, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniVecShift10); - VXC_DP4x4(temp2, line2, fx, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniGetTempVal); - temp2 = temp2 + test02; - temp2 = fy * (temp2 - temp1) + (temp1 << 10); - - VXC_DP4x4(val, temp2, 1 << 19, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 1), uniExtractBytes); - - VXC_DP4x4(dst, val, f16mean.y, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniDataSubMean_4x4); - VXC_DP4x4(dst, dst, f16alpha, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniDataMulAlpha_4x4); - _viv_asm(COPY, tmp_dst, dst, 8); - VXC_WriteImage2DArray(output, coord_out, tmp_dst, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); - - //B - VXC_DP2x8(line1, line0RGB1, line0RGB2, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniUnpackToB); - VXC_DP2x8(line2, line1RGB3, line1RGB4, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniUnpackToB); - - coord_out.z = 0; - VXC_DP4x4(test01, line1, line1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniVecShift10); - VXC_DP4x4(temp1, line1, fx, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniGetTempVal); - temp1 = temp1 + test01; - - VXC_DP4x4(test02, line2, line2, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniVecShift10); - VXC_DP4x4(temp2, line2, fx, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniGetTempVal); - temp2 = temp2 + test02; - temp2 = fy * (temp2 - temp1) + (temp1 << 10); - - VXC_DP4x4(val, temp2, 1 << 19, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 1), uniExtractBytes); - - VXC_DP4x4(dst, val, f16mean.x, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniDataSubMean_4x4); - VXC_DP4x4(dst, dst, f16alpha, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniDataMulAlpha_4x4); - _viv_asm(COPY, tmp_dst, dst, 8); - VXC_WriteImage2DArray(output, coord_out, tmp_dst, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); - -} - diff --git a/src/tim/vx/internal/src/libnnext/ops/vx/vsi_nn_kernel_imageprocess_2.vx b/src/tim/vx/internal/src/libnnext/ops/vx/vsi_nn_kernel_imageprocess_2.vx deleted file mode 100644 index 88efed8..0000000 --- a/src/tim/vx/internal/src/libnnext/ops/vx/vsi_nn_kernel_imageprocess_2.vx +++ /dev/null @@ -1,327 +0,0 @@ -#include "cl_viv_vx_ext.h" - -_viv_uniform VXC_512Bits uniVecShift10; -_viv_uniform VXC_512Bits uniAddRShift; -_viv_uniform VXC_512Bits uniGetTempVal; -_viv_uniform VXC_512Bits uniExtractBytes; -_viv_uniform VXC_512Bits uniUnpackToR; -_viv_uniform VXC_512Bits uniUnpackToG; -_viv_uniform VXC_512Bits uniUnpackToB; -_viv_uniform VXC_512Bits uniDataMulAlpha_4x4; -_viv_uniform VXC_512Bits uniDataSubMean_4x4; - -_viv_uniform VXC_512Bits uniConvertIntergetoF32_4x4; -_viv_uniform float outputScale; -_viv_uniform VXC_512Bits uniExtactInteger_2x8; - -#define DESCALE(x) (((x) + (1<<19)) >> 20) -__kernel void ScaletoTensor_Int16 - ( - __read_only image2d_t input, - __write_only image2d_array_t output, - global int *xRatio, - global int *yRatio, - global int *xOffset, - global int *yOffset, - float rMean, - float gMean, - float bMean, - float f32Var - ) -{ - int2 ratioXY = (int2)(*xRatio, *yRatio); - - int4 xPos = get_global_id(0); - int yPos = get_global_id(1); - - int2 ratioSufXY = (ratioXY >> 1) - (1 << 14); - xPos += (int4)(0, 1, 2, 3); - - //x - int4 fx0 = xPos * ratioXY.x + ratioSufXY.x; - int4 sx = fx0 & 0xffff8000; - fx0 -= sx; - sx = sx >> 15; - - vxc_short4 fx; - VXC_DP4x4(fx, fx0, 1 << 4, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAddRShift); - //y - int fy = yPos * ratioXY.y + ratioSufXY.y; - int sy = fy & 0xffff8000; // Floor - - fy -= sy; - sy = sy >> 15; - - fy = (fy + (1<< 4)) >> 5; - - //R - vxc_uchar16 line0RGB1, line0RGB2; - vxc_uchar16 line1RGB3, line1RGB4; - int4 coord; - sx = sx * 3 + *xOffset; - coord.xyz = sx.xyz; - coord.w = sy + *yOffset; - int2 coord1 = (int2)(sx.w, coord.w); - VXC_ReadImage(line0RGB1, input, coord.xw,\ - VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 5, 0, VXC_RM_TowardZero, 0)); - VXC_ReadImage(line0RGB1, input, coord.yw,\ - VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(6, 11, 0, VXC_RM_TowardZero, 0)); - VXC_ReadImage(line0RGB2, input, coord.zw,\ - VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 5, 0, VXC_RM_TowardZero, 0)); - VXC_ReadImage(line0RGB2, input, coord1,\ - VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(6, 11, 0, VXC_RM_TowardZero, 0)); - - VXC_ReadImage(line1RGB3, input, coord.xw,\ - VXC_5BITOFFSET_XY(0, 1), VXC_MODIFIER(0, 5, 0, VXC_RM_TowardZero, 0)); - VXC_ReadImage(line1RGB3, input, coord.yw,\ - VXC_5BITOFFSET_XY(0, 1), VXC_MODIFIER(6, 11, 0, VXC_RM_TowardZero, 0)); - VXC_ReadImage(line1RGB4, input, coord.zw,\ - VXC_5BITOFFSET_XY(0, 1), VXC_MODIFIER(0, 5, 0, VXC_RM_TowardZero, 0)); - VXC_ReadImage(line1RGB4, input, coord1,\ - VXC_5BITOFFSET_XY(0, 1), VXC_MODIFIER(6, 11, 0, VXC_RM_TowardZero, 0)); - - float4 bgrMean = (float4)(bMean, gMean, rMean, 0); - - bgrMean *= f32Var; - - int4 test01, temp1; - int4 test02, temp2; - int4 tt; - vxc_uchar4 val; - int4 coord_out = (int4)(xPos.x, yPos, 2, 0); - - vxc_uchar8 line1, line2; - - //R - VXC_DP2x8(line1, line0RGB1, line0RGB2, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniUnpackToR); - VXC_DP2x8(line2, line1RGB3, line1RGB4, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniUnpackToR); - - VXC_DP4x4(test01, line1, line1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniVecShift10); - VXC_DP4x4(temp1, line1, fx, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniGetTempVal); - temp1 = temp1 + test01; - - VXC_DP4x4(test02, line2, line2, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniVecShift10); - VXC_DP4x4(temp2, line2, fx, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniGetTempVal); - temp2 = temp2 + test02; - temp2 = fy * (temp2 - temp1) + (temp1 << 10); - - vxc_float4 tmp_dst; - vxc_uchar4 u8_dst; - VXC_DP4x4(u8_dst, temp2, 1 << 19, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 1), uniExtractBytes); - VXC_DP4x4(tmp_dst, u8_dst, u8_dst,\ - VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 1), uniConvertIntergetoF32_4x4); - - //convert U8 to dfp8 - int4 dst0; - vxc_short4 dst; - tmp_dst = tmp_dst * f32Var - bgrMean.z; - tmp_dst *= outputScale; - dst0 = convert_int4_rte(tmp_dst); - VXC_DP2x8(dst, dst0, dst0, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), uniExtactInteger_2x8); - - VXC_WriteImage2DArray(output, coord_out, dst, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); - - //G - VXC_DP2x8(line1, line0RGB1, line0RGB2, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniUnpackToG); - VXC_DP2x8(line2, line1RGB3, line1RGB4, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniUnpackToG); - - coord_out.z = 1; - VXC_DP4x4(test01, line1, line1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniVecShift10); - VXC_DP4x4(temp1, line1, fx, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniGetTempVal); - temp1 = temp1 + test01; - - VXC_DP4x4(test02, line2, line2, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniVecShift10); - VXC_DP4x4(temp2, line2, fx, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniGetTempVal); - temp2 = temp2 + test02; - temp2 = fy * (temp2 - temp1) + (temp1 << 10); - - VXC_DP4x4(u8_dst, temp2, 1 << 19, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 1), uniExtractBytes); - VXC_DP4x4(tmp_dst, u8_dst, u8_dst,\ - VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 1), uniConvertIntergetoF32_4x4); - - tmp_dst = tmp_dst * f32Var - bgrMean.y; - tmp_dst *= outputScale; - dst0 = convert_int4_rte(tmp_dst); - VXC_DP2x8(dst, dst0, dst0, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), uniExtactInteger_2x8); - VXC_WriteImage2DArray(output, coord_out, dst, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); - - //B - VXC_DP2x8(line1, line0RGB1, line0RGB2, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniUnpackToB); - VXC_DP2x8(line2, line1RGB3, line1RGB4, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniUnpackToB); - - coord_out.z = 0; - VXC_DP4x4(test01, line1, line1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniVecShift10); - VXC_DP4x4(temp1, line1, fx, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniGetTempVal); - temp1 = temp1 + test01; - - VXC_DP4x4(test02, line2, line2, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniVecShift10); - VXC_DP4x4(temp2, line2, fx, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniGetTempVal); - temp2 = temp2 + test02; - temp2 = fy * (temp2 - temp1) + (temp1 << 10); - - VXC_DP4x4(u8_dst, temp2, 1 << 19, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 1), uniExtractBytes); - VXC_DP4x4(tmp_dst, u8_dst, u8_dst,\ - VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 1), uniConvertIntergetoF32_4x4); - - tmp_dst = tmp_dst * f32Var - bgrMean.x; - tmp_dst *= outputScale; - dst0 = convert_int4_rte(tmp_dst); - VXC_DP2x8(dst, dst0, dst0, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), uniExtactInteger_2x8); - VXC_WriteImage2DArray(output, coord_out, dst, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); -} - -_viv_uniform float outputZP; -__kernel void ScaletoTensor_UInt8 - ( - __read_only image2d_t input, - __write_only image2d_array_t output, - global int *xRatio, - global int *yRatio, - global int *xOffset, - global int *yOffset, - float rMean, - float gMean, - float bMean, - float f32Var - ) -{ - int2 ratioXY = (int2)(*xRatio, *yRatio); - - int4 xPos = get_global_id(0); - int yPos = get_global_id(1); - - int2 ratioSufXY = (ratioXY >> 1) - (1 << 14); - xPos += (int4)(0, 1, 2, 3); - - //x - int4 fx0 = xPos * ratioXY.x + ratioSufXY.x; - int4 sx = fx0 & 0xffff8000; - fx0 -= sx; - sx = sx >> 15; - - vxc_short4 fx; - VXC_DP4x4(fx, fx0, 1 << 4, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAddRShift); - //y - int fy = yPos * ratioXY.y + ratioSufXY.y; - int sy = fy & 0xffff8000; // Floor - - fy -= sy; - sy = sy >> 15; - - fy = (fy + (1<< 4)) >> 5; - - //R - vxc_uchar16 line0RGB1, line0RGB2; - vxc_uchar16 line1RGB3, line1RGB4; - int4 coord; - sx = sx * 3 + *xOffset; - coord.xyz = sx.xyz; - coord.w = sy + *yOffset; - int2 coord1 = (int2)(sx.w, coord.w); - VXC_ReadImage(line0RGB1, input, coord.xw,\ - VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 5, 0, VXC_RM_TowardZero, 0)); - VXC_ReadImage(line0RGB1, input, coord.yw,\ - VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(6, 11, 0, VXC_RM_TowardZero, 0)); - VXC_ReadImage(line0RGB2, input, coord.zw,\ - VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 5, 0, VXC_RM_TowardZero, 0)); - VXC_ReadImage(line0RGB2, input, coord1,\ - VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(6, 11, 0, VXC_RM_TowardZero, 0)); - - VXC_ReadImage(line1RGB3, input, coord.xw,\ - VXC_5BITOFFSET_XY(0, 1), VXC_MODIFIER(0, 5, 0, VXC_RM_TowardZero, 0)); - VXC_ReadImage(line1RGB3, input, coord.yw,\ - VXC_5BITOFFSET_XY(0, 1), VXC_MODIFIER(6, 11, 0, VXC_RM_TowardZero, 0)); - VXC_ReadImage(line1RGB4, input, coord.zw,\ - VXC_5BITOFFSET_XY(0, 1), VXC_MODIFIER(0, 5, 0, VXC_RM_TowardZero, 0)); - VXC_ReadImage(line1RGB4, input, coord1,\ - VXC_5BITOFFSET_XY(0, 1), VXC_MODIFIER(6, 11, 0, VXC_RM_TowardZero, 0)); - - float4 bgrMean = (float4)(bMean, gMean, rMean, 0); - - bgrMean *= f32Var; - - int4 test01, temp1; - int4 test02, temp2; - int4 tt; - vxc_uchar4 val; - int4 coord_out = (int4)(xPos.x, yPos, 2, 0); - - vxc_uchar8 line1, line2; - - //R - VXC_DP2x8(line1, line0RGB1, line0RGB2, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniUnpackToR); - VXC_DP2x8(line2, line1RGB3, line1RGB4, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniUnpackToR); - - VXC_DP4x4(test01, line1, line1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniVecShift10); - VXC_DP4x4(temp1, line1, fx, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniGetTempVal); - temp1 = temp1 + test01; - - VXC_DP4x4(test02, line2, line2, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniVecShift10); - VXC_DP4x4(temp2, line2, fx, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniGetTempVal); - temp2 = temp2 + test02; - temp2 = fy * (temp2 - temp1) + (temp1 << 10); - - vxc_float4 tmp_dst; - vxc_uchar4 u8_dst; - VXC_DP4x4(u8_dst, temp2, 1 << 19, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 1), uniExtractBytes); - VXC_DP4x4(tmp_dst, u8_dst, u8_dst,\ - VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 1), uniConvertIntergetoF32_4x4); - - //convert U8 to dfp8 - int4 dst0; - vxc_uchar4 dst; - tmp_dst = tmp_dst * f32Var - bgrMean.z; - tmp_dst = tmp_dst * outputScale + outputZP; - dst0 = convert_int4_rte(tmp_dst); - VXC_DP2x8(dst, dst0, dst0, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), uniExtactInteger_2x8); - - VXC_WriteImage2DArray(output, coord_out, dst, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); - - //G - VXC_DP2x8(line1, line0RGB1, line0RGB2, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniUnpackToG); - VXC_DP2x8(line2, line1RGB3, line1RGB4, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniUnpackToG); - - coord_out.z = 1; - VXC_DP4x4(test01, line1, line1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniVecShift10); - VXC_DP4x4(temp1, line1, fx, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniGetTempVal); - temp1 = temp1 + test01; - - VXC_DP4x4(test02, line2, line2, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniVecShift10); - VXC_DP4x4(temp2, line2, fx, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniGetTempVal); - temp2 = temp2 + test02; - temp2 = fy * (temp2 - temp1) + (temp1 << 10); - - VXC_DP4x4(u8_dst, temp2, 1 << 19, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 1), uniExtractBytes); - VXC_DP4x4(tmp_dst, u8_dst, u8_dst,\ - VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 1), uniConvertIntergetoF32_4x4); - - tmp_dst = tmp_dst * f32Var - bgrMean.y; - tmp_dst = tmp_dst * outputScale + outputZP; - dst0 = convert_int4_rte(tmp_dst); - VXC_DP2x8(dst, dst0, dst0, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), uniExtactInteger_2x8); - VXC_WriteImage2DArray(output, coord_out, dst, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); - - //B - VXC_DP2x8(line1, line0RGB1, line0RGB2, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniUnpackToB); - VXC_DP2x8(line2, line1RGB3, line1RGB4, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniUnpackToB); - - coord_out.z = 0; - VXC_DP4x4(test01, line1, line1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniVecShift10); - VXC_DP4x4(temp1, line1, fx, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniGetTempVal); - temp1 = temp1 + test01; - - VXC_DP4x4(test02, line2, line2, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniVecShift10); - VXC_DP4x4(temp2, line2, fx, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniGetTempVal); - temp2 = temp2 + test02; - temp2 = fy * (temp2 - temp1) + (temp1 << 10); - - VXC_DP4x4(u8_dst, temp2, 1 << 19, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 1), uniExtractBytes); - VXC_DP4x4(tmp_dst, u8_dst, u8_dst,\ - VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 1), uniConvertIntergetoF32_4x4); - - tmp_dst = tmp_dst * f32Var - bgrMean.x; - tmp_dst = tmp_dst * outputScale + outputZP; - dst0 = convert_int4_rte(tmp_dst); - VXC_DP2x8(dst, dst0, dst0, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), uniExtactInteger_2x8); - VXC_WriteImage2DArray(output, coord_out, dst, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); -} diff --git a/src/tim/vx/internal/src/libnnext/ops/vx/vsi_nn_kernel_imageprocess_3.vx b/src/tim/vx/internal/src/libnnext/ops/vx/vsi_nn_kernel_imageprocess_3.vx deleted file mode 100644 index 742459e..0000000 --- a/src/tim/vx/internal/src/libnnext/ops/vx/vsi_nn_kernel_imageprocess_3.vx +++ /dev/null @@ -1,214 +0,0 @@ -#include "cl_viv_vx_ext.h" - -_viv_uniform VXC_512Bits uniExtractR_2x8; -_viv_uniform VXC_512Bits uniExtractG_2x8; -_viv_uniform VXC_512Bits uniExtractB_2x8; -_viv_uniform float outputScale; -__kernel void ScaletoTensor_Fp16_copy - ( - __read_only image2d_t input, - __write_only image2d_array_t output, - global int *xRatio, - global int *yRatio, - global int *xOffset, - global int *yOffset, - float rMean, - float gMean, - float bMean, - float f32Var - ) -{ - int2 coord = (int2)(get_global_id(0) * 3, get_global_id(1)); - - coord.xy += (int2) (*xOffset, *yOffset); - vxc_uchar16 src0, src1; - vxc_half8 dst; - - VXC_ReadImage(src0, input, coord.xy, VXC_5BITOFFSET_XY(0, 0),\ - VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); - VXC_ReadImage(src1, input, coord.xy, VXC_5BITOFFSET_XY(15, 0),\ - VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); - - float4 paramData = (float4)(rMean * f32Var, gMean * f32Var, bMean * f32Var, f32Var); - //convert U8 to FP16 - half4 paramData_f16; - vxc_short8 tmp_dst; - _viv_asm(CONV, paramData_f16, paramData); - - int4 coord_out = (int4)(get_global_id(0), get_global_id(1), 2, 0); - //R - VXC_DP2x8(dst, src0, paramData_f16, VXC_MODIFIER(0, 4, 0, VXC_RM_TowardZero, 0), uniExtractR_2x8); - VXC_DP2x8(dst, src1, paramData_f16, VXC_MODIFIER(5, 7, 0, VXC_RM_TowardZero, 0), uniExtractR_2x8); - _viv_asm(COPY, tmp_dst, dst, 16); - VXC_WriteImage2DArray(output, coord_out, tmp_dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); - - - //G - coord_out.z = 1; - VXC_DP2x8(dst, src0, paramData_f16, VXC_MODIFIER(0, 4, 0, VXC_RM_TowardZero, 0), uniExtractG_2x8); - VXC_DP2x8(dst, src1, paramData_f16, VXC_MODIFIER(5, 7, 0, VXC_RM_TowardZero, 0), uniExtractG_2x8); - _viv_asm(COPY, tmp_dst, dst, 16); - VXC_WriteImage2DArray(output, coord_out, tmp_dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); - - //B - coord_out.z = 0; - VXC_DP2x8(dst, src0, paramData_f16, VXC_MODIFIER(0, 4, 0, VXC_RM_TowardZero, 0), uniExtractB_2x8); - VXC_DP2x8(dst, src1, paramData_f16, VXC_MODIFIER(5, 7, 0, VXC_RM_TowardZero, 0), uniExtractB_2x8); - _viv_asm(COPY, tmp_dst, dst, 16); - VXC_WriteImage2DArray(output, coord_out, tmp_dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); -} - -__kernel void ScaletoTensor_Int8_copy - ( - __read_only image2d_t input, - __write_only image2d_array_t output, - global int *xRatio, - global int *yRatio, - global int *xOffset, - global int *yOffset, - float rMean, - float gMean, - float bMean, - float f32Var - ) -{ - int2 coord = (int2)(get_global_id(0) * 3, get_global_id(1)); - - coord.xy += (int2) (*xOffset, *yOffset); - vxc_uchar16 src0, src1; - vxc_char16 dst; - - VXC_ReadImage(src0, input, coord.xy, VXC_5BITOFFSET_XY(0, 0),\ - VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); - VXC_ReadImage(src1, input, coord.xy, VXC_5BITOFFSET_XY(15, 0),\ - VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); - - f32Var *= outputScale; - float4 paramData = (float4)(rMean * f32Var, gMean * f32Var, bMean * f32Var, f32Var); - //convert U8 to FP16 - half4 paramData_f16; - _viv_asm(CONV, paramData_f16, paramData); - - int4 coord_out = (int4)(get_global_id(0), get_global_id(1), 2, 0); - //R - VXC_DP2x8(dst, src0, paramData_f16, VXC_MODIFIER(0, 4, 0, VXC_RM_ToNearestEven, 1), uniExtractR_2x8); - VXC_DP2x8(dst, src1, paramData_f16, VXC_MODIFIER(5, 9, 0, VXC_RM_ToNearestEven, 1), uniExtractR_2x8); - VXC_WriteImage2DArray(output, coord_out, dst, VXC_MODIFIER(0, 9, 0, VXC_RM_TowardZero, 0)); - - - //G - coord_out.z = 1; - VXC_DP2x8(dst, src0, paramData_f16, VXC_MODIFIER(0, 4, 0, VXC_RM_ToNearestEven, 1), uniExtractG_2x8); - VXC_DP2x8(dst, src1, paramData_f16, VXC_MODIFIER(5, 9, 0, VXC_RM_ToNearestEven, 1), uniExtractG_2x8); - VXC_WriteImage2DArray(output, coord_out, dst, VXC_MODIFIER(0, 9, 0, VXC_RM_TowardZero, 0)); - - //B - coord_out.z = 0; - VXC_DP2x8(dst, src0, paramData_f16, VXC_MODIFIER(0, 4, 0, VXC_RM_ToNearestEven, 1), uniExtractB_2x8); - VXC_DP2x8(dst, src1, paramData_f16, VXC_MODIFIER(5, 9, 0, VXC_RM_ToNearestEven, 1), uniExtractB_2x8); - VXC_WriteImage2DArray(output, coord_out, dst, VXC_MODIFIER(0, 9, 0, VXC_RM_TowardZero, 0)); -} - -__kernel void ScaletoTensor_Int16_copy - ( - __read_only image2d_t input, - __write_only image2d_array_t output, - global int *xRatio, - global int *yRatio, - global int *xOffset, - global int *yOffset, - float rMean, - float gMean, - float bMean, - float f32Var - ) -{ - int2 coord = (int2)(get_global_id(0) * 3, get_global_id(1)); - - coord.xy += (int2) (*xOffset, *yOffset); - vxc_uchar16 src0, src1; - vxc_short8 dst; - - VXC_ReadImage(src0, input, coord.xy, VXC_5BITOFFSET_XY(0, 0),\ - VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); - VXC_ReadImage(src1, input, coord.xy, VXC_5BITOFFSET_XY(15, 0),\ - VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); - - f32Var *= outputScale; - float4 paramData = (float4)(rMean * f32Var, gMean * f32Var, bMean * f32Var, f32Var); - //convert U8 to FP16 - half4 paramData_f16; - _viv_asm(CONV, paramData_f16, paramData); - - int4 coord_out = (int4)(get_global_id(0), get_global_id(1), 2, 0); - //R - VXC_DP2x8(dst, src0, paramData_f16, VXC_MODIFIER(0, 4, 0, VXC_RM_ToNearestEven, 1), uniExtractR_2x8); - VXC_DP2x8(dst, src1, paramData_f16, VXC_MODIFIER(5, 7, 0, VXC_RM_ToNearestEven, 1), uniExtractR_2x8); - VXC_WriteImage2DArray(output, coord_out, dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); - - - //G - coord_out.z = 1; - VXC_DP2x8(dst, src0, paramData_f16, VXC_MODIFIER(0, 4, 0, VXC_RM_ToNearestEven, 1), uniExtractG_2x8); - VXC_DP2x8(dst, src1, paramData_f16, VXC_MODIFIER(5, 7, 0, VXC_RM_ToNearestEven, 1), uniExtractG_2x8); - VXC_WriteImage2DArray(output, coord_out, dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); - - //B - coord_out.z = 0; - VXC_DP2x8(dst, src0, paramData_f16, VXC_MODIFIER(0, 4, 0, VXC_RM_ToNearestEven, 1), uniExtractB_2x8); - VXC_DP2x8(dst, src1, paramData_f16, VXC_MODIFIER(5, 7, 0, VXC_RM_ToNearestEven, 1), uniExtractB_2x8); - VXC_WriteImage2DArray(output, coord_out, dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); -} - -_viv_uniform float outputZP; -__kernel void ScaletoTensor_UInt8_copy - ( - __read_only image2d_t input, - __write_only image2d_array_t output, - global int *xRatio, - global int *yRatio, - global int *xOffset, - global int *yOffset, - float rMean, - float gMean, - float bMean, - float f32Var - ) -{ - int2 coord = (int2)(get_global_id(0) * 3, get_global_id(1)); - - coord.xy += (int2) (*xOffset, *yOffset); - vxc_uchar16 src0, src1; - vxc_uchar16 dst; - - VXC_ReadImage(src0, input, coord.xy, VXC_5BITOFFSET_XY(0, 0),\ - VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); - VXC_ReadImage(src1, input, coord.xy, VXC_5BITOFFSET_XY(15, 0),\ - VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); - - f32Var *= outputScale; - float4 paramData = (float4)(rMean * f32Var - outputZP,\ - gMean * f32Var - outputZP, bMean * f32Var - outputZP, f32Var); - //convert U8 to FP16 - half4 paramData_f16; - _viv_asm(CONV, paramData_f16, paramData); - - int4 coord_out = (int4)(get_global_id(0), get_global_id(1), 2, 0); - //R - VXC_DP2x8(dst, src0, paramData_f16, VXC_MODIFIER(0, 4, 0, VXC_RM_ToNearestEven, 1), uniExtractR_2x8); - VXC_DP2x8(dst, src1, paramData_f16, VXC_MODIFIER(5, 9, 0, VXC_RM_ToNearestEven, 1), uniExtractR_2x8); - VXC_WriteImage2DArray(output, coord_out, dst, VXC_MODIFIER(0, 9, 0, VXC_RM_TowardZero, 0)); - - - //G - coord_out.z = 1; - VXC_DP2x8(dst, src0, paramData_f16, VXC_MODIFIER(0, 4, 0, VXC_RM_ToNearestEven, 1), uniExtractG_2x8); - VXC_DP2x8(dst, src1, paramData_f16, VXC_MODIFIER(5, 9, 0, VXC_RM_ToNearestEven, 1), uniExtractG_2x8); - VXC_WriteImage2DArray(output, coord_out, dst, VXC_MODIFIER(0, 9, 0, VXC_RM_TowardZero, 0)); - - //B - coord_out.z = 0; - VXC_DP2x8(dst, src0, paramData_f16, VXC_MODIFIER(0, 4, 0, VXC_RM_ToNearestEven, 1), uniExtractB_2x8); - VXC_DP2x8(dst, src1, paramData_f16, VXC_MODIFIER(5, 9, 0, VXC_RM_ToNearestEven, 1), uniExtractB_2x8); - VXC_WriteImage2DArray(output, coord_out, dst, VXC_MODIFIER(0, 9, 0, VXC_RM_TowardZero, 0)); -} diff --git a/src/tim/vx/internal/src/libnnext/ops/vx/vsi_nn_kernel_imageprocess_4.vx b/src/tim/vx/internal/src/libnnext/ops/vx/vsi_nn_kernel_imageprocess_4.vx deleted file mode 100644 index e7c9049..0000000 --- a/src/tim/vx/internal/src/libnnext/ops/vx/vsi_nn_kernel_imageprocess_4.vx +++ /dev/null @@ -1,207 +0,0 @@ -#include "cl_viv_vx_ext.h" - -_viv_uniform VXC_512Bits uniVecShift10; -_viv_uniform VXC_512Bits uniAddRShift; -_viv_uniform VXC_512Bits uniGetTempVal; -_viv_uniform VXC_512Bits uniExtractBytes; - -_viv_uniform VXC_512Bits uniConvertIntergetoF32_4x4; -_viv_uniform float outputScale; -_viv_uniform VXC_512Bits uniExtactInteger_2x8; - -#define DESCALE(x) (((x) + (1<<19)) >> 20) -__kernel void GrayScaletoTensor_Int8 - ( - __read_only image2d_t input, - __write_only image2d_array_t output, - global int *xRatio, - global int *yRatio, - global int *xOffset, - global int *yOffset, - float mean, - float f32Var - ) -{ - int2 ratioXY = (int2)(*xRatio, *yRatio); - - int4 xPos = get_global_id(0); - int yPos = get_global_id(1); - - int2 ratioSufXY = (ratioXY >> 1) - (1 << 14); - xPos += (int4)(0, 1, 2, 3); - - //x - int4 fx0 = xPos * ratioXY.x + ratioSufXY.x; - int4 sx = fx0 & 0xffff8000; - fx0 -= sx; - sx = sx >> 15; - - vxc_short4 fx; - VXC_DP4x4(fx, fx0, 1 << 4, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAddRShift); - //y - int fy = yPos * ratioXY.y + ratioSufXY.y; - int sy = fy & 0xffff8000; // Floor - - fy -= sy; - sy = sy >> 15; - - fy = (fy + (1<< 4)) >> 5; - - //R - vxc_uchar16 line0Y; - vxc_uchar16 line1Y; - int4 coord; - sx = sx + *xOffset; - coord.xyz = sx.xyz; - coord.w = sy + *yOffset; - int2 coord1 = (int2)(sx.w, coord.w); - VXC_ReadImage(line0Y, input, coord.xw, VXC_5BITOFFSET_XY(0, 0), - VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0)); - VXC_ReadImage(line0Y, input, coord.yw, VXC_5BITOFFSET_XY(0, 0), - VXC_MODIFIER(2, 3, 0, VXC_RM_TowardZero, 0)); - VXC_ReadImage(line0Y, input, coord.zw, VXC_5BITOFFSET_XY(0, 0), - VXC_MODIFIER(4, 5, 0, VXC_RM_TowardZero, 0)); - VXC_ReadImage(line0Y, input, coord1, VXC_5BITOFFSET_XY(0, 0), - VXC_MODIFIER(6, 7, 0, VXC_RM_TowardZero, 0)); - - VXC_ReadImage(line1Y, input, coord.xw, VXC_5BITOFFSET_XY(0, 1), - VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0)); - VXC_ReadImage(line1Y, input, coord.yw, VXC_5BITOFFSET_XY(0, 1), - VXC_MODIFIER(2, 3, 0, VXC_RM_TowardZero, 0)); - VXC_ReadImage(line1Y, input, coord.zw, VXC_5BITOFFSET_XY(0, 1), - VXC_MODIFIER(4, 5, 0, VXC_RM_TowardZero, 0)); - VXC_ReadImage(line1Y, input, coord1, VXC_5BITOFFSET_XY(0, 1), - VXC_MODIFIER(6, 7, 0, VXC_RM_TowardZero, 0)); - - float grayMean = mean * f32Var; - - int4 test01, temp1; - int4 test02, temp2; - int4 tt; - vxc_uchar4 val; - int2 coord_out = (int2)(xPos.x, yPos); - - vxc_uchar8 line1, line2; - - VXC_DP4x4(test01, line0Y, line0Y, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniVecShift10); - VXC_DP4x4(temp1, line0Y, fx, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniGetTempVal); - temp1 = temp1 + test01; - - VXC_DP4x4(test02, line1Y, line1Y, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniVecShift10); - VXC_DP4x4(temp2, line1Y, fx, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniGetTempVal); - temp2 = temp2 + test02; - temp2 = fy * (temp2 - temp1) + (temp1 << 10); - - vxc_float4 tmp_dst; - vxc_uchar4 u8_dst; - VXC_DP4x4(u8_dst, temp2, 1 << 19, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 1), uniExtractBytes); - VXC_DP4x4(tmp_dst, u8_dst, u8_dst, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, - 1), uniConvertIntergetoF32_4x4); - - //convert U8 to dfp8 - int4 dst0; - vxc_char4 dst; - tmp_dst = tmp_dst * f32Var - grayMean; - tmp_dst *= outputScale; - dst0 = convert_int4_rte(tmp_dst); - VXC_DP2x8(dst, dst0, dst0, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), uniExtactInteger_2x8); - - VXC_WriteImage(output, coord_out, dst, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); -} - -_viv_uniform VXC_512Bits uniDataMulAlpha_4x4; -_viv_uniform VXC_512Bits uniDataSubMean_4x4; -__kernel void GrayScaletoTensor_Fp16 - ( - __read_only image2d_t input, - __write_only image2d_array_t output, - global int *xRatio, - global int *yRatio, - global int *xOffset, - global int *yOffset, - float mean, - float f32Var - ) -{ - int2 ratioXY = (int2)(*xRatio, *yRatio); - - int4 xPos = get_global_id(0); - int yPos = get_global_id(1); - - int2 ratioSufXY = (ratioXY >> 1) - (1 << 14); - xPos += (int4)(0, 1, 2, 3); - - //x - int4 fx0 = xPos * ratioXY.x + ratioSufXY.x; - int4 sx = fx0 & 0xffff8000; - fx0 -= sx; - sx = sx >> 15; - - vxc_short4 fx; - VXC_DP4x4(fx, fx0, 1 << 4, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAddRShift); - //y - int fy = yPos * ratioXY.y + ratioSufXY.y; - int sy = fy & 0xffff8000; // Floor - - fy -= sy; - sy = sy >> 15; - - fy = (fy + (1<< 4)) >> 5; - - //R - vxc_uchar16 line0Y; - vxc_uchar16 line1Y; - int4 coord; - sx = sx + *xOffset; - coord.xyz = sx.xyz; - coord.w = sy + *yOffset; - int2 coord1 = (int2)(sx.w, coord.w); - VXC_ReadImage(line0Y, input, coord.xw, VXC_5BITOFFSET_XY(0, 0), - VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0)); - VXC_ReadImage(line0Y, input, coord.yw, VXC_5BITOFFSET_XY(0, 0), - VXC_MODIFIER(2, 3, 0, VXC_RM_TowardZero, 0)); - VXC_ReadImage(line0Y, input, coord.zw, VXC_5BITOFFSET_XY(0, 0), - VXC_MODIFIER(4, 5, 0, VXC_RM_TowardZero, 0)); - VXC_ReadImage(line0Y, input, coord1, VXC_5BITOFFSET_XY(0, 0), - VXC_MODIFIER(6, 7, 0, VXC_RM_TowardZero, 0)); - - VXC_ReadImage(line1Y, input, coord.xw, VXC_5BITOFFSET_XY(0, 1), - VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0)); - VXC_ReadImage(line1Y, input, coord.yw, VXC_5BITOFFSET_XY(0, 1), - VXC_MODIFIER(2, 3, 0, VXC_RM_TowardZero, 0)); - VXC_ReadImage(line1Y, input, coord.zw, VXC_5BITOFFSET_XY(0, 1), - VXC_MODIFIER(4, 5, 0, VXC_RM_TowardZero, 0)); - VXC_ReadImage(line1Y, input, coord1, VXC_5BITOFFSET_XY(0, 1), - VXC_MODIFIER(6, 7, 0, VXC_RM_TowardZero, 0)); - - float grayMean = mean; - - int4 test01, temp1; - int4 test02, temp2; - int4 tt; - vxc_uchar4 val; - int2 coord_out = (int2)(xPos.x, yPos); - - VXC_DP4x4(test01, line0Y, line0Y, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniVecShift10); - VXC_DP4x4(temp1, line0Y, fx, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniGetTempVal); - temp1 = temp1 + test01; - - VXC_DP4x4(test02, line1Y, line1Y, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniVecShift10); - VXC_DP4x4(temp2, line1Y, fx, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniGetTempVal); - temp2 = temp2 + test02; - temp2 = fy * (temp2 - temp1) + (temp1 << 10); - - VXC_DP4x4(val, temp2, 1 << 19, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 1), uniExtractBytes); - - //convert U8 to FP16 - half f16mean; - half f16alpha; - vxc_half4 dst; - vxc_short4 tmp_dst; - _viv_asm(CONV, f16mean, grayMean); - _viv_asm(CONV, f16alpha, f32Var); - VXC_DP4x4(dst, val, f16mean, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniDataSubMean_4x4); - VXC_DP4x4(dst, dst, f16alpha, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniDataMulAlpha_4x4); - _viv_asm(COPY, tmp_dst, dst, 8); - VXC_WriteImage(output, coord_out, tmp_dst, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); -} diff --git a/src/tim/vx/internal/src/libnnext/ops/vx/vsi_nn_kernel_imageprocess_5.vx b/src/tim/vx/internal/src/libnnext/ops/vx/vsi_nn_kernel_imageprocess_5.vx deleted file mode 100644 index 15bfb2e..0000000 --- a/src/tim/vx/internal/src/libnnext/ops/vx/vsi_nn_kernel_imageprocess_5.vx +++ /dev/null @@ -1,355 +0,0 @@ -#include "cl_viv_vx_ext.h" - -_viv_uniform VXC_512Bits uniVecShift10; -_viv_uniform VXC_512Bits uniAddRShift; -_viv_uniform VXC_512Bits uniGetTempVal; -_viv_uniform VXC_512Bits uniExtractBytes; - -_viv_uniform VXC_512Bits uniConvertIntergetoF32_4x4; -_viv_uniform float outputScale; -_viv_uniform VXC_512Bits uniExtactInteger_2x8; - -_viv_uniform VXC_512Bits uniDataMeanStddevLo_2x8; -_viv_uniform VXC_512Bits uniDataMeanStddevHi_2x8; -__kernel void GrayScaletoTensor_Fp16_copy - ( - __read_only image2d_t input, - __write_only image2d_array_t output, - global int *xRatio, - global int *yRatio, - global int *xOffset, - global int *yOffset, - float mean, - float f32Var - ) -{ - int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(0), get_global_id(1)); - - coord.xy += (int2) (*xOffset, *yOffset); - vxc_uchar16 src0; - vxc_half8 dst0, dst1; - - VXC_ReadImage(src0, input, coord.xy, VXC_5BITOFFSET_XY(0, 0), - VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); - - coord.x = coord.z + 8; - float4 paramData = (float4)(mean * f32Var, mean * f32Var, mean * f32Var, f32Var); - //convert U8 to FP16 - half4 paramData_f16; - vxc_short8 tmp_dst; - _viv_asm(CONV, paramData_f16, paramData); - - VXC_DP2x8(dst0, src0, paramData_f16, - VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniDataMeanStddevLo_2x8); - VXC_DP2x8(dst1, src0, paramData_f16, - VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniDataMeanStddevHi_2x8); - _viv_asm(COPY, tmp_dst, dst0, 16); - VXC_WriteImage(output, coord.zw, tmp_dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); - _viv_asm(COPY, tmp_dst, dst1, 16); - VXC_WriteImage(output, coord.xw, tmp_dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); -} - -__kernel void GrayScaletoTensor_Int8_copy - ( - __read_only image2d_t input, - __write_only image2d_array_t output, - global int *xRatio, - global int *yRatio, - global int *xOffset, - global int *yOffset, - float mean, - float f32Var - ) -{ - int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(0), get_global_id(1)); - - coord.xy += (int2) (*xOffset, *yOffset); - vxc_uchar16 src0; - vxc_char16 dst; - - VXC_ReadImage(src0, input, coord.xy, VXC_5BITOFFSET_XY(0, 0), - VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); - - f32Var *= outputScale; - float4 paramData = (float4)(mean * f32Var, mean * f32Var, mean * f32Var, f32Var); - //convert U8 to FP16 - half4 paramData_f16; - _viv_asm(CONV, paramData_f16, paramData); - - VXC_DP2x8(dst, src0, paramData_f16, - VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniDataMeanStddevLo_2x8); - VXC_DP2x8(dst, src0, paramData_f16, - VXC_MODIFIER(8, 15, 0, VXC_RM_ToNearestEven, 1), uniDataMeanStddevHi_2x8); - VXC_WriteImage(output, coord.zw, dst, - VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); - -} - -__kernel void GrayScaletoTensor_Int16 - ( - __read_only image2d_t input, - __write_only image2d_array_t output, - global int *xRatio, - global int *yRatio, - global int *xOffset, - global int *yOffset, - float mean, - float f32Var - ) -{ - int2 ratioXY = (int2)(*xRatio, *yRatio); - - int4 xPos = get_global_id(0); - int yPos = get_global_id(1); - - int2 ratioSufXY = (ratioXY >> 1) - (1 << 14); - xPos += (int4)(0, 1, 2, 3); - - //x - int4 fx0 = xPos * ratioXY.x + ratioSufXY.x; - int4 sx = fx0 & 0xffff8000; - fx0 -= sx; - sx = sx >> 15; - - vxc_short4 fx; - VXC_DP4x4(fx, fx0, 1 << 4, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAddRShift); - //y - int fy = yPos * ratioXY.y + ratioSufXY.y; - int sy = fy & 0xffff8000; // Floor - - fy -= sy; - sy = sy >> 15; - - fy = (fy + (1<< 4)) >> 5; - - vxc_uchar16 line0Y; - vxc_uchar16 line1Y; - int4 coord; - sx = sx + *xOffset; - coord.xyz = sx.xyz; - coord.w = sy + *yOffset; - int2 coord1 = (int2)(sx.w, coord.w); - VXC_ReadImage(line0Y, input, coord.xw, - VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0)); - VXC_ReadImage(line0Y, input, coord.yw, - VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(2, 3, 0, VXC_RM_TowardZero, 0)); - VXC_ReadImage(line0Y, input, coord.zw, - VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(4, 5, 0, VXC_RM_TowardZero, 0)); - VXC_ReadImage(line0Y, input, coord1, VXC_5BITOFFSET_XY(0, 0), - VXC_MODIFIER(6, 7, 0, VXC_RM_TowardZero, 0)); - - VXC_ReadImage(line1Y, input, coord.xw, VXC_5BITOFFSET_XY(0, 1), - VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0)); - VXC_ReadImage(line1Y, input, coord.yw, VXC_5BITOFFSET_XY(0, 1), - VXC_MODIFIER(2, 3, 0, VXC_RM_TowardZero, 0)); - VXC_ReadImage(line1Y, input, coord.zw, VXC_5BITOFFSET_XY(0, 1), - VXC_MODIFIER(4, 5, 0, VXC_RM_TowardZero, 0)); - VXC_ReadImage(line1Y, input, coord1, VXC_5BITOFFSET_XY(0, 1), - VXC_MODIFIER(6, 7, 0, VXC_RM_TowardZero, 0)); - - float grayMean = mean * f32Var; - - int4 test01, temp1; - int4 test02, temp2; - int4 tt; - vxc_uchar4 val; - int2 coord_out = (int2)(xPos.x, yPos); - - vxc_uchar8 line1, line2; - - VXC_DP4x4(test01, line0Y, line0Y, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniVecShift10); - VXC_DP4x4(temp1, line0Y, fx, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniGetTempVal); - temp1 = temp1 + test01; - - VXC_DP4x4(test02, line1Y, line1Y, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniVecShift10); - VXC_DP4x4(temp2, line1Y, fx, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniGetTempVal); - temp2 = temp2 + test02; - temp2 = fy * (temp2 - temp1) + (temp1 << 10); - - vxc_float4 tmp_dst; - vxc_uchar4 u8_dst; - VXC_DP4x4(u8_dst, temp2, 1 << 19, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 1), uniExtractBytes); - VXC_DP4x4(tmp_dst, u8_dst, u8_dst, - VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 1), uniConvertIntergetoF32_4x4); - - //convert U8 to dfp8 - int4 dst0; - vxc_short4 dst; - tmp_dst = tmp_dst * f32Var - grayMean; - tmp_dst *= outputScale; - dst0 = convert_int4_rte(tmp_dst); - VXC_DP2x8(dst, dst0, dst0, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), uniExtactInteger_2x8); - - VXC_WriteImage(output, coord_out, dst, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); -} - -__kernel void GrayScaletoTensor_Int16_copy - ( - __read_only image2d_t input, - __write_only image2d_array_t output, - global int *xRatio, - global int *yRatio, - global int *xOffset, - global int *yOffset, - float mean, - float f32Var - ) -{ - int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(0), get_global_id(1)); - - coord.xy += (int2) (*xOffset, *yOffset); - vxc_uchar16 src0; - vxc_short8 dst0, dst1; - - VXC_ReadImage(src0, input, coord.xy, VXC_5BITOFFSET_XY(0, 0), - VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); - - coord.x = coord.z + 8; - - f32Var *= outputScale; - float4 paramData = (float4)(mean * f32Var, mean * f32Var, mean * f32Var, f32Var); - //convert U8 to FP16 - half4 paramData_f16; - _viv_asm(CONV, paramData_f16, paramData); - - - VXC_DP2x8(dst0, src0, paramData_f16, - VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniDataMeanStddevLo_2x8); - VXC_DP2x8(dst1, src0, paramData_f16, - VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniDataMeanStddevHi_2x8); - VXC_WriteImage(output, coord.zw, dst0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); - VXC_WriteImage(output, coord.xw, dst1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); -} - -_viv_uniform float outputZP; -__kernel void GrayScaletoTensor_UInt8_copy - ( - __read_only image2d_t input, - __write_only image2d_array_t output, - global int *xRatio, - global int *yRatio, - global int *xOffset, - global int *yOffset, - float mean, - float f32Var - ) -{ - int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(0), get_global_id(1)); - - coord.xy += (int2) (*xOffset, *yOffset); - vxc_uchar16 src0; - vxc_uchar16 dst; - - VXC_ReadImage(src0, input, coord.xy, VXC_5BITOFFSET_XY(0, 0), - VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); - - f32Var *= outputScale; - float4 paramData = (float4)(mean * f32Var - outputZP, mean * f32Var - outputZP, - mean * f32Var - outputZP, f32Var); - //convert U8 to FP16 - half4 paramData_f16; - _viv_asm(CONV, paramData_f16, paramData); - - VXC_DP2x8(dst, src0, paramData_f16, - VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniDataMeanStddevLo_2x8); - VXC_DP2x8(dst, src0, paramData_f16, - VXC_MODIFIER(8, 15, 0, VXC_RM_ToNearestEven, 1), uniDataMeanStddevHi_2x8); - VXC_WriteImage(output, coord.zw, dst, VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); -} - -__kernel void GrayScaletoTensor_UInt8 - ( - __read_only image2d_t input, - __write_only image2d_array_t output, - global int *xRatio, - global int *yRatio, - global int *xOffset, - global int *yOffset, - float mean, - float f32Var - ) -{ - int2 ratioXY = (int2)(*xRatio, *yRatio); - - int4 xPos = get_global_id(0); - int yPos = get_global_id(1); - - int2 ratioSufXY = (ratioXY >> 1) - (1 << 14); - xPos += (int4)(0, 1, 2, 3); - - //x - int4 fx0 = xPos * ratioXY.x + ratioSufXY.x; - int4 sx = fx0 & 0xffff8000; - fx0 -= sx; - sx = sx >> 15; - - vxc_short4 fx; - VXC_DP4x4(fx, fx0, 1 << 4, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAddRShift); - //y - int fy = yPos * ratioXY.y + ratioSufXY.y; - int sy = fy & 0xffff8000; // Floor - - fy -= sy; - sy = sy >> 15; - - fy = (fy + (1<< 4)) >> 5; - - //R - vxc_uchar16 line0Y; - vxc_uchar16 line1Y; - int4 coord; - sx = sx + *xOffset; - coord.xyz = sx.xyz; - coord.w = sy + *yOffset; - int2 coord1 = (int2)(sx.w, coord.w); - VXC_ReadImage(line0Y, input, coord.xw, VXC_5BITOFFSET_XY(0, 0), - VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0)); - VXC_ReadImage(line0Y, input, coord.yw, VXC_5BITOFFSET_XY(0, 0), - VXC_MODIFIER(2, 3, 0, VXC_RM_TowardZero, 0)); - VXC_ReadImage(line0Y, input, coord.zw, VXC_5BITOFFSET_XY(0, 0), - VXC_MODIFIER(4, 5, 0, VXC_RM_TowardZero, 0)); - VXC_ReadImage(line0Y, input, coord1, VXC_5BITOFFSET_XY(0, 0), - VXC_MODIFIER(6, 7, 0, VXC_RM_TowardZero, 0)); - - VXC_ReadImage(line1Y, input, coord.xw, VXC_5BITOFFSET_XY(0, 1), - VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0)); - VXC_ReadImage(line1Y, input, coord.yw, VXC_5BITOFFSET_XY(0, 1), - VXC_MODIFIER(2, 3, 0, VXC_RM_TowardZero, 0)); - VXC_ReadImage(line1Y, input, coord.zw, VXC_5BITOFFSET_XY(0, 1), - VXC_MODIFIER(4, 5, 0, VXC_RM_TowardZero, 0)); - VXC_ReadImage(line1Y, input, coord1, VXC_5BITOFFSET_XY(0, 1), - VXC_MODIFIER(6, 7, 0, VXC_RM_TowardZero, 0)); - - float grayMean = mean * f32Var; - - int4 test01, temp1; - int4 test02, temp2; - int4 tt; - vxc_uchar4 val; - int2 coord_out = (int2)(xPos.x, yPos); - - VXC_DP4x4(test01, line0Y, line0Y, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniVecShift10); - VXC_DP4x4(temp1, line0Y, fx, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniGetTempVal); - temp1 = temp1 + test01; - - VXC_DP4x4(test02, line1Y, line1Y, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniVecShift10); - VXC_DP4x4(temp2, line1Y, fx, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniGetTempVal); - temp2 = temp2 + test02; - temp2 = fy * (temp2 - temp1) + (temp1 << 10); - - vxc_float4 tmp_dst; - vxc_uchar4 u8_dst; - VXC_DP4x4(u8_dst, temp2, 1 << 19, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 1), uniExtractBytes); - VXC_DP4x4(tmp_dst, u8_dst, u8_dst, - VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 1), uniConvertIntergetoF32_4x4); - - //convert U8 to dfp8 - int4 dst0; - vxc_uchar4 dst; - tmp_dst = tmp_dst * f32Var - grayMean; - tmp_dst = tmp_dst * outputScale + outputZP; - dst0 = convert_int4_rte(tmp_dst); - VXC_DP2x8(dst, dst0, dst0, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), uniExtactInteger_2x8); - - VXC_WriteImage(output, coord_out, dst, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); -} diff --git a/src/tim/vx/internal/src/libnnext/ops/vx/vsi_nn_kernel_roi_align.vx b/src/tim/vx/internal/src/libnnext/ops/vx/vsi_nn_kernel_roi_align.vx deleted file mode 100644 index 90804a3..0000000 --- a/src/tim/vx/internal/src/libnnext/ops/vx/vsi_nn_kernel_roi_align.vx +++ /dev/null @@ -1,8 +0,0 @@ -#include "cl_viv_vx_ext.h" - -__kernel void vxcRoi_align( - __read_only image2d_array_t input, - __write_only image2d_array_t output) -{ - -} diff --git a/src/tim/vx/internal/src/libnnext/ops/vx/vsi_nn_kernel_signalframe.vx b/src/tim/vx/internal/src/libnnext/ops/vx/vsi_nn_kernel_signalframe.vx deleted file mode 100644 index f055ad7..0000000 --- a/src/tim/vx/internal/src/libnnext/ops/vx/vsi_nn_kernel_signalframe.vx +++ /dev/null @@ -1,278 +0,0 @@ -#include "cl_viv_vx_ext.h" - -_viv_uniform int input_width; -_viv_uniform int input_height; -_viv_uniform int input_channel; -_viv_uniform int output_channel; - - -__kernel __attribute__((reqd_work_group_size(8, 1, 1))) void vxcSignalFrame_width( - image2d_array_t input, - image2d_array_t output, - int frame_length, - int step, - int pad_end, - int pad, - int axis) -{ - int gidx = get_global_id(0); - int gidy = get_global_id(1); - int gidz = get_global_id(2); - int outChn = gidz * input_height + gidy; - int4 coord = (int4)(0, gidy, gidz, 0); - int4 coord_out = (int4)(0, 0, outChn, 0); - - int endcoord = (pad_end == 0) ? (input_width - frame_length + 1) : (input_width); - int iter = frame_length / 8; - int res = frame_length % 8; - vxc_short8 src0; - - for(int i = 0; i < endcoord; i += step) - { - coord.x = i; - for(int j = 0; j < iter; j++) - { - coord_out.x = j << 3; - coord.x = i + (j << 3); - VXC_ReadImage2DArray(src0, input, coord, VXC_5BITOFFSET_XY(0, 0), - VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); - VXC_WriteImage2DArray(output, coord_out, src0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); - } - coord.x = i + (iter << 3); - coord_out.x = (iter << 3); - for(int j = 0; j < res; j++) - { - VXC_ReadImage2DArray(src0, input, coord, VXC_5BITOFFSET_XY(0, 0), - VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0)); - VXC_WriteImage2DArray(output, coord_out, src0, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0)); - coord_out.x++; - coord.x++; - } - - coord_out.y++; - } -} - -__kernel __attribute__((reqd_work_group_size(8, 1, 1))) void vxcSignalFrame_height( - image2d_array_t input, - image2d_array_t output, - int frame_length, - int step, - int pad_end, - int pad, - int axis) -{ - int gidx = get_global_id(0); - int gidy = get_global_id(1); - int gidz = get_global_id(2); - int outChn = gidz * output_channel + (gidy / step); - int4 coord = (int4)(gidx, gidy, gidz, 0); - int4 coord_out = (int4)(gidx, 0, outChn, 0); - vxc_short8 src0; - - for(int i = 0; i < frame_length; i++) - { - coord.y = gidy + i; - coord_out.y = i; - VXC_ReadImage2DArray(src0, input, coord, VXC_5BITOFFSET_XY(0, 0),\ - VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); - VXC_WriteImage2DArray(output, coord_out, src0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); - } -} - -__kernel __attribute__((reqd_work_group_size(8, 1, 1))) void vxcSignalFrame_channel( - image2d_array_t input, - image2d_array_t output, - int frame_length, - int step, - int pad_end, - int pad, - int axis) -{ - int gidx = get_global_id(0); - int gidy = get_global_id(1); - int gidz = get_global_id(2); - int outChn = (gidz / step) * frame_length; - int4 coord = (int4)(gidx, gidy, gidz, 0); - int4 coord_out = (int4)(gidx, gidy, outChn, 0); - vxc_short8 src0; - - for(int i = 0; i < frame_length; i++) - { - coord.z = gidz + i; - coord_out.z = outChn + i; - if(coord.z < input_channel) - { - VXC_ReadImage2DArray(src0, input, coord, VXC_5BITOFFSET_XY(0, 0), - VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); - } - else - { - src0 = (vxc_short8)(0); - } - VXC_WriteImage2DArray(output, coord_out, src0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); - } -} - -__kernel __attribute__((reqd_work_group_size(8, 1, 1))) void vxcSignalFrame_width_8bit( - image2d_array_t input, - image2d_array_t output, - int frame_length, - int step, - int pad_end, - int pad, - int axis) -{ - int gidx = get_global_id(0); - int gidy = get_global_id(1); - int gidz = get_global_id(2); - int outChn = gidz * input_height + gidy; - int4 coord = (int4)(0, gidy, gidz, 0); - int4 coord_out = (int4)(0, 0, outChn, 0); - - int endcoord = (pad_end == 0) ? (input_width - frame_length + 1) : (input_width); - int iter = frame_length / 8; - int res = frame_length % 8; - vxc_char8 src0; - - for(int i = 0; i < endcoord; i += step) - { - coord.x = i; - for(int j = 0; j < iter; j++) - { - coord_out.x = j << 3; - coord.x = i + (j << 3); - VXC_ReadImage2DArray(src0, input, coord, VXC_5BITOFFSET_XY(0, 0), - VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); - VXC_WriteImage2DArray(output, coord_out, src0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); - } - coord.x = i + (iter << 3); - coord_out.x = (iter << 3); - for(int j = 0; j < res; j++) - { - VXC_ReadImage2DArray(src0, input, coord, VXC_5BITOFFSET_XY(0, 0), - VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0)); - VXC_WriteImage2DArray(output, coord_out, src0, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0)); - coord_out.x++; - coord.x++; - } - - coord_out.y++; - } -} - -__kernel __attribute__((reqd_work_group_size(8, 1, 1))) void vxcSignalFrame_height_8bit( - image2d_array_t input, - image2d_array_t output, - int frame_length, - int step, - int pad_end, - int pad, - int axis) -{ - int gidx = get_global_id(0); - int gidy = get_global_id(1); - int gidz = get_global_id(2); - int outChn = gidz * output_channel + (gidy / step); - int4 coord = (int4)(gidx, gidy, gidz, 0); - int4 coord_out = (int4)(gidx, 0, outChn, 0); - vxc_char8 src0; - - for(int i = 0; i < frame_length; i++) - { - coord.y = gidy + i; - coord_out.y = i; - VXC_ReadImage2DArray(src0, input, coord, VXC_5BITOFFSET_XY(0, 0),\ - VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); - VXC_WriteImage2DArray(output, coord_out, src0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); - } -} - -__kernel __attribute__((reqd_work_group_size(8, 1, 1))) void vxcSignalFrame_channel_8bit( - image2d_array_t input, - image2d_array_t output, - int frame_length, - int step, - int pad_end, - int pad, - int axis) -{ - int gidx = get_global_id(0); - int gidy = get_global_id(1); - int gidz = get_global_id(2); - int outChn = (gidz / step) * frame_length; - int4 coord = (int4)(gidx, gidy, gidz, 0); - int4 coord_out = (int4)(gidx, gidy, outChn, 0); - vxc_char8 src0; - - for(int i = 0; i < frame_length; i++) - { - coord.z = gidz + i; - coord_out.z = outChn + i; - if(coord.z < input_channel) - { - VXC_ReadImage2DArray(src0, input, coord, VXC_5BITOFFSET_XY(0, 0), - VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); - } - else - { - src0 = (vxc_char8)(0); - } - VXC_WriteImage2DArray(output, coord_out, src0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); - } -} - -#if 0 -__kernel __attribute__((reqd_work_group_size(16, 1, 1))) void vxcSignalFrame_tensor( - image2d_array_t input, - image2d_array_t output, - image2d_array_t frame_length, - image2d_array_t steps, - image2d_array_t pad_end, - image2d_array_t pad, - image2d_array_t axis) -{ - int gidx = get_global_id(0); - int gidy = get_global_id(1); - int gidz = get_global_id(2); - int outChn = gidz * input_height + gidy; - int4 coord = (int4)(0, gidy, gidz, 0); - int4 coord_out = (int4)(0, 0, outChn, 0); - int4 coord_para = (int4)(0, 0, 0, 0); - - int4 size = read_imagei(frame_length, coord_para); - int4 step = read_imagei(steps, coord_para); - int4 pe = read_imagei(pad_end, coord_para); - int4 pd = read_imagei(pad, coord_para); - int len = input_width + (pe.x ? pd : 0); - int endcoord = len - size.x + 1; - int iter = size.x / 8; - int res = size.x % 8; - vxc_short8 src0; - - for(int i = 0; i < endcoord; i += step.x) - { - coord.x = i; - for(int j = 0; j < iter; j++) - { - coord_out.x = j << 3; - coord.x += (j << 3); - VXC_ReadImage2DArray(src0, input, coord, VXC_5BITOFFSET_XY(0, 0), - VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); - VXC_WriteImage2DArray(output, coord_out, src0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); - } - coord.x = i + (iter << 3); - coord_out.x = (iter << 3); - for(int j = 0; j < res; j++) - { - VXC_ReadImage2DArray(src0, input, coord, VXC_5BITOFFSET_XY(0, 0), - VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0)); - VXC_WriteImage2DArray(output, coord_out, src0, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0)); - coord_out.x++; - coord.x++; - } - - coord_out.y++; - } -} -#endif diff --git a/src/tim/vx/internal/src/libnnext/ops/vx/vsi_nn_kernel_tensorstackconcat.vx b/src/tim/vx/internal/src/libnnext/ops/vx/vsi_nn_kernel_tensorstackconcat.vx deleted file mode 100644 index 6d1eb8f..0000000 --- a/src/tim/vx/internal/src/libnnext/ops/vx/vsi_nn_kernel_tensorstackconcat.vx +++ /dev/null @@ -1,42 +0,0 @@ -#include "cl_viv_vx_ext.h" - -/*******************tensorstackconcat 16BITs********************/ -__kernel void vxcTensorStackConcat( - image2d_array_t input, - image2d_t index, - image2d_array_t output) -{ - int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0); - vxc_short8 src0, src1; - VXC_ReadImage2DArray(src0, input, coord, VXC_5BITOFFSET_XY(0, 0),\ - VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); - VXC_ReadImage2DArray(src1, input, coord, VXC_5BITOFFSET_XY(8, 0),\ - VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); - coord.w = 0; - coord.y = read_imagei(index, coord.ww).x; - VXC_WriteImage2DArray(output, coord, src0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); - coord.x += 8; - VXC_WriteImage2DArray(output, coord, src1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); -} - -/**************tensorstackconcat 8BITs***************************/ -__kernel void vxcTensorStackConcat8Bits( - image2d_array_t input, - image2d_t index, - image2d_array_t output) -{ - int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0); - int idx = coord.x; - vxc_char16 src0, src1; - VXC_ReadImage2DArray(src0, input, coord, VXC_5BITOFFSET_XY(0, 0),\ - VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); - coord.x += 16; - VXC_ReadImage2DArray(src1, input, coord, VXC_5BITOFFSET_XY(0, 0),\ - VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); - coord.x = idx; - coord.w = 0; - coord.y = read_imagei(index, coord.ww).x; - VXC_WriteImage2DArray(output, coord, src0, VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); - coord.x += 16; - VXC_WriteImage2DArray(output, coord, src1, VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); -} \ No newline at end of file diff --git a/src/tim/vx/internal/src/libnnext/ops/vx/vsi_nn_kernel_transform_gemm.vx b/src/tim/vx/internal/src/libnnext/ops/vx/vsi_nn_kernel_transform_gemm.vx deleted file mode 100644 index 7a146d1..0000000 --- a/src/tim/vx/internal/src/libnnext/ops/vx/vsi_nn_kernel_transform_gemm.vx +++ /dev/null @@ -1,39 +0,0 @@ -/* - ============================================================================ - Name : gemm.vx - Author : Sam - Version : - Copyright : Your copyright notice - Description : - ============================================================================ - */ -#include "cl_viv_vx_ext.h" - -_viv_uniform VXC_512Bits uniGemm3x3_4x4; -__kernel void vxcTransform_Gemm_F16toF16 - ( - __read_only image2d_array_t thetaTensor, - __read_only image2d_array_t gridTensor, - __write_only image2d_array_t coordinates - ) -{ - int4 coord = (int4)(0, get_global_id(0), get_global_id(1), 0); - - vxc_short8 vec0, vec1, vec2; - vxc_half8 src0, src1, src2, dst; - - VXC_ReadImage(vec0,thetaTensor,coord.xx,VXC_5BITOFFSET_XY(0,0),VXC_MODIFIER(0,5, 0, VXC_RM_TowardZero, 0)); - _viv_asm(COPY, src0, vec0, 16); - VXC_ReadImage(vec1,gridTensor,coord.yz,VXC_5BITOFFSET_XY(0,0),VXC_MODIFIER(0,5,0, VXC_RM_TowardZero, 0)); - _viv_asm(COPY, src1, vec1, 16); - VXC_ReadImage(vec2,gridTensor,coord.yz,VXC_5BITOFFSET_XY(6,0),VXC_MODIFIER(0,5,0, VXC_RM_TowardZero, 0)); - _viv_asm(COPY, src2, vec2, 16); - - coord.y = (int)((short)coord.y / (short)3) * 2; - - VXC_DP4x4(dst, src1, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniGemm3x3_4x4); - VXC_DP4x4(dst, src2, src0, VXC_MODIFIER(4, 7, 0, VXC_RM_TowardZero, 0), uniGemm3x3_4x4); - - _viv_asm(COPY, vec0, dst, 16); - VXC_WriteImage(coordinates, coord.yz, vec0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); -} diff --git a/src/tim/vx/internal/src/libnnext/ops/vx/vsi_nn_kernel_transform_interp.vx b/src/tim/vx/internal/src/libnnext/ops/vx/vsi_nn_kernel_transform_interp.vx deleted file mode 100644 index c149e6f..0000000 --- a/src/tim/vx/internal/src/libnnext/ops/vx/vsi_nn_kernel_transform_interp.vx +++ /dev/null @@ -1,125 +0,0 @@ -/* - ============================================================================ - Name : minimum.vx - Author : Sam - Version : - Copyright : Your copyright notice - Description : - ============================================================================ - */ -#include "cl_viv_vx_ext.h" - -_viv_uniform VXC_512Bits uniGetDXY_4x4; -_viv_uniform VXC_512Bits uniConvertF16toF32_4x4; -_viv_uniform int2 packedWH2; -_viv_uniform int packedWH; -__kernel void vxcTransform_InterP_F16toF16_2D - ( - __read_only image2d_array_t input0, - __read_only image2d_array_t input1, - __write_only image2d_array_t output - ) -{ - int2 coord = (int2)(get_global_id(0), get_global_id(1)); - - vxc_short8 vec0; - vxc_half8 pxy; - vxc_float4 dxy4; - vxc_int4 pos4; - short dst = 0; - - VXC_ReadImage(vec0, input1, coord.xy, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0)); - _viv_asm(COPY, pxy, vec0, 4); - - coord.x >>= 1; - vxc_short2 packedWH_16B; - _viv_asm(COPY, packedWH_16B, packedWH, 4); - VXC_DP4x4(dxy4, pxy, packedWH_16B, VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0), uniGetDXY_4x4); - dxy4.zw = floor(dxy4.xy); - pos4.xy = convert_int2(dxy4.zw); - pos4.zw = convert_int2(ceil(dxy4.xy)); - - vxc_short8 vec1; - vxc_half8 src0, src1; - VXC_ReadImage(vec0, input0, pos4.xy, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0)); - _viv_asm(COPY, src0, vec0, 8); - VXC_ReadImage(vec1, input0, pos4.xw, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0)); - _viv_asm(COPY, src1, vec1, 8); - - float2 xyLerp = dxy4.xy - dxy4.zw; - float2 oneSub_xyLerp = 1.0f - xyLerp; - float4 coef = (float4)(oneSub_xyLerp.x * oneSub_xyLerp.y, xyLerp.x * oneSub_xyLerp.y, - oneSub_xyLerp.x * xyLerp.y, xyLerp.x * xyLerp.y); - float4 data; - - VXC_DP4x4(data, src0, src1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertF16toF32_4x4); - - data.x = dot(data, coef); - - half tmp; - _viv_asm(CONV, tmp, data); - _viv_asm(COPY, dst, tmp, 4); - - VXC_WriteImage(output, coord, dst, VXC_MODIFIER(0, 0, 0,VXC_RM_TowardZero, 0)); -} - -_viv_uniform int depth; -__kernel void vxcTransform_InterP_F16toF16 - ( - __read_only image2d_array_t input0, - __read_only image2d_array_t input1, - __write_only image2d_array_t output - ) -{ - int4 coord = (int4)(get_global_id(0), get_global_id(1), 0, 0); - - vxc_short8 vec0; - vxc_half8 pxy; - vxc_float4 dxy4; - vxc_int4 pos4; - short dst = 0; - - VXC_ReadImage(vec0, input1, coord.xy, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0)); - _viv_asm(COPY, pxy, vec0, 4); - - coord.x >>= 1; - vxc_short2 packedWH_16B; - _viv_asm(COPY, packedWH_16B, packedWH, 4); - VXC_DP4x4(dxy4, pxy, packedWH_16B, VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0), uniGetDXY_4x4); - dxy4.zw = floor(dxy4.xy); - pos4.xy = convert_int2(dxy4.zw); - pos4.zw = convert_int2(ceil(dxy4.xy)); - - - float2 xyLerp = dxy4.xy - dxy4.zw; - float2 oneSub_xyLerp = 1.0f - xyLerp; - float4 coef = (float4)(oneSub_xyLerp.x * oneSub_xyLerp.y, xyLerp.x * oneSub_xyLerp.y, - oneSub_xyLerp.x * xyLerp.y, xyLerp.x * xyLerp.y); - - int4 coord_ = (int4)(pos4.x, pos4.y, 0, 0); - do - { - vxc_short8 vec1; - vxc_half8 src0, src1; - VXC_ReadImage2DArray(vec0,input0,coord_,VXC_5BITOFFSET_XY(0,0),VXC_MODIFIER(0,1,0, VXC_RM_TowardZero, 0)); - _viv_asm(COPY, src0, vec0, 8); - VXC_ReadImage2DArray(vec1,input0,coord_,VXC_5BITOFFSET_XY(0,1),VXC_MODIFIER(0,1,0, VXC_RM_TowardZero, 0)); - _viv_asm(COPY, src1, vec1, 8); - - coord_.z ++; - float4 data; - VXC_DP4x4(data, src0, src1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertF16toF32_4x4); - - data.x = dot(data, coef); - - half tmp; - _viv_asm(CONV, tmp, data); - _viv_asm(COPY, dst, tmp, 4); - - - VXC_WriteImage2DArray(output, coord, dst, VXC_MODIFIER(0, 0, 0,VXC_RM_TowardZero, 0)); - coord.z ++; - - } while (coord.z < depth); -} - diff --git a/src/tim/vx/internal/src/libnnext/ops/vx/vsi_nn_kernel_transform_setupThres.vx b/src/tim/vx/internal/src/libnnext/ops/vx/vsi_nn_kernel_transform_setupThres.vx deleted file mode 100644 index 31b1cec..0000000 --- a/src/tim/vx/internal/src/libnnext/ops/vx/vsi_nn_kernel_transform_setupThres.vx +++ /dev/null @@ -1,32 +0,0 @@ -/* - ============================================================================ - Name : gemm.vx - Author : Sam - Version : - Copyright : Your copyright notice - Description : - ============================================================================ - */ -#include "cl_viv_vx_ext.h" - -_viv_uniform int4 extract_packed; -__kernel void vxcTransform_setupThres_F16toF16 - ( - __read_only image2d_array_t initTensor, - __read_only image2d_array_t inputFC, - global int* thresFlag, - __write_only image2d_array_t thres - ) -{ - int2 coord = (int2)(0, 0); - - vxc_ushort8 src0, src1, dst; - - int flag = *thresFlag; - VXC_ReadImage(src0, initTensor, coord, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 5, 0, VXC_RM_TowardZero, 0)); - VXC_ReadImage(src1, inputFC, coord, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 5, 0, VXC_RM_TowardZero, 0)); - - VXC_BitExtract(dst, src0, src1, extract_packed, VXC_MODIFIER(0, 5, 0, VXC_RM_TowardZero, 0)); - - VXC_WriteImage(thres, coord, dst, VXC_MODIFIER(0, 5, 0, VXC_RM_TowardZero, 0)); -} diff --git a/src/tim/vx/internal/src/libnnext/ops/vx/warp_affine.vx b/src/tim/vx/internal/src/libnnext/ops/vx/warp_affine.vx new file mode 100644 index 0000000..10b1a29 --- /dev/null +++ b/src/tim/vx/internal/src/libnnext/ops/vx/warp_affine.vx @@ -0,0 +1,89 @@ +#include "cl_viv_vx_ext.h" + +_viv_uniform VXC_512Bits uniConvertDatatoF32_0_4x4; +_viv_uniform VXC_512Bits uniConvertDatatoF32_1_4x4; +_viv_uniform VXC_512Bits uniExtract8Data_2x8; +_viv_uniform float input_scale; +_viv_uniform float input_tail; +_viv_uniform float output_scale; +_viv_uniform float output_zp; +#define WARP_AFFINE_SH_IMPL(name0, name1, src_type, src_copy_type, convert_type, dst_type, dst_copy_type) \ +__kernel void warp_affine_##name0##to##name1 \ + ( \ + __read_only image2d_array_t input, \ + __read_only image2d_t matrix, \ + __write_only image2d_array_t output \ + ) \ +{ \ + int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), get_global_id(2)); \ + int4 coord_in = (int4)(get_global_id(0), get_global_id(1), get_global_id(0) + 1, get_global_id(1)); \ + \ + float4 coord_f = convert_float4(coord_in); \ + \ + int2 m_coord = (int2)(0, 0); \ + vxc_ushort8 m0, m1; \ + VXC_ReadImage(m0, matrix, m_coord, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \ + VXC_ReadImage(m1, matrix, m_coord, VXC_5BITOFFSET_XY(8, 0), VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \ + float4 matrix0, matrix1; \ + _viv_asm(COPY, matrix0, m0, 16); \ + _viv_asm(COPY, matrix1, m1, 16); \ + \ + coord_f = coord_f.xxzz * matrix0.xyxy + coord_f.yyww * matrix0.zwzw + matrix1.xyxy; \ + \ + coord_in = convert_int4(coord_f < 0 ? coord_f - 2 : coord_f); \ + \ + int4 coord_in0 = (int4)(coord_in.xy, coord.zw); \ + int8 input_desc; \ + _viv_asm(COPY, input_desc, input, sizeof(input_desc)); \ + int baseAddr = (int)coord_in0.z * input_desc.s4 + input_desc.s0; \ + _viv_asm(MOV, coord_in0.z, baseAddr); \ + \ + src_type v0, v1; \ + src_copy_type top, bot; \ + VXC_OP4(img_load_3d, v0, input, coord_in0, VXC_5BITOFFSET_XY(0, 0), \ + VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0)); \ + VXC_OP4(img_load_3d, v1, input, coord_in0, VXC_5BITOFFSET_XY(0, 1), \ + VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0)); \ + coord_in0.xy = coord_in.zw; \ + VXC_OP4(img_load_3d, v0, input, coord_in0, VXC_5BITOFFSET_XY(0, 0), \ + VXC_MODIFIER(2, 3, 0, VXC_RM_TowardZero, 0)); \ + VXC_OP4(img_load_3d, v1, input, coord_in0, VXC_5BITOFFSET_XY(0, 1), \ + VXC_MODIFIER(2, 3, 0, VXC_RM_TowardZero, 0)); \ + _viv_asm(COPY, top, v0, 16); \ + _viv_asm(COPY, bot, v1, 16); \ + \ + float4 lerp = coord_f - floor(coord_f); \ + float4 minus_lerp = 1.0f - lerp; \ + float4 coef0 = (float4)( minus_lerp.x * minus_lerp.y, lerp.x * minus_lerp.y, \ + minus_lerp.x * lerp.y, lerp.x * lerp.y); \ + \ + float4 coef1 = (float4)( minus_lerp.z * minus_lerp.w, lerp.z * minus_lerp.w, \ + minus_lerp.z * lerp.w, lerp.z * lerp.w); \ + \ + float4 data0, data1, result = 0; \ + VXC_DP4x4(data0, top, bot, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertDatatoF32_0_4x4); \ + VXC_DP4x4(data1, top, bot, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertDatatoF32_1_4x4); \ + \ + data0 = data0 * input_scale + input_tail; \ + data1 = data1 * input_scale + input_tail; \ + result.x = dot(data0, coef0); \ + result.y = dot(data1, coef1); \ + result.xy = result.xy * output_scale + output_zp; \ + convert_type dst0; \ + _viv_asm(CONV_RTE, dst0, result); \ + dst_type dst1; \ + VXC_DP2x8(dst1, dst0, dst0, VXC_MODIFIER(0, 1, 0, VXC_RM_ToNearestEven, 1), uniExtract8Data_2x8); \ + dst_copy_type dst; \ + _viv_asm(COPY, dst, dst1, 8); \ + VXC_WriteImage2DArray(output, coord, dst, VXC_MODIFIER(0, 1, 0,VXC_RM_TowardZero, 0)); \ +} +WARP_AFFINE_SH_IMPL(F16, F16, vxc_short8, vxc_half8, half4, vxc_half8, vxc_short8) +WARP_AFFINE_SH_IMPL(F16, I8, vxc_short8, vxc_half8, int4, vxc_char8, vxc_char8) +WARP_AFFINE_SH_IMPL(F16, U8, vxc_short8, vxc_half8, int4, vxc_uchar8, vxc_uchar8) +WARP_AFFINE_SH_IMPL(F16, I16, vxc_short8, vxc_half8, int4, vxc_short8, vxc_short8) +WARP_AFFINE_SH_IMPL(I8, I8, vxc_char8, vxc_char8, int4, vxc_char8, vxc_char8) +WARP_AFFINE_SH_IMPL(I8, F16, vxc_char8, vxc_char8, half4, vxc_half8, vxc_short8) +WARP_AFFINE_SH_IMPL(U8, U8, vxc_uchar8, vxc_uchar8, int4, vxc_uchar8, vxc_uchar8) +WARP_AFFINE_SH_IMPL(U8, F16, vxc_uchar8, vxc_uchar8, half4, vxc_half8, vxc_short8) +WARP_AFFINE_SH_IMPL(I16, I16, vxc_short8, vxc_short8, int4, vxc_short8, vxc_short8) +WARP_AFFINE_SH_IMPL(I16, F16, vxc_short8, vxc_short8, half4, vxc_half8, vxc_short8) diff --git a/src/tim/vx/internal/src/libnnext/vsi_nn_libnnext_resource.c b/src/tim/vx/internal/src/libnnext/vsi_nn_libnnext_resource.c index 962644c..0b450f0 100644 --- a/src/tim/vx/internal/src/libnnext/vsi_nn_libnnext_resource.c +++ b/src/tim/vx/internal/src/libnnext/vsi_nn_libnnext_resource.c @@ -1820,24 +1820,27 @@ __kernel void batch_norm_##name0##_F16_F16_F32_F32to##name1##_brdcst1( \\\n\ _viv_asm(COPY, mean, _mean, 16); \\\n\ VXC_ReadImage2DArray(_var, Variance, coord, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \\\n\ _viv_asm(COPY, var, _var, 16); \\\n\ - float4 gamma0 = read_imagef(Gamma, coord); \\\n\ - coord.x += 4; \\\n\ - float4 gamma1 = read_imagef(Gamma, coord); \\\n\ - coord.x -= 4; \\\n\ - float4 beta = read_imagef(Beta, coord); \\\n\ + int4 coord_in = coord; \\\n\ + int depth = get_image_array_size(Gamma); \\\n\ + _viv_asm(CLAMP0MAX, coord_in.z, coord_in.z, depth - 1); \\\n\ + float4 gamma = read_imagef(Gamma, coord_in); \\\n\ + coord_in.z = coord.z; \\\n\ + depth = get_image_array_size(Beta); \\\n\ + _viv_asm(CLAMP0MAX, coord_in.z, coord_in.z, depth - 1); \\\n\ + float4 beta = read_imagef(Beta, coord_in); \\\n\ \\\n\ float4 src0, src1, m, v; \\\n\ VXC_DP4x4(src0, src, src, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 1), uniDatatoF32_0_4x4); \\\n\ VXC_DP4x4(m, mean, mean, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 1), uniDatatoF32_0_4x4); \\\n\ VXC_DP4x4(v, var, var, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 1), uniDatatoF32_0_4x4); \\\n\ - gamma0 = gamma0 * rsqrt(v + eps); \\\n\ + float4 gamma0 = gamma.xxxx * rsqrt(v + eps); \\\n\ src0 = src0 * input_scale + input_tail; \\\n\ src0 = (src0 - m) * gamma0 + beta.xxxx; \\\n\ src0 = src0 * output_scale + output_zp; \\\n\ VXC_DP4x4(src1, src, src, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 1), uniDatatoF32_1_4x4); \\\n\ VXC_DP4x4(m, mean, mean, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 1), uniDatatoF32_1_4x4); \\\n\ VXC_DP4x4(v, var, var, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 1), uniDatatoF32_1_4x4); \\\n\ - gamma1 = gamma1 * rsqrt(v + eps); \\\n\ + float4 gamma1 = gamma.xxxx * rsqrt(v + eps); \\\n\ src1 = src1 * input_scale + input_tail; \\\n\ src1 = (src1 - m) * gamma1 + beta.xxxx; \\\n\ src1 = src1 * output_scale + output_zp; \\\n\ @@ -1885,22 +1888,21 @@ __kernel void batch_norm_##name0##_F16_F16_F32_F32to##name1##_brdcst1_2D( \\\n\ _viv_asm(COPY, mean, _mean, 16); \\\n\ VXC_ReadImage(_var, Variance, coord.xy, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \\\n\ _viv_asm(COPY, var, _var, 16); \\\n\ - float4 gamma0 = read_imagef(Gamma, coord.xy); \\\n\ - float4 gamma1 = read_imagef(Gamma, coord.zy); \\\n\ + float4 gamma = read_imagef(Gamma, coord.xy); \\\n\ float4 beta = read_imagef(Beta, coord.xy); \\\n\ \\\n\ float4 src0, src1, m, v; \\\n\ VXC_DP4x4(src0, src, src, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 1), uniDatatoF32_0_4x4); \\\n\ VXC_DP4x4(m, mean, mean, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 1), uniDatatoF32_0_4x4); \\\n\ VXC_DP4x4(v, var, var, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 1), uniDatatoF32_0_4x4); \\\n\ - gamma0 = gamma0 * rsqrt(v + eps); \\\n\ + float4 gamma0 = gamma.xxxx * rsqrt(v + eps); \\\n\ src0 = src0 * input_scale + input_tail; \\\n\ src0 = (src0 - m) * gamma0 + beta.xxxx; \\\n\ src0 = src0 * output_scale + output_zp; \\\n\ VXC_DP4x4(src1, src, src, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 1), uniDatatoF32_1_4x4); \\\n\ VXC_DP4x4(m, mean, mean, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 1), uniDatatoF32_1_4x4); \\\n\ VXC_DP4x4(v, var, var, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 1), uniDatatoF32_1_4x4); \\\n\ - gamma1 = gamma1 * rsqrt(v + eps); \\\n\ + float4 gamma1 = gamma.xxxx * rsqrt(v + eps); \\\n\ src1 = src1 * input_scale + input_tail; \\\n\ src1 = (src1 - m) * gamma1 + beta.xxxx; \\\n\ src1 = src1 * output_scale + output_zp; \\\n\ @@ -1948,12 +1950,18 @@ __kernel void batch_norm_##name0##_F16_F16_F32_F32to##name1##_brdcst0( \\\n\ _viv_asm(COPY, mean, _mean, 16); \\\n\ VXC_ReadImage2DArray(_var, Variance, coord, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \\\n\ _viv_asm(COPY, var, _var, 16); \\\n\ - float4 gamma0 = read_imagef(Gamma, coord); \\\n\ - float4 beta0 = read_imagef(Beta, coord); \\\n\ - coord.x += 4; \\\n\ - float4 gamma1 = read_imagef(Gamma, coord); \\\n\ - float4 beta1 = read_imagef(Beta, coord); \\\n\ - coord.x -= 4; \\\n\ + int4 coord_in0 = coord; \\\n\ + int depth = get_image_array_size(Gamma); \\\n\ + _viv_asm(CLAMP0MAX, coord_in0.z, coord_in0.z, depth - 1); \\\n\ + float4 gamma0 = read_imagef(Gamma, coord_in0); \\\n\ + int4 coord_in1 = coord; \\\n\ + depth = get_image_array_size(Beta); \\\n\ + _viv_asm(CLAMP0MAX, coord_in1.z, coord_in1.z, depth - 1); \\\n\ + float4 beta0 = read_imagef(Beta, coord_in1); \\\n\ + coord_in0.x += 4; \\\n\ + coord_in1.x += 4; \\\n\ + float4 gamma1 = read_imagef(Gamma, coord_in0); \\\n\ + float4 beta1 = read_imagef(Beta, coord_in1); \\\n\ \\\n\ float4 src0, src1, m, v; \\\n\ VXC_DP4x4(src0, src, src, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 1), uniDatatoF32_0_4x4); \\\n\ @@ -2054,7 +2062,6 @@ BATCH_NORM_SH_IMPL_AXIS1_2D(U8, U8, vxc_uchar16, vxc_uchar16, int4, vxc_uchar BATCH_NORM_SH_IMPL_AXIS1_2D(U8, F16, vxc_uchar16, vxc_uchar16, half4, vxc_half8, vxc_ushort8)\n\ BATCH_NORM_SH_IMPL_AXIS1_2D(I8, I8, vxc_char16, vxc_char16, int4, vxc_char16, vxc_char16)\n\ BATCH_NORM_SH_IMPL_AXIS1_2D(I8, F16, vxc_char16, vxc_char16, half4, vxc_half8, vxc_ushort8)\n\ -\n\ "; /* end of batchnorm_single_f32_vx*/ static const char cast_vx[] = "\n\ @@ -2807,14 +2814,6 @@ __kernel void conv1d_U8U8I32toU8_K1024_SMALL(\n\ VXC_WriteImage(output, coord.wy, result, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ }\n\ \n\ -inline uchar* get_image2D_array_ptr(image2d_array_t input)\n\ -{\n\ - int8 desc;\n\ - _viv_asm(COPY, desc, input, sizeof(desc));\n\ - uchar *src_ptr = (uchar*)desc.s0;\n\ - return src_ptr;\n\ -}\n\ -\n\ __kernel void conv1d_U8U8I32toU8_K1024_LARGE(\n\ __read_only image2d_array_t input,\n\ __read_only image2d_array_t weight,\n\ @@ -2836,9 +2835,11 @@ __kernel void conv1d_U8U8I32toU8_K1024_LARGE(\n\ vxc_short8 w_zp = (short)weight_ZP;\n\ vxc_uchar16 input_val = 0, weight_val = 0;\n\ int temp = 0, i, j;\n\ - uchar *src_ptr_base = (uchar *)get_image2D_array_ptr(input);\n\ + Tensor src_tensor = create_image_from_image2d(input, 1);\n\ + uchar *src_ptr_base = (uchar *)src_image.ptr;\n\ uchar *src_ptr;\n\ - uchar *dst_ptr = (uchar *)get_image2D_array_ptr(output);\n\ + Tensor dst_tensor = create_image_from_image2d(output, 1);\n\ + uchar *dst_ptr = (uchar *)dst_tensor.ptr;\n\ \n\ temp = read_imagei(bias, coord.yz).x;\n\ sum0 = convert_float(temp);\n\ @@ -2846,7 +2847,7 @@ __kernel void conv1d_U8U8I32toU8_K1024_LARGE(\n\ \n\ for (i = 0; i < input_height; i++)\n\ {\n\ - src_ptr = src_ptr_base + (coord.x + coord.z * input_width);\n\ + src_ptr = src_ptr_base + (coord.x + coord.z * src_image.stride_y);\n\ for (j = 0; j < kernel_cnt_x16; j++)\n\ {\n\ VXC_ReadImage2DArray(weight_val, weight, coord_w, VXC_5BITOFFSET_XY(0, 0), \\\n\ @@ -2885,7 +2886,7 @@ __kernel void conv1d_U8U8I32toU8_K1024_LARGE(\n\ _viv_asm(CONV_SAT_RTE, result1, sum1);\n\ vxc_uchar8 result;\n\ VXC_DP2x8(result, result0, result1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniSumOrderUchar_2x8);\n\ - dst_ptr = dst_ptr + (coord.w + coord.y * output_width);\n\ + dst_ptr = dst_ptr + (coord.w + coord.y * dst_tensor.stride_y);\n\ VXC_Vstore8(dst_ptr, 0, result);\n\ }\n\ \n\ @@ -4110,6 +4111,56 @@ float4 eltwise_unary_round(float4 x)\n\ return convert_float4(convert_int4_rte(x));\n\ }\n\ \n\ +#define MUL2_RSQRTPI (1.1283791670955126f)\n\ +float erf_eval(float x)\n\ +{\n\ + float res = 0;\n\ + float tmp = x;\n\ + float factorial = 1;\n\ + float x_pow = x;\n\ + float one = 1.0f;\n\ + float n = 1;\n\ +\n\ + if (x <= -3)\n\ + return -1;\n\ + else if(x >= 3)\n\ + return 1;\n\ +\n\ + while (fabs(tmp) > 1e-5)\n\ + {\n\ + res += tmp;\n\ +\n\ + factorial *= n;\n\ + one *= -1;\n\ + x_pow *= x * x;\n\ + tmp = one / factorial * x_pow / ( 2 * n + 1);\n\ +\n\ + n += 1.0f;\n\ + }\n\ + return res * MUL2_RSQRTPI;\n\ +}\n\ +#define RSQRT2 (0.70710678118654752440084436210485f)\n\ +float4 eltwise_unary_gelu(float4 x)\n\ +{\n\ + float4 erf, data;\n\ + data = x * RSQRT2;\n\ + erf.x = erf_eval(data.x);\n\ + erf.y = erf_eval(data.y);\n\ + erf.z = erf_eval(data.z);\n\ + erf.w = erf_eval(data.w);\n\ + x = 0.5f * x * (1 + erf);\n\ +\n\ + return x;\n\ +}\n\ +\n\ +#define SQRT_2_RCP_PI 0.7978845834732056f\n\ +float4 eltwise_unary_hard_gelu(float4 x)\n\ +{\n\ + float4 cdf = 0.5f + 0.5f * _tanh(SQRT_2_RCP_PI *\n\ + (x + 0.044715f * x * x * x));\n\ + return x * cdf;\n\ +}\n\ +\n\ _viv_uniform float inputScale;\n\ _viv_uniform float inputTail;\n\ _viv_uniform float outputScale;\n\ @@ -4241,6 +4292,28 @@ ELTSISE_UNARY_2D(round, U8, U8, vxc_uchar8, vxc_uchar8, int4, vxc_uchar8, vxc ELTSISE_UNARY_2D(round, U8, F16, vxc_uchar8, vxc_uchar8, half4, vxc_half8, vxc_short8)\n\ ELTSISE_UNARY_2D(round, I16, I16, vxc_short8, vxc_short8, int4, vxc_short8, vxc_short8)\n\ ELTSISE_UNARY_2D(round, I16, F16, vxc_short8, vxc_short8, half4, vxc_half8, vxc_short8)\n\ +//GELU\n\ +ELTSISE_UNARY_2D(gelu, F16, F16, vxc_short8, vxc_half8, half4, vxc_half8, vxc_short8)\n\ +ELTSISE_UNARY_2D(gelu, F16, I8, vxc_short8, vxc_half8, int4, vxc_char8, vxc_char8)\n\ +ELTSISE_UNARY_2D(gelu, F16, U8, vxc_short8, vxc_half8, int4, vxc_uchar8, vxc_uchar8)\n\ +ELTSISE_UNARY_2D(gelu, F16, I16, vxc_short8, vxc_half8, int4, vxc_short8, vxc_short8)\n\ +ELTSISE_UNARY_2D(gelu, I8, I8, vxc_char8, vxc_char8, int4, vxc_char8, vxc_char8)\n\ +ELTSISE_UNARY_2D(gelu, I8, F16, vxc_char8, vxc_char8, half4, vxc_half8, vxc_short8)\n\ +ELTSISE_UNARY_2D(gelu, U8, U8, vxc_uchar8, vxc_uchar8, int4, vxc_uchar8, vxc_uchar8)\n\ +ELTSISE_UNARY_2D(gelu, U8, F16, vxc_uchar8, vxc_uchar8, half4, vxc_half8, vxc_short8)\n\ +ELTSISE_UNARY_2D(gelu, I16, I16, vxc_short8, vxc_short8, int4, vxc_short8, vxc_short8)\n\ +ELTSISE_UNARY_2D(gelu, I16, F16, vxc_short8, vxc_short8, half4, vxc_half8, vxc_short8)\n\ +//HARD_GELU\n\ +ELTSISE_UNARY_2D(hard_gelu, F16, F16, vxc_short8, vxc_half8, half4, vxc_half8, vxc_short8)\n\ +ELTSISE_UNARY_2D(hard_gelu, F16, I8, vxc_short8, vxc_half8, int4, vxc_char8, vxc_char8)\n\ +ELTSISE_UNARY_2D(hard_gelu, F16, U8, vxc_short8, vxc_half8, int4, vxc_uchar8, vxc_uchar8)\n\ +ELTSISE_UNARY_2D(hard_gelu, F16, I16, vxc_short8, vxc_half8, int4, vxc_short8, vxc_short8)\n\ +ELTSISE_UNARY_2D(hard_gelu, I8, I8, vxc_char8, vxc_char8, int4, vxc_char8, vxc_char8)\n\ +ELTSISE_UNARY_2D(hard_gelu, I8, F16, vxc_char8, vxc_char8, half4, vxc_half8, vxc_short8)\n\ +ELTSISE_UNARY_2D(hard_gelu, U8, U8, vxc_uchar8, vxc_uchar8, int4, vxc_uchar8, vxc_uchar8)\n\ +ELTSISE_UNARY_2D(hard_gelu, U8, F16, vxc_uchar8, vxc_uchar8, half4, vxc_half8, vxc_short8)\n\ +ELTSISE_UNARY_2D(hard_gelu, I16, I16, vxc_short8, vxc_short8, int4, vxc_short8, vxc_short8)\n\ +ELTSISE_UNARY_2D(hard_gelu, I16, F16, vxc_short8, vxc_short8, half4, vxc_half8, vxc_short8)\n\ \n\ _viv_uniform VXC_512Bits uniConvBF16toF32_Part0_2x8;\n\ _viv_uniform VXC_512Bits uniConvBF16toF32_Part1_2x8;\n\ @@ -4290,6 +4363,10 @@ ELTSISE_UNARY_BF16_2D(mish)\n\ ELTSISE_UNARY_BF16_2D(hard_sigmoid)\n\ //ROUND\n\ ELTSISE_UNARY_BF16_2D(round)\n\ +//GELU\n\ +ELTSISE_UNARY_BF16_2D(gelu)\n\ +//HARD_GELU\n\ +ELTSISE_UNARY_BF16_2D(hard_gelu)\n\ "; /* end of eltwise_unary_2d_vx*/ static const char eltwise_unary_3d_vx[] = "#include \"cl_viv_vx_ext.h\"\n\ @@ -4366,6 +4443,56 @@ float4 eltwise_unary_round(float4 x)\n\ return convert_float4(convert_int4_rte(x));\n\ }\n\ \n\ +#define MUL2_RSQRTPI (1.1283791670955126f)\n\ +float erf_eval(float x)\n\ +{\n\ + float res = 0;\n\ + float tmp = x;\n\ + float factorial = 1;\n\ + float x_pow = x;\n\ + float one = 1.0f;\n\ + float n = 1;\n\ +\n\ + if (x <= -3)\n\ + return -1;\n\ + else if(x >= 3)\n\ + return 1;\n\ +\n\ + while (fabs(tmp) > 1e-5)\n\ + {\n\ + res += tmp;\n\ +\n\ + factorial *= n;\n\ + one *= -1;\n\ + x_pow *= x * x;\n\ + tmp = one / factorial * x_pow / ( 2 * n + 1);\n\ +\n\ + n += 1.0f;\n\ + }\n\ + return res * MUL2_RSQRTPI;\n\ +}\n\ +#define RSQRT2 (0.70710678118654752440084436210485f)\n\ +float4 eltwise_unary_gelu(float4 x)\n\ +{\n\ + float4 erf, data;\n\ + data = x * RSQRT2;\n\ + erf.x = erf_eval(data.x);\n\ + erf.y = erf_eval(data.y);\n\ + erf.z = erf_eval(data.z);\n\ + erf.w = erf_eval(data.w);\n\ + x = 0.5f * x * (1 + erf);\n\ +\n\ + return x;\n\ +}\n\ +\n\ +#define SQRT_2_RCP_PI 0.7978845834732056f\n\ +float4 eltwise_unary_hard_gelu(float4 x)\n\ +{\n\ + float4 cdf = 0.5f + 0.5f * _tanh(SQRT_2_RCP_PI *\n\ + (x + 0.044715f * x * x * x));\n\ + return x * cdf;\n\ +}\n\ +\n\ _viv_uniform float inputScale;\n\ _viv_uniform float inputTail;\n\ _viv_uniform float outputScale;\n\ @@ -4497,6 +4624,28 @@ ELTSISE_UNARY_3D(round, U8, U8, vxc_uchar8, vxc_uchar8, int4, vxc_uchar8, vxc ELTSISE_UNARY_3D(round, U8, F16, vxc_uchar8, vxc_uchar8, half4, vxc_half8, vxc_short8)\n\ ELTSISE_UNARY_3D(round, I16, I16, vxc_short8, vxc_short8, int4, vxc_short8, vxc_short8)\n\ ELTSISE_UNARY_3D(round, I16, F16, vxc_short8, vxc_short8, half4, vxc_half8, vxc_short8)\n\ +//GELU\n\ +ELTSISE_UNARY_3D(gelu, F16, F16, vxc_short8, vxc_half8, half4, vxc_half8, vxc_short8)\n\ +ELTSISE_UNARY_3D(gelu, F16, I8, vxc_short8, vxc_half8, int4, vxc_char8, vxc_char8)\n\ +ELTSISE_UNARY_3D(gelu, F16, U8, vxc_short8, vxc_half8, int4, vxc_uchar8, vxc_uchar8)\n\ +ELTSISE_UNARY_3D(gelu, F16, I16, vxc_short8, vxc_half8, int4, vxc_short8, vxc_short8)\n\ +ELTSISE_UNARY_3D(gelu, I8, I8, vxc_char8, vxc_char8, int4, vxc_char8, vxc_char8)\n\ +ELTSISE_UNARY_3D(gelu, I8, F16, vxc_char8, vxc_char8, half4, vxc_half8, vxc_short8)\n\ +ELTSISE_UNARY_3D(gelu, U8, U8, vxc_uchar8, vxc_uchar8, int4, vxc_uchar8, vxc_uchar8)\n\ +ELTSISE_UNARY_3D(gelu, U8, F16, vxc_uchar8, vxc_uchar8, half4, vxc_half8, vxc_short8)\n\ +ELTSISE_UNARY_3D(gelu, I16, I16, vxc_short8, vxc_short8, int4, vxc_short8, vxc_short8)\n\ +ELTSISE_UNARY_3D(gelu, I16, F16, vxc_short8, vxc_short8, half4, vxc_half8, vxc_short8)\n\ +//HARD_GELU\n\ +ELTSISE_UNARY_3D(hard_gelu, F16, F16, vxc_short8, vxc_half8, half4, vxc_half8, vxc_short8)\n\ +ELTSISE_UNARY_3D(hard_gelu, F16, I8, vxc_short8, vxc_half8, int4, vxc_char8, vxc_char8)\n\ +ELTSISE_UNARY_3D(hard_gelu, F16, U8, vxc_short8, vxc_half8, int4, vxc_uchar8, vxc_uchar8)\n\ +ELTSISE_UNARY_3D(hard_gelu, F16, I16, vxc_short8, vxc_half8, int4, vxc_short8, vxc_short8)\n\ +ELTSISE_UNARY_3D(hard_gelu, I8, I8, vxc_char8, vxc_char8, int4, vxc_char8, vxc_char8)\n\ +ELTSISE_UNARY_3D(hard_gelu, I8, F16, vxc_char8, vxc_char8, half4, vxc_half8, vxc_short8)\n\ +ELTSISE_UNARY_3D(hard_gelu, U8, U8, vxc_uchar8, vxc_uchar8, int4, vxc_uchar8, vxc_uchar8)\n\ +ELTSISE_UNARY_3D(hard_gelu, U8, F16, vxc_uchar8, vxc_uchar8, half4, vxc_half8, vxc_short8)\n\ +ELTSISE_UNARY_3D(hard_gelu, I16, I16, vxc_short8, vxc_short8, int4, vxc_short8, vxc_short8)\n\ +ELTSISE_UNARY_3D(hard_gelu, I16, F16, vxc_short8, vxc_short8, half4, vxc_half8, vxc_short8)\n\ \n\ _viv_uniform VXC_512Bits uniConvBF16toF32_Part0_2x8;\n\ _viv_uniform VXC_512Bits uniConvBF16toF32_Part1_2x8;\n\ @@ -4544,13 +4693,18 @@ ELTSISE_UNARY_BF16(mish)\n\ //HARD_SIGMOID\n\ ELTSISE_UNARY_BF16(hard_sigmoid)\n\ //ROUND\n\ -ELTSISE_UNARY_BF16(round)"; /* end of eltwise_unary_3d_vx*/ +ELTSISE_UNARY_BF16(round)\n\ +//GELU\n\ +ELTSISE_UNARY_BF16(gelu)\n\ +//HARD_GELU\n\ +ELTSISE_UNARY_BF16(hard_gelu)"; /* end of eltwise_unary_3d_vx*/ static const char erf_vx[] = "#include \"cl_viv_vx_ext.h\"\n\ \n\ #define MUL2_RSQRTPI (1.1283791670955126f)\n\ -float eltwise_unary_erf(float x)\n\ +float eltwise_unary_erf(float _x)\n\ {\n\ + float x = clamp(_x, -2, 2);\n\ float res = 0;\n\ float tmp = x;\n\ float factorial = 1;\n\ @@ -4721,6 +4875,73 @@ ELTSISE_UNARY_3D(erf, I16, F16, vxc_short8, vxc_short8, half4, vxc_half8, vxc_s //ERF\n\ ELTSISE_UNARY_BF16_3D(erf)"; /* end of erf_vx*/ +static const char extra_ending_vx[] = "#include \"cl_viv_vx_ext.h\"\n\ +\n\ +__kernel void extra_ending_I16\n\ + (\n\ + __read_only image2d_array_t input0,\n\ + __read_only image2d_array_t input,\n\ + __write_only image2d_array_t output\n\ + )\n\ +{\n\ + int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0);\n\ +\n\ + vxc_short8 data;\n\ + VXC_ReadImage2DArray(data, input, coord, VXC_5BITOFFSET_XY(0, 0),\\\n\ + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ +\n\ + VXC_WriteImage2DArray(output, coord, data, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ +}\n\ +\n\ +__kernel void extra_ending_F16\n\ + (\n\ + __read_only image2d_array_t input0,\n\ + __read_only image2d_array_t input,\n\ + __write_only image2d_array_t output\n\ + )\n\ +{\n\ + int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0);\n\ +\n\ + vxc_short8 data;\n\ + VXC_ReadImage2DArray(data, input, coord, VXC_5BITOFFSET_XY(0, 0),\\\n\ + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ +\n\ + VXC_WriteImage2DArray(output, coord, data, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ +}\n\ +\n\ +__kernel void extra_ending_I8\n\ + (\n\ + __read_only image2d_array_t input0,\n\ + __read_only image2d_array_t input,\n\ + __write_only image2d_array_t output\n\ + )\n\ +{\n\ + int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0);\n\ +\n\ + vxc_char8 data;\n\ + VXC_ReadImage2DArray(data, input, coord, VXC_5BITOFFSET_XY(0, 0),\\\n\ + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ +\n\ + VXC_WriteImage2DArray(output, coord, data, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ +}\n\ +\n\ +__kernel void extra_ending_U8\n\ + (\n\ + __read_only image2d_array_t input0,\n\ + __read_only image2d_array_t input,\n\ + __write_only image2d_array_t output\n\ + )\n\ +{\n\ + int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0);\n\ +\n\ + vxc_uchar8 data;\n\ + VXC_ReadImage2DArray(data, input, coord, VXC_5BITOFFSET_XY(0, 0),\\\n\ + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ +\n\ + VXC_WriteImage2DArray(output, coord, data, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ +}\n\ +"; /* end of extra_ending_vx*/ + static const char floordiv_vx[] = "#include \"cl_viv_vx_ext.h\"\n\ \n\ _viv_uniform VXC_512Bits uniConvertInt32toUint8_2x8;\n\ @@ -6018,6 +6239,193 @@ GATHER_ND_F16_TO_QINT_1D(I16, vxc_short8)\n\ \n\ "; /* end of gather_nd_mix_vx*/ +static const char get_matrix_vx[] = "#include \"cl_viv_vx_ext.h\"\n\ +\n\ +_viv_uniform float4 theta_1;\n\ +_viv_uniform float4 theta_2;\n\ +_viv_uniform float4 scale;\n\ +_viv_uniform float input_scale;\n\ +_viv_uniform float input_tail;\n\ +\n\ +#define GET_MATRIX_SH_IMPL(name0, in_type, read_func) \\\n\ +__kernel void get_matrix_##name0##toF32 \\\n\ + ( \\\n\ + __read_only image2d_t input, \\\n\ + __write_only image2d_t output, \\\n\ + int has_theta_1_1, \\\n\ + int has_theta_1_2, \\\n\ + int has_theta_1_3, \\\n\ + int has_theta_2_1, \\\n\ + int has_theta_2_2, \\\n\ + int has_theta_2_3, \\\n\ + float theta_1_1, \\\n\ + float theta_1_2, \\\n\ + float theta_1_3, \\\n\ + float theta_2_1, \\\n\ + float theta_2_2, \\\n\ + float theta_2_3, \\\n\ + float i_width, \\\n\ + float i_height, \\\n\ + float o_width, \\\n\ + float o_height \\\n\ + ) \\\n\ +{ \\\n\ + int2 coord = (int2)(0, get_global_id(1)); \\\n\ + float4 matrix0, matrix1; \\\n\ + float4 theta1, theta2; \\\n\ + _viv_asm(COPY, theta1, theta_1, 16); \\\n\ + _viv_asm(COPY, theta2, theta_2, 16); \\\n\ + \\\n\ + if (has_theta_1_1 == 0) \\\n\ + { \\\n\ + in_type data = read_func(input, coord); \\\n\ + coord.x ++; \\\n\ + theta1.x = convert_float(data.x) * input_scale + input_tail; \\\n\ + } \\\n\ + \\\n\ + if (has_theta_1_2 == 0) \\\n\ + { \\\n\ + in_type data = read_func(input, coord); \\\n\ + coord.x ++; \\\n\ + theta1.y = convert_float(data.x) * input_scale + input_tail; \\\n\ + } \\\n\ + \\\n\ + if (has_theta_1_3 == 0) \\\n\ + { \\\n\ + in_type data = read_func(input, coord); \\\n\ + coord.x ++; \\\n\ + theta1.z = convert_float(data.x) * input_scale + input_tail; \\\n\ + } \\\n\ + \\\n\ + if (has_theta_2_1 == 0) \\\n\ + { \\\n\ + in_type data = read_func(input, coord); \\\n\ + coord.x ++; \\\n\ + theta2.x = convert_float(data.x) * input_scale + input_tail; \\\n\ + } \\\n\ + \\\n\ + if (has_theta_2_2 == 0) \\\n\ + { \\\n\ + in_type data = read_func(input, coord); \\\n\ + coord.x ++; \\\n\ + theta2.y = convert_float(data.x) * input_scale + input_tail; \\\n\ + } \\\n\ + \\\n\ + if (has_theta_2_3 == 0) \\\n\ + { \\\n\ + in_type data = read_func(input, coord); \\\n\ + coord.x ++; \\\n\ + theta2.z = convert_float(data.x) * input_scale + input_tail; \\\n\ + } \\\n\ + \\\n\ + matrix0.x = theta2.y * scale.x; \\\n\ + matrix0.z = theta2.x * scale.z; \\\n\ + matrix1.x = ( theta2.z - theta2.y - theta2.x + 1) * i_width * 0.5f; \\\n\ + matrix0.y = theta1.y * scale.w; \\\n\ + matrix0.w = theta1.x * scale.y; \\\n\ + matrix1.y = ( theta1.z - theta1.y - theta1.x + 1) * i_height * 0.5f; \\\n\ + matrix1.zw = 2.0f * matrix0.xy; \\\n\ + \\\n\ + coord.x = 0; \\\n\ + vxc_ushort8 dst; \\\n\ + _viv_asm(COPY, dst, matrix0, 16); \\\n\ + VXC_WriteImage(output, coord, dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \\\n\ + _viv_asm(COPY, dst, matrix1, 16); \\\n\ + coord.x = 8; \\\n\ + VXC_WriteImage(output, coord, dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \\\n\ +}\n\ +GET_MATRIX_SH_IMPL(I16, int4, read_imagei)\n\ +GET_MATRIX_SH_IMPL(I8, int4, read_imagei)\n\ +GET_MATRIX_SH_IMPL(U8, uint4, read_imageui)\n\ +\n\ +__kernel void get_matrix_F16toF32\n\ + (\n\ + __read_only image2d_t input,\n\ + __write_only image2d_t output,\n\ + int has_theta_1_1,\n\ + int has_theta_1_2,\n\ + int has_theta_1_3,\n\ + int has_theta_2_1,\n\ + int has_theta_2_2,\n\ + int has_theta_2_3,\n\ + float theta_1_1,\n\ + float theta_1_2,\n\ + float theta_1_3,\n\ + float theta_2_1,\n\ + float theta_2_2,\n\ + float theta_2_3,\n\ + float i_width,\n\ + float i_height,\n\ + float o_width,\n\ + float o_height\n\ + )\n\ +{\n\ + int2 coord = (int2)(0, get_global_id(1));\n\ + float4 matrix0, matrix1;\n\ + float4 theta1, theta2;\n\ + _viv_asm(COPY, theta1, theta_1, 16);\n\ + _viv_asm(COPY, theta2, theta_2, 16);\n\ +\n\ + if (has_theta_1_1 == 0)\n\ + {\n\ + float4 data = read_imagef(input, coord);\n\ + coord.x ++;\n\ + theta1.x = data.x;\n\ + }\n\ +\n\ + if (has_theta_1_2 == 0)\n\ + {\n\ + float4 data = read_imagef(input, coord);\n\ + coord.x ++;\n\ + theta1.y = data.x;\n\ + }\n\ +\n\ + if (has_theta_1_3 == 0)\n\ + {\n\ + float4 data = read_imagef(input, coord);\n\ + coord.x ++;\n\ + theta1.z = data.x;\n\ + }\n\ +\n\ + if (has_theta_2_1 == 0)\n\ + {\n\ + float4 data = read_imagef(input, coord);\n\ + coord.x ++;\n\ + theta2.x = data.x;\n\ + }\n\ +\n\ + if (has_theta_2_2 == 0)\n\ + {\n\ + float4 data = read_imagef(input, coord);\n\ + coord.x ++;\n\ + theta2.y = data.x;\n\ + }\n\ +\n\ + if (has_theta_2_3 == 0)\n\ + {\n\ + float4 data = read_imagef(input, coord);\n\ + coord.x ++;\n\ + theta2.z = data.x;\n\ + }\n\ +\n\ + matrix0.x = theta2.y * scale.x;\n\ + matrix0.z = theta2.x * scale.z;\n\ + matrix1.x = ( theta2.z - theta2.y - theta2.x + 1) * i_width * 0.5f;\n\ + matrix0.y = theta1.y * scale.w;\n\ + matrix0.w = theta1.x * scale.y;\n\ + matrix1.y = ( theta1.z - theta1.y - theta1.x + 1) * i_height * 0.5f;\n\ + matrix1.zw = 2.0f * matrix0.xy;\n\ +\n\ + coord.x = 0;\n\ + vxc_ushort8 dst;\n\ + _viv_asm(COPY, dst, matrix0, 16);\n\ + VXC_WriteImage(output, coord, dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ + _viv_asm(COPY, dst, matrix1, 16);\n\ + coord.x = 8;\n\ + VXC_WriteImage(output, coord, dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ +}\n\ +"; /* end of get_matrix_vx*/ + static const char group_normalization_f16_vx[] = "#include \"cl_viv_vx_ext.h\"\n\ \n\ _viv_uniform int width;\n\ @@ -8486,7 +8894,7 @@ __kernel __attribute__((reqd_work_group_size(16, 1, 1))) void instance_norm_mean int gidx = get_global_id(0) << 3;\n\ int lidx = get_local_id(0);\n\ int gidz = get_global_id(1);\n\ - int4 coord = (int4)(gidx, 0, gidz, 0);\n\ + int4 coord = (int4)(gidx, 0, gidz, gidz);\n\ vxc_short8 src0;\n\ vxc_half8 in_h;\n\ vxc_float4 sumsqr;\n\ @@ -8603,7 +9011,8 @@ __kernel __attribute__((reqd_work_group_size(16, 1, 1))) void instance_norm_F16t image2d_array_t output, float eps, int rsFlg)\n\ {\n\ int gidz = get_global_id(1);\n\ - int4 coord = (int4)(get_global_id(0), 0, gidz, 0);\n\ + int4 coord = (int4)(get_global_id(0), 0, gidz, gidz);\n\ + int4 coord_in = (int4)(get_global_id(0), 0, gidz, gidz);\n\ int4 coord_para = (int4)(gidz, 0, 0, 0);\n\ vxc_short8 src0;\n\ vxc_short8 src1;\n\ @@ -8636,17 +9045,19 @@ __kernel __attribute__((reqd_work_group_size(16, 1, 1))) void instance_norm_F16t int8 input_desc, output_desc;\n\ _viv_asm(COPY, input_desc, input, sizeof(input_desc));\n\ int baseAddr_a = (int)get_global_id(1) * input_desc.s4 + input_desc.s0;\n\ - _viv_asm(MOV, coord.z, baseAddr_a);\n\ + _viv_asm(MOV, coord_in.z, baseAddr_a);\n\ \n\ _viv_asm(COPY, output_desc, output, sizeof(output_desc));\n\ int baseAddr = (int)get_global_id(1) * output_desc.s4 + output_desc.s0;\n\ - _viv_asm(MOV, coord.w, baseAddr);\n\ + _viv_asm(MOV, coord.z, baseAddr);\n\ \n\ for(coord.y = 0; coord.y < height; coord.y++)\n\ {\n\ - VXC_OP4(img_load_3d, src0, input, coord.xyzz, VXC_5BITOFFSET_XY(0, 0), \\\n\ + VXC_OP4(img_load_3d, src0, input, coord_in, VXC_5BITOFFSET_XY(0, 0), \\\n\ VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ _viv_asm(COPY, in_h, src0, 16);\n\ +\n\ + coord_in.y ++;\n\ \n\ VXC_DP4x4(tmpData0, in_h, in_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\\\n\ UniFP16toFP32Lo4_dp4x4);\n\ @@ -8661,7 +9072,7 @@ __kernel __attribute__((reqd_work_group_size(16, 1, 1))) void instance_norm_F16t VXC_DP2x8(dst, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0),\\\n\ uniConvertHalfToFp16_2x8);\n\ _viv_asm(COPY, outval, dst, 16);\n\ - VXC_OP4_NoDest(img_store_3d, output, coord.xyww, outval, \\\n\ + VXC_OP4_NoDest(img_store_3d, output, coord, outval, \\\n\ VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0));\n\ }\n\ }\n\ @@ -8753,7 +9164,7 @@ __kernel __attribute__((reqd_work_group_size(16, 1, 1))) void instance_norm_mean int gidx = get_global_id(0) << 3;\n\ int lidx = get_local_id(0);\n\ int gidz = get_global_id(1);\n\ - int4 coord = (int4)(gidx, 0, gidz, 0);\n\ + int4 coord = (int4)(gidx, 0, gidz, gidz);\n\ vxc_short8 src0;\n\ float sum = 0, sqr = 0;\n\ vxc_float4 sumsqr = (vxc_float4)(0);\n\ @@ -8770,7 +9181,7 @@ __kernel __attribute__((reqd_work_group_size(16, 1, 1))) void instance_norm_mean {\n\ for(coord.y = 0; coord.y < height;)\n\ {\n\ - VXC_OP4(img_load_3d, src0, input, coord, VXC_5BITOFFSET_XY(0, 0), \\\n\ + VXC_OP4(img_load_3d, src0, input, coord, 0, \\\n\ VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ coord.y++;\n\ VXC_DP8x2(sumsqr, src0, src0, VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0),\\\n\ @@ -8833,7 +9244,7 @@ __kernel __attribute__((reqd_work_group_size(16, 1, 1))) void instance_norm_mean {\n\ for(; coord.y < endH;)\n\ {\n\ - VXC_ReadImage(src0, input, coord, VXC_5BITOFFSET_XY(0, 0),\n\ + VXC_ReadImage(src0, input, coord, 0,\n\ VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ coord.y++;\n\ VXC_DP8x2(sumsqr, src0, src0, VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0),\\\n\ @@ -8881,7 +9292,8 @@ __kernel __attribute__((reqd_work_group_size(16, 1, 1))) void instance_norm_I16t int rsFlg)\n\ {\n\ int gidz = get_global_id(1);\n\ - int4 coord = (int4)(get_global_id(0), 0, gidz, 0);\n\ + int4 coord = (int4)(get_global_id(0), 0, gidz, gidz);\n\ + int4 coord_in = (int4)(get_global_id(0), 0, gidz, gidz);\n\ int4 coord_para = (int4)(gidz, 0, 0, 0);\n\ vxc_short8 src0;\n\ vxc_short8 src1;\n\ @@ -8889,7 +9301,7 @@ __kernel __attribute__((reqd_work_group_size(16, 1, 1))) void instance_norm_I16t float scale_vari, bias_val;\n\ vxc_float4 bias_f, scale_f, mean_vari = (vxc_float4)(0);\n\ \n\ - VXC_ReadImage(src1, scale, coord_para.xy, VXC_5BITOFFSET_XY(0, 0),\\\n\ + VXC_ReadImage(src1, scale, coord_para.xy, 0,\\\n\ VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));\n\ _viv_asm(COPY, scale_h, src1, 16);\n\ VXC_DP4x4(scale_f, scale_h, scale_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\\\n\ @@ -8917,16 +9329,17 @@ __kernel __attribute__((reqd_work_group_size(16, 1, 1))) void instance_norm_I16t int8 input_desc, output_desc;\n\ _viv_asm(COPY, input_desc, input, sizeof(input_desc));\n\ int baseAddr_a = (int)get_global_id(1) * input_desc.s4 + input_desc.s0;\n\ - _viv_asm(MOV, coord.z, baseAddr_a);\n\ + _viv_asm(MOV, coord_in.z, baseAddr_a);\n\ \n\ _viv_asm(COPY, output_desc, output, sizeof(output_desc));\n\ int baseAddr = (int)get_global_id(1) * output_desc.s4 + output_desc.s0;\n\ - _viv_asm(MOV, coord.w, baseAddr);\n\ + _viv_asm(MOV, coord.z, baseAddr);\n\ \n\ for(coord.y = 0; coord.y < height; coord.y++)\n\ {\n\ - VXC_OP4(img_load_3d, src0, input, coord.xyzz, VXC_5BITOFFSET_XY(0, 0), \\\n\ + VXC_OP4(img_load_3d, src0, input, coord_in, 0, \\\n\ VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ + coord_in.y ++;\n\ VXC_DP4x4(tmpData0, src0, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\\\n\ uniConvertInt16Fp32Fst_4x4);\n\ VXC_DP4x4(tmpData1, src0, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\\\n\ @@ -8940,7 +9353,7 @@ __kernel __attribute__((reqd_work_group_size(16, 1, 1))) void instance_norm_I16t VXC_DP2x8(dst, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0),\\\n\ uniConvertHalfToFp16_2x8);\n\ _viv_asm(COPY, outval, dst, 16);\n\ - VXC_OP4_NoDest(img_store_3d, output, coord.xyww, outval, \\\n\ + VXC_OP4_NoDest(img_store_3d, output, coord, outval, \\\n\ VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0));\n\ }\n\ }\n\ @@ -8965,7 +9378,7 @@ __kernel __attribute__((reqd_work_group_size(16, 1, 1))) void instance_norm_I16t float scale_vari, bias_val;\n\ vxc_float4 bias_f, scale_f, mean_vari = (vxc_float4)(0);\n\ \n\ - VXC_ReadImage(src1, scale, coord_para.xy, VXC_5BITOFFSET_XY(0, 0),\\\n\ + VXC_ReadImage(src1, scale, coord_para.xy, 0,\\\n\ VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));\n\ _viv_asm(COPY, scale_h, src1, 16);\n\ VXC_DP4x4(scale_f, scale_h, scale_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\\\n\ @@ -8992,7 +9405,7 @@ __kernel __attribute__((reqd_work_group_size(16, 1, 1))) void instance_norm_I16t \n\ for(; coord.y < endH; coord.y++)\n\ {\n\ - VXC_ReadImage(src0, input, coord.xy, VXC_5BITOFFSET_XY(0, 0),\\\n\ + VXC_ReadImage(src0, input, coord.xy, 0,\\\n\ VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ \n\ VXC_DP4x4(tmpData0, src0, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\\\n\ @@ -9021,7 +9434,8 @@ __kernel __attribute__((reqd_work_group_size(16, 1, 1))) void instance_norm_I16t int rsFlg)\n\ {\n\ int gidz = get_global_id(1);\n\ - int4 coord = (int4)(get_global_id(0), 0, gidz, 0);\n\ + int4 coord = (int4)(get_global_id(0), 0, gidz, gidz);\n\ + int4 coord_in = (int4)(get_global_id(0), 0, gidz, gidz);\n\ int4 coord_para = (int4)(gidz, 0, 0, 0);\n\ vxc_short8 src0, src2;\n\ vxc_short8 src1;\n\ @@ -9029,7 +9443,7 @@ __kernel __attribute__((reqd_work_group_size(16, 1, 1))) void instance_norm_I16t float scale_vari, bias_val;\n\ vxc_float4 bias_f, scale_f, mean_vari = (vxc_float4)(0);\n\ \n\ - VXC_ReadImage(src1, scale, coord_para.xy, VXC_5BITOFFSET_XY(0, 0),\\\n\ + VXC_ReadImage(src1, scale, coord_para.xy, 0,\\\n\ VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));\n\ _viv_asm(COPY, scale_h, src1, 16);\n\ VXC_DP4x4(scale_f, scale_h, scale_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), UniFP16toFP32Lo4_dp4x4);\n\ @@ -9053,15 +9467,16 @@ __kernel __attribute__((reqd_work_group_size(16, 1, 1))) void instance_norm_I16t int8 input_desc, output_desc;\n\ _viv_asm(COPY, input_desc, input, sizeof(input_desc));\n\ int baseAddr_a = (int)get_global_id(1) * input_desc.s4 + input_desc.s0;\n\ - _viv_asm(MOV, coord.z, baseAddr_a);\n\ + _viv_asm(MOV, coord_in.z, baseAddr_a);\n\ \n\ _viv_asm(COPY, output_desc, output, sizeof(output_desc));\n\ int baseAddr = (int)get_global_id(1) * output_desc.s4 + output_desc.s0;\n\ - _viv_asm(MOV, coord.w, baseAddr);\n\ + _viv_asm(MOV, coord.z, baseAddr);\n\ for(coord.y = 0; coord.y < height; coord.y++)\n\ {\n\ - VXC_OP4(img_load_3d, src0, input, coord.xyzz, VXC_5BITOFFSET_XY(0, 0), \\\n\ + VXC_OP4(img_load_3d, src0, input, coord_in, 0, \\\n\ VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ + coord_in.y ++;\n\ VXC_DP4x4(tmpData0, src0, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\\\n\ uniConvertInt16Fp32Fst_4x4);\n\ VXC_DP4x4(tmpData1, src0, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\\\n\ @@ -9073,7 +9488,7 @@ __kernel __attribute__((reqd_work_group_size(16, 1, 1))) void instance_norm_I16t tmpVal1 = convert_int4_rte(norm);\n\ VXC_DP2x8(src2, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1),\\\n\ uniConvertInt32toInt16_2x8);\n\ - VXC_OP4_NoDest(img_store_3d, output, coord.xyww, src2, \\\n\ + VXC_OP4_NoDest(img_store_3d, output, coord, src2, \\\n\ VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0));\n\ }\n\ }\n\ @@ -9098,7 +9513,7 @@ __kernel __attribute__((reqd_work_group_size(16, 1, 1))) void instance_norm_I16t float scale_vari, bias_val;\n\ vxc_float4 bias_f, scale_f, mean_vari = (vxc_float4)(0);\n\ \n\ - VXC_ReadImage(src1, scale, coord_para.xy, VXC_5BITOFFSET_XY(0, 0),\\\n\ + VXC_ReadImage(src1, scale, coord_para.xy, 0,\\\n\ VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));\n\ _viv_asm(COPY, scale_h, src1, 16);\n\ VXC_DP4x4(scale_f, scale_h, scale_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), UniFP16toFP32Lo4_dp4x4);\n\ @@ -9121,7 +9536,7 @@ __kernel __attribute__((reqd_work_group_size(16, 1, 1))) void instance_norm_I16t \n\ for(; coord.y < endH; coord.y++)\n\ {\n\ - VXC_ReadImage(src0, input, coord, VXC_5BITOFFSET_XY(0, 0),\\\n\ + VXC_ReadImage(src0, input, coord, 0,\\\n\ VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ VXC_DP4x4(tmpData0, src0, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\\\n\ uniConvertInt16Fp32Fst_4x4);\n\ @@ -9167,7 +9582,7 @@ __kernel __attribute__((reqd_work_group_size(16, 1, 1))) void instance_norm_mean int gidx = get_global_id(0) << 4;\n\ int lidx = get_local_id(0);\n\ int gidz = get_global_id(1);\n\ - int4 coord = (int4)(gidx, 0, gidz, 0);\n\ + int4 coord = (int4)(gidx, 0, gidz, gidz);\n\ vxc_char16 src0;\n\ float sum = 0, sqr = 0;\n\ int tmpSum = 0, tmpSqr = 0, tmpSum1, tmpSqr1;\n\ @@ -9279,7 +9694,7 @@ __kernel __attribute__((reqd_work_group_size(16, 1, 1))) void instance_norm_I8to image2d_array_t output, float eps, int rsFlg)\n\ {\n\ int gidz = get_global_id(1);\n\ - int4 coord = (int4)(get_global_id(0), 0, gidz, 0);\n\ + int4 coord = (int4)(get_global_id(0), 0, gidz, gidz);\n\ int4 coord_para = (int4)(gidz, 0, 0, 0);\n\ vxc_char16 src0;\n\ vxc_short8 src1, outval;\n\ @@ -9417,7 +9832,8 @@ __kernel __attribute__((reqd_work_group_size(16, 1, 1))) void instance_norm_I8to image2d_array_t output, float eps, int rsFlg)\n\ {\n\ int gidz = get_global_id(1);\n\ - int4 coord = (int4)(get_global_id(0), 0, gidz, 0);\n\ + int4 coord = (int4)(get_global_id(0), 0, gidz, gidz);\n\ + int4 coord_in = (int4)(get_global_id(0), 0, gidz, gidz);\n\ int4 coord_para = (int4)(gidz, 0, 0, 0);\n\ vxc_char16 src0, src2;\n\ vxc_short8 src1;\n\ @@ -9449,16 +9865,17 @@ __kernel __attribute__((reqd_work_group_size(16, 1, 1))) void instance_norm_I8to int8 input_desc, output_desc;\n\ _viv_asm(COPY, input_desc, input, sizeof(input_desc));\n\ int baseAddr_a = (int)get_global_id(1) * input_desc.s4 + input_desc.s0;\n\ - _viv_asm(MOV, coord.z, baseAddr_a);\n\ + _viv_asm(MOV, coord_in.z, baseAddr_a);\n\ \n\ _viv_asm(COPY, output_desc, output, sizeof(output_desc));\n\ int baseAddr = (int)get_global_id(1) * output_desc.s4 + output_desc.s0;\n\ - _viv_asm(MOV, coord.w, baseAddr);\n\ + _viv_asm(MOV, coord.z, baseAddr);\n\ \n\ for(coord.y = 0; coord.y < height; coord.y++)\n\ {\n\ - VXC_OP4(img_load_3d, src0, input, coord.xyzz, VXC_5BITOFFSET_XY(0, 0), \\\n\ + VXC_OP4(img_load_3d, src0, input, coord_in, VXC_5BITOFFSET_XY(0, 0), \\\n\ VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));\n\ + coord_in.y ++;\n\ VXC_DP4x4(tmpData0, src0, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertDirInt8Fp32_4x4);\n\ VXC_DP4x4(tmpData1, src0, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertEndInt8Fp32_4x4);\n\ VXC_DP4x4(tmpData2, src0, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertTrdInt8Fp32_4x4);\n\ @@ -9473,7 +9890,7 @@ __kernel __attribute__((reqd_work_group_size(16, 1, 1))) void instance_norm_I8to norm = tmpData3 * alpha + bias_val;\n\ tmpVal1 = convert_int4_rte(norm);\n\ VXC_DP2x8(src2, tmpVal0, tmpVal1, VXC_MODIFIER(8, 15, 0, VXC_RM_TowardZero, 1), uniConvertInt32toUint8_2x8);\n\ - VXC_OP4_NoDest(img_store_3d, output, coord.xyww, src2, VXC_MODIFIER(0, 15, 0,VXC_RM_TowardZero, 0));\n\ + VXC_OP4_NoDest(img_store_3d, output, coord, src2, VXC_MODIFIER(0, 15, 0,VXC_RM_TowardZero, 0));\n\ }\n\ }\n\ \n\ @@ -9562,7 +9979,8 @@ __kernel void instance_norm_##src1_type_name##F32to##src1_type_name( \\\n\ image2d_array_t output, float eps, int rsFlg) \\\n\ { \\\n\ int gidz = get_global_id(1); \\\n\ - int4 coord = (int4)(get_global_id(0), 0, gidz, 0); \\\n\ + int4 coord = (int4)(get_global_id(0), 0, gidz, gidz); \\\n\ + int4 coord_in = (int4)(get_global_id(0), 0, gidz, gidz); \\\n\ int2 coord_para = (int2)(gidz, 0); \\\n\ read_type src0, src2; \\\n\ float scale_vari, bias_val; \\\n\ @@ -9597,15 +10015,16 @@ __kernel void instance_norm_##src1_type_name##F32to##src1_type_name( \\\n\ int8 input_desc, output_desc; \\\n\ _viv_asm(COPY, input_desc, input, sizeof(input_desc)); \\\n\ int baseAddr_a = (int)get_global_id(1) * input_desc.s4 + input_desc.s0; \\\n\ - _viv_asm(MOV, coord.z, baseAddr_a); \\\n\ + _viv_asm(MOV, coord_in.z, baseAddr_a); \\\n\ _viv_asm(COPY, output_desc, output, sizeof(output_desc)); \\\n\ int baseAddr = (int)get_global_id(1) * output_desc.s4 + output_desc.s0; \\\n\ - _viv_asm(MOV, coord.w, baseAddr); \\\n\ + _viv_asm(MOV, coord.z, baseAddr); \\\n\ \\\n\ for(coord.y = 0; coord.y < height; coord.y++) \\\n\ { \\\n\ - VXC_OP4(img_load_3d, src0, input, coord.xyzz, VXC_5BITOFFSET_XY(0, 0), \\\n\ + VXC_OP4(img_load_3d, src0, input, coord_in, VXC_5BITOFFSET_XY(0, 0), \\\n\ VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); \\\n\ + coord_in.y ++; \\\n\ VXC_DP4x4(tmpData0, src0, zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), \\\n\ uniConvert1stUint8SubZpToFp32_4x4); \\\n\ VXC_DP4x4(tmpData1, src0, zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), \\\n\ @@ -9624,7 +10043,7 @@ __kernel void instance_norm_##src1_type_name##F32to##src1_type_name( \\\n\ norm = tmpData3 * alpha + bias_val; \\\n\ tmpVal1 = convert_int4_rte(norm); \\\n\ VXC_DP2x8(src2, tmpVal0, tmpVal1, VXC_MODIFIER(8, 15, 0, VXC_RM_TowardZero, 1), uniConvertInt32toUint8_2x8); \\\n\ - VXC_OP4_NoDest(img_store_3d, output, coord.xyww, src2, VXC_MODIFIER(0, 15, 0,VXC_RM_TowardZero, 0)); \\\n\ + VXC_OP4_NoDest(img_store_3d, output, coord, src2, VXC_MODIFIER(0, 15, 0,VXC_RM_TowardZero, 0)); \\\n\ } \\\n\ }\n\ INSTANCENORM_8BITS_F32(U8, vxc_uchar16)\n\ @@ -9703,7 +10122,8 @@ __kernel __attribute__((reqd_work_group_size(16, 1, 1))) void instance_norm_I16F image2d_array_t output, float eps, int rsFlg)\n\ {\n\ int gidz = get_global_id(1);\n\ - int4 coord = (int4)(get_global_id(0), 0, gidz, 0);\n\ + int4 coord = (int4)(get_global_id(0), 0, gidz, gidz);\n\ + int4 coord_in = (int4)(get_global_id(0), 0, gidz, gidz);\n\ int2 coord_para = (int2)(gidz, 0);\n\ vxc_short8 src0, src2;\n\ float scale_vari, bias_val;\n\ @@ -9738,15 +10158,16 @@ __kernel __attribute__((reqd_work_group_size(16, 1, 1))) void instance_norm_I16F int8 input_desc, output_desc;\n\ _viv_asm(COPY, input_desc, input, sizeof(input_desc));\n\ int baseAddr_a = (int)get_global_id(1) * input_desc.s4 + input_desc.s0;\n\ - _viv_asm(MOV, coord.z, baseAddr_a);\n\ + _viv_asm(MOV, coord_in.z, baseAddr_a);\n\ \n\ _viv_asm(COPY, output_desc, output, sizeof(output_desc));\n\ int baseAddr = (int)get_global_id(1) * output_desc.s4 + output_desc.s0;\n\ - _viv_asm(MOV, coord.w, baseAddr);\n\ + _viv_asm(MOV, coord.z, baseAddr);\n\ for(coord.y = 0; coord.y < height; coord.y++)\n\ {\n\ - VXC_OP4(img_load_3d, src0, input, coord.xyzz, VXC_5BITOFFSET_XY(0, 0), \\\n\ + VXC_OP4(img_load_3d, src0, input, coord_in, VXC_5BITOFFSET_XY(0, 0), \\\n\ VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ + coord_in.y ++;\n\ VXC_DP4x4(tmpData0, src0, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\\\n\ uniConvertInt16Fp32Fst_4x4);\n\ VXC_DP4x4(tmpData1, src0, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\\\n\ @@ -9758,7 +10179,7 @@ __kernel __attribute__((reqd_work_group_size(16, 1, 1))) void instance_norm_I16F tmpVal1 = convert_int4_rte(norm);\n\ VXC_DP2x8(src2, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1),\\\n\ uniConvertInt32toInt16_2x8);\n\ - VXC_OP4_NoDest(img_store_3d, output, coord.xyww, src2, \\\n\ + VXC_OP4_NoDest(img_store_3d, output, coord, src2, \\\n\ VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0));\n\ }\n\ }\n\ @@ -9840,7 +10261,7 @@ __kernel __attribute__((reqd_work_group_size(16, 1, 1))) void instance_norm_mean int gidx = get_global_id(0) << 3;\n\ int lidx = get_local_id(0);\n\ int gidz = get_global_id(1);\n\ - int4 coord = (int4)(gidx, 0, gidz, 0);\n\ + int4 coord = (int4)(gidx, 0, gidz, gidz);\n\ vxc_short8 src0, src1, src2;\n\ float4 srcA, srcB;\n\ vxc_float sum = 0, sqr = 0;\n\ @@ -9957,7 +10378,8 @@ __kernel __attribute__((reqd_work_group_size(16, 1, 1))) void instance_norm_BF16 image2d_array_t output, float eps, int rsFlg)\n\ {\n\ int gidz = get_global_id(1);\n\ - int4 coord = (int4)(get_global_id(0), 0, gidz, 0);\n\ + int4 coord = (int4)(get_global_id(0), 0, gidz, gidz);\n\ + int4 coord_in = (int4)(get_global_id(0), 0, gidz, gidz);\n\ vxc_short8 src0, src1, src2;\n\ float scale_vari, bias_val;\n\ vxc_float4 mean_vari = (vxc_float4)(0);\n\ @@ -9967,7 +10389,7 @@ __kernel __attribute__((reqd_work_group_size(16, 1, 1))) void instance_norm_BF16 Image img3 = create_image_from_image2d(meanVari, 4);\n\ __global float* bias_ptr = (__global float*)img1.ptr;\n\ __global float* scal_ptr = (__global float*)img2.ptr;\n\ - __global uchar* sumVari_ptr = (__global uchar*)get_image_ptr_from_coord(img3, coord.wz);\n\ + __global uchar* sumVari_ptr = (__global uchar*)get_image_ptr_from_coord(img3, (int2)(0, gidz));\n\ __global float4* vari_ptr = (__global float4*)sumVari_ptr;\n\ \n\ float bval = bias_ptr[gidz];\n\ @@ -9989,16 +10411,17 @@ __kernel __attribute__((reqd_work_group_size(16, 1, 1))) void instance_norm_BF16 int8 input_desc, output_desc;\n\ _viv_asm(COPY, input_desc, input, sizeof(input_desc));\n\ int baseAddr_a = (int)get_global_id(1) * input_desc.s4 + input_desc.s0;\n\ - _viv_asm(MOV, coord.z, baseAddr_a);\n\ + _viv_asm(MOV, coord_in.z, baseAddr_a);\n\ \n\ _viv_asm(COPY, output_desc, output, sizeof(output_desc));\n\ int baseAddr = (int)get_global_id(1) * output_desc.s4 + output_desc.s0;\n\ - _viv_asm(MOV, coord.w, baseAddr);\n\ + _viv_asm(MOV, coord.z, baseAddr);\n\ \n\ for(coord.y = 0; coord.y < height; coord.y++)\n\ {\n\ - VXC_OP4(img_load_3d, src0, input, coord.xyzz, VXC_5BITOFFSET_XY(0, 0), \\\n\ + VXC_OP4(img_load_3d, src0, input, coord_in, VXC_5BITOFFSET_XY(0, 0), \\\n\ VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ + coord_in.y ++;\n\ VXC_DP2x8(src1, src0, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0),\n\ uniConvBF16toF32_Part0_2x8);\n\ VXC_DP2x8(src2, src0, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0),\n\ @@ -10012,7 +10435,7 @@ __kernel __attribute__((reqd_work_group_size(16, 1, 1))) void instance_norm_BF16 norm = scale_vari * tmpData1 + bias_val;\n\ _viv_asm(COPY, src1, norm, 16);\n\ VXC_DP2x8(src2, src0, src1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniExtractOddData_2x8);\n\ - VXC_OP4_NoDest(img_store_3d, output, coord.xyww, src2, \\\n\ + VXC_OP4_NoDest(img_store_3d, output, coord, src2, \\\n\ VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0));\n\ }\n\ }\n\ @@ -10090,7 +10513,8 @@ __kernel __attribute__((reqd_work_group_size(16, 1, 1))) void instance_norm_F16F image2d_array_t output, float eps, int rsFlg)\n\ {\n\ int gidz = get_global_id(1);\n\ - int4 coord = (int4)(get_global_id(0), 0, gidz, 0);\n\ + int4 coord = (int4)(get_global_id(0), 0, gidz, gidz);\n\ + int4 coord_in = (int4)(get_global_id(0), 0, gidz, gidz);\n\ vxc_short8 src0;\n\ vxc_half8 in_h;\n\ float scale_vari, bias_val;\n\ @@ -10101,7 +10525,7 @@ __kernel __attribute__((reqd_work_group_size(16, 1, 1))) void instance_norm_F16F Image img3 = create_image_from_image2d(meanVari, 4);\n\ __global float* bias_ptr = (__global float*)img1.ptr;\n\ __global float* scal_ptr = (__global float*)img2.ptr;\n\ - __global uchar* sumVari_ptr = (__global uchar*)get_image_ptr_from_coord(img3, coord.wz);\n\ + __global uchar* sumVari_ptr = (__global uchar*)get_image_ptr_from_coord(img3, (int2)(0, gidz));\n\ __global float4* vari_ptr = (__global float4*)sumVari_ptr;\n\ \n\ float bval = bias_ptr[gidz];\n\ @@ -10126,17 +10550,19 @@ __kernel __attribute__((reqd_work_group_size(16, 1, 1))) void instance_norm_F16F int8 input_desc, output_desc;\n\ _viv_asm(COPY, input_desc, input, sizeof(input_desc));\n\ int baseAddr_a = (int)get_global_id(1) * input_desc.s4 + input_desc.s0;\n\ - _viv_asm(MOV, coord.z, baseAddr_a);\n\ + _viv_asm(MOV, coord_in.z, baseAddr_a);\n\ \n\ _viv_asm(COPY, output_desc, output, sizeof(output_desc));\n\ int baseAddr = (int)get_global_id(1) * output_desc.s4 + output_desc.s0;\n\ - _viv_asm(MOV, coord.w, baseAddr);\n\ + _viv_asm(MOV, coord.z, baseAddr);\n\ \n\ for(coord.y = 0; coord.y < height; coord.y++)\n\ {\n\ - VXC_OP4(img_load_3d, src0, input, coord.xyzz, VXC_5BITOFFSET_XY(0, 0), \\\n\ + VXC_OP4(img_load_3d, src0, input, coord_in, VXC_5BITOFFSET_XY(0, 0), \\\n\ VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ _viv_asm(COPY, in_h, src0, 16);\n\ +\n\ + coord_in.y ++;\n\ \n\ VXC_DP4x4(tmpData0, in_h, in_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\\\n\ UniFP16toFP32Lo4_dp4x4);\n\ @@ -10151,7 +10577,7 @@ __kernel __attribute__((reqd_work_group_size(16, 1, 1))) void instance_norm_F16F VXC_DP2x8(dst, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0),\\\n\ uniConvertHalfToFp16_2x8);\n\ _viv_asm(COPY, outval, dst, 16);\n\ - VXC_OP4_NoDest(img_store_3d, output, coord.xyww, outval, \\\n\ + VXC_OP4_NoDest(img_store_3d, output, coord, outval, \\\n\ VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0));\n\ }\n\ }\n\ @@ -10250,7 +10676,7 @@ __kernel __attribute__((reqd_work_group_size(16, 1, 1))) void instance_norm_mean int gidx = get_global_id(0) << 4;\n\ int lidx = get_local_id(0);\n\ int gidz = get_global_id(1);\n\ - int4 coord = (int4)(gidx, 0, gidz, 0);\n\ + int4 coord = (int4)(gidx, 0, gidz, gidz);\n\ vxc_uchar16 src0;\n\ float sum = 0, sqr = 0;\n\ int tmpSum = 0, tmpSqr = 0, tmpSum1 = 0, tmpSqr1 = 0;\n\ @@ -10265,7 +10691,7 @@ __kernel __attribute__((reqd_work_group_size(16, 1, 1))) void instance_norm_mean {\n\ for(coord.y = 0; coord.y < height;)\n\ {\n\ - VXC_OP4(img_load_3d, src0, input, coord, VXC_5BITOFFSET_XY(0, 0), \\\n\ + VXC_OP4(img_load_3d, src0, input, coord, 0, \\\n\ VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));\n\ coord.y++;\n\ VXC_DP16x1(tmpSum1, src0, src0, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0), uniSumU8_16x1);\n\ @@ -10317,7 +10743,7 @@ __kernel __attribute__((reqd_work_group_size(16, 1, 1))) void instance_norm_mean {\n\ for(; coord.y < endH;)\n\ {\n\ - VXC_ReadImage(src0, input, coord, VXC_5BITOFFSET_XY(0, 0),\n\ + VXC_ReadImage(src0, input, coord, 0,\n\ VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));\n\ coord.y++;\n\ VXC_DP16x1(tmpSum1, src0, src0, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0), uniSumU8_16x1);\n\ @@ -10354,7 +10780,8 @@ __kernel __attribute__((reqd_work_group_size(16, 1, 1))) void instance_norm_U8to image2d_array_t output, float eps, int rsFlg)\n\ {\n\ int gidz = get_global_id(1);\n\ - int4 coord = (int4)(get_global_id(0), 0, gidz, 0);\n\ + int4 coord = (int4)(get_global_id(0), 0, gidz, gidz);\n\ + int4 coord_in = (int4)(get_global_id(0), 0, gidz, gidz);\n\ int4 coord_para = (int4)(gidz, 0, 0, 0);\n\ vxc_uchar16 src0, src2;\n\ vxc_short8 src1;\n\ @@ -10362,7 +10789,7 @@ __kernel __attribute__((reqd_work_group_size(16, 1, 1))) void instance_norm_U8to float scale_vari, bias_val;\n\ vxc_float4 bias_f, scale_f, mean_vari = (vxc_float4)(0);\n\ \n\ - VXC_ReadImage(src1, scale, coord_para.xy, VXC_5BITOFFSET_XY(0, 0),\\\n\ + VXC_ReadImage(src1, scale, coord_para.xy, 0,\\\n\ VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));\n\ _viv_asm(COPY, scale_h, src1, 16);\n\ VXC_DP4x4(scale_f, scale_h, scale_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), UniFP16toFP32Lo4_dp4x4);\n\ @@ -10387,15 +10814,16 @@ __kernel __attribute__((reqd_work_group_size(16, 1, 1))) void instance_norm_U8to int8 input_desc, output_desc;\n\ _viv_asm(COPY, input_desc, input, sizeof(input_desc));\n\ int baseAddr_a = (int)get_global_id(1) * input_desc.s4 + input_desc.s0;\n\ - _viv_asm(MOV, coord.z, baseAddr_a);\n\ + _viv_asm(MOV, coord_in.z, baseAddr_a);\n\ _viv_asm(COPY, output_desc, output, sizeof(output_desc));\n\ int baseAddr = (int)get_global_id(1) * output_desc.s4 + output_desc.s0;\n\ - _viv_asm(MOV, coord.w, baseAddr);\n\ + _viv_asm(MOV, coord.z, baseAddr);\n\ \n\ for(coord.y = 0; coord.y < height; coord.y++)\n\ {\n\ - VXC_OP4(img_load_3d, src0, input, coord.xyzz, VXC_5BITOFFSET_XY(0, 0), \\\n\ + VXC_OP4(img_load_3d, src0, input, coord_in, 0, \\\n\ VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));\n\ + coord_in.y ++;\n\ VXC_DP4x4(tmpData0, src0, zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvert1stUint8SubZpToFp32_4x4);\n\ VXC_DP4x4(tmpData1, src0, zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvert2ndUint8SubZpToFp32_4x4);\n\ VXC_DP4x4(tmpData2, src0, zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvert3rdUint8SubZpToFp32_4x4);\n\ @@ -10410,7 +10838,7 @@ __kernel __attribute__((reqd_work_group_size(16, 1, 1))) void instance_norm_U8to norm = tmpData3 * alpha + bias_val;\n\ tmpVal1 = convert_int4_rte(norm);\n\ VXC_DP2x8(src2, tmpVal0, tmpVal1, VXC_MODIFIER(8, 15, 0, VXC_RM_TowardZero, 1), uniConvertInt32toUint8_2x8);\n\ - VXC_OP4_NoDest(img_store_3d, output, coord.xyww, src2, VXC_MODIFIER(0, 15, 0,VXC_RM_TowardZero, 0));\n\ + VXC_OP4_NoDest(img_store_3d, output, coord, src2, VXC_MODIFIER(0, 15, 0,VXC_RM_TowardZero, 0));\n\ }\n\ }\n\ \n\ @@ -10429,7 +10857,7 @@ __kernel __attribute__((reqd_work_group_size(16, 1, 1))) void instance_norm_U8to float scale_vari, bias_val;\n\ vxc_float4 bias_f, scale_f, mean_vari = (vxc_float4)(0);\n\ \n\ - VXC_ReadImage(src1, scale, coord_para.xy, VXC_5BITOFFSET_XY(0, 0),\\\n\ + VXC_ReadImage(src1, scale, coord_para.xy, 0,\\\n\ VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));\n\ _viv_asm(COPY, scale_h, src1, 16);\n\ VXC_DP4x4(scale_f, scale_h, scale_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), UniFP16toFP32Lo4_dp4x4);\n\ @@ -10453,7 +10881,7 @@ __kernel __attribute__((reqd_work_group_size(16, 1, 1))) void instance_norm_U8to \n\ for(; coord.y < endH; coord.y++)\n\ {\n\ - VXC_ReadImage(src0, input, coord, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));\n\ + VXC_ReadImage(src0, input, coord, 0, VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));\n\ VXC_DP4x4(tmpData0, src0, zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvert1stUint8SubZpToFp32_4x4);\n\ VXC_DP4x4(tmpData1, src0, zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvert2ndUint8SubZpToFp32_4x4);\n\ VXC_DP4x4(tmpData2, src0, zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvert3rdUint8SubZpToFp32_4x4);\n\ @@ -10493,7 +10921,7 @@ __kernel __attribute__((reqd_work_group_size(16, 1, 1))) void instance_norm_U8to image2d_array_t output, float eps, int rsFlg)\n\ {\n\ int gidz = get_global_id(1);\n\ - int4 coord = (int4)(get_global_id(0), 0, gidz, 0);\n\ + int4 coord = (int4)(get_global_id(0), 0, gidz, gidz);\n\ int4 coord_para = (int4)(gidz, 0, 0, 0);\n\ vxc_uchar16 src0;\n\ vxc_short8 src1, outval;\n\ @@ -10629,14 +11057,6 @@ do \\\n\ VXC_OP3_NoDest(vstore3, Pointer, byteOffset, Data); } \\\n\ while(0)\n\ \n\ -inline uchar* get_image2D_array_ptr(image2d_array_t input)\n\ -{\n\ - int8 desc;\n\ - _viv_asm(COPY, desc, input, sizeof(desc));\n\ - uchar *src_ptr = (uchar*)desc.s0;\n\ - return src_ptr;\n\ -}\n\ -\n\ #define L2NORMSCALE_SWITCH_PROCESS(case_value, vec_val, ZpValue) \\\n\ switch (case_value) \\\n\ { \\\n\ @@ -10727,8 +11147,10 @@ _viv_uniform int inputZP;\n\ \n\ #define L2NORMSCALE_MUL_AXIS0_PROCESS(dst_type, convert_type, output_type, copy_type) \\\n\ vxc_float4 rsqrt0;\\\n\ - dst_type *dst_ptr = (dst_type *)get_image2D_array_ptr(output); \\\n\ - short *scale_ptr = (short *)get_image2D_array_ptr(scale); \\\n\ + Image dst_img = create_image_from_image2d(output, 1); \\\n\ + dst_type *dst_ptr = (dst_type *)dst_img.ptr; \\\n\ + Image s_img = create_image_from_image2d(scale, 2); \\\n\ + short *scale_ptr = (short *)s_img.ptr; \\\n\ vxc_float4 vec0, vec1;\\\n\ convert_type dst0, dst1;\\\n\ vxc_short8 scale_s16;\\\n\ @@ -10811,15 +11233,16 @@ _viv_uniform int inputZP;\n\ __kernel __attribute__((reqd_work_group_size(16, 1, 1))) \\\n\ void l2normalizescale_axis0_##in0_name##_##in1_name##to##out_name##_2D \\\n\ (\\\n\ - __read_only image2d_array_t input,\\\n\ - __read_only image2d_array_t scale,\\\n\ - __write_only image2d_array_t output,\\\n\ + __read_only image2d_t input,\\\n\ + __read_only image2d_t scale,\\\n\ + __write_only image2d_t output,\\\n\ int axis\\\n\ )\\\n\ { \\\n\ int lidx = get_local_id(0); \\\n\ int offset = get_global_id(0); \\\n\ - read_type *src_ptr_base = (read_type *)get_image2D_array_ptr(input); \\\n\ + Image src_img = create_image_from_image2d(input, 1); \\\n\ + read_type *src_ptr_base = (read_type *)src_img.ptr; \\\n\ read_type *src_ptr; \\\n\ read_type2 src0, src1; \\\n\ src_type val0, val1; \\\n\ @@ -10890,7 +11313,8 @@ __kernel __attribute__((reqd_work_group_size(16, 1, 1))) \\\n\ { \\\n\ int lidx = get_local_id(0); \\\n\ int offset = get_global_id(0); \\\n\ - uchar *src_ptr_base = (uchar *)get_image2D_array_ptr(input); \\\n\ + Image src_img = create_image_from_image2d(input, 1);\n\ + uchar *src_ptr_base = (uchar *)src_img.ptr; \\\n\ uchar *src_ptr; \\\n\ vxc_uchar8 src0, src1; \\\n\ vxc_uchar8 val0, val1; \\\n\ @@ -11210,7 +11634,7 @@ __kernel void layer_norm_F16toF16(\n\ image2d_array_t input, image2d_t bias, image2d_t scale,\n\ image2d_array_t output, float eps)\n\ {\n\ - int4 coord = (int4)(0, get_global_id(1), get_global_id(2), 0);\n\ + int4 coord = (int4)(0, get_global_id(1), get_global_id(2), get_global_id(2));\n\ int4 coord_out = coord;\n\ \n\ int8 input_desc, output_desc;\n\ @@ -11220,18 +11644,18 @@ __kernel void layer_norm_F16toF16(\n\ \n\ vxc_short8 src0, src1;\n\ vxc_float sum = 0, sqr = 0;\n\ - VXC_OP4(img_load_3d, src0, input, coord.xyzz, VXC_5BITOFFSET_XY(0, 0), \\\n\ + VXC_OP4(img_load_3d, src0, input, coord, VXC_5BITOFFSET_XY(0, 0), \\\n\ VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ \n\ _viv_asm(COPY, output_desc, output, sizeof(output_desc));\n\ int baseAddr = (int)get_global_id(2) * output_desc.s4 + output_desc.s0;\n\ - _viv_asm(MOV, coord_out.w, baseAddr);\n\ + _viv_asm(MOV, coord_out.z, baseAddr);\n\ \n\ for(coord.x = 8; coord.x < (width+8); coord.x += 8)\n\ {\n\ vxc_half8 val0_h;\n\ _viv_asm(COPY, val0_h, src0, 16);\n\ - VXC_OP4(img_load_3d, src0, input, coord.xyzz, VXC_5BITOFFSET_XY(0, 0), \\\n\ + VXC_OP4(img_load_3d, src0, input, coord, VXC_5BITOFFSET_XY(0, 0), \\\n\ VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ vxc_float4 sumsqr;\n\ VXC_DP8x2(sumsqr, val0_h, val0_h, VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0),\\\n\ @@ -11248,7 +11672,7 @@ __kernel void layer_norm_F16toF16(\n\ vxc_float4 bias_f;\n\ for(coord.x = 0; coord.x < width; coord.x += 4)\n\ {\n\ - VXC_OP4(img_load_3d, src0, input, coord.xyzz, VXC_5BITOFFSET_XY(0, 0), \\\n\ + VXC_OP4(img_load_3d, src0, input, coord, VXC_5BITOFFSET_XY(0, 0), \\\n\ VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0));\n\ VXC_ReadImage(src1, scale, coord.xw, VXC_5BITOFFSET_XY(0, 0),\\\n\ VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0));\n\ @@ -11272,7 +11696,7 @@ __kernel void layer_norm_F16toF16(\n\ vxc_short8 dstval;\n\ _viv_asm(COPY, dstval, dst, 16);\n\ coord_out.x = coord.x;\n\ - VXC_OP4_NoDest(img_store_3d, output, coord_out.xyww, dstval, \\\n\ + VXC_OP4_NoDest(img_store_3d, output, coord_out, dstval, \\\n\ VXC_MODIFIER(0, 3, 0,VXC_RM_TowardZero, 0));\n\ }\n\ }\n\ @@ -11298,7 +11722,7 @@ __kernel void layer_norm_U8toU8(\n\ image2d_array_t input, image2d_t bias, image2d_t scale,\n\ image2d_array_t output, float eps)\n\ {\n\ - int4 coord = (int4)(0, get_global_id(1), get_global_id(2), 0);\n\ + int4 coord = (int4)(0, get_global_id(1), get_global_id(2), get_global_id(2));\n\ int4 coord_out = coord;\n\ \n\ vxc_uchar16 src0, src2;\n\ @@ -11318,11 +11742,11 @@ __kernel void layer_norm_U8toU8(\n\ \n\ _viv_asm(COPY, output_desc, output, sizeof(output_desc));\n\ int baseAddr = (int)get_global_id(2) * output_desc.s4 + output_desc.s0;\n\ - _viv_asm(MOV, coord_out.w, baseAddr);\n\ + _viv_asm(MOV, coord_out.z, baseAddr);\n\ \n\ for(coord.x = 0; coord.x < width; coord.x += 16)\n\ {\n\ - VXC_OP4(img_load_3d, src0, input, coord.xyzz, VXC_5BITOFFSET_XY(0, 0), \\\n\ + VXC_OP4(img_load_3d, src0, input, coord, VXC_5BITOFFSET_XY(0, 0), \\\n\ VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));\n\ VXC_DP16x1(tmpSum1, src0, src0, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0), uniSumU8_16x1);\n\ tmpSum += (tmpSum1.x);\n\ @@ -11343,7 +11767,7 @@ __kernel void layer_norm_U8toU8(\n\ \n\ for(coord.x = 0; coord.x < width; coord.x += 16)\n\ {\n\ - VXC_OP4(img_load_3d, src0, input, coord.xyzz, VXC_5BITOFFSET_XY(0, 0), \\\n\ + VXC_OP4(img_load_3d, src0, input, coord, VXC_5BITOFFSET_XY(0, 0), \\\n\ VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));\n\ VXC_ReadImage(src1, scale, coord.xw, VXC_5BITOFFSET_XY(0, 0),\\\n\ VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ @@ -11402,7 +11826,7 @@ __kernel void layer_norm_U8toU8(\n\ VXC_DP2x8(src2, tmpVal0, tmpVal1, VXC_MODIFIER(8, 15, 0, VXC_RM_TowardZero, 1),\\\n\ uniConvertInt32toUint8_2x8);\n\ coord_out.x = coord.x;\n\ - VXC_OP4_NoDest(img_store_3d, output, coord_out.xyww, src2, \\\n\ + VXC_OP4_NoDest(img_store_3d, output, coord_out, src2, \\\n\ VXC_MODIFIER(0, 15, 0,VXC_RM_TowardZero, 0));\n\ }\n\ }\n\ @@ -11411,7 +11835,7 @@ __kernel void layer_norm_F16toU8(\n\ image2d_array_t input, image2d_t bias, image2d_t scale,\n\ image2d_array_t output, float eps)\n\ {\n\ - int4 coord = (int4)(0, get_global_id(1), get_global_id(2), 0);\n\ + int4 coord = (int4)(0, get_global_id(1), get_global_id(2), get_global_id(2));\n\ int4 coord_out = coord;\n\ \n\ int8 input_desc, output_desc;\n\ @@ -11421,18 +11845,18 @@ __kernel void layer_norm_F16toU8(\n\ \n\ vxc_short8 src0, src1;\n\ vxc_float sum = 0, sqr = 0;\n\ - VXC_OP4(img_load_3d, src0, input, coord.xyzz, VXC_5BITOFFSET_XY(0, 0), \\\n\ + VXC_OP4(img_load_3d, src0, input, coord, VXC_5BITOFFSET_XY(0, 0), \\\n\ VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ \n\ _viv_asm(COPY, output_desc, output, sizeof(output_desc));\n\ int baseAddr = (int)get_global_id(2) * output_desc.s4 + output_desc.s0;\n\ - _viv_asm(MOV, coord_out.w, baseAddr);\n\ + _viv_asm(MOV, coord_out.z, baseAddr);\n\ \n\ for(coord.x = 8; coord.x < (width+8); coord.x += 8)\n\ {\n\ vxc_half8 val0_h;\n\ _viv_asm(COPY, val0_h, src0, 16);\n\ - VXC_OP4(img_load_3d, src0, input, coord.xyzz, VXC_5BITOFFSET_XY(0, 0), \\\n\ + VXC_OP4(img_load_3d, src0, input, coord, VXC_5BITOFFSET_XY(0, 0), \\\n\ VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ vxc_float4 sumsqr;\n\ VXC_DP8x2(sumsqr, val0_h, val0_h, VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0),\\\n\ @@ -11449,7 +11873,7 @@ __kernel void layer_norm_F16toU8(\n\ vxc_float4 bias_f;\n\ for(coord.x = 0; coord.x < width; coord.x += 4)\n\ {\n\ - VXC_OP4(img_load_3d, src0, input, coord.xyzz, VXC_5BITOFFSET_XY(0, 0), \\\n\ + VXC_OP4(img_load_3d, src0, input, coord, VXC_5BITOFFSET_XY(0, 0), \\\n\ VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0));\n\ VXC_ReadImage(src1, scale, coord.xw, VXC_5BITOFFSET_XY(0, 0),\\\n\ VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0));\n\ @@ -11472,7 +11896,7 @@ __kernel void layer_norm_F16toU8(\n\ VXC_DP2x8(dst, output_int4, output_int4, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1),\n\ uniConvertInt32toUint8_2x8);\n\ coord_out.x = coord.x;\n\ - VXC_OP4_NoDest(img_store_3d, output, coord_out.xyww, dst, \\\n\ + VXC_OP4_NoDest(img_store_3d, output, coord_out, dst, \\\n\ VXC_MODIFIER(0, 3, 0,VXC_RM_TowardZero, 0));\n\ }\n\ }"; /* end of layer_normalization_vx*/ @@ -11736,24 +12160,25 @@ __kernel void layer_norm_I16toI16(\n\ image2d_array_t input, image2d_t bias, image2d_t scale,\n\ image2d_array_t output, float eps)\n\ {\n\ - int4 coord = (int4)(0, get_global_id(1), get_global_id(2), 0);\n\ + int4 coord = (int4)(0, get_global_id(1), get_global_id(2), get_global_id(2));\n\ + int4 coord_in = (int4)(0, get_global_id(1), get_global_id(2), get_global_id(2));\n\ \n\ int8 input_desc, output_desc;\n\ _viv_asm(COPY, input_desc, input, sizeof(input_desc));\n\ int baseAddr_a = (int)get_global_id(2) * input_desc.s4 + input_desc.s0;\n\ - _viv_asm(MOV, coord.z, baseAddr_a);\n\ + _viv_asm(MOV, coord_in.z, baseAddr_a);\n\ \n\ _viv_asm(COPY, output_desc, output, sizeof(output_desc));\n\ int baseAddr = (int)get_global_id(2) * output_desc.s4 + output_desc.s0;\n\ - _viv_asm(MOV, coord.w, baseAddr);\n\ + _viv_asm(MOV, coord.z, baseAddr);\n\ \n\ vxc_short8 src0, src1, dst;\n\ vxc_float sum = 0, sqr = 0;\n\ - for(; coord.x < width;)\n\ + for(; coord_in.x < width;)\n\ {\n\ - VXC_OP4(img_load_3d, src0, input, coord.xyzz, VXC_5BITOFFSET_XY(0, 0), \\\n\ + VXC_OP4(img_load_3d, src0, input, coord_in, VXC_5BITOFFSET_XY(0, 0), \\\n\ VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ - coord.x += 8;\n\ + coord_in.x += 8;\n\ vxc_float4 sumsqr;\n\ VXC_DP8x2(sumsqr, src0, src0, VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0),\\\n\ uniInt16SumSqr_dp8x2);\n\ @@ -11775,11 +12200,11 @@ __kernel void layer_norm_I16toI16(\n\ \n\ int2 coord_bias = (int2)(0, 0);\n\ \n\ - for(coord.x = 0; coord.x < width; coord.x += 8)\n\ + for(coord_in.x = 0; coord_in.x < width; coord_in.x += 8, coord.x += 8)\n\ {\n\ - VXC_OP4(img_load_3d, src0, input, coord.xyzz, VXC_5BITOFFSET_XY(0, 0), \\\n\ + VXC_OP4(img_load_3d, src0, input, coord_in, VXC_5BITOFFSET_XY(0, 0), \\\n\ VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ - coord_bias.x = coord.x;\n\ + coord_bias.x = coord_in.x;\n\ VXC_ReadImage(src1, scale, coord_bias, VXC_5BITOFFSET_XY(0, 0),\\\n\ VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ bias_f0 = read_imagef(bias, coord_bias);\n\ @@ -11807,7 +12232,7 @@ __kernel void layer_norm_I16toI16(\n\ \n\ VXC_DP2x8(dst, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1),\\\n\ uniConvertInt32toUint8_2x8);\n\ - VXC_OP4_NoDest(img_store_3d, output, coord.xyww, dst, \\\n\ + VXC_OP4_NoDest(img_store_3d, output, coord, dst, \\\n\ VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0));\n\ }\n\ }\n\ @@ -11895,7 +12320,7 @@ __kernel void layer_norm_F16F32toF16(\n\ image2d_array_t input, image2d_t bias, image2d_t scale,\n\ image2d_array_t output, float eps)\n\ {\n\ - int4 coord = (int4)(0, get_global_id(1), get_global_id(2), 0);\n\ + int4 coord = (int4)(0, get_global_id(1), get_global_id(2), get_global_id(2));\n\ int4 coord_out = coord;\n\ \n\ int8 input_desc, output_desc;\n\ @@ -11905,20 +12330,20 @@ __kernel void layer_norm_F16F32toF16(\n\ \n\ vxc_short8 src0;\n\ vxc_float sum = 0, sqr = 0;\n\ - VXC_OP4(img_load_3d, src0, input, coord.xyzz, VXC_5BITOFFSET_XY(0, 0), \\\n\ + VXC_OP4(img_load_3d, src0, input, coord, VXC_5BITOFFSET_XY(0, 0), \\\n\ VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ Image img1 = create_image_from_image2d(bias, 4);\n\ Image img2 = create_image_from_image2d(scale, 4);\n\ \n\ _viv_asm(COPY, output_desc, output, sizeof(output_desc));\n\ int baseAddr = (int)get_global_id(2) * output_desc.s4 + output_desc.s0;\n\ - _viv_asm(MOV, coord_out.w, baseAddr);\n\ + _viv_asm(MOV, coord_out.z, baseAddr);\n\ \n\ for(coord.x = 8; coord.x < (width+8); coord.x += 8)\n\ {\n\ vxc_half8 val0_h;\n\ _viv_asm(COPY, val0_h, src0, 16);\n\ - VXC_OP4(img_load_3d, src0, input, coord.xyzz, VXC_5BITOFFSET_XY(0, 0), \\\n\ + VXC_OP4(img_load_3d, src0, input, coord, VXC_5BITOFFSET_XY(0, 0), \\\n\ VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ vxc_float4 sumsqr;\n\ VXC_DP8x2(sumsqr, val0_h, val0_h, VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0),\\\n\ @@ -11933,11 +12358,11 @@ __kernel void layer_norm_F16F32toF16(\n\ vari += eps;\n\ vari = rsqrt(vari);\n\ vxc_float4 bias_f, scale_f, in_f;\n\ - __global float* bias_ptr = (__global float*)get_image_ptr_from_coord(img1, coord.ww);\n\ - __global float* scale_ptr = (__global float*)get_image_ptr_from_coord(img2, coord.ww);\n\ + __global float* bias_ptr = (__global float*)get_image_ptr_from_coord(img1, (int2)(0, 0));\n\ + __global float* scale_ptr = (__global float*)get_image_ptr_from_coord(img2, (int2)(0, 0));\n\ for(coord.x = 0; coord.x < width; coord.x += 4)\n\ {\n\ - VXC_OP4(img_load_3d, src0, input, coord.xyzz, VXC_5BITOFFSET_XY(0, 0), \\\n\ + VXC_OP4(img_load_3d, src0, input, coord, VXC_5BITOFFSET_XY(0, 0), \\\n\ VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0));\n\ bias_f = vload4(0, bias_ptr + coord.x);\n\ scale_f = vload4(0, scale_ptr + coord.x);\n\ @@ -11956,7 +12381,7 @@ __kernel void layer_norm_F16F32toF16(\n\ vxc_short8 dstval;\n\ _viv_asm(COPY, dstval, dst, 16);\n\ coord_out.x = coord.x;\n\ - VXC_OP4_NoDest(img_store_3d, output, coord_out.xyww, dstval, \\\n\ + VXC_OP4_NoDest(img_store_3d, output, coord_out, dstval, \\\n\ VXC_MODIFIER(0, 3, 0,VXC_RM_TowardZero, 0));\n\ }\n\ }\n\ @@ -11984,7 +12409,7 @@ __kernel void layer_norm_U8F32toU8(\n\ image2d_array_t input, image2d_t bias, image2d_t scale,\n\ image2d_array_t output, float eps)\n\ {\n\ - int4 coord = (int4)(0, get_global_id(1), get_global_id(2), 0);\n\ + int4 coord = (int4)(0, get_global_id(1), get_global_id(2), get_global_id(2));\n\ int4 coord_out = coord;\n\ \n\ vxc_uchar16 src0, src2;\n\ @@ -12002,11 +12427,11 @@ __kernel void layer_norm_U8F32toU8(\n\ \n\ _viv_asm(COPY, output_desc, output, sizeof(output_desc));\n\ int baseAddr = (int)get_global_id(2) * output_desc.s4 + output_desc.s0;\n\ - _viv_asm(MOV, coord_out.w, baseAddr);\n\ + _viv_asm(MOV, coord_out.z, baseAddr);\n\ \n\ for(coord.x = 0; coord.x < width; coord.x += 16)\n\ {\n\ - VXC_OP4(img_load_3d, src0, input, coord.xyzz, VXC_5BITOFFSET_XY(0, 0), \\\n\ + VXC_OP4(img_load_3d, src0, input, coord, VXC_5BITOFFSET_XY(0, 0), \\\n\ VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));\n\ VXC_DP16x1(tmpSum1, src0, src0, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0), uniSumU8_16x1);\n\ tmpSum += (tmpSum1.x);\n\ @@ -12026,11 +12451,11 @@ __kernel void layer_norm_U8F32toU8(\n\ \n\ Image img1 = create_image_from_image2d(bias, 4);\n\ Image img2 = create_image_from_image2d(scale, 4);\n\ - __global float* bias_ptr = (__global float*)get_image_ptr_from_coord(img1, coord.ww);\n\ - __global float* scale_ptr = (__global float*)get_image_ptr_from_coord(img2, coord.ww);\n\ + __global float* bias_ptr = (__global float*)get_image_ptr_from_coord(img1, (int2)(0, 0));\n\ + __global float* scale_ptr = (__global float*)get_image_ptr_from_coord(img2, (int2)(0, 0));\n\ for(coord.x = 0; coord.x < width; coord.x += 16)\n\ {\n\ - VXC_OP4(img_load_3d, src0, input, coord.xyzz, VXC_5BITOFFSET_XY(0, 0), \\\n\ + VXC_OP4(img_load_3d, src0, input, coord, VXC_5BITOFFSET_XY(0, 0), \\\n\ VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));\n\ bias_f0 = vload4(0, bias_ptr);\n\ bias_f1 = vload4(1, bias_ptr);\n\ @@ -12077,7 +12502,7 @@ __kernel void layer_norm_U8F32toU8(\n\ VXC_DP2x8(src2, tmpVal0, tmpVal1, VXC_MODIFIER(8, 15, 0, VXC_RM_TowardZero, 1),\\\n\ uniConvertInt32toUint8_2x8);\n\ coord_out.x = coord.x;\n\ - VXC_OP4_NoDest(img_store_3d, output, coord_out.xyww, src2, \\\n\ + VXC_OP4_NoDest(img_store_3d, output, coord_out, src2, \\\n\ VXC_MODIFIER(0, 15, 0,VXC_RM_TowardZero, 0));\n\ }\n\ }\n\ @@ -12086,24 +12511,25 @@ __kernel void layer_norm_I16F32toI16(\n\ image2d_array_t input, image2d_t bias, image2d_t scale,\n\ image2d_array_t output, float eps)\n\ {\n\ - int4 coord = (int4)(0, get_global_id(1), get_global_id(2), 0);\n\ + int4 coord = (int4)(0, get_global_id(1), get_global_id(2), get_global_id(2));\n\ + int4 coord_in = (int4)(0, get_global_id(1), get_global_id(2), get_global_id(2));\n\ \n\ int8 input_desc, output_desc;\n\ _viv_asm(COPY, input_desc, input, sizeof(input_desc));\n\ int baseAddr_a = (int)get_global_id(2) * input_desc.s4 + input_desc.s0;\n\ - _viv_asm(MOV, coord.z, baseAddr_a);\n\ + _viv_asm(MOV, coord_in.z, baseAddr_a);\n\ \n\ _viv_asm(COPY, output_desc, output, sizeof(output_desc));\n\ int baseAddr = (int)get_global_id(2) * output_desc.s4 + output_desc.s0;\n\ - _viv_asm(MOV, coord.w, baseAddr);\n\ + _viv_asm(MOV, coord.z, baseAddr);\n\ \n\ vxc_short8 src0, dst;\n\ vxc_float sum = 0, sqr = 0;\n\ - for(; coord.x < width;)\n\ + for(; coord_in.x < width;)\n\ {\n\ - VXC_OP4(img_load_3d, src0, input, coord.xyzz, VXC_5BITOFFSET_XY(0, 0), \\\n\ + VXC_OP4(img_load_3d, src0, input, coord_in, VXC_5BITOFFSET_XY(0, 0), \\\n\ VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ - coord.x += 8;\n\ + coord_in.x += 8;\n\ vxc_float4 sumsqr;\n\ VXC_DP8x2(sumsqr, src0, src0, VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0),\\\n\ uniInt16SumSqr_dp8x2);\n\ @@ -12127,9 +12553,9 @@ __kernel void layer_norm_I16F32toI16(\n\ Image img2 = create_image_from_image2d(scale, 4);\n\ __global float* bias_ptr = (__global float*)get_image_ptr_from_coord(img1, coord_bias);\n\ __global float* scale_ptr = (__global float*)get_image_ptr_from_coord(img2, coord_bias);\n\ - for(coord.x = 0; coord.x < width; coord.x += 8)\n\ + for(coord_in.x = 0; coord_in.x < width; coord_in.x += 8, coord.x += 8)\n\ {\n\ - VXC_OP4(img_load_3d, src0, input, coord.xyzz, VXC_5BITOFFSET_XY(0, 0), \\\n\ + VXC_OP4(img_load_3d, src0, input, coord_in, VXC_5BITOFFSET_XY(0, 0), \\\n\ VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ bias_f0 = vload4(0, bias_ptr);\n\ bias_f1 = vload4(1, bias_ptr);\n\ @@ -12153,7 +12579,7 @@ __kernel void layer_norm_I16F32toI16(\n\ \n\ VXC_DP2x8(dst, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1),\\\n\ uniConvertInt32toUint8_2x8);\n\ - VXC_OP4_NoDest(img_store_3d, output, coord.xyww, dst, \\\n\ + VXC_OP4_NoDest(img_store_3d, output, coord, dst, \\\n\ VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0));\n\ }\n\ }"; /* end of layer_normalization_scale_f32_vx*/ @@ -12409,7 +12835,7 @@ __kernel void layer_norm_BF16F32toBF16(\n\ image2d_array_t input, image2d_t bias, image2d_t scale,\n\ image2d_array_t output, float eps)\n\ {\n\ - int4 coord = (int4)(0, get_global_id(1), get_global_id(2), 0);\n\ + int4 coord = (int4)(0, get_global_id(1), get_global_id(2), get_global_id(2));\n\ int4 coord_out = coord;\n\ \n\ int8 input_desc, output_desc;\n\ @@ -12428,7 +12854,7 @@ __kernel void layer_norm_BF16F32toBF16(\n\ \n\ _viv_asm(COPY, output_desc, output, sizeof(output_desc));\n\ int baseAddr = (int)get_global_id(2) * output_desc.s4 + output_desc.s0;\n\ - _viv_asm(MOV, coord_out.w, baseAddr);\n\ + _viv_asm(MOV, coord_out.z, baseAddr);\n\ float4 srcA, srcB;\n\ for(coord.x = 8; coord.x < (width+8); coord.x += 8)\n\ {\n\ @@ -12438,7 +12864,7 @@ __kernel void layer_norm_BF16F32toBF16(\n\ uniConvBF16toF32_Part1_2x8);\n\ _viv_asm(COPY, srcA, src1, 16);\n\ _viv_asm(COPY, srcB, src2, 16);\n\ - VXC_OP4(img_load_3d, src0, input, coord.xyzz, VXC_5BITOFFSET_XY(0, 0), \\\n\ + VXC_OP4(img_load_3d, src0, input, coord, VXC_5BITOFFSET_XY(0, 0), \\\n\ VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ sum += dot(srcA, ones) + dot(srcB, ones);\n\ sqr += dot(srcA * srcA, ones) + dot(srcB * srcB, ones);\n\ @@ -12450,12 +12876,12 @@ __kernel void layer_norm_BF16F32toBF16(\n\ vari += eps;\n\ vari = rsqrt(vari);\n\ vxc_float4 bias_f0, bias_f1, scale_f0, scale_f1;\n\ - __global float* bias_ptr = (__global float*)get_image_ptr_from_coord(img1, coord.ww);\n\ - __global float* scale_ptr = (__global float*)get_image_ptr_from_coord(img2, coord.ww);\n\ + __global float* bias_ptr = (__global float*)get_image_ptr_from_coord(img1, (int2)(0, 0));\n\ + __global float* scale_ptr = (__global float*)get_image_ptr_from_coord(img2, (int2)(0, 0));\n\ \n\ for(coord.x = 0; coord.x < width; coord.x += 8)\n\ {\n\ - VXC_OP4(img_load_3d, src0, input, coord.xyzz, VXC_5BITOFFSET_XY(0, 0), \\\n\ + VXC_OP4(img_load_3d, src0, input, coord, VXC_5BITOFFSET_XY(0, 0), \\\n\ VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ bias_f0 = vload4(0, bias_ptr);\n\ bias_f1 = vload4(1, bias_ptr);\n\ @@ -12483,7 +12909,7 @@ __kernel void layer_norm_BF16F32toBF16(\n\ VXC_DP2x8(src2, src0, src1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniExtractOddData_2x8);\n\ \n\ coord_out.x = coord.x;\n\ - VXC_OP4_NoDest(img_store_3d, output, coord_out.xyww, src2, \\\n\ + VXC_OP4_NoDest(img_store_3d, output, coord_out, src2, \\\n\ VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0));\n\ }\n\ }\n\ @@ -12584,7 +13010,7 @@ __kernel void layer_norm_U8toF16(\n\ image2d_array_t output,\n\ float eps)\n\ {\n\ - int4 coord = (int4)(0, get_global_id(1), get_global_id(2), 0);\n\ + int4 coord = (int4)(0, get_global_id(1), get_global_id(2), get_global_id(2));\n\ int4 coord_out = coord;\n\ vxc_uchar16 src0;\n\ float sum = 0, sqr = 0;\n\ @@ -12599,11 +13025,11 @@ __kernel void layer_norm_U8toF16(\n\ \n\ _viv_asm(COPY, output_desc, output, sizeof(output_desc));\n\ int baseAddr = (int)get_global_id(2) * output_desc.s4 + output_desc.s0;\n\ - _viv_asm(MOV, coord_out.w, baseAddr);\n\ + _viv_asm(MOV, coord_out.z, baseAddr);\n\ \n\ for(coord.x = 0; coord.x < width; coord.x += 16)\n\ {\n\ - VXC_OP4(img_load_3d, src0, input, coord.xyzz, VXC_5BITOFFSET_XY(0, 0), \\\n\ + VXC_OP4(img_load_3d, src0, input, coord, VXC_5BITOFFSET_XY(0, 0), \\\n\ VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));\n\ VXC_DP16x1(tmpSum1, src0, src0, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0), uniSumU8_16x1);\n\ tmpSum += (tmpSum1.x);\n\ @@ -12629,7 +13055,7 @@ __kernel void layer_norm_U8toF16(\n\ \n\ for(coord.x = 0; coord.x < width; coord.x += 16)\n\ {\n\ - VXC_OP4(img_load_3d, src0, input, coord.xyzz, VXC_5BITOFFSET_XY(0, 0), \\\n\ + VXC_OP4(img_load_3d, src0, input, coord, VXC_5BITOFFSET_XY(0, 0), \\\n\ VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));\n\ VXC_ReadImage(src1, scale, coord.xw, VXC_5BITOFFSET_XY(0, 0),\\\n\ VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ @@ -12679,7 +13105,7 @@ __kernel void layer_norm_U8toF16(\n\ UniPackFP16even_2x8);\n\ _viv_asm(COPY, outval, dst, 16);\n\ coord_out.x = coord.x;\n\ - VXC_OP4_NoDest(img_store_3d, output, coord_out.xyww, outval, \\\n\ + VXC_OP4_NoDest(img_store_3d, output, coord_out, outval, \\\n\ VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0));\n\ \n\ tmpData2 -= mean;\n\ @@ -12693,7 +13119,7 @@ __kernel void layer_norm_U8toF16(\n\ UniPackFP16even_2x8);\n\ _viv_asm(COPY, outval, dst, 16);\n\ coord_out.x += 8;\n\ - VXC_OP4_NoDest(img_store_3d, output, coord_out.xyww, outval, \\\n\ + VXC_OP4_NoDest(img_store_3d, output, coord_out, outval, \\\n\ VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0));\n\ }\n\ }\n\ @@ -12851,7 +13277,7 @@ __kernel __attribute__((reqd_work_group_size(16, 1, 1))) void layernorm_wh_sumSq {\n\ for(coord.y = 0; coord.y < height;)\n\ {\n\ - VXC_OP4(img_load_3d, src0, input, coord.xyww, VXC_5BITOFFSET_XY(0, 0),\n\ + VXC_OP4(img_load_3d, src0, input, coord.xywz, VXC_5BITOFFSET_XY(0, 0),\n\ VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ coord.y++;\n\ _viv_asm(COPY, in_h, src0, 16);\n\ @@ -12946,7 +13372,8 @@ __kernel __attribute__((reqd_work_group_size(16, 1, 1))) void layernorm_wh_F16to image2d_array_t output, float eps)\n\ {\n\ int gidz = get_global_id(1);\n\ - int4 coord = (int4)(get_global_id(0), 0, gidz, 0);\n\ + int4 coord = (int4)(get_global_id(0), 0, gidz, gidz);\n\ + int4 coord_in = (int4)(get_global_id(0), 0, gidz, gidz);\n\ int2 coord_sum = (int2)(0, gidz);\n\ int4 coord_para = coord;\n\ coord_para.z = (ushort)gidz / (ushort)(height_depth);\n\ @@ -12969,8 +13396,8 @@ __kernel __attribute__((reqd_work_group_size(16, 1, 1))) void layernorm_wh_F16to \n\ int8 input_desc, scale_desc, output_desc;\n\ _viv_asm(COPY, input_desc, input, sizeof(input_desc));\n\ - int baseAddr_a = (int)coord.z * input_desc.s4 + input_desc.s0;\n\ - _viv_asm(MOV, coord.w, baseAddr_a);\n\ + int baseAddr_a = (int)coord_in.z * input_desc.s4 + input_desc.s0;\n\ + _viv_asm(MOV, coord_in.z, baseAddr_a);\n\ \n\ _viv_asm(COPY, scale_desc, scale, sizeof(scale_desc));\n\ int baseAddr_c = (int)coord_para.z * scale_desc.s4 + scale_desc.s0;\n\ @@ -12987,11 +13414,12 @@ __kernel __attribute__((reqd_work_group_size(16, 1, 1))) void layernorm_wh_F16to \n\ for(coord.y = 0; coord.y < height; coord.y++)\n\ {\n\ - VXC_OP4(img_load_3d, src0, input, coord.xyww, VXC_5BITOFFSET_XY(0, 0),\n\ + VXC_OP4(img_load_3d, src0, input, coord_in, VXC_5BITOFFSET_XY(0, 0),\n\ VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ + coord_in.y ++;\n\ coord_para.y = coord.y;\n\ coord_bias.y = coord.y;\n\ - VXC_OP4(img_load_3d, src1, scale, coord_para.xyww, VXC_5BITOFFSET_XY(0, 0),\n\ + VXC_OP4(img_load_3d, src1, scale, coord_para.xywz, VXC_5BITOFFSET_XY(0, 0),\n\ VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ bias_f0 = read_imagef(bias, coord_bias);\n\ coord_bias.x += 4;\n\ @@ -13020,7 +13448,7 @@ __kernel __attribute__((reqd_work_group_size(16, 1, 1))) void layernorm_wh_F16to VXC_DP2x8(dst, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0),\\\n\ uniConvertHalfToFp16_2x8);\n\ _viv_asm(COPY, outval, dst, 16);\n\ - VXC_OP4_NoDest(img_store_3d, output, coord.xyzz, outval, \\\n\ + VXC_OP4_NoDest(img_store_3d, output, coord, outval, \\\n\ VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0));\n\ }\n\ }\n\ @@ -13096,7 +13524,8 @@ __kernel __attribute__((reqd_work_group_size(16, 1, 1))) void layernorm_wh_F16to image2d_array_t output, float eps)\n\ {\n\ int gidz = get_global_id(1);\n\ - int4 coord = (int4)(get_global_id(0), 0, gidz, 0);\n\ + int4 coord = (int4)(get_global_id(0), 0, gidz, gidz);\n\ + int4 coord_in = (int4)(get_global_id(0), 0, gidz, gidz);\n\ int2 coord_sum = (int2)(0, gidz);\n\ int4 coord_para = coord;\n\ coord_para.z = (ushort)gidz / (ushort)(height_depth);\n\ @@ -13119,8 +13548,8 @@ __kernel __attribute__((reqd_work_group_size(16, 1, 1))) void layernorm_wh_F16to \n\ int8 input_desc, scale_desc, output_desc;\n\ _viv_asm(COPY, input_desc, input, sizeof(input_desc));\n\ - int baseAddr_a = (int)coord.z * input_desc.s4 + input_desc.s0;\n\ - _viv_asm(MOV, coord.w, baseAddr_a);\n\ + int baseAddr_a = (int)coord_in.z * input_desc.s4 + input_desc.s0;\n\ + _viv_asm(MOV, coord_in.z, baseAddr_a);\n\ \n\ _viv_asm(COPY, scale_desc, scale, sizeof(scale_desc));\n\ int baseAddr_c = (int)coord_para.z * scale_desc.s4 + scale_desc.s0;\n\ @@ -13136,11 +13565,12 @@ __kernel __attribute__((reqd_work_group_size(16, 1, 1))) void layernorm_wh_F16to \n\ for(coord.y = 0; coord.y < height; coord.y++)\n\ {\n\ - VXC_OP4(img_load_3d, src0, input, coord.xyww, VXC_5BITOFFSET_XY(0, 0),\n\ + VXC_OP4(img_load_3d, src0, input, coord_in, VXC_5BITOFFSET_XY(0, 0),\n\ VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ + coord_in.y ++;\n\ coord_para.y = coord.y;\n\ coord_bias.y = coord.y;\n\ - VXC_OP4(img_load_3d, src1, scale, coord_para.xyww, VXC_5BITOFFSET_XY(0, 0),\n\ + VXC_OP4(img_load_3d, src1, scale, coord_para.xywz, VXC_5BITOFFSET_XY(0, 0),\n\ VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ bias_f0 = read_imagef(bias, coord_bias);\n\ coord_bias.x += 4;\n\ @@ -13168,7 +13598,7 @@ __kernel __attribute__((reqd_work_group_size(16, 1, 1))) void layernorm_wh_F16to tmpVal1 = convert_int4_rte(norm * outputScale + output_zp);\n\ VXC_DP2x8(outval, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1),\\\n\ uniConvertInt32toUint8_2x8);\n\ - VXC_OP4_NoDest(img_store_3d, output, coord.xyzz, outval, \\\n\ + VXC_OP4_NoDest(img_store_3d, output, coord, outval, \\\n\ VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0));\n\ }\n\ }\n\ @@ -13282,7 +13712,7 @@ __kernel __attribute__((reqd_work_group_size(16, 1, 1))) void layernorm_wh_sumSq {\n\ for(coord.y = 0; coord.y < height;)\n\ {\n\ - VXC_OP4(img_load_3d, src0, input, coord.xyww, VXC_5BITOFFSET_XY(0, 0),\n\ + VXC_OP4(img_load_3d, src0, input, coord.xywz, VXC_5BITOFFSET_XY(0, 0),\n\ VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ coord.y++;\n\ vxc_float4 sumsqr;\n\ @@ -13369,7 +13799,8 @@ __kernel __attribute__((reqd_work_group_size(16, 1, 1))) void layernorm_wh_I16to image2d_array_t output, float eps)\n\ {\n\ int gidz = get_global_id(1);\n\ - int4 coord = (int4)(get_global_id(0), 0, gidz, 0);\n\ + int4 coord = (int4)(get_global_id(0), 0, gidz, gidz);\n\ + int4 coord_in = (int4)(get_global_id(0), 0, gidz, gidz);\n\ int2 coord_sum = (int2)(0, gidz);\n\ int4 coord_para = coord;\n\ coord_para.z = (ushort)gidz / (ushort)(height_depth);\n\ @@ -13391,8 +13822,8 @@ __kernel __attribute__((reqd_work_group_size(16, 1, 1))) void layernorm_wh_I16to \n\ int8 input_desc, scale_desc, output_desc;\n\ _viv_asm(COPY, input_desc, input, sizeof(input_desc));\n\ - int baseAddr_a = (int)coord.z * input_desc.s4 + input_desc.s0;\n\ - _viv_asm(MOV, coord.w, baseAddr_a);\n\ + int baseAddr_a = (int)coord_in.z * input_desc.s4 + input_desc.s0;\n\ + _viv_asm(MOV, coord_in.z, baseAddr_a);\n\ \n\ _viv_asm(COPY, scale_desc, scale, sizeof(scale_desc));\n\ int baseAddr_c = (int)coord_para.z * scale_desc.s4 + scale_desc.s0;\n\ @@ -13408,11 +13839,12 @@ __kernel __attribute__((reqd_work_group_size(16, 1, 1))) void layernorm_wh_I16to \n\ for(coord.y = 0; coord.y < height; coord.y++)\n\ {\n\ - VXC_OP4(img_load_3d, src0, input, coord.xyww, VXC_5BITOFFSET_XY(0, 0),\n\ + VXC_OP4(img_load_3d, src0, input, coord_in, VXC_5BITOFFSET_XY(0, 0),\n\ VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ + coord_in.y ++;\n\ coord_para.y = coord.y;\n\ coord_bias.y = coord.y;\n\ - VXC_OP4(img_load_3d, src1, scale, coord_para.xyww, VXC_5BITOFFSET_XY(0, 0),\n\ + VXC_OP4(img_load_3d, src1, scale, coord_para.xywz, VXC_5BITOFFSET_XY(0, 0),\n\ VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ bias_f0 = read_imagef(bias, coord_bias);\n\ coord_bias.x += 4;\n\ @@ -13438,7 +13870,7 @@ __kernel __attribute__((reqd_work_group_size(16, 1, 1))) void layernorm_wh_I16to \n\ VXC_DP2x8(outval, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1),\\\n\ uniConvertInt32toUint8_2x8);\n\ - VXC_OP4_NoDest(img_store_3d, output, coord.xyzz, outval, \\\n\ + VXC_OP4_NoDest(img_store_3d, output, coord, outval, \\\n\ VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0));\n\ }\n\ }\n\ @@ -13554,7 +13986,7 @@ __kernel __attribute__((reqd_work_group_size(16, 1, 1))) void layernorm_wh_sumSq {\n\ for(coord.y = 0; coord.y < height;)\n\ {\n\ - VXC_OP4(img_load_3d, src0, input, coord.xyww, VXC_5BITOFFSET_XY(0, 0),\n\ + VXC_OP4(img_load_3d, src0, input, coord.xywz, 0,\n\ VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));\n\ coord.y++;\n\ VXC_DP16x1(tmpSum1, src0, src0, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0), uniSumU8_16x1);\n\ @@ -13607,7 +14039,7 @@ __kernel __attribute__((reqd_work_group_size(16, 1, 1))) void layernorm_wh_sumSq {\n\ for(; coord.y < endH;)\n\ {\n\ - VXC_ReadImage(src0, input, coord, VXC_5BITOFFSET_XY(0, 0),\n\ + VXC_ReadImage(src0, input, coord, 0,\n\ VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));\n\ coord.y++;\n\ VXC_DP16x1(tmpSum1, src0, src0, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0), uniSumU8_16x1);\n\ @@ -13644,7 +14076,8 @@ __kernel __attribute__((reqd_work_group_size(16, 1, 1))) void layernorm_wh_U8toF image2d_array_t output, float eps)\n\ {\n\ int gidz = get_global_id(1);\n\ - int4 coord = (int4)(get_global_id(0), 0, gidz, 0);\n\ + int4 coord = (int4)(get_global_id(0), 0, gidz, gidz);\n\ + int4 coord_in = (int4)(get_global_id(0), 0, gidz, gidz);\n\ int2 coord_sum = (int2)(0, gidz);\n\ int4 coord_para = coord;\n\ coord_para.z = (ushort)gidz / (ushort)(height_depth);\n\ @@ -13667,8 +14100,8 @@ __kernel __attribute__((reqd_work_group_size(16, 1, 1))) void layernorm_wh_U8toF \n\ int8 input_desc, scale_desc, output_desc;\n\ _viv_asm(COPY, input_desc, input, sizeof(input_desc));\n\ - int baseAddr_a = (int)coord.z * input_desc.s4 + input_desc.s0;\n\ - _viv_asm(MOV, coord.w, baseAddr_a);\n\ + int baseAddr_a = (int)coord_in.z * input_desc.s4 + input_desc.s0;\n\ + _viv_asm(MOV, coord_in.z, baseAddr_a);\n\ \n\ _viv_asm(COPY, scale_desc, scale, sizeof(scale_desc));\n\ int baseAddr_c = (int)coord_para.z * scale_desc.s4 + scale_desc.s0;\n\ @@ -13684,10 +14117,11 @@ __kernel __attribute__((reqd_work_group_size(16, 1, 1))) void layernorm_wh_U8toF \n\ for(coord.y = 0; coord.y < height; coord.y++)\n\ {\n\ - VXC_OP4(img_load_3d, src0, input, coord.xyww, VXC_5BITOFFSET_XY(0, 0),\n\ + VXC_OP4(img_load_3d, src0, input, coord_in, 0,\n\ VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ + coord_in.y ++;\n\ coord_para.y = coord.y; coord_bias.y = coord.y;\n\ - VXC_OP4(img_load_3d, src1, scale, coord_para.xyww, VXC_5BITOFFSET_XY(0, 0),\n\ + VXC_OP4(img_load_3d, src1, scale, coord_para.xywz, 0,\n\ VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ bias_f0 = read_imagef(bias, coord_bias);\n\ coord_bias.x += 4;\n\ @@ -13714,7 +14148,7 @@ __kernel __attribute__((reqd_work_group_size(16, 1, 1))) void layernorm_wh_U8toF VXC_DP2x8(dst, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0),\\\n\ uniConvertHalfToFp16_2x8);\n\ _viv_asm(COPY, outval, dst, 16);\n\ - VXC_OP4_NoDest(img_store_3d, output, coord.xyzz, outval, \\\n\ + VXC_OP4_NoDest(img_store_3d, output, coord, outval, \\\n\ VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0));\n\ }\n\ }\n\ @@ -13748,10 +14182,10 @@ __kernel __attribute__((reqd_work_group_size(16, 1, 1))) void layernorm_wh_U8toF \n\ for(coord.y = 0; coord.y < height; coord.y++)\n\ {\n\ - VXC_ReadImage(src0, input, coord, VXC_5BITOFFSET_XY(0, 0),\\\n\ + VXC_ReadImage(src0, input, coord, 0,\\\n\ VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ coord_bias.y = coord.y;\n\ - VXC_ReadImage(src1, scale, coord, VXC_5BITOFFSET_XY(0, 0),\\\n\ + VXC_ReadImage(src1, scale, coord, 0,\\\n\ VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ bias_f0 = read_imagef(bias, coord_bias);\n\ coord_bias.x += 4;\n\ @@ -13787,7 +14221,8 @@ __kernel __attribute__((reqd_work_group_size(16, 1, 1))) void layernorm_wh_U8toU image2d_array_t output, float eps)\n\ {\n\ int gidz = get_global_id(1);\n\ - int4 coord = (int4)(get_global_id(0), 0, gidz, 0);\n\ + int4 coord = (int4)(get_global_id(0), 0, gidz, gidz);\n\ + int4 coord_in = (int4)(get_global_id(0), 0, gidz, gidz);\n\ int2 coord_sum = (int2)(0, gidz);\n\ int4 coord_para = coord;\n\ coord_para.z = (ushort)gidz / (ushort)(height_depth);\n\ @@ -13810,8 +14245,8 @@ __kernel __attribute__((reqd_work_group_size(16, 1, 1))) void layernorm_wh_U8toU \n\ int8 input_desc, scale_desc, output_desc;\n\ _viv_asm(COPY, input_desc, input, sizeof(input_desc));\n\ - int baseAddr_a = (int)coord.z * input_desc.s4 + input_desc.s0;\n\ - _viv_asm(MOV, coord.w, baseAddr_a);\n\ + int baseAddr_a = (int)coord_in.z * input_desc.s4 + input_desc.s0;\n\ + _viv_asm(MOV, coord_in.z, baseAddr_a);\n\ \n\ _viv_asm(COPY, scale_desc, scale, sizeof(scale_desc));\n\ int baseAddr_c = (int)coord_para.z * scale_desc.s4 + scale_desc.s0;\n\ @@ -13827,11 +14262,12 @@ __kernel __attribute__((reqd_work_group_size(16, 1, 1))) void layernorm_wh_U8toU \n\ for(coord.y = 0; coord.y < height; coord.y++)\n\ {\n\ - VXC_OP4(img_load_3d, src0, input, coord.xyww, VXC_5BITOFFSET_XY(0, 0),\n\ + VXC_OP4(img_load_3d, src0, input, coord_in, 0,\n\ VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ + coord_in.y ++;\n\ coord_para.y = coord.y;\n\ coord_bias.y = coord.y;\n\ - VXC_OP4(img_load_3d, src1, scale, coord_para.xyww, VXC_5BITOFFSET_XY(0, 0),\n\ + VXC_OP4(img_load_3d, src1, scale, coord_para.xywz, 0,\n\ VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ bias_f0 = read_imagef(bias, coord_bias);\n\ coord_bias.x += 4;\n\ @@ -13857,7 +14293,7 @@ __kernel __attribute__((reqd_work_group_size(16, 1, 1))) void layernorm_wh_U8toU \n\ VXC_DP2x8(outval, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1),\\\n\ uniConvertInt32toUint8_2x8);\n\ - VXC_OP4_NoDest(img_store_3d, output, coord.xyzz, outval, \\\n\ + VXC_OP4_NoDest(img_store_3d, output, coord, outval, \\\n\ VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0));\n\ }\n\ }\n\ @@ -13891,10 +14327,10 @@ __kernel __attribute__((reqd_work_group_size(16, 1, 1))) void layernorm_wh_U8toU \n\ for(coord.y = 0; coord.y < height; coord.y++)\n\ {\n\ - VXC_ReadImage(src0, input, coord, VXC_5BITOFFSET_XY(0, 0),\\\n\ + VXC_ReadImage(src0, input, coord, 0,\\\n\ VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ coord_bias.y = coord.y;\n\ - VXC_ReadImage(src1, scale, coord, VXC_5BITOFFSET_XY(0, 0),\\\n\ + VXC_ReadImage(src1, scale, coord, 0,\\\n\ VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ bias_f0 = read_imagef(bias, coord_bias);\n\ coord_bias.x += 4;\n\ @@ -19214,21 +19650,21 @@ __kernel void gemm_F16F16toF16(image2d_array_t inputA,\n\ for(coord_a.x = 0, coord_b.y = 0; coord_a.x < K;)\n\ {\n\ vxc_float4 tempA0, tempA1, tempA2, tempA3;\n\ - VXC_OP4(img_load_3d, srcA0, inputA, coord_a.xyww, VXC_5BITOFFSET_XY(0, 0),\n\ + VXC_OP4(img_load_3d, srcA0, inputA, coord_a.xywz, VXC_5BITOFFSET_XY(0, 0),\n\ VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0));\n\ - VXC_OP4(img_load_3d, srcB.hi, inputB, coord_b.xyww, VXC_5BITOFFSET_XY(0, 0),\n\ + VXC_OP4(img_load_3d, srcB.hi, inputB, coord_b.xywz, VXC_5BITOFFSET_XY(0, 0),\n\ VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0));\n\ - VXC_OP4(img_load_3d, srcA1, inputA, coord_a.xyww, VXC_5BITOFFSET_XY(0, 1),\n\ + VXC_OP4(img_load_3d, srcA1, inputA, coord_a.xywz, VXC_5BITOFFSET_XY(0, 1),\n\ VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0));\n\ - VXC_OP4(img_load_3d, srcB.hi, inputB, coord_b.xyww, VXC_5BITOFFSET_XY(0, 1),\n\ + VXC_OP4(img_load_3d, srcB.hi, inputB, coord_b.xywz, VXC_5BITOFFSET_XY(0, 1),\n\ VXC_MODIFIER(4, 7, 0, VXC_RM_TowardZero, 0));\n\ - VXC_OP4(img_load_3d, srcA2, inputA, coord_a.xyww, VXC_5BITOFFSET_XY(0, 2),\n\ + VXC_OP4(img_load_3d, srcA2, inputA, coord_a.xywz, VXC_5BITOFFSET_XY(0, 2),\n\ VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0));\n\ - VXC_OP4(img_load_3d, srcB.lo, inputB, coord_b.xyww, VXC_5BITOFFSET_XY(0, 2),\n\ + VXC_OP4(img_load_3d, srcB.lo, inputB, coord_b.xywz, VXC_5BITOFFSET_XY(0, 2),\n\ VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0));\n\ - VXC_OP4(img_load_3d, srcA3, inputA, coord_a.xyww, VXC_5BITOFFSET_XY(0, 3),\n\ + VXC_OP4(img_load_3d, srcA3, inputA, coord_a.xywz, VXC_5BITOFFSET_XY(0, 3),\n\ VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0));\n\ - VXC_OP4(img_load_3d, srcB.lo, inputB, coord_b.xyww, VXC_5BITOFFSET_XY(0, 3),\n\ + VXC_OP4(img_load_3d, srcB.lo, inputB, coord_b.xywz, VXC_5BITOFFSET_XY(0, 3),\n\ VXC_MODIFIER(4, 7, 0, VXC_RM_TowardZero, 0));\n\ coord_a.x += 4;\n\ coord_b.y += 4;\n\ @@ -19257,22 +19693,22 @@ __kernel void gemm_F16F16toF16(image2d_array_t inputA,\n\ _viv_asm(MOV, coord_b.w, baseAddr);\n\ _viv_asm(CONV, valC, sum0);\n\ _viv_asm(COPY, outC, valC, 16);\n\ - VXC_OP4_NoDest(img_store_3d, output, coord_b.xyww, outC.s0246, \\\n\ + VXC_OP4_NoDest(img_store_3d, output, coord_b.xywz, outC.s0246, \\\n\ VXC_MODIFIER(0, 3, 0,VXC_RM_TowardZero, 0));\n\ coord_b.y++;\n\ _viv_asm(CONV, valC, sum1);\n\ _viv_asm(COPY, outC, valC, 16);\n\ - VXC_OP4_NoDest(img_store_3d, output, coord_b.xyww, outC.s0246, \\\n\ + VXC_OP4_NoDest(img_store_3d, output, coord_b.xywz, outC.s0246, \\\n\ VXC_MODIFIER(0, 3, 0,VXC_RM_TowardZero, 0));\n\ coord_b.y++;\n\ _viv_asm(CONV, valC, sum2);\n\ _viv_asm(COPY, outC, valC, 16);\n\ - VXC_OP4_NoDest(img_store_3d, output, coord_b.xyww, outC.s0246, \\\n\ + VXC_OP4_NoDest(img_store_3d, output, coord_b.xywz, outC.s0246, \\\n\ VXC_MODIFIER(0, 3, 0,VXC_RM_TowardZero, 0));\n\ coord_b.y++;\n\ _viv_asm(CONV, valC, sum3);\n\ _viv_asm(COPY, outC, valC, 16);\n\ - VXC_OP4_NoDest(img_store_3d, output, coord_b.xyww, outC.s0246, \\\n\ + VXC_OP4_NoDest(img_store_3d, output, coord_b.xywz, outC.s0246, \\\n\ VXC_MODIFIER(0, 3, 0,VXC_RM_TowardZero, 0));\n\ }\n\ #else\n\ @@ -19303,21 +19739,21 @@ __kernel void gemm_F16F16toF16(image2d_array_t inputA,\n\ {\n\ vxc_float4 tempA0, tempA1, tempA2, tempA3;\n\ vxc_float4 tempB0, tempB1, tempB2, tempB3;\n\ - VXC_OP4(img_load_3d, srcA0, inputA, coord_a.xyww, VXC_5BITOFFSET_XY(0, 0),\n\ + VXC_OP4(img_load_3d, srcA0, inputA, coord_a.xywz, VXC_5BITOFFSET_XY(0, 0),\n\ VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0));\n\ - VXC_OP4(img_load_3d, srcB0, inputB, coord_b.xyww, VXC_5BITOFFSET_XY(0, 0),\n\ + VXC_OP4(img_load_3d, srcB0, inputB, coord_b.xywz, VXC_5BITOFFSET_XY(0, 0),\n\ VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0));\n\ - VXC_OP4(img_load_3d, srcA0, inputA, coord_a.xyww, VXC_5BITOFFSET_XY(0, 1),\n\ + VXC_OP4(img_load_3d, srcA0, inputA, coord_a.xywz, VXC_5BITOFFSET_XY(0, 1),\n\ VXC_MODIFIER(4, 7, 0, VXC_RM_TowardZero, 0));\n\ - VXC_OP4(img_load_3d, srcB0, inputB, coord_b.xyww, VXC_5BITOFFSET_XY(0, 1),\n\ + VXC_OP4(img_load_3d, srcB0, inputB, coord_b.xywz, VXC_5BITOFFSET_XY(0, 1),\n\ VXC_MODIFIER(4, 7, 0, VXC_RM_TowardZero, 0));\n\ - VXC_OP4(img_load_3d, srcA1, inputA, coord_a.xyww, VXC_5BITOFFSET_XY(0, 2),\n\ + VXC_OP4(img_load_3d, srcA1, inputA, coord_a.xywz, VXC_5BITOFFSET_XY(0, 2),\n\ VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0));\n\ - VXC_OP4(img_load_3d, srcB1, inputB, coord_b.xyww, VXC_5BITOFFSET_XY(0, 2),\n\ + VXC_OP4(img_load_3d, srcB1, inputB, coord_b.xywz, VXC_5BITOFFSET_XY(0, 2),\n\ VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0));\n\ - VXC_OP4(img_load_3d, srcA1, inputA, coord_a.xyww, VXC_5BITOFFSET_XY(0, 3),\n\ + VXC_OP4(img_load_3d, srcA1, inputA, coord_a.xywz, VXC_5BITOFFSET_XY(0, 3),\n\ VXC_MODIFIER(4, 7, 0, VXC_RM_TowardZero, 0));\n\ - VXC_OP4(img_load_3d, srcB1, inputB, coord_b.xyww, VXC_5BITOFFSET_XY(0, 3),\n\ + VXC_OP4(img_load_3d, srcB1, inputB, coord_b.xywz, VXC_5BITOFFSET_XY(0, 3),\n\ VXC_MODIFIER(4, 7, 0, VXC_RM_TowardZero, 0));\n\ coord_a.x += 4;\n\ coord_b.y += 4;\n\ @@ -19345,22 +19781,22 @@ __kernel void gemm_F16F16toF16(image2d_array_t inputA,\n\ _viv_asm(MOV, coord_b.w, baseAddr);\n\ _viv_asm(CONV, valC, sum0);\n\ _viv_asm(COPY, outC, valC, 16);\n\ - VXC_OP4_NoDest(img_store_3d, output, coord_b.xyww, outC.s0246, \\\n\ + VXC_OP4_NoDest(img_store_3d, output, coord_b.xywz, outC.s0246, \\\n\ VXC_MODIFIER(0, 3, 0,VXC_RM_TowardZero, 0));\n\ coord_b.y++;\n\ _viv_asm(CONV, valC, sum1);\n\ _viv_asm(COPY, outC, valC, 16);\n\ - VXC_OP4_NoDest(img_store_3d, output, coord_b.xyww, outC.s0246, \\\n\ + VXC_OP4_NoDest(img_store_3d, output, coord_b.xywz, outC.s0246, \\\n\ VXC_MODIFIER(0, 3, 0,VXC_RM_TowardZero, 0));\n\ coord_b.y++;\n\ _viv_asm(CONV, valC, sum2);\n\ _viv_asm(COPY, outC, valC, 16);\n\ - VXC_OP4_NoDest(img_store_3d, output, coord_b.xyww, outC.s0246, \\\n\ + VXC_OP4_NoDest(img_store_3d, output, coord_b.xywz, outC.s0246, \\\n\ VXC_MODIFIER(0, 3, 0,VXC_RM_TowardZero, 0));\n\ coord_b.y++;\n\ _viv_asm(CONV, valC, sum3);\n\ _viv_asm(COPY, outC, valC, 16);\n\ - VXC_OP4_NoDest(img_store_3d, output, coord_b.xyww, outC.s0246, \\\n\ + VXC_OP4_NoDest(img_store_3d, output, coord_b.xywz, outC.s0246, \\\n\ VXC_MODIFIER(0, 3, 0,VXC_RM_TowardZero, 0));\n\ }\n\ #endif\n\ @@ -19476,21 +19912,21 @@ __kernel void gemm_F16F16to##dst_type_name( \\\n\ for(coord_a.x = 0, coord_b.y = 0; coord_a.x < K;) \\\n\ { \\\n\ vxc_float4 tempA0, tempA1, tempA2, tempA3; \\\n\ - VXC_OP4(img_load_3d, srcA0, inputA, coord_a.xyww, VXC_5BITOFFSET_XY(0, 0), \\\n\ + VXC_OP4(img_load_3d, srcA0, inputA, coord_a.xywz, VXC_5BITOFFSET_XY(0, 0), \\\n\ VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \\\n\ - VXC_OP4(img_load_3d, srcB.hi, inputB, coord_b.xyww, VXC_5BITOFFSET_XY(0, 0), \\\n\ + VXC_OP4(img_load_3d, srcB.hi, inputB, coord_b.xywz, VXC_5BITOFFSET_XY(0, 0), \\\n\ VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \\\n\ - VXC_OP4(img_load_3d, srcA1, inputA, coord_a.xyww, VXC_5BITOFFSET_XY(0, 1), \\\n\ + VXC_OP4(img_load_3d, srcA1, inputA, coord_a.xywz, VXC_5BITOFFSET_XY(0, 1), \\\n\ VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \\\n\ - VXC_OP4(img_load_3d, srcB.hi, inputB, coord_b.xyww, VXC_5BITOFFSET_XY(0, 1), \\\n\ + VXC_OP4(img_load_3d, srcB.hi, inputB, coord_b.xywz, VXC_5BITOFFSET_XY(0, 1), \\\n\ VXC_MODIFIER(4, 7, 0, VXC_RM_TowardZero, 0)); \\\n\ - VXC_OP4(img_load_3d, srcA2, inputA, coord_a.xyww, VXC_5BITOFFSET_XY(0, 2), \\\n\ + VXC_OP4(img_load_3d, srcA2, inputA, coord_a.xywz, VXC_5BITOFFSET_XY(0, 2), \\\n\ VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \\\n\ - VXC_OP4(img_load_3d, srcB.lo, inputB, coord_b.xyww, VXC_5BITOFFSET_XY(0, 2), \\\n\ + VXC_OP4(img_load_3d, srcB.lo, inputB, coord_b.xywz, VXC_5BITOFFSET_XY(0, 2), \\\n\ VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \\\n\ - VXC_OP4(img_load_3d, srcA3, inputA, coord_a.xyww, VXC_5BITOFFSET_XY(0, 3), \\\n\ + VXC_OP4(img_load_3d, srcA3, inputA, coord_a.xywz, VXC_5BITOFFSET_XY(0, 3), \\\n\ VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \\\n\ - VXC_OP4(img_load_3d, srcB.lo, inputB, coord_b.xyww, VXC_5BITOFFSET_XY(0, 3), \\\n\ + VXC_OP4(img_load_3d, srcB.lo, inputB, coord_b.xywz, VXC_5BITOFFSET_XY(0, 3), \\\n\ VXC_MODIFIER(4, 7, 0, VXC_RM_TowardZero, 0)); \\\n\ coord_a.x += 4; \\\n\ coord_b.y += 4; \\\n\ @@ -19523,20 +19959,20 @@ __kernel void gemm_F16F16to##dst_type_name( \\\n\ tmpOut1 = convert_int4_rte(sum1 * outputScale + output_ZP); \\\n\ VXC_DP2x8(outC, tmpOut0, tmpOut1, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1),\\\n\ uniConvertInt32toUint8_2x8); \\\n\ - VXC_OP4_NoDest(img_store_3d, output, coord_b.xyww, outC.s0123, \\\n\ + VXC_OP4_NoDest(img_store_3d, output, coord_b.xywz, outC.s0123, \\\n\ VXC_MODIFIER(0, 3, 0,VXC_RM_TowardZero, 0)); \\\n\ coord_b.y++; \\\n\ - VXC_OP4_NoDest(img_store_3d, output, coord_b.xyww, outC.s4567, \\\n\ + VXC_OP4_NoDest(img_store_3d, output, coord_b.xywz, outC.s4567, \\\n\ VXC_MODIFIER(0, 3, 0,VXC_RM_TowardZero, 0)); \\\n\ coord_b.y++; \\\n\ tmpOut0 = convert_int4_rte(sum2 * outputScale + output_ZP); \\\n\ tmpOut1 = convert_int4_rte(sum3 * outputScale + output_ZP); \\\n\ VXC_DP2x8(outC, tmpOut0, tmpOut1, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1),\\\n\ uniConvertInt32toUint8_2x8); \\\n\ - VXC_OP4_NoDest(img_store_3d, output, coord_b.xyww, outC.s0123, \\\n\ + VXC_OP4_NoDest(img_store_3d, output, coord_b.xywz, outC.s0123, \\\n\ VXC_MODIFIER(0, 3, 0,VXC_RM_TowardZero, 0)); \\\n\ coord_b.y++; \\\n\ - VXC_OP4_NoDest(img_store_3d, output, coord_b.xyww, outC.s4567, \\\n\ + VXC_OP4_NoDest(img_store_3d, output, coord_b.xywz, outC.s4567, \\\n\ VXC_MODIFIER(0, 3, 0,VXC_RM_TowardZero, 0)); \\\n\ }\n\ #else\n\ @@ -19566,21 +20002,21 @@ __kernel void gemm_F16F16to##dst_type_name( \\\n\ { \\\n\ vxc_float4 tempA0, tempA1, tempA2, tempA3; \\\n\ vxc_float4 tempB0, tempB1, tempB2, tempB3; \\\n\ - VXC_OP4(img_load_3d, srcA0, inputA, coord_a.xyww, VXC_5BITOFFSET_XY(0, 0), \\\n\ + VXC_OP4(img_load_3d, srcA0, inputA, coord_a.xywz, VXC_5BITOFFSET_XY(0, 0), \\\n\ VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \\\n\ - VXC_OP4(img_load_3d, srcB0, inputB, coord_b.xyww, VXC_5BITOFFSET_XY(0, 0), \\\n\ + VXC_OP4(img_load_3d, srcB0, inputB, coord_b.xywz, VXC_5BITOFFSET_XY(0, 0), \\\n\ VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \\\n\ - VXC_OP4(img_load_3d, srcA0, inputA, coord_a.xyww, VXC_5BITOFFSET_XY(0, 1), \\\n\ + VXC_OP4(img_load_3d, srcA0, inputA, coord_a.xywz, VXC_5BITOFFSET_XY(0, 1), \\\n\ VXC_MODIFIER(4, 7, 0, VXC_RM_TowardZero, 0)); \\\n\ - VXC_OP4(img_load_3d, srcB0, inputB, coord_b.xyww, VXC_5BITOFFSET_XY(0, 1), \\\n\ + VXC_OP4(img_load_3d, srcB0, inputB, coord_b.xywz, VXC_5BITOFFSET_XY(0, 1), \\\n\ VXC_MODIFIER(4, 7, 0, VXC_RM_TowardZero, 0)); \\\n\ - VXC_OP4(img_load_3d, srcA1, inputA, coord_a.xyww, VXC_5BITOFFSET_XY(0, 2), \\\n\ + VXC_OP4(img_load_3d, srcA1, inputA, coord_a.xywz, VXC_5BITOFFSET_XY(0, 2), \\\n\ VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \\\n\ - VXC_OP4(img_load_3d, srcB1, inputB, coord_b.xyww, VXC_5BITOFFSET_XY(0, 2), \\\n\ + VXC_OP4(img_load_3d, srcB1, inputB, coord_b.xywz, VXC_5BITOFFSET_XY(0, 2), \\\n\ VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \\\n\ - VXC_OP4(img_load_3d, srcA1, inputA, coord_a.xyww, VXC_5BITOFFSET_XY(0, 3), \\\n\ + VXC_OP4(img_load_3d, srcA1, inputA, coord_a.xywz, VXC_5BITOFFSET_XY(0, 3), \\\n\ VXC_MODIFIER(4, 7, 0, VXC_RM_TowardZero, 0)); \\\n\ - VXC_OP4(img_load_3d, srcB1, inputB, coord_b.xyww, VXC_5BITOFFSET_XY(0, 3), \\\n\ + VXC_OP4(img_load_3d, srcB1, inputB, coord_b.xywz, VXC_5BITOFFSET_XY(0, 3), \\\n\ VXC_MODIFIER(4, 7, 0, VXC_RM_TowardZero, 0)); \\\n\ coord_a.x += 4; \\\n\ coord_b.y += 4; \\\n\ @@ -19612,20 +20048,20 @@ __kernel void gemm_F16F16to##dst_type_name( \\\n\ tmpOut1 = convert_int4_rte(sum1 * outputScale + output_ZP); \\\n\ VXC_DP2x8(outC, tmpOut0, tmpOut1, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1),\\\n\ uniConvertInt32toUint8_2x8); \\\n\ - VXC_OP4_NoDest(img_store_3d, output, coord_b.xyww, outC.s0123, \\\n\ + VXC_OP4_NoDest(img_store_3d, output, coord_b.xywz, outC.s0123, \\\n\ VXC_MODIFIER(0, 3, 0,VXC_RM_TowardZero, 0)); \\\n\ coord_b.y++; \\\n\ - VXC_OP4_NoDest(img_store_3d, output, coord_b.xyww, outC.s4567, \\\n\ + VXC_OP4_NoDest(img_store_3d, output, coord_b.xywz, outC.s4567, \\\n\ VXC_MODIFIER(0, 3, 0,VXC_RM_TowardZero, 0)); \\\n\ coord_b.y++; \\\n\ tmpOut0 = convert_int4_rte(sum2 * outputScale + output_ZP); \\\n\ tmpOut1 = convert_int4_rte(sum3 * outputScale + output_ZP); \\\n\ VXC_DP2x8(outC, tmpOut0, tmpOut1, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1),\\\n\ uniConvertInt32toUint8_2x8); \\\n\ - VXC_OP4_NoDest(img_store_3d, output, coord_b.xyww, outC.s0123, \\\n\ + VXC_OP4_NoDest(img_store_3d, output, coord_b.xywz, outC.s0123, \\\n\ VXC_MODIFIER(0, 3, 0,VXC_RM_TowardZero, 0)); \\\n\ coord_b.y++; \\\n\ - VXC_OP4_NoDest(img_store_3d, output, coord_b.xyww, outC.s4567, \\\n\ + VXC_OP4_NoDest(img_store_3d, output, coord_b.xywz, outC.s4567, \\\n\ VXC_MODIFIER(0, 3, 0,VXC_RM_TowardZero, 0)); \\\n\ }\n\ #endif\n\ @@ -19678,21 +20114,21 @@ __kernel void gemm_F16##src1_type_name##toI16(image2d_array_t inputA, \\\n\ for(coord_a.x = 0, coord_b.y = 0; coord_a.x < K;) \\\n\ { \\\n\ vxc_float4 tempA0, tempA1, tempA2, tempA3, tmpZpScale; \\\n\ - VXC_OP4(img_load_3d, srcA0, inputA, coord_a.xyww, VXC_5BITOFFSET_XY(0, 0), \\\n\ + VXC_OP4(img_load_3d, srcA0, inputA, coord_a.xywz, VXC_5BITOFFSET_XY(0, 0), \\\n\ VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \\\n\ - VXC_OP4(img_load_3d, srcB.hi, inputB, coord_b.xyww, VXC_5BITOFFSET_XY(0, 0), \\\n\ + VXC_OP4(img_load_3d, srcB.hi, inputB, coord_b.xywz, VXC_5BITOFFSET_XY(0, 0), \\\n\ VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \\\n\ - VXC_OP4(img_load_3d, srcA0, inputA, coord_a.xyww, VXC_5BITOFFSET_XY(0, 1), \\\n\ + VXC_OP4(img_load_3d, srcA0, inputA, coord_a.xywz, VXC_5BITOFFSET_XY(0, 1), \\\n\ VXC_MODIFIER(4, 7, 0, VXC_RM_TowardZero, 0)); \\\n\ - VXC_OP4(img_load_3d, srcB.hi, inputB, coord_b.xyww, VXC_5BITOFFSET_XY(0, 1), \\\n\ + VXC_OP4(img_load_3d, srcB.hi, inputB, coord_b.xywz, VXC_5BITOFFSET_XY(0, 1), \\\n\ VXC_MODIFIER(4, 7, 0, VXC_RM_TowardZero, 0)); \\\n\ - VXC_OP4(img_load_3d, srcA1, inputA, coord_a.xyww, VXC_5BITOFFSET_XY(0, 2), \\\n\ + VXC_OP4(img_load_3d, srcA1, inputA, coord_a.xywz, VXC_5BITOFFSET_XY(0, 2), \\\n\ VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \\\n\ - VXC_OP4(img_load_3d, srcB.lo, inputB, coord_b.xyww, VXC_5BITOFFSET_XY(0, 2), \\\n\ + VXC_OP4(img_load_3d, srcB.lo, inputB, coord_b.xywz, VXC_5BITOFFSET_XY(0, 2), \\\n\ VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \\\n\ - VXC_OP4(img_load_3d, srcA1, inputA, coord_a.xyww, VXC_5BITOFFSET_XY(0, 3), \\\n\ + VXC_OP4(img_load_3d, srcA1, inputA, coord_a.xywz, VXC_5BITOFFSET_XY(0, 3), \\\n\ VXC_MODIFIER(4, 7, 0, VXC_RM_TowardZero, 0)); \\\n\ - VXC_OP4(img_load_3d, srcB.lo, inputB, coord_b.xyww, VXC_5BITOFFSET_XY(0, 3), \\\n\ + VXC_OP4(img_load_3d, srcB.lo, inputB, coord_b.xywz, VXC_5BITOFFSET_XY(0, 3), \\\n\ VXC_MODIFIER(4, 7, 0, VXC_RM_TowardZero, 0)); \\\n\ coord_a.x += 4; \\\n\ coord_b.y += 4; \\\n\ @@ -19722,20 +20158,20 @@ __kernel void gemm_F16##src1_type_name##toI16(image2d_array_t inputA, \\\n\ tmpOut1 = convert_int4_rte(sum1 * in1outScale + output_ZP); \\\n\ VXC_DP2x8(outC, tmpOut0, tmpOut1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1),\\\n\ uniConvertInt32toUint8_2x8); \\\n\ - VXC_OP4_NoDest(img_store_3d, output, coord_b.xyww, outC.s0123, \\\n\ + VXC_OP4_NoDest(img_store_3d, output, coord_b.xywz, outC.s0123, \\\n\ VXC_MODIFIER(0, 3, 0,VXC_RM_TowardZero, 0)); \\\n\ coord_b.y++; \\\n\ - VXC_OP4_NoDest(img_store_3d, output, coord_b.xyww, outC.s4567, \\\n\ + VXC_OP4_NoDest(img_store_3d, output, coord_b.xywz, outC.s4567, \\\n\ VXC_MODIFIER(0, 3, 0,VXC_RM_TowardZero, 0)); \\\n\ coord_b.y++; \\\n\ tmpOut0 = convert_int4_rte(sum2 * in1outScale + output_ZP); \\\n\ tmpOut1 = convert_int4_rte(sum3 * in1outScale + output_ZP); \\\n\ VXC_DP2x8(outC, tmpOut0, tmpOut1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1),\\\n\ uniConvertInt32toUint8_2x8); \\\n\ - VXC_OP4_NoDest(img_store_3d, output, coord_b.xyww, outC.s0123, \\\n\ + VXC_OP4_NoDest(img_store_3d, output, coord_b.xywz, outC.s0123, \\\n\ VXC_MODIFIER(0, 3, 0,VXC_RM_TowardZero, 0)); \\\n\ coord_b.y++; \\\n\ - VXC_OP4_NoDest(img_store_3d, output, coord_b.xyww, outC.s4567, \\\n\ + VXC_OP4_NoDest(img_store_3d, output, coord_b.xywz, outC.s4567, \\\n\ VXC_MODIFIER(0, 3, 0,VXC_RM_TowardZero, 0)); \\\n\ }\n\ #else\n\ @@ -19766,21 +20202,21 @@ __kernel void gemm_F16##src1_type_name##toI16(image2d_array_t inputA, \\\n\ { \\\n\ vxc_float4 tempA0, tempA1, tempA2, tempA3, tmpZpScale; \\\n\ vxc_float4 tempB0, tempB1, tempB2, tempB3; \\\n\ - VXC_OP4(img_load_3d, srcA0, inputA, coord_a.xyww, VXC_5BITOFFSET_XY(0, 0), \\\n\ + VXC_OP4(img_load_3d, srcA0, inputA, coord_a.xywz, VXC_5BITOFFSET_XY(0, 0), \\\n\ VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \\\n\ - VXC_OP4(img_load_3d, srcB0, inputB, coord_b.xyww, VXC_5BITOFFSET_XY(0, 0), \\\n\ + VXC_OP4(img_load_3d, srcB0, inputB, coord_b.xywz, VXC_5BITOFFSET_XY(0, 0), \\\n\ VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \\\n\ - VXC_OP4(img_load_3d, srcA0, inputA, coord_a.xyww, VXC_5BITOFFSET_XY(0, 1), \\\n\ + VXC_OP4(img_load_3d, srcA0, inputA, coord_a.xywz, VXC_5BITOFFSET_XY(0, 1), \\\n\ VXC_MODIFIER(4, 7, 0, VXC_RM_TowardZero, 0)); \\\n\ - VXC_OP4(img_load_3d, srcB0, inputB, coord_b.xyww, VXC_5BITOFFSET_XY(0, 1), \\\n\ + VXC_OP4(img_load_3d, srcB0, inputB, coord_b.xywz, VXC_5BITOFFSET_XY(0, 1), \\\n\ VXC_MODIFIER(4, 7, 0, VXC_RM_TowardZero, 0)); \\\n\ - VXC_OP4(img_load_3d, srcA1, inputA, coord_a.xyww, VXC_5BITOFFSET_XY(0, 2), \\\n\ + VXC_OP4(img_load_3d, srcA1, inputA, coord_a.xywz, VXC_5BITOFFSET_XY(0, 2), \\\n\ VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \\\n\ - VXC_OP4(img_load_3d, srcB1, inputB, coord_b.xyww, VXC_5BITOFFSET_XY(0, 2), \\\n\ + VXC_OP4(img_load_3d, srcB1, inputB, coord_b.xywz, VXC_5BITOFFSET_XY(0, 2), \\\n\ VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \\\n\ - VXC_OP4(img_load_3d, srcA1, inputA, coord_a.xyww, VXC_5BITOFFSET_XY(0, 3), \\\n\ + VXC_OP4(img_load_3d, srcA1, inputA, coord_a.xywz, VXC_5BITOFFSET_XY(0, 3), \\\n\ VXC_MODIFIER(4, 7, 0, VXC_RM_TowardZero, 0)); \\\n\ - VXC_OP4(img_load_3d, srcB1, inputB, coord_b.xyww, VXC_5BITOFFSET_XY(0, 3), \\\n\ + VXC_OP4(img_load_3d, srcB1, inputB, coord_b.xywz, VXC_5BITOFFSET_XY(0, 3), \\\n\ VXC_MODIFIER(4, 7, 0, VXC_RM_TowardZero, 0)); \\\n\ coord_a.x += 4; \\\n\ coord_b.y += 4; \\\n\ @@ -19810,20 +20246,20 @@ __kernel void gemm_F16##src1_type_name##toI16(image2d_array_t inputA, \\\n\ tmpOut1 = convert_int4_rte(sum1 * in1outScale + output_ZP); \\\n\ VXC_DP2x8(outC, tmpOut0, tmpOut1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1),\\\n\ uniConvertInt32toUint8_2x8); \\\n\ - VXC_OP4_NoDest(img_store_3d, output, coord_b.xyww, outC.s0123, \\\n\ + VXC_OP4_NoDest(img_store_3d, output, coord_b.xywz, outC.s0123, \\\n\ VXC_MODIFIER(0, 3, 0,VXC_RM_TowardZero, 0)); \\\n\ coord_b.y++; \\\n\ - VXC_OP4_NoDest(img_store_3d, output, coord_b.xyww, outC.s4567, \\\n\ + VXC_OP4_NoDest(img_store_3d, output, coord_b.xywz, outC.s4567, \\\n\ VXC_MODIFIER(0, 3, 0,VXC_RM_TowardZero, 0)); \\\n\ coord_b.y++; \\\n\ tmpOut0 = convert_int4_rte(sum2 * in1outScale + output_ZP); \\\n\ tmpOut1 = convert_int4_rte(sum3 * in1outScale + output_ZP); \\\n\ VXC_DP2x8(outC, tmpOut0, tmpOut1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1),\\\n\ uniConvertInt32toUint8_2x8); \\\n\ - VXC_OP4_NoDest(img_store_3d, output, coord_b.xyww, outC.s0123, \\\n\ + VXC_OP4_NoDest(img_store_3d, output, coord_b.xywz, outC.s0123, \\\n\ VXC_MODIFIER(0, 3, 0,VXC_RM_TowardZero, 0)); \\\n\ coord_b.y++; \\\n\ - VXC_OP4_NoDest(img_store_3d, output, coord_b.xyww, outC.s4567, \\\n\ + VXC_OP4_NoDest(img_store_3d, output, coord_b.xywz, outC.s4567, \\\n\ VXC_MODIFIER(0, 3, 0,VXC_RM_TowardZero, 0)); \\\n\ }\n\ #endif\n\ @@ -19872,21 +20308,21 @@ __kernel void gemm_F16##src1_type_name##toF16(image2d_array_t inputA, \\\n\ { \\\n\ vxc_float4 tempA0, tempA1, tempA2, tempA3; \\\n\ vxc_float4 tempZp; \\\n\ - VXC_OP4(img_load_3d, srcA0, inputA, coord_a.xyww, VXC_5BITOFFSET_XY(0, 0), \\\n\ + VXC_OP4(img_load_3d, srcA0, inputA, coord_a.xywz, VXC_5BITOFFSET_XY(0, 0), \\\n\ VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \\\n\ - VXC_OP4(img_load_3d, srcB, inputB, coord_b.xyww, VXC_5BITOFFSET_XY(0, 0), \\\n\ + VXC_OP4(img_load_3d, srcB, inputB, coord_b.xywz, VXC_5BITOFFSET_XY(0, 0), \\\n\ VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \\\n\ - VXC_OP4(img_load_3d, srcA0, inputA, coord_a.xyww, VXC_5BITOFFSET_XY(0, 1), \\\n\ + VXC_OP4(img_load_3d, srcA0, inputA, coord_a.xywz, VXC_5BITOFFSET_XY(0, 1), \\\n\ VXC_MODIFIER(4, 7, 0, VXC_RM_TowardZero, 0)); \\\n\ - VXC_OP4(img_load_3d, srcB, inputB, coord_b.xyww, VXC_5BITOFFSET_XY(0, 1), \\\n\ + VXC_OP4(img_load_3d, srcB, inputB, coord_b.xywz, VXC_5BITOFFSET_XY(0, 1), \\\n\ VXC_MODIFIER(4, 7, 0, VXC_RM_TowardZero, 0)); \\\n\ - VXC_OP4(img_load_3d, srcA1, inputA, coord_a.xyww, VXC_5BITOFFSET_XY(0, 2), \\\n\ + VXC_OP4(img_load_3d, srcA1, inputA, coord_a.xywz, VXC_5BITOFFSET_XY(0, 2), \\\n\ VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \\\n\ - VXC_OP4(img_load_3d, srcB, inputB, coord_b.xyww, VXC_5BITOFFSET_XY(0, 2), \\\n\ + VXC_OP4(img_load_3d, srcB, inputB, coord_b.xywz, VXC_5BITOFFSET_XY(0, 2), \\\n\ VXC_MODIFIER(8, 11, 0, VXC_RM_TowardZero, 0)); \\\n\ - VXC_OP4(img_load_3d, srcA1, inputA, coord_a.xyww, VXC_5BITOFFSET_XY(0, 3), \\\n\ + VXC_OP4(img_load_3d, srcA1, inputA, coord_a.xywz, VXC_5BITOFFSET_XY(0, 3), \\\n\ VXC_MODIFIER(4, 7, 0, VXC_RM_TowardZero, 0)); \\\n\ - VXC_OP4(img_load_3d, srcB, inputB, coord_b.xyww, VXC_5BITOFFSET_XY(0, 3), \\\n\ + VXC_OP4(img_load_3d, srcB, inputB, coord_b.xywz, VXC_5BITOFFSET_XY(0, 3), \\\n\ VXC_MODIFIER(12, 15, 0, VXC_RM_TowardZero, 0)); \\\n\ coord_a.x += 4; \\\n\ coord_b.y += 4; \\\n\ @@ -19913,22 +20349,22 @@ __kernel void gemm_F16##src1_type_name##toF16(image2d_array_t inputA, \\\n\ _viv_asm(MOV, coord_b.w, baseAddr); \\\n\ _viv_asm(CONV, valC, sum0); \\\n\ _viv_asm(COPY, outC, valC, 16); \\\n\ - VXC_OP4_NoDest(img_store_3d, output, coord_b.xyww, outC.s0246, \\\n\ + VXC_OP4_NoDest(img_store_3d, output, coord_b.xywz, outC.s0246, \\\n\ VXC_MODIFIER(0, 3, 0,VXC_RM_TowardZero, 0)); \\\n\ coord_b.y++; \\\n\ _viv_asm(CONV, valC, sum1); \\\n\ _viv_asm(COPY, outC, valC, 16); \\\n\ - VXC_OP4_NoDest(img_store_3d, output, coord_b.xyww, outC.s0246, \\\n\ + VXC_OP4_NoDest(img_store_3d, output, coord_b.xywz, outC.s0246, \\\n\ VXC_MODIFIER(0, 3, 0,VXC_RM_TowardZero, 0)); \\\n\ coord_b.y++; \\\n\ _viv_asm(CONV, valC, sum2); \\\n\ _viv_asm(COPY, outC, valC, 16); \\\n\ - VXC_OP4_NoDest(img_store_3d, output, coord_b.xyww, outC.s0246, \\\n\ + VXC_OP4_NoDest(img_store_3d, output, coord_b.xywz, outC.s0246, \\\n\ VXC_MODIFIER(0, 3, 0,VXC_RM_TowardZero, 0)); \\\n\ coord_b.y++; \\\n\ _viv_asm(CONV, valC, sum3); \\\n\ _viv_asm(COPY, outC, valC, 16); \\\n\ - VXC_OP4_NoDest(img_store_3d, output, coord_b.xyww, outC.s0246, \\\n\ + VXC_OP4_NoDest(img_store_3d, output, coord_b.xywz, outC.s0246, \\\n\ VXC_MODIFIER(0, 3, 0,VXC_RM_TowardZero, 0)); \\\n\ }\n\ GEMM_F16_QINT_TO_F16(U8, vxc_uchar16)\n\ @@ -19964,33 +20400,33 @@ __kernel void gemm_F16##src1_type_name##toF16(image2d_array_t inputA, \\\n\ { \\\n\ vxc_float4 tempA0, tempA1, tempA2, tempA3; \\\n\ vxc_float4 tempB0, tempB1, tempB2, tempB3; \\\n\ - VXC_OP4(img_load_3d, srcA, inputA, coord_a.xyww, VXC_5BITOFFSET_XY(0, 0), \\\n\ + VXC_OP4(img_load_3d, srcA, inputA, coord_a.xywz, VXC_5BITOFFSET_XY(0, 0), \\\n\ VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \\\n\ - VXC_OP4(img_load_3d, srcB, inputB, coord_b.xyww, VXC_5BITOFFSET_XY(0, 0), \\\n\ + VXC_OP4(img_load_3d, srcB, inputB, coord_b.xywz, VXC_5BITOFFSET_XY(0, 0), \\\n\ VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \\\n\ _viv_asm(COPY, tmpA, srcA, 16); \\\n\ VXC_DP4x4(tempA0, tmpA, tmpA, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvert1stFp16ToFp32_4x4); \\\n\ VXC_DP4x4(tempB0, srcB, in1_zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), \\\n\ uniConvertUint8SubZpToFp32B_4x4); \\\n\ - VXC_OP4(img_load_3d, srcA, inputA, coord_a.xyww, VXC_5BITOFFSET_XY(0, 1), \\\n\ + VXC_OP4(img_load_3d, srcA, inputA, coord_a.xywz, VXC_5BITOFFSET_XY(0, 1), \\\n\ VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \\\n\ - VXC_OP4(img_load_3d, srcB, inputB, coord_b.xyww, VXC_5BITOFFSET_XY(0, 1), \\\n\ + VXC_OP4(img_load_3d, srcB, inputB, coord_b.xywz, VXC_5BITOFFSET_XY(0, 1), \\\n\ VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \\\n\ _viv_asm(COPY, tmpA, srcA, 16); \\\n\ VXC_DP4x4(tempA1, tmpA, tmpA, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvert1stFp16ToFp32_4x4); \\\n\ VXC_DP4x4(tempB1, srcB, in1_zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), \\\n\ uniConvertUint8SubZpToFp32B_4x4); \\\n\ - VXC_OP4(img_load_3d, srcA, inputA, coord_a.xyww, VXC_5BITOFFSET_XY(0, 2), \\\n\ + VXC_OP4(img_load_3d, srcA, inputA, coord_a.xywz, VXC_5BITOFFSET_XY(0, 2), \\\n\ VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \\\n\ - VXC_OP4(img_load_3d, srcB, inputB, coord_b.xyww, VXC_5BITOFFSET_XY(0, 2), \\\n\ + VXC_OP4(img_load_3d, srcB, inputB, coord_b.xywz, VXC_5BITOFFSET_XY(0, 2), \\\n\ VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \\\n\ _viv_asm(COPY, tmpA, srcA, 16); \\\n\ VXC_DP4x4(tempA2, tmpA, tmpA, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvert1stFp16ToFp32_4x4); \\\n\ VXC_DP4x4(tempB2, srcB, in1_zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), \\\n\ uniConvertUint8SubZpToFp32B_4x4); \\\n\ - VXC_OP4(img_load_3d, srcA, inputA, coord_a.xyww, VXC_5BITOFFSET_XY(0, 3), \\\n\ + VXC_OP4(img_load_3d, srcA, inputA, coord_a.xywz, VXC_5BITOFFSET_XY(0, 3), \\\n\ VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \\\n\ - VXC_OP4(img_load_3d, srcB, inputB, coord_b.xyww, VXC_5BITOFFSET_XY(0, 3), \\\n\ + VXC_OP4(img_load_3d, srcB, inputB, coord_b.xywz, VXC_5BITOFFSET_XY(0, 3), \\\n\ VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \\\n\ coord_a.x += 4; \\\n\ coord_b.y += 4; \\\n\ @@ -20009,22 +20445,22 @@ __kernel void gemm_F16##src1_type_name##toF16(image2d_array_t inputA, \\\n\ _viv_asm(MOV, coord_b.w, baseAddr); \\\n\ _viv_asm(CONV, valC, sum0); \\\n\ _viv_asm(COPY, outC, valC, 16); \\\n\ - VXC_OP4_NoDest(img_store_3d, output, coord_b.xyww, outC.s0246, \\\n\ + VXC_OP4_NoDest(img_store_3d, output, coord_b.xywz, outC.s0246, \\\n\ VXC_MODIFIER(0, 3, 0,VXC_RM_TowardZero, 0)); \\\n\ coord_b.y++; \\\n\ _viv_asm(CONV, valC, sum1); \\\n\ _viv_asm(COPY, outC, valC, 16); \\\n\ - VXC_OP4_NoDest(img_store_3d, output, coord_b.xyww, outC.s0246, \\\n\ + VXC_OP4_NoDest(img_store_3d, output, coord_b.xywz, outC.s0246, \\\n\ VXC_MODIFIER(0, 3, 0,VXC_RM_TowardZero, 0)); \\\n\ coord_b.y++; \\\n\ _viv_asm(CONV, valC, sum2); \\\n\ _viv_asm(COPY, outC, valC, 16); \\\n\ - VXC_OP4_NoDest(img_store_3d, output, coord_b.xyww, outC.s0246, \\\n\ + VXC_OP4_NoDest(img_store_3d, output, coord_b.xywz, outC.s0246, \\\n\ VXC_MODIFIER(0, 3, 0,VXC_RM_TowardZero, 0)); \\\n\ coord_b.y++; \\\n\ _viv_asm(CONV, valC, sum3); \\\n\ _viv_asm(COPY, outC, valC, 16); \\\n\ - VXC_OP4_NoDest(img_store_3d, output, coord_b.xyww, outC.s0246, \\\n\ + VXC_OP4_NoDest(img_store_3d, output, coord_b.xywz, outC.s0246, \\\n\ VXC_MODIFIER(0, 3, 0,VXC_RM_TowardZero, 0)); \\\n\ }\n\ GEMM_F16_QINT16_TO_F16(I16, vxc_short8)\n\ @@ -20069,21 +20505,21 @@ __kernel void gemm_F16##src1_type_name##to##src1_type_name(image2d_array_t input { \\\n\ vxc_float4 tempA0, tempA1, tempA2, tempA3; \\\n\ vxc_float4 tempZp; \\\n\ - VXC_OP4(img_load_3d, srcA0, inputA, coord_a.xyww, VXC_5BITOFFSET_XY(0, 0), \\\n\ + VXC_OP4(img_load_3d, srcA0, inputA, coord_a.xywz, VXC_5BITOFFSET_XY(0, 0), \\\n\ VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \\\n\ - VXC_OP4(img_load_3d, srcB, inputB, coord_b.xyww, VXC_5BITOFFSET_XY(0, 0), \\\n\ + VXC_OP4(img_load_3d, srcB, inputB, coord_b.xywz, VXC_5BITOFFSET_XY(0, 0), \\\n\ VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \\\n\ - VXC_OP4(img_load_3d, srcA0, inputA, coord_a.xyww, VXC_5BITOFFSET_XY(0, 1), \\\n\ + VXC_OP4(img_load_3d, srcA0, inputA, coord_a.xywz, VXC_5BITOFFSET_XY(0, 1), \\\n\ VXC_MODIFIER(4, 7, 0, VXC_RM_TowardZero, 0)); \\\n\ - VXC_OP4(img_load_3d, srcB, inputB, coord_b.xyww, VXC_5BITOFFSET_XY(0, 1), \\\n\ + VXC_OP4(img_load_3d, srcB, inputB, coord_b.xywz, VXC_5BITOFFSET_XY(0, 1), \\\n\ VXC_MODIFIER(4, 7, 0, VXC_RM_TowardZero, 0)); \\\n\ - VXC_OP4(img_load_3d, srcA1, inputA, coord_a.xyww, VXC_5BITOFFSET_XY(0, 2), \\\n\ + VXC_OP4(img_load_3d, srcA1, inputA, coord_a.xywz, VXC_5BITOFFSET_XY(0, 2), \\\n\ VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \\\n\ - VXC_OP4(img_load_3d, srcB, inputB, coord_b.xyww, VXC_5BITOFFSET_XY(0, 2), \\\n\ + VXC_OP4(img_load_3d, srcB, inputB, coord_b.xywz, VXC_5BITOFFSET_XY(0, 2), \\\n\ VXC_MODIFIER(8, 11, 0, VXC_RM_TowardZero, 0)); \\\n\ - VXC_OP4(img_load_3d, srcA1, inputA, coord_a.xyww, VXC_5BITOFFSET_XY(0, 3), \\\n\ + VXC_OP4(img_load_3d, srcA1, inputA, coord_a.xywz, VXC_5BITOFFSET_XY(0, 3), \\\n\ VXC_MODIFIER(4, 7, 0, VXC_RM_TowardZero, 0)); \\\n\ - VXC_OP4(img_load_3d, srcB, inputB, coord_b.xyww, VXC_5BITOFFSET_XY(0, 3), \\\n\ + VXC_OP4(img_load_3d, srcB, inputB, coord_b.xywz, VXC_5BITOFFSET_XY(0, 3), \\\n\ VXC_MODIFIER(12, 15, 0, VXC_RM_TowardZero, 0)); \\\n\ coord_a.x += 4; \\\n\ coord_b.y += 4; \\\n\ @@ -20109,20 +20545,20 @@ __kernel void gemm_F16##src1_type_name##to##src1_type_name(image2d_array_t input tmpOut1 = convert_int4_rte(sum1 * in1outScale + output_ZP); \\\n\ VXC_DP2x8(outC, tmpOut0, tmpOut1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1),\\\n\ uniConvertInt32toUint8_2x8); \\\n\ - VXC_OP4_NoDest(img_store_3d, output, coord_b.xyww, outC.s0123, \\\n\ + VXC_OP4_NoDest(img_store_3d, output, coord_b.xywz, outC.s0123, \\\n\ VXC_MODIFIER(0, 3, 0,VXC_RM_TowardZero, 0)); \\\n\ coord_b.y++; \\\n\ - VXC_OP4_NoDest(img_store_3d, output, coord_b.xyww, outC.s4567, \\\n\ + VXC_OP4_NoDest(img_store_3d, output, coord_b.xywz, outC.s4567, \\\n\ VXC_MODIFIER(0, 3, 0,VXC_RM_TowardZero, 0)); \\\n\ coord_b.y++; \\\n\ tmpOut0 = convert_int4_rte(sum2 * in1outScale + output_ZP); \\\n\ tmpOut1 = convert_int4_rte(sum3 * in1outScale + output_ZP); \\\n\ VXC_DP2x8(outC, tmpOut0, tmpOut1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1),\\\n\ uniConvertInt32toUint8_2x8); \\\n\ - VXC_OP4_NoDest(img_store_3d, output, coord_b.xyww, outC.s0123, \\\n\ + VXC_OP4_NoDest(img_store_3d, output, coord_b.xywz, outC.s0123, \\\n\ VXC_MODIFIER(0, 3, 0,VXC_RM_TowardZero, 0)); \\\n\ coord_b.y++; \\\n\ - VXC_OP4_NoDest(img_store_3d, output, coord_b.xyww, outC.s4567, \\\n\ + VXC_OP4_NoDest(img_store_3d, output, coord_b.xywz, outC.s4567, \\\n\ VXC_MODIFIER(0, 3, 0,VXC_RM_TowardZero, 0)); \\\n\ }\n\ GEMM_F16_QINT_TO_QINT(U8, vxc_uchar16)\n\ @@ -20169,33 +20605,33 @@ __kernel void gemm_##src0_type_name##src0_type_name##to##src0_type_name( \\\n\ { \\\n\ vxc_float4 tempA0, tempA1, tempA2, tempA3; \\\n\ vxc_float4 tempB0, tempB1, tempB2, tempB3; \\\n\ - VXC_OP4(img_load_3d, srcA, inputA, coord_a.xyww, VXC_5BITOFFSET_XY(0, 0), \\\n\ + VXC_OP4(img_load_3d, srcA, inputA, coord_a.xywz, VXC_5BITOFFSET_XY(0, 0), \\\n\ VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \\\n\ - VXC_OP4(img_load_3d, srcB, inputB, coord_b.xyww, VXC_5BITOFFSET_XY(0, 0), \\\n\ + VXC_OP4(img_load_3d, srcB, inputB, coord_b.xywz, VXC_5BITOFFSET_XY(0, 0), \\\n\ VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \\\n\ VXC_DP4x4(tempA0, srcA, in0_zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), \\\n\ uniConvertUint8SubZpToFp32_4x4); \\\n\ VXC_DP4x4(tempB0, srcB, in1_zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), \\\n\ uniConvertUint8SubZpToFp32B_4x4); \\\n\ - VXC_OP4(img_load_3d, srcA, inputA, coord_a.xyww, VXC_5BITOFFSET_XY(0, 1), \\\n\ + VXC_OP4(img_load_3d, srcA, inputA, coord_a.xywz, VXC_5BITOFFSET_XY(0, 1), \\\n\ VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \\\n\ - VXC_OP4(img_load_3d, srcB, inputB, coord_b.xyww, VXC_5BITOFFSET_XY(0, 1), \\\n\ + VXC_OP4(img_load_3d, srcB, inputB, coord_b.xywz, VXC_5BITOFFSET_XY(0, 1), \\\n\ VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \\\n\ VXC_DP4x4(tempA1, srcA, in0_zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), \\\n\ uniConvertUint8SubZpToFp32_4x4); \\\n\ VXC_DP4x4(tempB1, srcB, in1_zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), \\\n\ uniConvertUint8SubZpToFp32B_4x4); \\\n\ - VXC_OP4(img_load_3d, srcA, inputA, coord_a.xyww, VXC_5BITOFFSET_XY(0, 2), \\\n\ + VXC_OP4(img_load_3d, srcA, inputA, coord_a.xywz, VXC_5BITOFFSET_XY(0, 2), \\\n\ VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \\\n\ - VXC_OP4(img_load_3d, srcB, inputB, coord_b.xyww, VXC_5BITOFFSET_XY(0, 2), \\\n\ + VXC_OP4(img_load_3d, srcB, inputB, coord_b.xywz, VXC_5BITOFFSET_XY(0, 2), \\\n\ VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \\\n\ VXC_DP4x4(tempA2, srcA, in0_zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), \\\n\ uniConvertUint8SubZpToFp32_4x4); \\\n\ VXC_DP4x4(tempB2, srcB, in1_zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), \\\n\ uniConvertUint8SubZpToFp32B_4x4); \\\n\ - VXC_OP4(img_load_3d, srcA, inputA, coord_a.xyww, VXC_5BITOFFSET_XY(0, 3), \\\n\ + VXC_OP4(img_load_3d, srcA, inputA, coord_a.xywz, VXC_5BITOFFSET_XY(0, 3), \\\n\ VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \\\n\ - VXC_OP4(img_load_3d, srcB, inputB, coord_b.xyww, VXC_5BITOFFSET_XY(0, 3), \\\n\ + VXC_OP4(img_load_3d, srcB, inputB, coord_b.xywz, VXC_5BITOFFSET_XY(0, 3), \\\n\ VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \\\n\ coord_a.x += 4; \\\n\ coord_b.y += 4; \\\n\ @@ -20216,19 +20652,19 @@ __kernel void gemm_##src0_type_name##src0_type_name##to##src0_type_name( \\\n\ tmpOut0 = convert_int4_rte(sum0 * outputScale + output_ZP); \\\n\ tmpOut1 = convert_int4_rte(sum1 * outputScale + output_ZP); \\\n\ VXC_DP2x8(outC, tmpOut0, tmpOut1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1), uniConvertInt32toUint8_2x8); \\\n\ - VXC_OP4_NoDest(img_store_3d, output, coord_b.xyww, outC.s0123, \\\n\ + VXC_OP4_NoDest(img_store_3d, output, coord_b.xywz, outC.s0123, \\\n\ VXC_MODIFIER(0, 3, 0,VXC_RM_TowardZero, 0)); \\\n\ coord_b.y++; \\\n\ - VXC_OP4_NoDest(img_store_3d, output, coord_b.xyww, outC.s4567, \\\n\ + VXC_OP4_NoDest(img_store_3d, output, coord_b.xywz, outC.s4567, \\\n\ VXC_MODIFIER(0, 3, 0,VXC_RM_TowardZero, 0)); \\\n\ coord_b.y++; \\\n\ tmpOut0 = convert_int4_rte(sum2 * outputScale + output_ZP); \\\n\ tmpOut1 = convert_int4_rte(sum3 * outputScale + output_ZP); \\\n\ VXC_DP2x8(outC, tmpOut0, tmpOut1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1), uniConvertInt32toUint8_2x8); \\\n\ - VXC_OP4_NoDest(img_store_3d, output, coord_b.xyww, outC.s0123, \\\n\ + VXC_OP4_NoDest(img_store_3d, output, coord_b.xywz, outC.s0123, \\\n\ VXC_MODIFIER(0, 3, 0,VXC_RM_TowardZero, 0)); \\\n\ coord_b.y++; \\\n\ - VXC_OP4_NoDest(img_store_3d, output, coord_b.xyww, outC.s4567, \\\n\ + VXC_OP4_NoDest(img_store_3d, output, coord_b.xywz, outC.s4567, \\\n\ VXC_MODIFIER(0, 3, 0,VXC_RM_TowardZero, 0)); \\\n\ }\n\ GEMM_QINT_TO_QINT(I16, vxc_short8)\n\ @@ -20284,9 +20720,9 @@ __kernel void gemm_transa_##src0_type_name##src1_type_name##to##dst_type_name( \ \\\n\ for(coord_a.y = 0, coord_b.y = 0; coord_a.y < K;) \\\n\ { \\\n\ - VXC_OP4(img_load_3d, srcA, inputA, coord_a.xyww, VXC_5BITOFFSET_XY(0, 0), \\\n\ + VXC_OP4(img_load_3d, srcA, inputA, coord_a.xywz, VXC_5BITOFFSET_XY(0, 0), \\\n\ VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \\\n\ - VXC_OP4(img_load_3d, srcB, inputB, coord_b.xyww, VXC_5BITOFFSET_XY(0, 0), \\\n\ + VXC_OP4(img_load_3d, srcB, inputB, coord_b.xywz, VXC_5BITOFFSET_XY(0, 0), \\\n\ VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \\\n\ coord_a.y++; \\\n\ coord_b.y++; \\\n\ @@ -20308,10 +20744,10 @@ __kernel void gemm_transa_##src0_type_name##src1_type_name##to##dst_type_name( \ tmpOut1 = convert_int4_rte(sum1 * outputScale + output_ZP); \\\n\ VXC_DP2x8(outC, tmpOut0, tmpOut1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1),\\\n\ uniConvertInt32toUint8_2x8); \\\n\ - VXC_OP4_NoDest(img_store_3d, output, coord_b.xyww, outC.s0123, \\\n\ + VXC_OP4_NoDest(img_store_3d, output, coord_b.xywz, outC.s0123, \\\n\ VXC_MODIFIER(0, 3, 0,VXC_RM_TowardZero, 0)); \\\n\ coord_b.y++; \\\n\ - VXC_OP4_NoDest(img_store_3d, output, coord_b.xyww, outC.s4567, \\\n\ + VXC_OP4_NoDest(img_store_3d, output, coord_b.xywz, outC.s4567, \\\n\ VXC_MODIFIER(0, 3, 0,VXC_RM_TowardZero, 0)); \\\n\ \\\n\ coord_b.y++; \\\n\ @@ -20319,10 +20755,10 @@ __kernel void gemm_transa_##src0_type_name##src1_type_name##to##dst_type_name( \ tmpOut1 = convert_int4_rte(sum3 * outputScale + output_ZP); \\\n\ VXC_DP2x8(outC, tmpOut0, tmpOut1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1),\\\n\ uniConvertInt32toUint8_2x8); \\\n\ - VXC_OP4_NoDest(img_store_3d, output, coord_b.xyww, outC.s0123, \\\n\ + VXC_OP4_NoDest(img_store_3d, output, coord_b.xywz, outC.s0123, \\\n\ VXC_MODIFIER(0, 3, 0,VXC_RM_TowardZero, 0)); \\\n\ coord_b.y++; \\\n\ - VXC_OP4_NoDest(img_store_3d, output, coord_b.xyww, outC.s4567, \\\n\ + VXC_OP4_NoDest(img_store_3d, output, coord_b.xywz, outC.s4567, \\\n\ VXC_MODIFIER(0, 3, 0,VXC_RM_TowardZero, 0)); \\\n\ }\n\ GEMM_TRANSA_QINT(U8, U8, U8, vxc_uchar16, vxc_uchar16, vxc_uchar16)\n\ @@ -20369,9 +20805,9 @@ __kernel void gemm_transa_##src0_type_name##F16to##src0_type_name( \\\n\ \\\n\ for(coord_a.y = 0, coord_b.y = 0; coord_a.y < K;) \\\n\ { \\\n\ - VXC_OP4(img_load_3d, srcA, inputA, coord_a.xyww, VXC_5BITOFFSET_XY(0, 0), \\\n\ + VXC_OP4(img_load_3d, srcA, inputA, coord_a.xywz, VXC_5BITOFFSET_XY(0, 0), \\\n\ VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \\\n\ - VXC_OP4(img_load_3d, srcB, inputB, coord_b.xyww, VXC_5BITOFFSET_XY(0, 0), \\\n\ + VXC_OP4(img_load_3d, srcB, inputB, coord_b.xywz, VXC_5BITOFFSET_XY(0, 0), \\\n\ VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \\\n\ coord_a.y++; \\\n\ coord_b.y++; \\\n\ @@ -20393,10 +20829,10 @@ __kernel void gemm_transa_##src0_type_name##F16to##src0_type_name( \\\n\ tmpOut1 = convert_int4_rte(sum1 * outputScale + output_ZP); \\\n\ VXC_DP2x8(outC, tmpOut0, tmpOut1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1),\\\n\ uniConvertInt32toUint8_2x8); \\\n\ - VXC_OP4_NoDest(img_store_3d, output, coord_b.xyww, outC.s0123, \\\n\ + VXC_OP4_NoDest(img_store_3d, output, coord_b.xywz, outC.s0123, \\\n\ VXC_MODIFIER(0, 3, 0,VXC_RM_TowardZero, 0)); \\\n\ coord_b.y++; \\\n\ - VXC_OP4_NoDest(img_store_3d, output, coord_b.xyww, outC.s4567, \\\n\ + VXC_OP4_NoDest(img_store_3d, output, coord_b.xywz, outC.s4567, \\\n\ VXC_MODIFIER(0, 3, 0,VXC_RM_TowardZero, 0)); \\\n\ \\\n\ coord_b.y++; \\\n\ @@ -20404,10 +20840,10 @@ __kernel void gemm_transa_##src0_type_name##F16to##src0_type_name( \\\n\ tmpOut1 = convert_int4_rte(sum3 * outputScale + output_ZP); \\\n\ VXC_DP2x8(outC, tmpOut0, tmpOut1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1),\\\n\ uniConvertInt32toUint8_2x8); \\\n\ - VXC_OP4_NoDest(img_store_3d, output, coord_b.xyww, outC.s0123, \\\n\ + VXC_OP4_NoDest(img_store_3d, output, coord_b.xywz, outC.s0123, \\\n\ VXC_MODIFIER(0, 3, 0,VXC_RM_TowardZero, 0)); \\\n\ coord_b.y++; \\\n\ - VXC_OP4_NoDest(img_store_3d, output, coord_b.xyww, outC.s4567, \\\n\ + VXC_OP4_NoDest(img_store_3d, output, coord_b.xywz, outC.s4567, \\\n\ VXC_MODIFIER(0, 3, 0,VXC_RM_TowardZero, 0)); \\\n\ }\n\ GEMM_TRANSA_INPUTB_F16(U8, vxc_uchar16)\n\ @@ -20451,9 +20887,9 @@ __kernel void gemm_transa_F16F16toF16(\n\ \n\ for(coord_a.y = 0, coord_b.y = 0; coord_a.y < K;)\n\ {\n\ - VXC_OP4(img_load_3d, srcA, inputA, coord_a.xyww, VXC_5BITOFFSET_XY(0, 0),\n\ + VXC_OP4(img_load_3d, srcA, inputA, coord_a.xywz, VXC_5BITOFFSET_XY(0, 0),\n\ VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0));\n\ - VXC_OP4(img_load_3d, srcB, inputB, coord_b.xyww, VXC_5BITOFFSET_XY(0, 0),\n\ + VXC_OP4(img_load_3d, srcB, inputB, coord_b.xywz, VXC_5BITOFFSET_XY(0, 0),\n\ VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0));\n\ coord_a.y++;\n\ coord_b.y++;\n\ @@ -20473,25 +20909,25 @@ __kernel void gemm_transa_F16F16toF16(\n\ _viv_asm(MOV, coord_b.w, baseAddr);\n\ _viv_asm(CONV, valC, sum0);\n\ _viv_asm(COPY, outC, valC, 16);\n\ - VXC_OP4_NoDest(img_store_3d, output, coord_b.xyww, outC.s0246, \\\n\ + VXC_OP4_NoDest(img_store_3d, output, coord_b.xywz, outC.s0246, \\\n\ VXC_MODIFIER(0, 3, 0,VXC_RM_TowardZero, 0));\n\ \n\ coord_b.y++;\n\ _viv_asm(CONV, valC, sum1);\n\ _viv_asm(COPY, outC, valC, 16);\n\ - VXC_OP4_NoDest(img_store_3d, output, coord_b.xyww, outC.s0246, \\\n\ + VXC_OP4_NoDest(img_store_3d, output, coord_b.xywz, outC.s0246, \\\n\ VXC_MODIFIER(0, 3, 0,VXC_RM_TowardZero, 0));\n\ \n\ coord_b.y++;\n\ _viv_asm(CONV, valC, sum2);\n\ _viv_asm(COPY, outC, valC, 16);\n\ - VXC_OP4_NoDest(img_store_3d, output, coord_b.xyww, outC.s0246, \\\n\ + VXC_OP4_NoDest(img_store_3d, output, coord_b.xywz, outC.s0246, \\\n\ VXC_MODIFIER(0, 3, 0,VXC_RM_TowardZero, 0));\n\ \n\ coord_b.y++;\n\ _viv_asm(CONV, valC, sum3);\n\ _viv_asm(COPY, outC, valC, 16);\n\ - VXC_OP4_NoDest(img_store_3d, output, coord_b.xyww, outC.s0246, \\\n\ + VXC_OP4_NoDest(img_store_3d, output, coord_b.xywz, outC.s0246, \\\n\ VXC_MODIFIER(0, 3, 0,VXC_RM_TowardZero, 0));\n\ }"; /* end of matrixmul_transA_vx*/ @@ -20533,21 +20969,21 @@ __kernel void gemm_transb_F16F16toF16(image2d_array_t inputA,\n\ {\n\ vxc_short8 srcA0,srcA1,srcA2,srcA3;\n\ vxc_short8 srcB0,srcB1,srcB2,srcB3;\n\ - VXC_OP4(img_load_3d, srcA0, inputA, coord_a.xyww, VXC_5BITOFFSET_XY(0, 0),\n\ + VXC_OP4(img_load_3d, srcA0, inputA, coord_a.xywz, VXC_5BITOFFSET_XY(0, 0),\n\ VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ - VXC_OP4(img_load_3d, srcA1, inputA, coord_a.xyww, VXC_5BITOFFSET_XY(0, 1),\n\ + VXC_OP4(img_load_3d, srcA1, inputA, coord_a.xywz, VXC_5BITOFFSET_XY(0, 1),\n\ VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ - VXC_OP4(img_load_3d, srcA2, inputA, coord_a.xyww, VXC_5BITOFFSET_XY(0, 2),\n\ + VXC_OP4(img_load_3d, srcA2, inputA, coord_a.xywz, VXC_5BITOFFSET_XY(0, 2),\n\ VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ - VXC_OP4(img_load_3d, srcA3, inputA, coord_a.xyww, VXC_5BITOFFSET_XY(0, 3),\n\ + VXC_OP4(img_load_3d, srcA3, inputA, coord_a.xywz, VXC_5BITOFFSET_XY(0, 3),\n\ VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ - VXC_OP4(img_load_3d, srcB0, inputB, coord_b.xyww, VXC_5BITOFFSET_XY(0, 0),\n\ + VXC_OP4(img_load_3d, srcB0, inputB, coord_b.xywz, VXC_5BITOFFSET_XY(0, 0),\n\ VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ - VXC_OP4(img_load_3d, srcB1, inputB, coord_b.xyww, VXC_5BITOFFSET_XY(0, 1),\n\ + VXC_OP4(img_load_3d, srcB1, inputB, coord_b.xywz, VXC_5BITOFFSET_XY(0, 1),\n\ VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ - VXC_OP4(img_load_3d, srcB2, inputB, coord_b.xyww, VXC_5BITOFFSET_XY(0, 2),\n\ + VXC_OP4(img_load_3d, srcB2, inputB, coord_b.xywz, VXC_5BITOFFSET_XY(0, 2),\n\ VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ - VXC_OP4(img_load_3d, srcB3, inputB, coord_b.xyww, VXC_5BITOFFSET_XY(0, 3),\n\ + VXC_OP4(img_load_3d, srcB3, inputB, coord_b.xywz, VXC_5BITOFFSET_XY(0, 3),\n\ VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ coord_a.x += 8;\n\ coord_b.x += 8;\n\ @@ -20662,21 +21098,21 @@ __kernel void gemm_transb_F16U8toF16(image2d_array_t inputA,\n\ {\n\ vxc_short8 srcA0,srcA1,srcA2,srcA3;\n\ vxc_uchar8 srcB0,srcB1,srcB2,srcB3;\n\ - VXC_OP4(img_load_3d, srcA0, inputA, coord_a.xyww, VXC_5BITOFFSET_XY(0, 0),\n\ + VXC_OP4(img_load_3d, srcA0, inputA, coord_a.xywz, VXC_5BITOFFSET_XY(0, 0),\n\ VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ - VXC_OP4(img_load_3d, srcA1, inputA, coord_a.xyww, VXC_5BITOFFSET_XY(0, 1),\n\ + VXC_OP4(img_load_3d, srcA1, inputA, coord_a.xywz, VXC_5BITOFFSET_XY(0, 1),\n\ VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ - VXC_OP4(img_load_3d, srcA2, inputA, coord_a.xyww, VXC_5BITOFFSET_XY(0, 2),\n\ + VXC_OP4(img_load_3d, srcA2, inputA, coord_a.xywz, VXC_5BITOFFSET_XY(0, 2),\n\ VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ - VXC_OP4(img_load_3d, srcA3, inputA, coord_a.xyww, VXC_5BITOFFSET_XY(0, 3),\n\ + VXC_OP4(img_load_3d, srcA3, inputA, coord_a.xywz, VXC_5BITOFFSET_XY(0, 3),\n\ VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ - VXC_OP4(img_load_3d, srcB0, inputB, coord_b.xyww, VXC_5BITOFFSET_XY(0, 0),\n\ + VXC_OP4(img_load_3d, srcB0, inputB, coord_b.xywz, VXC_5BITOFFSET_XY(0, 0),\n\ VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ - VXC_OP4(img_load_3d, srcB1, inputB, coord_b.xyww, VXC_5BITOFFSET_XY(0, 1),\n\ + VXC_OP4(img_load_3d, srcB1, inputB, coord_b.xywz, VXC_5BITOFFSET_XY(0, 1),\n\ VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ - VXC_OP4(img_load_3d, srcB2, inputB, coord_b.xyww, VXC_5BITOFFSET_XY(0, 2),\n\ + VXC_OP4(img_load_3d, srcB2, inputB, coord_b.xywz, VXC_5BITOFFSET_XY(0, 2),\n\ VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ - VXC_OP4(img_load_3d, srcB3, inputB, coord_b.xyww, VXC_5BITOFFSET_XY(0, 3),\n\ + VXC_OP4(img_load_3d, srcB3, inputB, coord_b.xywz, VXC_5BITOFFSET_XY(0, 3),\n\ VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ coord_a.x += 8;\n\ coord_b.x += 8;\n\ @@ -20792,21 +21228,21 @@ __kernel void gemm_transb_F16U8toU8(image2d_array_t inputA,\n\ {\n\ vxc_short8 srcA0,srcA1,srcA2,srcA3;\n\ vxc_uchar8 srcB0,srcB1,srcB2,srcB3;\n\ - VXC_OP4(img_load_3d, srcA0, inputA, coord_a.xyww, VXC_5BITOFFSET_XY(0, 0),\n\ + VXC_OP4(img_load_3d, srcA0, inputA, coord_a.xywz, VXC_5BITOFFSET_XY(0, 0),\n\ VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ - VXC_OP4(img_load_3d, srcA1, inputA, coord_a.xyww, VXC_5BITOFFSET_XY(0, 1),\n\ + VXC_OP4(img_load_3d, srcA1, inputA, coord_a.xywz, VXC_5BITOFFSET_XY(0, 1),\n\ VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ - VXC_OP4(img_load_3d, srcA2, inputA, coord_a.xyww, VXC_5BITOFFSET_XY(0, 2),\n\ + VXC_OP4(img_load_3d, srcA2, inputA, coord_a.xywz, VXC_5BITOFFSET_XY(0, 2),\n\ VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ - VXC_OP4(img_load_3d, srcA3, inputA, coord_a.xyww, VXC_5BITOFFSET_XY(0, 3),\n\ + VXC_OP4(img_load_3d, srcA3, inputA, coord_a.xywz, VXC_5BITOFFSET_XY(0, 3),\n\ VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ - VXC_OP4(img_load_3d, srcB0, inputB, coord_b.xyww, VXC_5BITOFFSET_XY(0, 0),\n\ + VXC_OP4(img_load_3d, srcB0, inputB, coord_b.xywz, VXC_5BITOFFSET_XY(0, 0),\n\ VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ - VXC_OP4(img_load_3d, srcB1, inputB, coord_b.xyww, VXC_5BITOFFSET_XY(0, 1),\n\ + VXC_OP4(img_load_3d, srcB1, inputB, coord_b.xywz, VXC_5BITOFFSET_XY(0, 1),\n\ VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ - VXC_OP4(img_load_3d, srcB2, inputB, coord_b.xyww, VXC_5BITOFFSET_XY(0, 2),\n\ + VXC_OP4(img_load_3d, srcB2, inputB, coord_b.xywz, VXC_5BITOFFSET_XY(0, 2),\n\ VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ - VXC_OP4(img_load_3d, srcB3, inputB, coord_b.xyww, VXC_5BITOFFSET_XY(0, 3),\n\ + VXC_OP4(img_load_3d, srcB3, inputB, coord_b.xywz, VXC_5BITOFFSET_XY(0, 3),\n\ VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ coord_a.x += 8;\n\ coord_b.x += 8;\n\ @@ -20927,21 +21363,21 @@ __kernel void gemm_transb_U8U8toF16(image2d_array_t inputA,\n\ {\n\ vxc_uchar8 srcA0,srcA1,srcA2,srcA3;\n\ vxc_uchar8 srcB0,srcB1,srcB2,srcB3;\n\ - VXC_OP4(img_load_3d, srcA0, inputA, coord_a.xyww, VXC_5BITOFFSET_XY(0, 0),\n\ + VXC_OP4(img_load_3d, srcA0, inputA, coord_a.xywz, VXC_5BITOFFSET_XY(0, 0),\n\ VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ - VXC_OP4(img_load_3d, srcA1, inputA, coord_a.xyww, VXC_5BITOFFSET_XY(0, 1),\n\ + VXC_OP4(img_load_3d, srcA1, inputA, coord_a.xywz, VXC_5BITOFFSET_XY(0, 1),\n\ VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ - VXC_OP4(img_load_3d, srcA2, inputA, coord_a.xyww, VXC_5BITOFFSET_XY(0, 2),\n\ + VXC_OP4(img_load_3d, srcA2, inputA, coord_a.xywz, VXC_5BITOFFSET_XY(0, 2),\n\ VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ - VXC_OP4(img_load_3d, srcA3, inputA, coord_a.xyww, VXC_5BITOFFSET_XY(0, 3),\n\ + VXC_OP4(img_load_3d, srcA3, inputA, coord_a.xywz, VXC_5BITOFFSET_XY(0, 3),\n\ VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ - VXC_OP4(img_load_3d, srcB0, inputB, coord_b.xyww, VXC_5BITOFFSET_XY(0, 0),\n\ + VXC_OP4(img_load_3d, srcB0, inputB, coord_b.xywz, VXC_5BITOFFSET_XY(0, 0),\n\ VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ - VXC_OP4(img_load_3d, srcB1, inputB, coord_b.xyww, VXC_5BITOFFSET_XY(0, 1),\n\ + VXC_OP4(img_load_3d, srcB1, inputB, coord_b.xywz, VXC_5BITOFFSET_XY(0, 1),\n\ VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ - VXC_OP4(img_load_3d, srcB2, inputB, coord_b.xyww, VXC_5BITOFFSET_XY(0, 2),\n\ + VXC_OP4(img_load_3d, srcB2, inputB, coord_b.xywz, VXC_5BITOFFSET_XY(0, 2),\n\ VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ - VXC_OP4(img_load_3d, srcB3, inputB, coord_b.xyww, VXC_5BITOFFSET_XY(0, 3),\n\ + VXC_OP4(img_load_3d, srcB3, inputB, coord_b.xywz, VXC_5BITOFFSET_XY(0, 3),\n\ VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ coord_a.x += 8;\n\ coord_b.x += 8;\n\ @@ -21062,21 +21498,21 @@ __kernel void gemm_transb_U8U8toU8(image2d_array_t inputA,\n\ {\n\ vxc_uchar8 srcA0,srcA1,srcA2,srcA3;\n\ vxc_uchar8 srcB0,srcB1,srcB2,srcB3;\n\ - VXC_OP4(img_load_3d, srcA0, inputA, coord_a.xyww, VXC_5BITOFFSET_XY(0, 0),\n\ + VXC_OP4(img_load_3d, srcA0, inputA, coord_a.xywz, VXC_5BITOFFSET_XY(0, 0),\n\ VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ - VXC_OP4(img_load_3d, srcA1, inputA, coord_a.xyww, VXC_5BITOFFSET_XY(0, 1),\n\ + VXC_OP4(img_load_3d, srcA1, inputA, coord_a.xywz, VXC_5BITOFFSET_XY(0, 1),\n\ VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ - VXC_OP4(img_load_3d, srcA2, inputA, coord_a.xyww, VXC_5BITOFFSET_XY(0, 2),\n\ + VXC_OP4(img_load_3d, srcA2, inputA, coord_a.xywz, VXC_5BITOFFSET_XY(0, 2),\n\ VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ - VXC_OP4(img_load_3d, srcA3, inputA, coord_a.xyww, VXC_5BITOFFSET_XY(0, 3),\n\ + VXC_OP4(img_load_3d, srcA3, inputA, coord_a.xywz, VXC_5BITOFFSET_XY(0, 3),\n\ VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ - VXC_OP4(img_load_3d, srcB0, inputB, coord_b.xyww, VXC_5BITOFFSET_XY(0, 0),\n\ + VXC_OP4(img_load_3d, srcB0, inputB, coord_b.xywz, VXC_5BITOFFSET_XY(0, 0),\n\ VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ - VXC_OP4(img_load_3d, srcB1, inputB, coord_b.xyww, VXC_5BITOFFSET_XY(0, 1),\n\ + VXC_OP4(img_load_3d, srcB1, inputB, coord_b.xywz, VXC_5BITOFFSET_XY(0, 1),\n\ VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ - VXC_OP4(img_load_3d, srcB2, inputB, coord_b.xyww, VXC_5BITOFFSET_XY(0, 2),\n\ + VXC_OP4(img_load_3d, srcB2, inputB, coord_b.xywz, VXC_5BITOFFSET_XY(0, 2),\n\ VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ - VXC_OP4(img_load_3d, srcB3, inputB, coord_b.xyww, VXC_5BITOFFSET_XY(0, 3),\n\ + VXC_OP4(img_load_3d, srcB3, inputB, coord_b.xywz, VXC_5BITOFFSET_XY(0, 3),\n\ VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ coord_a.x += 8;\n\ coord_b.x += 8;\n\ @@ -21191,21 +21627,21 @@ __kernel void gemm_##src0_type_name##src0_type_name##to##src0_type_name( \\\n\ { \\\n\ vxc_float4 tempA0, tempA1, tempA2, tempA3; \\\n\ vxc_float4 tempB0, tempB1, tempB2, tempB3; \\\n\ - VXC_OP4(img_load_3d, srcA0, inputA, coord_a.xyww, VXC_5BITOFFSET_XY(0, 0), \\\n\ + VXC_OP4(img_load_3d, srcA0, inputA, coord_a.xywz, VXC_5BITOFFSET_XY(0, 0), \\\n\ VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \\\n\ - VXC_OP4(img_load_3d, srcB, inputB, coord_b.xyww, VXC_5BITOFFSET_XY(0, 0), \\\n\ + VXC_OP4(img_load_3d, srcB, inputB, coord_b.xywz, VXC_5BITOFFSET_XY(0, 0), \\\n\ VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \\\n\ - VXC_OP4(img_load_3d, srcA1, inputA, coord_a.xyww, VXC_5BITOFFSET_XY(0, 1), \\\n\ + VXC_OP4(img_load_3d, srcA1, inputA, coord_a.xywz, VXC_5BITOFFSET_XY(0, 1), \\\n\ VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \\\n\ - VXC_OP4(img_load_3d, srcB, inputB, coord_b.xyww, VXC_5BITOFFSET_XY(0, 1), \\\n\ + VXC_OP4(img_load_3d, srcB, inputB, coord_b.xywz, VXC_5BITOFFSET_XY(0, 1), \\\n\ VXC_MODIFIER(4, 7, 0, VXC_RM_TowardZero, 0)); \\\n\ - VXC_OP4(img_load_3d, srcA2, inputA, coord_a.xyww, VXC_5BITOFFSET_XY(0, 2), \\\n\ + VXC_OP4(img_load_3d, srcA2, inputA, coord_a.xywz, VXC_5BITOFFSET_XY(0, 2), \\\n\ VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \\\n\ - VXC_OP4(img_load_3d, srcB, inputB, coord_b.xyww, VXC_5BITOFFSET_XY(0, 2), \\\n\ + VXC_OP4(img_load_3d, srcB, inputB, coord_b.xywz, VXC_5BITOFFSET_XY(0, 2), \\\n\ VXC_MODIFIER(8, 11, 0, VXC_RM_TowardZero, 0)); \\\n\ - VXC_OP4(img_load_3d, srcA3, inputA, coord_a.xyww, VXC_5BITOFFSET_XY(0, 3), \\\n\ + VXC_OP4(img_load_3d, srcA3, inputA, coord_a.xywz, VXC_5BITOFFSET_XY(0, 3), \\\n\ VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \\\n\ - VXC_OP4(img_load_3d, srcB, inputB, coord_b.xyww, VXC_5BITOFFSET_XY(0, 3), \\\n\ + VXC_OP4(img_load_3d, srcB, inputB, coord_b.xywz, VXC_5BITOFFSET_XY(0, 3), \\\n\ VXC_MODIFIER(12, 15, 0, VXC_RM_TowardZero, 0)); \\\n\ coord_a.x += 4; \\\n\ coord_b.y += 4; \\\n\ @@ -21238,19 +21674,19 @@ __kernel void gemm_##src0_type_name##src0_type_name##to##src0_type_name( \\\n\ tmpOut0 = convert_int4_rte(sum0 * inOutScale + output_ZP); \\\n\ tmpOut1 = convert_int4_rte(sum1 * inOutScale + output_ZP); \\\n\ VXC_DP2x8(outC, tmpOut0, tmpOut1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1), uniConvertInt32toUint8_2x8); \\\n\ - VXC_OP4_NoDest(img_store_3d, output, coord_b.xyww, outC.s0123, \\\n\ + VXC_OP4_NoDest(img_store_3d, output, coord_b.xywz, outC.s0123, \\\n\ VXC_MODIFIER(0, 3, 0,VXC_RM_TowardZero, 0)); \\\n\ coord_b.y++; \\\n\ - VXC_OP4_NoDest(img_store_3d, output, coord_b.xyww, outC.s4567, \\\n\ + VXC_OP4_NoDest(img_store_3d, output, coord_b.xywz, outC.s4567, \\\n\ VXC_MODIFIER(0, 3, 0,VXC_RM_TowardZero, 0)); \\\n\ coord_b.y++; \\\n\ tmpOut0 = convert_int4_rte(sum2 * inOutScale + output_ZP); \\\n\ tmpOut1 = convert_int4_rte(sum3 * inOutScale + output_ZP); \\\n\ VXC_DP2x8(outC, tmpOut0, tmpOut1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1), uniConvertInt32toUint8_2x8); \\\n\ - VXC_OP4_NoDest(img_store_3d, output, coord_b.xyww, outC.s0123, \\\n\ + VXC_OP4_NoDest(img_store_3d, output, coord_b.xywz, outC.s0123, \\\n\ VXC_MODIFIER(0, 3, 0,VXC_RM_TowardZero, 0)); \\\n\ coord_b.y++; \\\n\ - VXC_OP4_NoDest(img_store_3d, output, coord_b.xyww, outC.s4567, \\\n\ + VXC_OP4_NoDest(img_store_3d, output, coord_b.xywz, outC.s4567, \\\n\ VXC_MODIFIER(0, 3, 0,VXC_RM_TowardZero, 0)); \\\n\ }\n\ GEMM_QINT_TO_QINT(U8, vxc_uchar16)\n\ @@ -21304,21 +21740,21 @@ __kernel void gemm_##src0_type_name##F16toF16( \\\n\ { \\\n\ vxc_float4 tempA0, tempA1, tempA2, tempA3, tmpZpScale; \\\n\ vxc_float4 tempB0, tempB1, tempB2, tempB3; \\\n\ - VXC_OP4(img_load_3d, srcA0, inputA, coord_a.xyww, VXC_5BITOFFSET_XY(0, 0), \\\n\ + VXC_OP4(img_load_3d, srcA0, inputA, coord_a.xywz, VXC_5BITOFFSET_XY(0, 0), \\\n\ VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \\\n\ - VXC_OP4(img_load_3d, srcB.hi, inputB, coord_b.xyww, VXC_5BITOFFSET_XY(0, 0), \\\n\ + VXC_OP4(img_load_3d, srcB.hi, inputB, coord_b.xywz, VXC_5BITOFFSET_XY(0, 0), \\\n\ VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \\\n\ - VXC_OP4(img_load_3d, srcA0, inputA, coord_a.xyww, VXC_5BITOFFSET_XY(0, 1), \\\n\ + VXC_OP4(img_load_3d, srcA0, inputA, coord_a.xywz, VXC_5BITOFFSET_XY(0, 1), \\\n\ VXC_MODIFIER(4, 7, 0, VXC_RM_TowardZero, 0)); \\\n\ - VXC_OP4(img_load_3d, srcB.hi, inputB, coord_b.xyww, VXC_5BITOFFSET_XY(0, 1), \\\n\ + VXC_OP4(img_load_3d, srcB.hi, inputB, coord_b.xywz, VXC_5BITOFFSET_XY(0, 1), \\\n\ VXC_MODIFIER(4, 7, 0, VXC_RM_TowardZero, 0)); \\\n\ - VXC_OP4(img_load_3d, srcA1, inputA, coord_a.xyww, VXC_5BITOFFSET_XY(0, 2), \\\n\ + VXC_OP4(img_load_3d, srcA1, inputA, coord_a.xywz, VXC_5BITOFFSET_XY(0, 2), \\\n\ VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \\\n\ - VXC_OP4(img_load_3d, srcB.lo, inputB, coord_b.xyww, VXC_5BITOFFSET_XY(0, 2), \\\n\ + VXC_OP4(img_load_3d, srcB.lo, inputB, coord_b.xywz, VXC_5BITOFFSET_XY(0, 2), \\\n\ VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \\\n\ - VXC_OP4(img_load_3d, srcA1, inputA, coord_a.xyww, VXC_5BITOFFSET_XY(0, 3), \\\n\ + VXC_OP4(img_load_3d, srcA1, inputA, coord_a.xywz, VXC_5BITOFFSET_XY(0, 3), \\\n\ VXC_MODIFIER(4, 7, 0, VXC_RM_TowardZero, 0)); \\\n\ - VXC_OP4(img_load_3d, srcB.lo, inputB, coord_b.xyww, VXC_5BITOFFSET_XY(0, 3), \\\n\ + VXC_OP4(img_load_3d, srcB.lo, inputB, coord_b.xywz, VXC_5BITOFFSET_XY(0, 3), \\\n\ VXC_MODIFIER(4, 7, 0, VXC_RM_TowardZero, 0)); \\\n\ coord_a.x += 4; \\\n\ coord_b.y += 4; \\\n\ @@ -21350,22 +21786,22 @@ __kernel void gemm_##src0_type_name##F16toF16( \\\n\ \\\n\ _viv_asm(CONV, valC, sum0); \\\n\ _viv_asm(COPY, outC, valC, 16); \\\n\ - VXC_OP4_NoDest(img_store_3d, output, coord_b.xyww, outC.s0246, \\\n\ + VXC_OP4_NoDest(img_store_3d, output, coord_b.xywz, outC.s0246, \\\n\ VXC_MODIFIER(0, 3, 0,VXC_RM_TowardZero, 0)); \\\n\ coord_b.y++; \\\n\ _viv_asm(CONV, valC, sum1); \\\n\ _viv_asm(COPY, outC, valC, 16); \\\n\ - VXC_OP4_NoDest(img_store_3d, output, coord_b.xyww, outC.s0246, \\\n\ + VXC_OP4_NoDest(img_store_3d, output, coord_b.xywz, outC.s0246, \\\n\ VXC_MODIFIER(0, 3, 0,VXC_RM_TowardZero, 0)); \\\n\ coord_b.y++; \\\n\ _viv_asm(CONV, valC, sum2); \\\n\ _viv_asm(COPY, outC, valC, 16); \\\n\ - VXC_OP4_NoDest(img_store_3d, output, coord_b.xyww, outC.s0246, \\\n\ + VXC_OP4_NoDest(img_store_3d, output, coord_b.xywz, outC.s0246, \\\n\ VXC_MODIFIER(0, 3, 0,VXC_RM_TowardZero, 0)); \\\n\ coord_b.y++; \\\n\ _viv_asm(CONV, valC, sum3); \\\n\ _viv_asm(COPY, outC, valC, 16); \\\n\ - VXC_OP4_NoDest(img_store_3d, output, coord_b.xyww, outC.s0246, \\\n\ + VXC_OP4_NoDest(img_store_3d, output, coord_b.xywz, outC.s0246, \\\n\ VXC_MODIFIER(0, 3, 0,VXC_RM_TowardZero, 0)); \\\n\ }\n\ GEMM_QINT_F16_TO_F16(U8, vxc_uchar16)\n\ @@ -21401,21 +21837,21 @@ __kernel void gemm_##src0_type_name##F16toF16( \\\n\ { \\\n\ vxc_float4 tempA0, tempA1, tempA2, tempA3, tmpZpScale; \\\n\ vxc_float4 tempB0, tempB1, tempB2, tempB3; \\\n\ - VXC_OP4(img_load_3d, srcA0, inputA, coord_a.xyww, VXC_5BITOFFSET_XY(0, 0), \\\n\ + VXC_OP4(img_load_3d, srcA0, inputA, coord_a.xywz, VXC_5BITOFFSET_XY(0, 0), \\\n\ VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \\\n\ - VXC_OP4(img_load_3d, srcB.hi, inputB, coord_b.xyww, VXC_5BITOFFSET_XY(0, 0), \\\n\ + VXC_OP4(img_load_3d, srcB.hi, inputB, coord_b.xywz, VXC_5BITOFFSET_XY(0, 0), \\\n\ VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \\\n\ - VXC_OP4(img_load_3d, srcA0, inputA, coord_a.xyww, VXC_5BITOFFSET_XY(0, 1), \\\n\ + VXC_OP4(img_load_3d, srcA0, inputA, coord_a.xywz, VXC_5BITOFFSET_XY(0, 1), \\\n\ VXC_MODIFIER(4, 7, 0, VXC_RM_TowardZero, 0)); \\\n\ - VXC_OP4(img_load_3d, srcB.hi, inputB, coord_b.xyww, VXC_5BITOFFSET_XY(0, 1), \\\n\ + VXC_OP4(img_load_3d, srcB.hi, inputB, coord_b.xywz, VXC_5BITOFFSET_XY(0, 1), \\\n\ VXC_MODIFIER(4, 7, 0, VXC_RM_TowardZero, 0)); \\\n\ - VXC_OP4(img_load_3d, srcA1, inputA, coord_a.xyww, VXC_5BITOFFSET_XY(0, 2), \\\n\ + VXC_OP4(img_load_3d, srcA1, inputA, coord_a.xywz, VXC_5BITOFFSET_XY(0, 2), \\\n\ VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \\\n\ - VXC_OP4(img_load_3d, srcB.lo, inputB, coord_b.xyww, VXC_5BITOFFSET_XY(0, 2), \\\n\ + VXC_OP4(img_load_3d, srcB.lo, inputB, coord_b.xywz, VXC_5BITOFFSET_XY(0, 2), \\\n\ VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \\\n\ - VXC_OP4(img_load_3d, srcA1, inputA, coord_a.xyww, VXC_5BITOFFSET_XY(0, 3), \\\n\ + VXC_OP4(img_load_3d, srcA1, inputA, coord_a.xywz, VXC_5BITOFFSET_XY(0, 3), \\\n\ VXC_MODIFIER(4, 7, 0, VXC_RM_TowardZero, 0)); \\\n\ - VXC_OP4(img_load_3d, srcB.lo, inputB, coord_b.xyww, VXC_5BITOFFSET_XY(0, 3), \\\n\ + VXC_OP4(img_load_3d, srcB.lo, inputB, coord_b.xywz, VXC_5BITOFFSET_XY(0, 3), \\\n\ VXC_MODIFIER(4, 7, 0, VXC_RM_TowardZero, 0)); \\\n\ coord_a.x += 4; \\\n\ coord_b.y += 4; \\\n\ @@ -21455,22 +21891,22 @@ __kernel void gemm_##src0_type_name##F16toF16( \\\n\ \\\n\ _viv_asm(CONV, valC, sum0); \\\n\ _viv_asm(COPY, outC, valC, 16); \\\n\ - VXC_OP4_NoDest(img_store_3d, output, coord_b.xyww, outC.s0246, \\\n\ + VXC_OP4_NoDest(img_store_3d, output, coord_b.xywz, outC.s0246, \\\n\ VXC_MODIFIER(0, 3, 0,VXC_RM_TowardZero, 0)); \\\n\ coord_b.y++; \\\n\ _viv_asm(CONV, valC, sum1); \\\n\ _viv_asm(COPY, outC, valC, 16); \\\n\ - VXC_OP4_NoDest(img_store_3d, output, coord_b.xyww, outC.s0246, \\\n\ + VXC_OP4_NoDest(img_store_3d, output, coord_b.xywz, outC.s0246, \\\n\ VXC_MODIFIER(0, 3, 0,VXC_RM_TowardZero, 0)); \\\n\ coord_b.y++; \\\n\ _viv_asm(CONV, valC, sum2); \\\n\ _viv_asm(COPY, outC, valC, 16); \\\n\ - VXC_OP4_NoDest(img_store_3d, output, coord_b.xyww, outC.s0246, \\\n\ + VXC_OP4_NoDest(img_store_3d, output, coord_b.xywz, outC.s0246, \\\n\ VXC_MODIFIER(0, 3, 0,VXC_RM_TowardZero, 0)); \\\n\ coord_b.y++; \\\n\ _viv_asm(CONV, valC, sum3); \\\n\ _viv_asm(COPY, outC, valC, 16); \\\n\ - VXC_OP4_NoDest(img_store_3d, output, coord_b.xyww, outC.s0246, \\\n\ + VXC_OP4_NoDest(img_store_3d, output, coord_b.xywz, outC.s0246, \\\n\ VXC_MODIFIER(0, 3, 0,VXC_RM_TowardZero, 0)); \\\n\ }\n\ GEMM_QINT_F16_TO_F16(U8, vxc_uchar16)\n\ @@ -21523,21 +21959,21 @@ __kernel void gemm_##src0_type_name##F16to##src0_type_name( \\\n\ { \\\n\ vxc_float4 tempA0, tempA1, tempA2, tempA3, tmpZpScale; \\\n\ vxc_float4 tempB0, tempB1, tempB2, tempB3; \\\n\ - VXC_OP4(img_load_3d, srcA0, inputA, coord_a.xyww, VXC_5BITOFFSET_XY(0, 0), \\\n\ + VXC_OP4(img_load_3d, srcA0, inputA, coord_a.xywz, VXC_5BITOFFSET_XY(0, 0), \\\n\ VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \\\n\ - VXC_OP4(img_load_3d, srcB.hi, inputB, coord_b.xyww, VXC_5BITOFFSET_XY(0, 0), \\\n\ + VXC_OP4(img_load_3d, srcB.hi, inputB, coord_b.xywz, VXC_5BITOFFSET_XY(0, 0), \\\n\ VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \\\n\ - VXC_OP4(img_load_3d, srcA0, inputA, coord_a.xyww, VXC_5BITOFFSET_XY(0, 1), \\\n\ + VXC_OP4(img_load_3d, srcA0, inputA, coord_a.xywz, VXC_5BITOFFSET_XY(0, 1), \\\n\ VXC_MODIFIER(4, 7, 0, VXC_RM_TowardZero, 0)); \\\n\ - VXC_OP4(img_load_3d, srcB.hi, inputB, coord_b.xyww, VXC_5BITOFFSET_XY(0, 1), \\\n\ + VXC_OP4(img_load_3d, srcB.hi, inputB, coord_b.xywz, VXC_5BITOFFSET_XY(0, 1), \\\n\ VXC_MODIFIER(4, 7, 0, VXC_RM_TowardZero, 0)); \\\n\ - VXC_OP4(img_load_3d, srcA1, inputA, coord_a.xyww, VXC_5BITOFFSET_XY(0, 2), \\\n\ + VXC_OP4(img_load_3d, srcA1, inputA, coord_a.xywz, VXC_5BITOFFSET_XY(0, 2), \\\n\ VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \\\n\ - VXC_OP4(img_load_3d, srcB.lo, inputB, coord_b.xyww, VXC_5BITOFFSET_XY(0, 2), \\\n\ + VXC_OP4(img_load_3d, srcB.lo, inputB, coord_b.xywz, VXC_5BITOFFSET_XY(0, 2), \\\n\ VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \\\n\ - VXC_OP4(img_load_3d, srcA1, inputA, coord_a.xyww, VXC_5BITOFFSET_XY(0, 3), \\\n\ + VXC_OP4(img_load_3d, srcA1, inputA, coord_a.xywz, VXC_5BITOFFSET_XY(0, 3), \\\n\ VXC_MODIFIER(4, 7, 0, VXC_RM_TowardZero, 0)); \\\n\ - VXC_OP4(img_load_3d, srcB.lo, inputB, coord_b.xyww, VXC_5BITOFFSET_XY(0, 3), \\\n\ + VXC_OP4(img_load_3d, srcB.lo, inputB, coord_b.xywz, VXC_5BITOFFSET_XY(0, 3), \\\n\ VXC_MODIFIER(4, 7, 0, VXC_RM_TowardZero, 0)); \\\n\ coord_a.x += 4; \\\n\ coord_b.y += 4; \\\n\ @@ -21567,20 +22003,20 @@ __kernel void gemm_##src0_type_name##F16to##src0_type_name( \\\n\ tmpOut1 = convert_int4_rte(sum1 * in0outScale + output_ZP); \\\n\ VXC_DP2x8(outC, tmpOut0, tmpOut1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1),\\\n\ uniConvertInt32toUint8_2x8); \\\n\ - VXC_OP4_NoDest(img_store_3d, output, coord_b.xyww, outC.s0123, \\\n\ + VXC_OP4_NoDest(img_store_3d, output, coord_b.xywz, outC.s0123, \\\n\ VXC_MODIFIER(0, 3, 0,VXC_RM_TowardZero, 0)); \\\n\ coord_b.y++; \\\n\ - VXC_OP4_NoDest(img_store_3d, output, coord_b.xyww, outC.s4567, \\\n\ + VXC_OP4_NoDest(img_store_3d, output, coord_b.xywz, outC.s4567, \\\n\ VXC_MODIFIER(0, 3, 0,VXC_RM_TowardZero, 0)); \\\n\ coord_b.y++; \\\n\ tmpOut0 = convert_int4_rte(sum2 * in0outScale + output_ZP); \\\n\ tmpOut1 = convert_int4_rte(sum3 * in0outScale + output_ZP); \\\n\ VXC_DP2x8(outC, tmpOut0, tmpOut1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1),\\\n\ uniConvertInt32toUint8_2x8); \\\n\ - VXC_OP4_NoDest(img_store_3d, output, coord_b.xyww, outC.s0123, \\\n\ + VXC_OP4_NoDest(img_store_3d, output, coord_b.xywz, outC.s0123, \\\n\ VXC_MODIFIER(0, 3, 0,VXC_RM_TowardZero, 0)); \\\n\ coord_b.y++; \\\n\ - VXC_OP4_NoDest(img_store_3d, output, coord_b.xyww, outC.s4567, \\\n\ + VXC_OP4_NoDest(img_store_3d, output, coord_b.xywz, outC.s4567, \\\n\ VXC_MODIFIER(0, 3, 0,VXC_RM_TowardZero, 0)); \\\n\ }\n\ #else\n\ @@ -21614,36 +22050,36 @@ __kernel void gemm_##src0_type_name##F16to##src0_type_name( \\\n\ { \\\n\ vxc_float4 tempA0, tempA1, tempA2, tempA3; \\\n\ vxc_float4 tempB0, tempB1, tempB2, tempB3; \\\n\ - VXC_OP4(img_load_3d, srcA, inputA, coord_a.xyww, VXC_5BITOFFSET_XY(0, 0), \\\n\ + VXC_OP4(img_load_3d, srcA, inputA, coord_a.xywz, VXC_5BITOFFSET_XY(0, 0), \\\n\ VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \\\n\ - VXC_OP4(img_load_3d, srcB, inputB, coord_b.xyww, VXC_5BITOFFSET_XY(0, 0), \\\n\ + VXC_OP4(img_load_3d, srcB, inputB, coord_b.xywz, VXC_5BITOFFSET_XY(0, 0), \\\n\ VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \\\n\ VXC_DP4x4(tempA0, srcA, in0_zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), \\\n\ uniConvertUint8SubZpToFp32_4x4); \\\n\ _viv_asm(COPY, tmpB, srcB, 16); \\\n\ VXC_DP4x4(tempB0, tmpB, tmpB, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvert1stFp16ToFp32_4x4); \\\n\ \\\n\ - VXC_OP4(img_load_3d, srcA, inputA, coord_a.xyww, VXC_5BITOFFSET_XY(0, 1), \\\n\ + VXC_OP4(img_load_3d, srcA, inputA, coord_a.xywz, VXC_5BITOFFSET_XY(0, 1), \\\n\ VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \\\n\ - VXC_OP4(img_load_3d, srcB, inputB, coord_b.xyww, VXC_5BITOFFSET_XY(0, 1), \\\n\ + VXC_OP4(img_load_3d, srcB, inputB, coord_b.xywz, VXC_5BITOFFSET_XY(0, 1), \\\n\ VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \\\n\ VXC_DP4x4(tempA1, srcA, in0_zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), \\\n\ uniConvertUint8SubZpToFp32_4x4); \\\n\ _viv_asm(COPY, tmpB, srcB, 16); \\\n\ VXC_DP4x4(tempB1, tmpB, tmpB, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvert1stFp16ToFp32_4x4); \\\n\ \\\n\ - VXC_OP4(img_load_3d, srcA, inputA, coord_a.xyww, VXC_5BITOFFSET_XY(0, 2), \\\n\ + VXC_OP4(img_load_3d, srcA, inputA, coord_a.xywz, VXC_5BITOFFSET_XY(0, 2), \\\n\ VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \\\n\ - VXC_OP4(img_load_3d, srcB, inputB, coord_b.xyww, VXC_5BITOFFSET_XY(0, 2), \\\n\ + VXC_OP4(img_load_3d, srcB, inputB, coord_b.xywz, VXC_5BITOFFSET_XY(0, 2), \\\n\ VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \\\n\ VXC_DP4x4(tempA2, srcA, in0_zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), \\\n\ uniConvertUint8SubZpToFp32_4x4); \\\n\ _viv_asm(COPY, tmpB, srcB, 16); \\\n\ VXC_DP4x4(tempB2, tmpB, tmpB, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvert1stFp16ToFp32_4x4); \\\n\ \\\n\ - VXC_OP4(img_load_3d, srcA, inputA, coord_a.xyww, VXC_5BITOFFSET_XY(0, 3), \\\n\ + VXC_OP4(img_load_3d, srcA, inputA, coord_a.xywz, VXC_5BITOFFSET_XY(0, 3), \\\n\ VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \\\n\ - VXC_OP4(img_load_3d, srcB, inputB, coord_b.xyww, VXC_5BITOFFSET_XY(0, 3), \\\n\ + VXC_OP4(img_load_3d, srcB, inputB, coord_b.xywz, VXC_5BITOFFSET_XY(0, 3), \\\n\ VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \\\n\ coord_a.x += 4; \\\n\ coord_b.y += 4; \\\n\ @@ -21665,20 +22101,20 @@ __kernel void gemm_##src0_type_name##F16to##src0_type_name( \\\n\ tmpOut1 = convert_int4_rte(sum1 * outputScale + output_ZP); \\\n\ VXC_DP2x8(outC, tmpOut0, tmpOut1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1),\\\n\ uniConvertInt32toUint8_2x8); \\\n\ - VXC_OP4_NoDest(img_store_3d, output, coord_b.xyww, outC.s0123, \\\n\ + VXC_OP4_NoDest(img_store_3d, output, coord_b.xywz, outC.s0123, \\\n\ VXC_MODIFIER(0, 3, 0,VXC_RM_TowardZero, 0)); \\\n\ coord_b.y++; \\\n\ - VXC_OP4_NoDest(img_store_3d, output, coord_b.xyww, outC.s4567, \\\n\ + VXC_OP4_NoDest(img_store_3d, output, coord_b.xywz, outC.s4567, \\\n\ VXC_MODIFIER(0, 3, 0,VXC_RM_TowardZero, 0)); \\\n\ coord_b.y++; \\\n\ tmpOut0 = convert_int4_rte(sum2 * outputScale + output_ZP); \\\n\ tmpOut1 = convert_int4_rte(sum3 * outputScale + output_ZP); \\\n\ VXC_DP2x8(outC, tmpOut0, tmpOut1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1),\\\n\ uniConvertInt32toUint8_2x8); \\\n\ - VXC_OP4_NoDest(img_store_3d, output, coord_b.xyww, outC.s0123, \\\n\ + VXC_OP4_NoDest(img_store_3d, output, coord_b.xywz, outC.s0123, \\\n\ VXC_MODIFIER(0, 3, 0,VXC_RM_TowardZero, 0)); \\\n\ coord_b.y++; \\\n\ - VXC_OP4_NoDest(img_store_3d, output, coord_b.xyww, outC.s4567, \\\n\ + VXC_OP4_NoDest(img_store_3d, output, coord_b.xywz, outC.s4567, \\\n\ VXC_MODIFIER(0, 3, 0,VXC_RM_TowardZero, 0)); \\\n\ }\n\ #endif\n\ @@ -21730,21 +22166,21 @@ __kernel void gemm_##src0_type_name##src0_type_name##toF16( \\\n\ { \\\n\ vxc_float4 tempA0, tempA1, tempA2, tempA3; \\\n\ vxc_float4 tempB0, tempB1, tempB2, tempB3; \\\n\ - VXC_OP4(img_load_3d, srcA0, inputA, coord_a.xyww, VXC_5BITOFFSET_XY(0, 0), \\\n\ + VXC_OP4(img_load_3d, srcA0, inputA, coord_a.xywz, VXC_5BITOFFSET_XY(0, 0), \\\n\ VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \\\n\ - VXC_OP4(img_load_3d, srcB, inputB, coord_b.xyww, VXC_5BITOFFSET_XY(0, 0), \\\n\ + VXC_OP4(img_load_3d, srcB, inputB, coord_b.xywz, VXC_5BITOFFSET_XY(0, 0), \\\n\ VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \\\n\ - VXC_OP4(img_load_3d, srcA1, inputA, coord_a.xyww, VXC_5BITOFFSET_XY(0, 1), \\\n\ + VXC_OP4(img_load_3d, srcA1, inputA, coord_a.xywz, VXC_5BITOFFSET_XY(0, 1), \\\n\ VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \\\n\ - VXC_OP4(img_load_3d, srcB, inputB, coord_b.xyww, VXC_5BITOFFSET_XY(0, 1), \\\n\ + VXC_OP4(img_load_3d, srcB, inputB, coord_b.xywz, VXC_5BITOFFSET_XY(0, 1), \\\n\ VXC_MODIFIER(4, 7, 0, VXC_RM_TowardZero, 0)); \\\n\ - VXC_OP4(img_load_3d, srcA2, inputA, coord_a.xyww, VXC_5BITOFFSET_XY(0, 2), \\\n\ + VXC_OP4(img_load_3d, srcA2, inputA, coord_a.xywz, VXC_5BITOFFSET_XY(0, 2), \\\n\ VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \\\n\ - VXC_OP4(img_load_3d, srcB, inputB, coord_b.xyww, VXC_5BITOFFSET_XY(0, 2), \\\n\ + VXC_OP4(img_load_3d, srcB, inputB, coord_b.xywz, VXC_5BITOFFSET_XY(0, 2), \\\n\ VXC_MODIFIER(8, 11, 0, VXC_RM_TowardZero, 0)); \\\n\ - VXC_OP4(img_load_3d, srcA3, inputA, coord_a.xyww, VXC_5BITOFFSET_XY(0, 3), \\\n\ + VXC_OP4(img_load_3d, srcA3, inputA, coord_a.xywz, VXC_5BITOFFSET_XY(0, 3), \\\n\ VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \\\n\ - VXC_OP4(img_load_3d, srcB, inputB, coord_b.xyww, VXC_5BITOFFSET_XY(0, 3), \\\n\ + VXC_OP4(img_load_3d, srcB, inputB, coord_b.xywz, VXC_5BITOFFSET_XY(0, 3), \\\n\ VXC_MODIFIER(12, 15, 0, VXC_RM_TowardZero, 0)); \\\n\ coord_a.x += 4; \\\n\ coord_b.y += 4; \\\n\ @@ -21779,22 +22215,22 @@ __kernel void gemm_##src0_type_name##src0_type_name##toF16( \\\n\ _viv_asm(MOV, coord_b.w, baseAddr); \\\n\ _viv_asm(CONV, valC, sum0); \\\n\ _viv_asm(COPY, outC, valC, 16); \\\n\ - VXC_OP4_NoDest(img_store_3d, output, coord_b.xyww, outC.s0246, \\\n\ + VXC_OP4_NoDest(img_store_3d, output, coord_b.xywz, outC.s0246, \\\n\ VXC_MODIFIER(0, 3, 0,VXC_RM_TowardZero, 0)); \\\n\ coord_b.y++; \\\n\ _viv_asm(CONV, valC, sum1); \\\n\ _viv_asm(COPY, outC, valC, 16); \\\n\ - VXC_OP4_NoDest(img_store_3d, output, coord_b.xyww, outC.s0246, \\\n\ + VXC_OP4_NoDest(img_store_3d, output, coord_b.xywz, outC.s0246, \\\n\ VXC_MODIFIER(0, 3, 0,VXC_RM_TowardZero, 0)); \\\n\ coord_b.y++; \\\n\ _viv_asm(CONV, valC, sum2); \\\n\ _viv_asm(COPY, outC, valC, 16); \\\n\ - VXC_OP4_NoDest(img_store_3d, output, coord_b.xyww, outC.s0246, \\\n\ + VXC_OP4_NoDest(img_store_3d, output, coord_b.xywz, outC.s0246, \\\n\ VXC_MODIFIER(0, 3, 0,VXC_RM_TowardZero, 0)); \\\n\ coord_b.y++; \\\n\ _viv_asm(CONV, valC, sum3); \\\n\ _viv_asm(COPY, outC, valC, 16); \\\n\ - VXC_OP4_NoDest(img_store_3d, output, coord_b.xyww, outC.s0246, \\\n\ + VXC_OP4_NoDest(img_store_3d, output, coord_b.xywz, outC.s0246, \\\n\ VXC_MODIFIER(0, 3, 0,VXC_RM_TowardZero, 0)); \\\n\ }\n\ GEMM_QINT_TO_F16(U8, vxc_uchar16)\n\ @@ -21832,36 +22268,36 @@ __kernel void gemm_##src0_type_name##src0_type_name##toF16( \\\n\ { \\\n\ vxc_float4 tempA0, tempA1, tempA2, tempA3; \\\n\ vxc_float4 tempB0, tempB1, tempB2, tempB3; \\\n\ - VXC_OP4(img_load_3d, srcA, inputA, coord_a.xyww, VXC_5BITOFFSET_XY(0, 0), \\\n\ + VXC_OP4(img_load_3d, srcA, inputA, coord_a.xywz, VXC_5BITOFFSET_XY(0, 0), \\\n\ VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \\\n\ - VXC_OP4(img_load_3d, srcB, inputB, coord_b.xyww, VXC_5BITOFFSET_XY(0, 0), \\\n\ + VXC_OP4(img_load_3d, srcB, inputB, coord_b.xywz, VXC_5BITOFFSET_XY(0, 0), \\\n\ VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \\\n\ VXC_DP4x4(tempA0, srcA, in0_zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), \\\n\ uniConvertUint8SubZpToFp32_4x4); \\\n\ VXC_DP4x4(tempB0, srcB, in1_zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), \\\n\ uniConvertUint8SubZpToFp32B_4x4); \\\n\ \\\n\ - VXC_OP4(img_load_3d, srcA, inputA, coord_a.xyww, VXC_5BITOFFSET_XY(0, 1), \\\n\ + VXC_OP4(img_load_3d, srcA, inputA, coord_a.xywz, VXC_5BITOFFSET_XY(0, 1), \\\n\ VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \\\n\ - VXC_OP4(img_load_3d, srcB, inputB, coord_b.xyww, VXC_5BITOFFSET_XY(0, 1), \\\n\ + VXC_OP4(img_load_3d, srcB, inputB, coord_b.xywz, VXC_5BITOFFSET_XY(0, 1), \\\n\ VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \\\n\ VXC_DP4x4(tempA1, srcA, in0_zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), \\\n\ uniConvertUint8SubZpToFp32_4x4); \\\n\ VXC_DP4x4(tempB1, srcB, in1_zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), \\\n\ uniConvertUint8SubZpToFp32B_4x4); \\\n\ \\\n\ - VXC_OP4(img_load_3d, srcA, inputA, coord_a.xyww, VXC_5BITOFFSET_XY(0, 2), \\\n\ + VXC_OP4(img_load_3d, srcA, inputA, coord_a.xywz, VXC_5BITOFFSET_XY(0, 2), \\\n\ VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \\\n\ - VXC_OP4(img_load_3d, srcB, inputB, coord_b.xyww, VXC_5BITOFFSET_XY(0, 2), \\\n\ + VXC_OP4(img_load_3d, srcB, inputB, coord_b.xywz, VXC_5BITOFFSET_XY(0, 2), \\\n\ VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \\\n\ VXC_DP4x4(tempA2, srcA, in0_zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), \\\n\ uniConvertUint8SubZpToFp32_4x4); \\\n\ VXC_DP4x4(tempB2, srcB, in1_zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), \\\n\ uniConvertUint8SubZpToFp32B_4x4); \\\n\ \\\n\ - VXC_OP4(img_load_3d, srcA, inputA, coord_a.xyww, VXC_5BITOFFSET_XY(0, 3), \\\n\ + VXC_OP4(img_load_3d, srcA, inputA, coord_a.xywz, VXC_5BITOFFSET_XY(0, 3), \\\n\ VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \\\n\ - VXC_OP4(img_load_3d, srcB, inputB, coord_b.xyww, VXC_5BITOFFSET_XY(0, 3), \\\n\ + VXC_OP4(img_load_3d, srcB, inputB, coord_b.xywz, VXC_5BITOFFSET_XY(0, 3), \\\n\ VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \\\n\ coord_a.x += 4; \\\n\ coord_b.y += 4; \\\n\ @@ -21880,22 +22316,22 @@ __kernel void gemm_##src0_type_name##src0_type_name##toF16( \\\n\ _viv_asm(MOV, coord_b.w, baseAddr); \\\n\ _viv_asm(CONV, valC, sum0); \\\n\ _viv_asm(COPY, outC, valC, 16); \\\n\ - VXC_OP4_NoDest(img_store_3d, output, coord_b.xyww, outC.s0246, \\\n\ + VXC_OP4_NoDest(img_store_3d, output, coord_b.xywz, outC.s0246, \\\n\ VXC_MODIFIER(0, 3, 0,VXC_RM_TowardZero, 0)); \\\n\ coord_b.y++; \\\n\ _viv_asm(CONV, valC, sum1); \\\n\ _viv_asm(COPY, outC, valC, 16); \\\n\ - VXC_OP4_NoDest(img_store_3d, output, coord_b.xyww, outC.s0246, \\\n\ + VXC_OP4_NoDest(img_store_3d, output, coord_b.xywz, outC.s0246, \\\n\ VXC_MODIFIER(0, 3, 0,VXC_RM_TowardZero, 0)); \\\n\ coord_b.y++; \\\n\ _viv_asm(CONV, valC, sum2); \\\n\ _viv_asm(COPY, outC, valC, 16); \\\n\ - VXC_OP4_NoDest(img_store_3d, output, coord_b.xyww, outC.s0246, \\\n\ + VXC_OP4_NoDest(img_store_3d, output, coord_b.xywz, outC.s0246, \\\n\ VXC_MODIFIER(0, 3, 0,VXC_RM_TowardZero, 0)); \\\n\ coord_b.y++; \\\n\ _viv_asm(CONV, valC, sum3); \\\n\ _viv_asm(COPY, outC, valC, 16); \\\n\ - VXC_OP4_NoDest(img_store_3d, output, coord_b.xyww, outC.s0246, \\\n\ + VXC_OP4_NoDest(img_store_3d, output, coord_b.xywz, outC.s0246, \\\n\ VXC_MODIFIER(0, 3, 0,VXC_RM_TowardZero, 0)); \\\n\ }\n\ GEMM_QINT16_TO_F16(I16, vxc_short8)\n\ @@ -24396,15 +24832,22 @@ __kernel void moments_axis1_##src0_type_name##toF16( \\\n\ short zp = inputZP;\\\n\ float4 tmpData0;\\\n\ \\\n\ - for(coord.y = 0; coord.y < height; coord.y++) \\\n\ + VXC_ReadImage2DArray(src0, input, coord, VXC_5BITOFFSET_XY(0, 0), \\\n\ + VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \\\n\ + for(coord.y = 1; coord.y < height; ) \\\n\ { \\\n\ - VXC_ReadImage2DArray(src0, input, coord, VXC_5BITOFFSET_XY(0, 0), \\\n\ - VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \\\n\ VXC_DP4x4(tmpData0, src0, zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\\\n\ uniConvert1stUint8SubZpToFp32_4x4); \\\n\ + VXC_ReadImage2DArray(src0, input, coord, VXC_5BITOFFSET_XY(0, 0), \\\n\ + VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \\\n\ + coord.y++; \\\n\ sum += (tmpData0); \\\n\ sqr += (tmpData0 * tmpData0); \\\n\ } \\\n\ + VXC_DP4x4(tmpData0, src0, zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\\\n\ + uniConvert1stUint8SubZpToFp32_4x4); \\\n\ + sum += (tmpData0); \\\n\ + sqr += (tmpData0 * tmpData0); \\\n\ sum *= input_scale; \\\n\ sqr *= e2InScale; \\\n\ \\\n\ @@ -24441,16 +24884,23 @@ __kernel void moments_axis1_##src0_type_name##toF16_2D( \\\n\ float4 sum = 0, sqr = 0; \\\n\ short zp = inputZP;\\\n\ float4 tmpData0;\\\n\ + VXC_ReadImage(src0, input, coord, VXC_5BITOFFSET_XY(0, 0), \\\n\ + VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \\\n\ \\\n\ - for(coord.y = 0; coord.y < height; coord.y++) \\\n\ + for (coord.y = 1; coord.y < height; ) \\\n\ { \\\n\ - VXC_ReadImage(src0, input, coord, VXC_5BITOFFSET_XY(0, 0), \\\n\ - VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \\\n\ VXC_DP4x4(tmpData0, src0, zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\\\n\ uniConvert1stUint8SubZpToFp32_4x4); \\\n\ + VXC_ReadImage(src0, input, coord, VXC_5BITOFFSET_XY(0, 0), \\\n\ + VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \\\n\ + coord.y++; \\\n\ sum += (tmpData0); \\\n\ sqr += (tmpData0 * tmpData0); \\\n\ } \\\n\ + VXC_DP4x4(tmpData0, src0, zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\\\n\ + uniConvert1stUint8SubZpToFp32_4x4); \\\n\ + sum += (tmpData0); \\\n\ + sqr += (tmpData0 * tmpData0); \\\n\ sum *= input_scale; \\\n\ sqr *= e2InScale; \\\n\ \\\n\ @@ -24656,6 +25106,430 @@ __kernel void moments_axis2_F16toF16(\n\ VXC_WriteImage(output_vari, coord_out, dst.s4567, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0));\n\ }"; /* end of moments_axis2_vx*/ +static const char moments_u8_vx[] = "#include \"cl_viv_vx_ext.h\"\n\ +\n\ +_viv_uniform int width;\n\ +_viv_uniform int height;\n\ +_viv_uniform int channel;\n\ +_viv_uniform float dimRatio;\n\ +\n\ +_viv_uniform VXC_512Bits uniSumU8_16x1;\n\ +_viv_uniform VXC_512Bits uniSqrSum_16x1;\n\ +_viv_uniform float input_scale;\n\ +_viv_uniform int inputZP;\n\ +_viv_uniform int sumInZp;\n\ +_viv_uniform int tmpZp1;\n\ +_viv_uniform float e2InScale;\n\ +_viv_uniform float rowSumScale;\n\ +_viv_uniform float4 output_ZP;\n\ +_viv_uniform float4 outputScale;\n\ +_viv_uniform float output_ZP0;\n\ +_viv_uniform float outputScale0;\n\ +_viv_uniform float output_ZP1;\n\ +_viv_uniform float outputScale1;\n\ +_viv_uniform VXC_512Bits uniConvertInt32toUint8_2x8;\n\ +_viv_uniform VXC_512Bits uniConvert1stUint8SubZpToFp32_4x4;\n\ +\n\ +#define MOMENTS_AXIS0_QINT_U8(src0_type_name, read0_type) \\\n\ +__kernel void moments_axis0_##src0_type_name##toU8( \\\n\ + image2d_array_t input, image2d_t output_mean, image2d_t output_vari, \\\n\ + int axis, int axis_num) \\\n\ +{ \\\n\ + int gidy = get_global_id(0); \\\n\ + int gidz = get_global_id(1); \\\n\ + int4 coord = (int4)(0, gidy, gidz, gidz); \\\n\ + read0_type src0; \\\n\ + float sum = 0, sqr = 0; \\\n\ + int tmpSum = 0, tmpSqr = 0; \\\n\ + int4 tmpSum0, tmpSqr0; \\\n\ + int8 inputA_desc; \\\n\ + _viv_asm(COPY, inputA_desc, input, sizeof(inputA_desc)); \\\n\ + int baseAddr_a = (int)gidz * inputA_desc.s4 + inputA_desc.s0; \\\n\ + _viv_asm(MOV, coord.z, baseAddr_a); \\\n\ + \\\n\ + for(coord.x = 0; coord.x < width; coord.x += 16) \\\n\ + { \\\n\ + VXC_OP4(img_load_3d, src0, input, coord, VXC_5BITOFFSET_XY(0, 0), \\\n\ + VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); \\\n\ + VXC_DP16x1(tmpSum0, src0, src0, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0), uniSumU8_16x1); \\\n\ + VXC_DP16x1(tmpSqr0, src0, src0, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0), uniSqrSum_16x1); \\\n\ + tmpSum += (tmpSum0.x); \\\n\ + tmpSqr += (tmpSqr0.x + tmpZp1 * tmpSum0.x); \\\n\ + } \\\n\ + sqr = (convert_float(tmpSqr) * e2InScale + rowSumScale); \\\n\ + sum = convert_float(tmpSum + sumInZp) * input_scale; \\\n\ + vxc_float4 mean_vari0 = (vxc_float4)(sum, sqr, 0, 0); \\\n\ + mean_vari0 *= dimRatio; \\\n\ + mean_vari0.s1 = mean_vari0.s1 - mean_vari0.s0 * mean_vari0.s0; \\\n\ + int2 coord_out = (int2)(gidy, gidz); \\\n\ + vxc_int4 tmpData = convert_int4_rte(mean_vari0 * outputScale + output_ZP); \\\n\ + VXC_DP2x8(src0, tmpData, tmpData, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1),\\\n\ + uniConvertInt32toUint8_2x8); \\\n\ + VXC_WriteImage(output_mean, coord_out, src0.s0123, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0)); \\\n\ + VXC_WriteImage(output_vari, coord_out, src0.s1023, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0)); \\\n\ +}\n\ +MOMENTS_AXIS0_QINT_U8(U8, vxc_uchar16)\n\ +\n\ +#define MOMENTS_AXIS0_QINT_U8_2D(src0_type_name, read0_type) \\\n\ +__kernel void moments_axis0_##src0_type_name##toU8_2D( \\\n\ + image2d_t input, image2d_t output_mean, image2d_t output_vari, \\\n\ + int axis, int axis_num) \\\n\ +{ \\\n\ + int gidy = get_global_id(0); \\\n\ + int2 coord = (int2)(0, gidy); \\\n\ + read0_type src0; \\\n\ + float sum = 0, sqr = 0; \\\n\ + int tmpSum = 0, tmpSqr = 0; \\\n\ + int4 tmpSum0, tmpSqr0; \\\n\ + \\\n\ + for(coord.x = 0; coord.x < width; coord.x += 16) \\\n\ + { \\\n\ + VXC_ReadImage(src0, input, coord, VXC_5BITOFFSET_XY(0, 0), \\\n\ + VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); \\\n\ + VXC_DP16x1(tmpSum0, src0, src0, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0), uniSumU8_16x1); \\\n\ + VXC_DP16x1(tmpSqr0, src0, src0, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0), uniSqrSum_16x1); \\\n\ + tmpSum += (tmpSum0.x); \\\n\ + tmpSqr += (tmpSqr0.x + tmpZp1 * tmpSum0.x); \\\n\ + } \\\n\ + sqr = (convert_float(tmpSqr) * e2InScale + rowSumScale); \\\n\ + sum = convert_float(tmpSum + sumInZp) * input_scale; \\\n\ + vxc_float4 mean_vari0 = (vxc_float4)(sum, sqr, 0, 0); \\\n\ + mean_vari0 *= dimRatio; \\\n\ + mean_vari0.s1 = mean_vari0.s1 - mean_vari0.s0 * mean_vari0.s0; \\\n\ + int2 coord_out = (int2)(gidy, 0); \\\n\ + vxc_int4 tmpData = convert_int4_rte(mean_vari0 * outputScale + output_ZP); \\\n\ + VXC_DP2x8(src0, tmpData, tmpData, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1),\\\n\ + uniConvertInt32toUint8_2x8); \\\n\ + VXC_WriteImage(output_mean, coord_out, src0.s0123, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0)); \\\n\ + VXC_WriteImage(output_vari, coord_out, src0.s1023, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0)); \\\n\ +}\n\ +MOMENTS_AXIS0_QINT_U8_2D(U8, vxc_uchar16)\n\ +\n\ +#define MOMENTS_AXIS01_QINT_U8(src0_type_name, read0_type) \\\n\ +__kernel __attribute__((reqd_work_group_size(16, 1, 1))) void moments_axis01_##src0_type_name##toU8( \\\n\ + image2d_array_t input, image2d_t output_mean, image2d_t output_vari, \\\n\ + int axis, int axis_num) \\\n\ +{ \\\n\ + int gidx = get_global_id(0) << 4; \\\n\ + int lidx = get_local_id(0); \\\n\ + int gidz = get_global_id(1); \\\n\ + int4 coord = (int4)(gidx, 0, gidz, gidz); \\\n\ + read0_type src0; \\\n\ + float sum = 0, sqr = 0; \\\n\ + \\\n\ + __local float lcl_sum[16]; \\\n\ + __local float lcl_sqr[16]; \\\n\ + int8 inputA_desc; \\\n\ + _viv_asm(COPY, inputA_desc, input, sizeof(inputA_desc)); \\\n\ + int baseAddr_a = (int)gidz * inputA_desc.s4 + inputA_desc.s0; \\\n\ + _viv_asm(MOV, coord.z, baseAddr_a); \\\n\ + \\\n\ + for(coord.x = gidx; coord.x < width; coord.x += 256) \\\n\ + { \\\n\ + int tmpSum = 0, tmpSqr = 0, tmpSum1, tmpSqr1; \\\n\ + for(coord.y = 0; coord.y < height;) \\\n\ + { \\\n\ + VXC_OP4(img_load_3d, src0, input, coord, VXC_5BITOFFSET_XY(0, 0), \\\n\ + VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); \\\n\ + coord.y++; \\\n\ + VXC_DP16x1(tmpSum1, src0, src0, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0), uniSumU8_16x1); \\\n\ + tmpSum += (tmpSum1); \\\n\ + VXC_DP16x1(tmpSqr1, src0, src0, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0), uniSqrSum_16x1); \\\n\ + tmpSqr += (tmpSqr1 + tmpZp1 * tmpSum1); \\\n\ + } \\\n\ + sqr += (tmpSqr * e2InScale + rowSumScale); \\\n\ + sum += (tmpSum + sumInZp) * input_scale; \\\n\ + } \\\n\ + lcl_sum[lidx] = sum; \\\n\ + lcl_sqr[lidx] = sqr; \\\n\ + barrier(CLK_LOCAL_MEM_FENCE); \\\n\ + int2 coord_out = (int2)(gidz, 0); \\\n\ + if(lidx == 0) \\\n\ + { \\\n\ + float4 one = (float4)(1, 1, 1, 1); \\\n\ + __local float4* tmp_sum = (__local float4*)lcl_sum; \\\n\ + __local float4* tmp_sqr = (__local float4*)lcl_sqr; \\\n\ + \\\n\ + sum = (0); \\\n\ + sqr = (0); \\\n\ + for(int i = 0; i < 4; i++) \\\n\ + { \\\n\ + sum += dot(tmp_sum[i], one); \\\n\ + sqr += dot(tmp_sqr[i], one); \\\n\ + } \\\n\ + float4 meanVari; \\\n\ + meanVari.x = sum * dimRatio; \\\n\ + meanVari.y = sqr * dimRatio - meanVari.x * meanVari.x; \\\n\ + vxc_int4 tmpData = convert_int4_rte(meanVari * outputScale + output_ZP); \\\n\ + VXC_DP2x8(src0, tmpData, tmpData, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1),\\\n\ + uniConvertInt32toUint8_2x8); \\\n\ + VXC_WriteImage(output_mean, coord_out, src0.s0123, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0)); \\\n\ + VXC_WriteImage(output_vari, coord_out, src0.s1023, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0)); \\\n\ + } \\\n\ +}\n\ +MOMENTS_AXIS01_QINT_U8(U8, vxc_uchar16)\n\ +\n\ +#define MOMENTS_AXIS01_QINT_U8_2D(src0_type_name, read0_type) \\\n\ +__kernel __attribute__((reqd_work_group_size(16, 1, 1))) void moments_axis01_##src0_type_name##toU8_2D( \\\n\ + image2d_t input, image2d_t output_mean, image2d_t output_vari, \\\n\ + int axis, int axis_num) \\\n\ +{ \\\n\ + int gidx = get_global_id(0) << 4; \\\n\ + int lidx = get_local_id(0); \\\n\ + int2 coord = (int2)(gidx, 0); \\\n\ + read0_type src0; \\\n\ + float sum = 0, sqr = 0; \\\n\ + __local float lcl_sum[16]; \\\n\ + __local float lcl_sqr[16]; \\\n\ + for(coord.x = gidx; coord.x < width; coord.x += 256) \\\n\ + { \\\n\ + int tmpSum = 0, tmpSqr = 0, tmpSum1, tmpSqr1; \\\n\ + for(coord.y = 0; coord.y < height;) \\\n\ + { \\\n\ + VXC_ReadImage(src0, input, coord, VXC_5BITOFFSET_XY(0, 0), \\\n\ + VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); \\\n\ + coord.y++; \\\n\ + VXC_DP16x1(tmpSum1, src0, src0, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0), uniSumU8_16x1); \\\n\ + tmpSum += (tmpSum1); \\\n\ + VXC_DP16x1(tmpSqr1, src0, src0, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0), uniSqrSum_16x1); \\\n\ + tmpSqr += (tmpSqr1 + tmpZp1 * tmpSum1); \\\n\ + } \\\n\ + sqr += (tmpSqr * e2InScale + rowSumScale); \\\n\ + sum += (tmpSum + sumInZp) * input_scale; \\\n\ + } \\\n\ + lcl_sum[lidx] = sum; \\\n\ + lcl_sqr[lidx] = sqr; \\\n\ + barrier(CLK_LOCAL_MEM_FENCE); \\\n\ + int2 coord_out = (int2)(0, 0); \\\n\ + if(lidx == 0) \\\n\ + { \\\n\ + float4 one = (float4)(1, 1, 1, 1); \\\n\ + __local float4* tmp_sum = (__local float4*)lcl_sum; \\\n\ + __local float4* tmp_sqr = (__local float4*)lcl_sqr; \\\n\ + sum = (0); sqr = (0); \\\n\ + for(int i = 0; i < 4; i++) \\\n\ + { \\\n\ + sum += dot(tmp_sum[i], one); \\\n\ + sqr += dot(tmp_sqr[i], one); \\\n\ + } \\\n\ + float4 meanVari; \\\n\ + meanVari.x = sum * dimRatio; \\\n\ + meanVari.y = sqr * dimRatio - meanVari.x * meanVari.x; \\\n\ + vxc_int4 tmpData = convert_int4_rte(meanVari * outputScale + output_ZP); \\\n\ + VXC_DP2x8(src0, tmpData, tmpData, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1),\\\n\ + uniConvertInt32toUint8_2x8); \\\n\ + VXC_WriteImage(output_mean, coord_out, src0.s0123, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0)); \\\n\ + VXC_WriteImage(output_vari, coord_out, src0.s1023, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0)); \\\n\ + } \\\n\ +}\n\ +MOMENTS_AXIS01_QINT_U8_2D(U8, vxc_uchar16)\n\ +\n\ +#define MOMENTS_AXIS1_QINT_U8(src0_type_name, read0_type) \\\n\ +__kernel void moments_axis1_##src0_type_name##toU8( \\\n\ + image2d_array_t input, image2d_t output_mean, image2d_t output_vari, \\\n\ + int axis, int axis_num) \\\n\ +{ \\\n\ + int gidx = get_global_id(0); \\\n\ + int gidz = get_global_id(1); \\\n\ + int4 coord = (int4)(gidx, 0, gidz, gidz); \\\n\ + read0_type src0; \\\n\ + float4 sum = 0, sqr = 0; \\\n\ + short zp = inputZP;\\\n\ + float4 tmpData0;\\\n\ + \\\n\ + int8 inputA_desc; \\\n\ + _viv_asm(COPY, inputA_desc, input, sizeof(inputA_desc)); \\\n\ + int baseAddr_a = (int)gidz * inputA_desc.s4 + inputA_desc.s0; \\\n\ + _viv_asm(MOV, coord.z, baseAddr_a); \\\n\ + VXC_OP4(img_load_3d, src0, input, coord, VXC_5BITOFFSET_XY(0, 0), \\\n\ + VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \\\n\ + for(coord.y = 1; coord.y < height; ) \\\n\ + { \\\n\ + VXC_DP4x4(tmpData0, src0, zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\\\n\ + uniConvert1stUint8SubZpToFp32_4x4); \\\n\ + VXC_OP4(img_load_3d, src0, input, coord, VXC_5BITOFFSET_XY(0, 0), \\\n\ + VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \\\n\ + coord.y++; \\\n\ + sum += (tmpData0); \\\n\ + sqr += (tmpData0 * tmpData0); \\\n\ + } \\\n\ + VXC_DP4x4(tmpData0, src0, zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\\\n\ + uniConvert1stUint8SubZpToFp32_4x4); \\\n\ + sum += (tmpData0); \\\n\ + sqr += (tmpData0 * tmpData0); \\\n\ + sum *= input_scale; \\\n\ + sqr *= e2InScale; \\\n\ + \\\n\ + float4 mean = sum * dimRatio; \\\n\ + float4 vari = sqr * dimRatio; \\\n\ + vari = vari - mean * mean; \\\n\ + vxc_int4 tmpVal0 = convert_int4_rte(mean * outputScale0 + output_ZP0); \\\n\ + vxc_int4 tmpVal1 = convert_int4_rte(vari * outputScale1 + output_ZP1); \\\n\ + VXC_DP2x8(src0, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1),\\\n\ + uniConvertInt32toUint8_2x8); \\\n\ + int2 coord_out = (int2)(gidx, gidz); \\\n\ + VXC_WriteImage(output_mean, coord_out, src0.s0123, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \\\n\ + VXC_WriteImage(output_vari, coord_out, src0.s4567, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \\\n\ +}\n\ +MOMENTS_AXIS1_QINT_U8(U8, vxc_uchar16)\n\ +\n\ +#define MOMENTS_AXIS1_QINT_U8_2D(src0_type_name, read0_type) \\\n\ +__kernel void moments_axis1_##src0_type_name##toU8_2D( \\\n\ + image2d_t input, image2d_t output_mean, image2d_t output_vari, \\\n\ + int axis, int axis_num) \\\n\ +{ \\\n\ + int gidx = get_global_id(0); \\\n\ + int2 coord = (int2)(gidx, 0); \\\n\ + read0_type src0; \\\n\ + float4 sum = 0, sqr = 0; \\\n\ + short zp = inputZP;\\\n\ + float4 tmpData0;\\\n\ + VXC_ReadImage(src0, input, coord, VXC_5BITOFFSET_XY(0, 0), \\\n\ + VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \\\n\ + \\\n\ + for (coord.y = 1; coord.y < height; ) \\\n\ + { \\\n\ + VXC_DP4x4(tmpData0, src0, zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\\\n\ + uniConvert1stUint8SubZpToFp32_4x4); \\\n\ + VXC_ReadImage(src0, input, coord, VXC_5BITOFFSET_XY(0, 0), \\\n\ + VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \\\n\ + coord.y++; \\\n\ + sum += (tmpData0); \\\n\ + sqr += (tmpData0 * tmpData0); \\\n\ + } \\\n\ + VXC_DP4x4(tmpData0, src0, zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\\\n\ + uniConvert1stUint8SubZpToFp32_4x4); \\\n\ + sum += (tmpData0); \\\n\ + sqr += (tmpData0 * tmpData0); \\\n\ + sum *= input_scale; \\\n\ + sqr *= e2InScale; \\\n\ + \\\n\ + float4 mean = sum * dimRatio; \\\n\ + float4 vari = sqr * dimRatio - mean * mean; \\\n\ + vxc_int4 tmpVal0 = convert_int4_rte(mean * outputScale0 + output_ZP0); \\\n\ + vxc_int4 tmpVal1 = convert_int4_rte(vari * outputScale1 + output_ZP1); \\\n\ + VXC_DP2x8(src0, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1),\\\n\ + uniConvertInt32toUint8_2x8); \\\n\ + int2 coord_out = (int2)(gidx, 0); \\\n\ + VXC_WriteImage(output_mean, coord_out, src0.s0123, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \\\n\ + VXC_WriteImage(output_vari, coord_out, src0.s4567, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \\\n\ +}\n\ +MOMENTS_AXIS1_QINT_U8_2D(U8, vxc_uchar16)\n\ +\n\ +#define MOMENTS_AXIS2_QINT_U8(src0_type_name, read0_type) \\\n\ +__kernel void moments_axis2_##src0_type_name##toU8( \\\n\ + image2d_array_t input, image2d_t output_mean, image2d_t output_vari, \\\n\ + int axis, int axis_num) \\\n\ +{ \\\n\ + int gidx = get_global_id(0); \\\n\ + int gidy = get_global_id(1); \\\n\ + int4 coord = (int4)(gidx, gidy, 0, 0); \\\n\ + read0_type src0; \\\n\ + float4 sum = 0, sqr = 0; \\\n\ + short zp = inputZP;\\\n\ + float4 tmpData0;\\\n\ + \\\n\ + for(coord.z = 0; coord.z < channel; coord.z++) \\\n\ + { \\\n\ + VXC_ReadImage2DArray(src0, input, coord, VXC_5BITOFFSET_XY(0, 0), \\\n\ + VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \\\n\ + VXC_DP4x4(tmpData0, src0, zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\\\n\ + uniConvert1stUint8SubZpToFp32_4x4); \\\n\ + sum += (tmpData0); \\\n\ + sqr += (tmpData0 * tmpData0); \\\n\ + } \\\n\ + sum *= input_scale; \\\n\ + sqr *= e2InScale; \\\n\ + \\\n\ + float4 mean = sum * dimRatio; \\\n\ + float4 vari = sqr * dimRatio - mean * mean; \\\n\ + vxc_int4 tmpVal0 = convert_int4_rte(mean * outputScale0 + output_ZP0); \\\n\ + vxc_int4 tmpVal1 = convert_int4_rte(vari * outputScale1 + output_ZP1); \\\n\ + VXC_DP2x8(src0, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1),\\\n\ + uniConvertInt32toUint8_2x8); \\\n\ + int2 coord_out = (int2)(gidx, gidy); \\\n\ + VXC_WriteImage(output_mean, coord_out, src0.s0123, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \\\n\ + VXC_WriteImage(output_vari, coord_out, src0.s4567, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \\\n\ +}\n\ +MOMENTS_AXIS2_QINT_U8(U8, vxc_uchar16)\n\ +"; /* end of moments_u8_vx*/ + +static const char moments_u8_axis012_vx[] = "#include \"cl_viv_vx_ext.h\"\n\ +\n\ +_viv_uniform int width;\n\ +_viv_uniform int height;\n\ +_viv_uniform int channel;\n\ +_viv_uniform float dimRatio;\n\ +\n\ +_viv_uniform VXC_512Bits uniSumU8_16x1;\n\ +_viv_uniform VXC_512Bits uniSqrSum_16x1;\n\ +_viv_uniform float input_scale;\n\ +_viv_uniform int sumInZp;\n\ +_viv_uniform int tmpZp1;\n\ +_viv_uniform float e2InScale;\n\ +_viv_uniform float rowSumScale;\n\ +_viv_uniform float4 output_ZP;\n\ +_viv_uniform float4 outputScale;\n\ +_viv_uniform VXC_512Bits uniConvertInt32toUint8_2x8;\n\ +\n\ +#define MOMENTS_AXIS012_QINT_U8(src0_type_name, read0_type) \\\n\ +__kernel __attribute__((reqd_work_group_size(16, 1, 1))) void moments_axis012_##src0_type_name##toU8( \\\n\ + image2d_array_t input, image2d_t output_mean, image2d_t output_vari, \\\n\ + int axis, int axis_num) \\\n\ +{ \\\n\ + int gidx = get_global_id(0) << 4; \\\n\ + int lidx = get_local_id(0); \\\n\ + int4 coord = (int4)(gidx, 0, 0, 0); \\\n\ + read0_type src0; \\\n\ + float sum = 0, sqr = 0; \\\n\ + __local float lcl_sum[16]; \\\n\ + __local float lcl_sqr[16]; \\\n\ + for(coord.z = 0; coord.z < channel; coord.z++) \\\n\ + { \\\n\ + for(coord.x = gidx; coord.x < width; coord.x += 256) \\\n\ + { \\\n\ + int tmpSum = 0, tmpSqr = 0, tmpSum1, tmpSqr1; \\\n\ + for(coord.y = 0; coord.y < height;) \\\n\ + { \\\n\ + VXC_ReadImage2DArray(src0, input, coord, VXC_5BITOFFSET_XY(0, 0), \\\n\ + VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); \\\n\ + coord.y++; \\\n\ + VXC_DP16x1(tmpSum1, src0, src0, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0), uniSumU8_16x1); \\\n\ + tmpSum += (tmpSum1); \\\n\ + VXC_DP16x1(tmpSqr1, src0, src0, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0), uniSqrSum_16x1); \\\n\ + tmpSqr += (tmpSqr1 + tmpZp1 * tmpSum1); \\\n\ + } \\\n\ + sqr += (tmpSqr * e2InScale + rowSumScale); \\\n\ + sum += (tmpSum + sumInZp) * input_scale; \\\n\ + } \\\n\ + } \\\n\ + lcl_sum[lidx] = sum; \\\n\ + lcl_sqr[lidx] = sqr; \\\n\ + barrier(CLK_LOCAL_MEM_FENCE); \\\n\ + int2 coord_out = (int2)(0, 0); \\\n\ + if(lidx == 0) \\\n\ + { \\\n\ + float4 one = (float4)(1, 1, 1, 1); \\\n\ + __local float4* tmp_sum = (__local float4*)lcl_sum; \\\n\ + __local float4* tmp_sqr = (__local float4*)lcl_sqr; \\\n\ + sum = (0); sqr = (0); \\\n\ + for(int i = 0; i < 4; i++) \\\n\ + { \\\n\ + sum += dot(tmp_sum[i], one); \\\n\ + sqr += dot(tmp_sqr[i], one); \\\n\ + } \\\n\ + float4 meanVari; \\\n\ + meanVari.x = sum * dimRatio; \\\n\ + meanVari.y = sqr * dimRatio - meanVari.x * meanVari.x; \\\n\ + vxc_int4 tmpData = convert_int4_rte(meanVari * outputScale + output_ZP); \\\n\ + VXC_DP2x8(src0, tmpData, tmpData, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1),\\\n\ + uniConvertInt32toUint8_2x8); \\\n\ + VXC_WriteImage(output_mean, coord_out, src0.s0123, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0)); \\\n\ + VXC_WriteImage(output_vari, coord_out, src0.s1023, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0)); \\\n\ + } \\\n\ +}\n\ +MOMENTS_AXIS012_QINT_U8(U8, vxc_uchar16)"; /* end of moments_u8_axis012_vx*/ + static const char one_hot_vx[] = "#include \"cl_viv_vx_ext.h\"\n\ \n\ _viv_uniform VXC_512Bits uniDataConvert_0_4x4;\n\ @@ -31043,15 +31917,6 @@ _viv_uniform int iter;\n\ _viv_uniform int stride;\n\ _viv_uniform float re_rand_max;\n\ \n\ -inline uchar* get_image2D_array_ptr(image2d_array_t input)\n\ -{\n\ - int8 desc;\n\ - _viv_asm(COPY, desc, input, sizeof(desc));\n\ - uchar *src_ptr = (uchar*)desc.s0;\n\ -\n\ - return src_ptr;\n\ -}\n\ -\n\ uint4 _philox4x32bumpkey(uint4 key)\n\ {\n\ uint4 mask = (uint4)((uint)0x9E3779B9, (uint)0xBB67AE85, 0, 0);\n\ @@ -31107,15 +31972,16 @@ uint4 philox4x32_R_10(uint4 ctr, uint4 key)\n\ }\n\ \n\ __kernel void random_seed(\n\ - __read_only image2d_array_t seeds,\n\ - __write_only image2d_array_t output)\n\ + __read_only image2d_t seeds,\n\ + __write_only image2d_t output)\n\ {\n\ int gidx = get_global_id(0);\n\ int gidy = get_global_id(1);\n\ int4 coord = (int4)(gidx << 1, gidy, 0, 0);\n\ \n\ int width = get_image_width(seeds);\n\ - __global uint* seeds_ptr = (__global uint*)get_image2D_array_ptr(seeds);\n\ + Image s_img = create_image_from_image2d(seeds, 4);\n\ + __global uint* seeds_ptr = (__global uint*)s_img.ptr;\n\ seeds_ptr = seeds_ptr + coord.x + coord.y * width;\n\ uint4 key = vload4(0, seeds_ptr);\n\ \n\ @@ -31123,8 +31989,9 @@ __kernel void random_seed(\n\ float4 result = 0;\n\ \n\ width = get_image_width(output);\n\ + Image o_img = create_image_from_image2d(output, 4);\n\ coord.x = gidx * stride + width * coord.y;\n\ - __global float* output_ptr = (__global float*)get_image2D_array_ptr(output);\n\ + __global float* output_ptr = (__global float*)o_img.ptr;\n\ output_ptr += coord.x;\n\ \n\ for(int i = 0; i < iter; i++)\n\ @@ -31146,8 +32013,8 @@ float4 eltwise_unary_exp(float4 x)\n\ // x dim = 1\n\ __kernel void random_multinomial_cdf_F16\n\ (\n\ - __read_only image2d_array_t input,\n\ - __write_only image2d_array_t output\n\ + __read_only image2d_t input,\n\ + __write_only image2d_t output\n\ )\n\ {\n\ int gidx = get_global_id(0);\n\ @@ -31162,7 +32029,8 @@ __kernel void random_multinomial_cdf_F16\n\ \n\ int class_max_stride = get_image_width(input);\n\ int offset = gidy * class_max_stride;\n\ - __global float* output_ptr = (__global float*)get_image2D_array_ptr(output);\n\ + Image o_img = create_image_from_image2d(output, 4);\n\ + __global float* output_ptr = (__global float*)o_img.ptr;\n\ __global float* cdfPtr = output_ptr + offset;\n\ \n\ VXC_ReadImage(maxData, input, coord, VXC_5BITOFFSET_XY(0, 0),\\\n\ @@ -31205,8 +32073,8 @@ __kernel void random_multinomial_cdf_F16\n\ \n\ __kernel void random_multinomial_cdf_F32\n\ (\n\ - __read_only image2d_array_t input,\n\ - __write_only image2d_array_t output\n\ + __read_only image2d_t input,\n\ + __write_only image2d_t output\n\ )\n\ {\n\ int gidx = get_global_id(0);\n\ @@ -31222,11 +32090,13 @@ __kernel void random_multinomial_cdf_F32\n\ int class_max_stride = get_image_width(input);\n\ float tmp = 0;\n\ int offset = gidy * class_max_stride;\n\ - __global float* output_ptr = (__global float*)get_image2D_array_ptr(output);\n\ + Image o_img = create_image_from_image2d(output, 4);\n\ + __global float* output_ptr = (__global float*)o_img.ptr;\n\ __global float* cdfPtr = output_ptr + offset;\n\ \n\ int width = get_image_width(input);\n\ - __global float* input_ptr = (__global float*)get_image2D_array_ptr(input);\n\ + Image i_img = create_image_from_image2d(input, 4);\n\ + __global float* input_ptr = (__global float*)i_img.ptr;\n\ input_ptr = input_ptr + coord.x + coord.y * width;\n\ \n\ float4 maxVal = vload4(0, input_ptr);\n\ @@ -31270,10 +32140,10 @@ uint upper_bound(float* a, int n, float x)\n\ // one thread calculate 4\n\ __kernel void random_multinomial\n\ (\n\ - __read_only image2d_array_t randoms,\n\ - __read_only image2d_array_t cdfs,\n\ - __write_only image2d_array_t output,\n\ - int class_size\n\ + __read_only image2d_t randoms,\n\ + __read_only image2d_t cdfs,\n\ + __write_only image2d_t output,\n\ + int class_size\n\ )\n\ {\n\ int gidx = get_global_id(0);\n\ @@ -31282,17 +32152,20 @@ __kernel void random_multinomial\n\ \n\ int class_max_stride = get_image_width(cdfs);\n\ int offset = gidy * class_max_stride;\n\ - __global float* cdf_ptr = (__global float*)get_image2D_array_ptr(cdfs);\n\ + Image cdf_img = create_image_from_image2d(cdfs, 4);\n\ + __global float* cdf_ptr = (__global float*)cdf_img.ptr;\n\ __global float* cdfPtr = cdf_ptr + offset;\n\ \n\ int width = get_image_width(randoms);\n\ offset = coord.x + coord.y * width;\n\ - __global float* randoms_ptr = (__global float*)get_image2D_array_ptr(randoms);\n\ + Image r_img = create_image_from_image2d(randoms, 4);\n\ + __global float* randoms_ptr = (__global float*)r_img.ptr;\n\ randoms_ptr = randoms_ptr + offset;\n\ \n\ width = get_image_width(output);\n\ offset = coord.x + coord.y * width;\n\ - __global uint* output_ptr = (__global uint*)get_image2D_array_ptr(output);\n\ + Image o_img = create_image_from_image2d(output, 4);\n\ + __global uint* output_ptr = (__global uint*)o_img.ptr;\n\ output_ptr = output_ptr + offset;\n\ \n\ float4 ran = vload4(0, randoms_ptr);\n\ @@ -33524,7 +34397,7 @@ __kernel void repeat_I16_axis0(\n\ image2d_array_t input0, image2d_t input1, image2d_t input2,\n\ image2d_array_t output, int axis)\n\ {\n\ - int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0);\n\ + int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), get_global_id(2));\n\ vxc_short8 src0;\n\ VXC_ReadImage2DArray(src0, input0, coord, VXC_5BITOFFSET_XY(0, 0),\n\ VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ @@ -33541,6 +34414,7 @@ __kernel void repeat_I16_axis0(\n\ int baseAddr = (int)get_global_id(2) * output_desc.s4 + output_desc.s0;\n\ _viv_asm(MOV, coord.z, baseAddr);\n\ int end = len + start;\n\ + coord.w = get_global_id(2);\n\ \n\ for(coord.y = start; coord.y < end; coord.y++)\n\ {\n\ @@ -33650,6 +34524,7 @@ __kernel void repeat_U8_axis0(\n\ int baseAddr = (int)get_global_id(2) * output_desc.s4 + output_desc.s0;\n\ _viv_asm(MOV, coord.z, baseAddr);\n\ int end = len + start;\n\ + coord.w = get_global_id(2);\n\ \n\ for(coord.y = start; coord.y < end; coord.y++)\n\ {\n\ @@ -33708,7 +34583,7 @@ __kernel void repeat_I16_axis1(\n\ image2d_array_t output, int axis)\n\ {\n\ int gidy = get_global_id(1);\n\ - int4 coord = (int4)(get_global_id(0), gidy, get_global_id(2), 0);\n\ + int4 coord = (int4)(get_global_id(0), gidy, get_global_id(2), get_global_id(2));\n\ vxc_short8 src0, src1, src2, src3, src4, src5, src6, src7;\n\ \n\ int8 input_desc, output_desc;\n\ @@ -33813,7 +34688,7 @@ __kernel void repeat_U8_axis1(\n\ image2d_array_t output, int axis)\n\ {\n\ int gidy = get_global_id(1);\n\ - int4 coord = (int4)(get_global_id(0), gidy, get_global_id(2), 0);\n\ + int4 coord = (int4)(get_global_id(0), gidy, get_global_id(2), get_global_id(2));\n\ vxc_uchar16 src0, src1, src2, src3, src4, src5, src6, src7;\n\ \n\ int8 input_desc, output_desc;\n\ @@ -33957,16 +34832,16 @@ __kernel void resize_1d_bilinear_BF16toBF16_DOWN\n\ \n\ do\n\ {\n\ - VXC_OP4(img_load_3d, top, input, coord_in.xyww, VXC_5BITOFFSET_XY(0, 0),\n\ + VXC_OP4(img_load_3d, top, input, coord_in.xywz, VXC_5BITOFFSET_XY(0, 0),\n\ VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0));\n\ coord_in.x = left_x_idx.y;\n\ - VXC_OP4(img_load_3d, top, input, coord_in.xyww, VXC_5BITOFFSET_XY(0, 0),\n\ + VXC_OP4(img_load_3d, top, input, coord_in.xywz, VXC_5BITOFFSET_XY(0, 0),\n\ VXC_MODIFIER(2, 3, 0, VXC_RM_TowardZero, 0));\n\ coord_in.x = left_x_idx.z;\n\ - VXC_OP4(img_load_3d, top, input, coord_in.xyww, VXC_5BITOFFSET_XY(0, 0),\n\ + VXC_OP4(img_load_3d, top, input, coord_in.xywz, VXC_5BITOFFSET_XY(0, 0),\n\ VXC_MODIFIER(4, 5, 0, VXC_RM_TowardZero, 0));\n\ coord_in.x = left_x_idx.w;\n\ - VXC_OP4(img_load_3d, top, input, coord_in.xyww, VXC_5BITOFFSET_XY(0, 0),\n\ + VXC_OP4(img_load_3d, top, input, coord_in.xywz, VXC_5BITOFFSET_XY(0, 0),\n\ VXC_MODIFIER(6, 7, 0, VXC_RM_TowardZero, 0));\n\ \n\ vxc_ushort8 src;\n\ @@ -33983,7 +34858,7 @@ __kernel void resize_1d_bilinear_BF16toBF16_DOWN\n\ vxc_ushort8 tmp, dst;\n\ _viv_asm(COPY, tmp, dst4, 16);\n\ dst.s0123 = tmp.s1357;\n\ - VXC_OP4_NoDest(img_store_3d, output, coord_out.xyww, dst,\n\ + VXC_OP4_NoDest(img_store_3d, output, coord_out.xywz, dst,\n\ VXC_MODIFIER(0, 3, 0,VXC_RM_TowardZero, 0));\n\ coord_in.y++;\n\ coord_out.y ++;\n\ @@ -34015,9 +34890,9 @@ __kernel void resize_1d_bilinear_BF16toBF16_UP\n\ _viv_asm(COPY, input_desc, input, sizeof(input_desc));\n\ int baseAddr = (int)coord_in.z * input_desc.s4 + input_desc.s0;\n\ _viv_asm(MOV, coord_in.w, baseAddr);\n\ - VXC_OP4(img_load_3d, src0, input, coord_in.xyww, VXC_5BITOFFSET_XY(0, 0),\n\ + VXC_OP4(img_load_3d, src0, input, coord_in.xywz, VXC_5BITOFFSET_XY(0, 0),\n\ VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ - VXC_OP4(img_load_3d, src1, input, coord_in.xyww, VXC_5BITOFFSET_XY(8, 0),\n\ + VXC_OP4(img_load_3d, src1, input, coord_in.xywz, VXC_5BITOFFSET_XY(8, 0),\n\ VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ \n\ vxc_ushort8 bitextract_p0;\n\ @@ -34038,9 +34913,9 @@ __kernel void resize_1d_bilinear_BF16toBF16_UP\n\ VXC_BitExtract(dst0, src0, src1, maskShift, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ \n\ coord_in.y ++;\n\ - VXC_OP4(img_load_3d, src0, input, coord_in.xyww, VXC_5BITOFFSET_XY(0, 0),\n\ + VXC_OP4(img_load_3d, src0, input, coord_in.xywz, VXC_5BITOFFSET_XY(0, 0),\n\ VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ - VXC_OP4(img_load_3d, src1, input, coord_in.xyww, VXC_5BITOFFSET_XY(8, 0),\n\ + VXC_OP4(img_load_3d, src1, input, coord_in.xywz, VXC_5BITOFFSET_XY(8, 0),\n\ VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ \n\ vxc_ushort8 dst_tmp;\n\ @@ -34058,7 +34933,7 @@ __kernel void resize_1d_bilinear_BF16toBF16_UP\n\ _viv_asm(COPY, tmp, dst4, 16);\n\ dst.s0123 = tmp.s1357;\n\ \n\ - VXC_OP4_NoDest(img_store_3d, output, coord_out.xyww, dst,\n\ + VXC_OP4_NoDest(img_store_3d, output, coord_out.xywz, dst,\n\ VXC_MODIFIER(0, 3, 0,VXC_RM_TowardZero, 0));\n\ coord_out.y++;\n\ } while (coord_out.y < out_height);\n\ @@ -34086,12 +34961,12 @@ _viv_uniform int out_height;\n\ \\\n\ while (coord_out.y < out_height) \\\n\ { \\\n\ - VXC_OP4(img_load_3d, read_data, input, coord_in.xyww, VXC_5BITOFFSET_XY(0, 0), \\\n\ + VXC_OP4(img_load_3d, read_data, input, coord_in.xywz, VXC_5BITOFFSET_XY(0, 0), \\\n\ VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); \\\n\ _viv_asm(COPY, in0, read_data, 16); \\\n\ VXC_DP2x8(result, in0, in0, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniResizeNxDown_2x8); \\\n\ _viv_asm(COPY, save_data, result, 16); \\\n\ - VXC_OP4_NoDest(img_store_3d, output, coord_out.xyww, save_data, \\\n\ + VXC_OP4_NoDest(img_store_3d, output, coord_out.xywz, save_data, \\\n\ VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0)); \\\n\ coord_in.y++; \\\n\ coord_out.y++; \\\n\ @@ -34150,15 +35025,15 @@ RESIZE_1D_2X_DOWN_8BIT_SAME(I8, I8, vxc_char16, vxc_char16)\n\ \\\n\ while (coord_out.y < out_height) \\\n\ { \\\n\ - VXC_OP4(img_load_3d, read_data, input, coord_in.xyww, VXC_5BITOFFSET_XY(0, 0), \\\n\ + VXC_OP4(img_load_3d, read_data, input, coord_in.xywz, VXC_5BITOFFSET_XY(0, 0), \\\n\ VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \\\n\ _viv_asm(COPY, in0, read_data, 16); \\\n\ - VXC_OP4(img_load_3d, read_data1, input, coord_in.xyww, VXC_5BITOFFSET_XY(8, 0), \\\n\ + VXC_OP4(img_load_3d, read_data1, input, coord_in.xywz, VXC_5BITOFFSET_XY(8, 0), \\\n\ VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \\\n\ _viv_asm(COPY, in1, read_data1, 16); \\\n\ VXC_DP2x8(result, in0, in1, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniResizeNxDown_2x8); \\\n\ _viv_asm(COPY, save_data, result, 16); \\\n\ - VXC_OP4_NoDest(img_store_3d, output, coord_out.xyww, save_data, \\\n\ + VXC_OP4_NoDest(img_store_3d, output, coord_out.xywz, save_data, \\\n\ VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0)); \\\n\ coord_in.y++; \\\n\ coord_out.y++; \\\n\ @@ -34253,16 +35128,16 @@ __kernel void resize_1d_bilinear_F16toF16_DOWN\n\ \n\ do\n\ {\n\ - VXC_OP4(img_load_3d, src, input, coord_in.xyww, VXC_5BITOFFSET_XY(0, 0),\n\ + VXC_OP4(img_load_3d, src, input, coord_in.xywz, VXC_5BITOFFSET_XY(0, 0),\n\ VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0));\n\ coord_in.x = left_x_idx.y;\n\ - VXC_OP4(img_load_3d, src, input, coord_in.xyww, VXC_5BITOFFSET_XY(0, 0),\n\ + VXC_OP4(img_load_3d, src, input, coord_in.xywz, VXC_5BITOFFSET_XY(0, 0),\n\ VXC_MODIFIER(2, 3, 0, VXC_RM_TowardZero, 0));\n\ coord_in.x = left_x_idx.z;\n\ - VXC_OP4(img_load_3d, src, input, coord_in.xyww, VXC_5BITOFFSET_XY(0, 0),\n\ + VXC_OP4(img_load_3d, src, input, coord_in.xywz, VXC_5BITOFFSET_XY(0, 0),\n\ VXC_MODIFIER(4, 5, 0, VXC_RM_TowardZero, 0));\n\ coord_in.x = left_x_idx.w;\n\ - VXC_OP4(img_load_3d, src, input, coord_in.xyww, VXC_5BITOFFSET_XY(0, 0),\n\ + VXC_OP4(img_load_3d, src, input, coord_in.xywz, VXC_5BITOFFSET_XY(0, 0),\n\ VXC_MODIFIER(6, 7, 0, VXC_RM_TowardZero, 0));\n\ \n\ _viv_asm(COPY, src_half, src, 16);\n\ @@ -34276,7 +35151,7 @@ __kernel void resize_1d_bilinear_F16toF16_DOWN\n\ _viv_asm(CONV, tmp, dst4);\n\ VXC_DP2x8(dst, tmp, tmp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniExtactHalf8_2x8);\n\ _viv_asm(COPY, result, dst, 16);\n\ - VXC_OP4_NoDest(img_store_3d, output, coord_out.xyww, result,\n\ + VXC_OP4_NoDest(img_store_3d, output, coord_out.xywz, result,\n\ VXC_MODIFIER(0, 3, 0,VXC_RM_TowardZero, 0));\n\ coord_in.y++;\n\ coord_out.y ++;\n\ @@ -34318,16 +35193,16 @@ __kernel void resize_1d_bilinear_F16toU8_DOWN\n\ \n\ do\n\ {\n\ - VXC_OP4(img_load_3d, src, input, coord_in.xyww, VXC_5BITOFFSET_XY(0, 0),\n\ + VXC_OP4(img_load_3d, src, input, coord_in.xywz, VXC_5BITOFFSET_XY(0, 0),\n\ VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0));\n\ coord_in.x = left_x_idx.y;\n\ - VXC_OP4(img_load_3d, src, input, coord_in.xyww, VXC_5BITOFFSET_XY(0, 0),\n\ + VXC_OP4(img_load_3d, src, input, coord_in.xywz, VXC_5BITOFFSET_XY(0, 0),\n\ VXC_MODIFIER(2, 3, 0, VXC_RM_TowardZero, 0));\n\ coord_in.x = left_x_idx.z;\n\ - VXC_OP4(img_load_3d, src, input, coord_in.xyww, VXC_5BITOFFSET_XY(0, 0),\n\ + VXC_OP4(img_load_3d, src, input, coord_in.xywz, VXC_5BITOFFSET_XY(0, 0),\n\ VXC_MODIFIER(4, 5, 0, VXC_RM_TowardZero, 0));\n\ coord_in.x = left_x_idx.w;\n\ - VXC_OP4(img_load_3d, src, input, coord_in.xyww, VXC_5BITOFFSET_XY(0, 0),\n\ + VXC_OP4(img_load_3d, src, input, coord_in.xywz, VXC_5BITOFFSET_XY(0, 0),\n\ VXC_MODIFIER(6, 7, 0, VXC_RM_TowardZero, 0));\n\ \n\ _viv_asm(COPY, src_half, src, 16);\n\ @@ -34341,7 +35216,7 @@ __kernel void resize_1d_bilinear_F16toU8_DOWN\n\ int4 dst = convert_int4_rte(dst4);\n\ VXC_DP2x8(result, dst, dst, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), uniExtact8Bit_2x8);\n\ \n\ - VXC_OP4_NoDest(img_store_3d, output, coord_out.xyww, result,\n\ + VXC_OP4_NoDest(img_store_3d, output, coord_out.xywz, result,\n\ VXC_MODIFIER(0, 3, 0,VXC_RM_TowardZero, 0));\n\ coord_in.y++;\n\ coord_out.y ++;\n\ @@ -34373,9 +35248,9 @@ __kernel void resize_1d_bilinear_F16toF16_UP\n\ _viv_asm(COPY, input_desc, input, sizeof(input_desc));\n\ int baseAddr = (int)coord_in.z * input_desc.s4 + input_desc.s0;\n\ _viv_asm(MOV, coord_in.w, baseAddr);\n\ - VXC_OP4(img_load_3d, src0, input, coord_in.xyww, VXC_5BITOFFSET_XY(0, 0),\n\ + VXC_OP4(img_load_3d, src0, input, coord_in.xywz, VXC_5BITOFFSET_XY(0, 0),\n\ VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ - VXC_OP4(img_load_3d, src1, input, coord_in.xyww, VXC_5BITOFFSET_XY(8, 0),\n\ + VXC_OP4(img_load_3d, src1, input, coord_in.xywz, VXC_5BITOFFSET_XY(8, 0),\n\ VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ \n\ vxc_ushort8 bitextract_p0;\n\ @@ -34397,9 +35272,9 @@ __kernel void resize_1d_bilinear_F16toF16_UP\n\ _viv_asm(COPY, top, dst0, 16);\n\ \n\ coord_in.y ++;\n\ - VXC_OP4(img_load_3d, src0, input, coord_in.xyww, VXC_5BITOFFSET_XY(0, 0),\n\ + VXC_OP4(img_load_3d, src0, input, coord_in.xywz, VXC_5BITOFFSET_XY(0, 0),\n\ VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ - VXC_OP4(img_load_3d, src1, input, coord_in.xyww, VXC_5BITOFFSET_XY(8, 0),\n\ + VXC_OP4(img_load_3d, src1, input, coord_in.xywz, VXC_5BITOFFSET_XY(8, 0),\n\ VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ \n\ float4 left4;\n\ @@ -34414,7 +35289,7 @@ __kernel void resize_1d_bilinear_F16toF16_UP\n\ VXC_DP2x8(top, tmp, tmp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniExtactHalf8_2x8);\n\ _viv_asm(COPY, dst0, top, 16);\n\ \n\ - VXC_OP4_NoDest(img_store_3d, output, coord_out.xyww, dst0,\n\ + VXC_OP4_NoDest(img_store_3d, output, coord_out.xywz, dst0,\n\ VXC_MODIFIER(0, 3, 0,VXC_RM_TowardZero, 0));\n\ coord_out.y++;\n\ } while (coord_out.y < out_height);\n\ @@ -34463,9 +35338,9 @@ __kernel void resize_1d_bilinear_I16toI16_UP\n\ _viv_asm(COPY, input_desc, input, sizeof(input_desc));\n\ int baseAddr = (int)coord_in.z * input_desc.s4 + input_desc.s0;\n\ _viv_asm(MOV, coord_in.w, baseAddr);\n\ - VXC_OP4(img_load_3d, src0, input, coord_in.xyww, VXC_5BITOFFSET_XY(0, 0),\n\ + VXC_OP4(img_load_3d, src0, input, coord_in.xywz, VXC_5BITOFFSET_XY(0, 0),\n\ VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ - VXC_OP4(img_load_3d, src1, input, coord_in.xyww, VXC_5BITOFFSET_XY(8, 0),\n\ + VXC_OP4(img_load_3d, src1, input, coord_in.xywz, VXC_5BITOFFSET_XY(8, 0),\n\ VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ \n\ vxc_ushort8 bitextract_p0;\n\ @@ -34490,9 +35365,9 @@ __kernel void resize_1d_bilinear_I16toI16_UP\n\ float4 right4;\n\ \n\ coord_in.y ++;\n\ - VXC_OP4(img_load_3d, src0, input, coord_in.xyww, VXC_5BITOFFSET_XY(0, 0),\n\ + VXC_OP4(img_load_3d, src0, input, coord_in.xywz, VXC_5BITOFFSET_XY(0, 0),\n\ VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ - VXC_OP4(img_load_3d, src1, input, coord_in.xyww, VXC_5BITOFFSET_XY(8, 0),\n\ + VXC_OP4(img_load_3d, src1, input, coord_in.xywz, VXC_5BITOFFSET_XY(8, 0),\n\ VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ \n\ VXC_DP4x4(left4, top, top, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertDFP2FP32_4x4);\n\ @@ -34502,7 +35377,7 @@ __kernel void resize_1d_bilinear_I16toI16_UP\n\ int4 dst = convert_int4_rte(dst4);\n\ VXC_DP2x8(top, dst, dst, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), uniExtact8Bit_2x8);\n\ \n\ - VXC_OP4_NoDest(img_store_3d, output, coord_out.xyww, top,\n\ + VXC_OP4_NoDest(img_store_3d, output, coord_out.xywz, top,\n\ VXC_MODIFIER(0, 3, 0,VXC_RM_TowardZero, 0));\n\ coord_out.y ++;\n\ } while (coord_out.y < out_height);\n\ @@ -34540,16 +35415,16 @@ __kernel void resize_1d_bilinear_I16toI16_DOWN\n\ \n\ do\n\ {\n\ - VXC_OP4(img_load_3d, src, input, coord_in.xyww, VXC_5BITOFFSET_XY(0, 0),\n\ + VXC_OP4(img_load_3d, src, input, coord_in.xywz, VXC_5BITOFFSET_XY(0, 0),\n\ VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0));\n\ coord_in.x = left_x_idx.y;\n\ - VXC_OP4(img_load_3d, src, input, coord_in.xyww, VXC_5BITOFFSET_XY(0, 0),\n\ + VXC_OP4(img_load_3d, src, input, coord_in.xywz, VXC_5BITOFFSET_XY(0, 0),\n\ VXC_MODIFIER(2, 3, 0, VXC_RM_TowardZero, 0));\n\ coord_in.x = left_x_idx.z;\n\ - VXC_OP4(img_load_3d, src, input, coord_in.xyww, VXC_5BITOFFSET_XY(0, 0),\n\ + VXC_OP4(img_load_3d, src, input, coord_in.xywz, VXC_5BITOFFSET_XY(0, 0),\n\ VXC_MODIFIER(4, 5, 0, VXC_RM_TowardZero, 0));\n\ coord_in.x = left_x_idx.w;\n\ - VXC_OP4(img_load_3d, src, input, coord_in.xyww, VXC_5BITOFFSET_XY(0, 0),\n\ + VXC_OP4(img_load_3d, src, input, coord_in.xywz, VXC_5BITOFFSET_XY(0, 0),\n\ VXC_MODIFIER(6, 7, 0, VXC_RM_TowardZero, 0));\n\ \n\ VXC_DP4x4(left4, src, src, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertDFP2FP32_left_4x4);\n\ @@ -34561,7 +35436,7 @@ __kernel void resize_1d_bilinear_I16toI16_DOWN\n\ int4 dst = convert_int4_rte(dst4);\n\ \n\ VXC_DP2x8(result, dst, dst, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), uniExtact8Bit_2x8);\n\ - VXC_OP4_NoDest(img_store_3d, output, coord_out.xyww, result,\n\ + VXC_OP4_NoDest(img_store_3d, output, coord_out.xywz, result,\n\ VXC_MODIFIER(0, 3, 0,VXC_RM_TowardZero, 0));\n\ coord_in.y++;\n\ coord_out.y ++;\n\ @@ -34613,7 +35488,7 @@ __kernel void resize_1d_bilinear_I8toI8_UP\n\ _viv_asm(COPY, input_desc, input, sizeof(input_desc));\n\ int baseAddr = (int)coord_in.z * input_desc.s4 + input_desc.s0;\n\ _viv_asm(MOV, coord_in.w, baseAddr);\n\ - VXC_OP4(img_load_3d, src0, input, coord_in.xyww, VXC_5BITOFFSET_XY(0, 0),\n\ + VXC_OP4(img_load_3d, src0, input, coord_in.xywz, VXC_5BITOFFSET_XY(0, 0),\n\ VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));\n\ \n\ vxc_ushort8 bitextract_p0;\n\ @@ -34635,7 +35510,7 @@ __kernel void resize_1d_bilinear_I8toI8_UP\n\ _viv_asm(COPY, top, dst0, 16);\n\ \n\ coord_in.y++;\n\ - VXC_OP4(img_load_3d, src0, input, coord_in.xyww, VXC_5BITOFFSET_XY(0, 0),\n\ + VXC_OP4(img_load_3d, src0, input, coord_in.xywz, VXC_5BITOFFSET_XY(0, 0),\n\ VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));\n\ \n\ float4 left4;\n\ @@ -34651,7 +35526,7 @@ __kernel void resize_1d_bilinear_I8toI8_UP\n\ dst4 = dst4 * dfpScale;\n\ int4 dst = convert_int4_rte(dst4);\n\ VXC_DP2x8(top, dst, dst, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), uniExtact8Bit_2x8);\n\ - VXC_OP4_NoDest(img_store_3d, output, coord_out.xyww, top,\n\ + VXC_OP4_NoDest(img_store_3d, output, coord_out.xywz, top,\n\ VXC_MODIFIER(0, 3, 0,VXC_RM_TowardZero, 0));\n\ \n\ coord_out.y ++;\n\ @@ -34690,16 +35565,16 @@ __kernel void resize_1d_bilinear_I8toI8_DOWN\n\ \n\ do\n\ {\n\ - VXC_OP4(img_load_3d, src, input, coord_in.xyww, VXC_5BITOFFSET_XY(0, 0),\n\ + VXC_OP4(img_load_3d, src, input, coord_in.xywz, VXC_5BITOFFSET_XY(0, 0),\n\ VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0));\n\ coord_in.x = left_x_idx.y;\n\ - VXC_OP4(img_load_3d, src, input, coord_in.xyww, VXC_5BITOFFSET_XY(0, 0),\n\ + VXC_OP4(img_load_3d, src, input, coord_in.xywz, VXC_5BITOFFSET_XY(0, 0),\n\ VXC_MODIFIER(2, 3, 0, VXC_RM_TowardZero, 0));\n\ coord_in.x = left_x_idx.z;\n\ - VXC_OP4(img_load_3d, src, input, coord_in.xyww, VXC_5BITOFFSET_XY(0, 0),\n\ + VXC_OP4(img_load_3d, src, input, coord_in.xywz, VXC_5BITOFFSET_XY(0, 0),\n\ VXC_MODIFIER(4, 5, 0, VXC_RM_TowardZero, 0));\n\ coord_in.x = left_x_idx.w;\n\ - VXC_OP4(img_load_3d, src, input, coord_in.xyww, VXC_5BITOFFSET_XY(0, 0),\n\ + VXC_OP4(img_load_3d, src, input, coord_in.xywz, VXC_5BITOFFSET_XY(0, 0),\n\ VXC_MODIFIER(6, 7, 0, VXC_RM_TowardZero, 0));\n\ \n\ VXC_DP4x4(left4, src, src, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertDFP2FP32_left_4x4);\n\ @@ -34711,7 +35586,7 @@ __kernel void resize_1d_bilinear_I8toI8_DOWN\n\ int4 dst = convert_int4_rte(dst4);\n\ \n\ VXC_DP2x8(result, dst, dst, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), uniExtact8Bit_2x8);\n\ - VXC_OP4_NoDest(img_store_3d, output, coord_out.xyww, result,\n\ + VXC_OP4_NoDest(img_store_3d, output, coord_out.xywz, result,\n\ VXC_MODIFIER(0, 3, 0,VXC_RM_TowardZero, 0));\n\ coord_in.y++;\n\ coord_out.y ++;\n\ @@ -34770,16 +35645,16 @@ __kernel void resize_1d_bilinear_U8toF16_DOWN\n\ \n\ do\n\ {\n\ - VXC_OP4(img_load_3d, src, input, coord_in.xyww, VXC_5BITOFFSET_XY(0, 0),\n\ + VXC_OP4(img_load_3d, src, input, coord_in.xywz, VXC_5BITOFFSET_XY(0, 0),\n\ VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0));\n\ coord_in.x = left_x_idx.y;\n\ - VXC_OP4(img_load_3d, src, input, coord_in.xyww, VXC_5BITOFFSET_XY(0, 0),\n\ + VXC_OP4(img_load_3d, src, input, coord_in.xywz, VXC_5BITOFFSET_XY(0, 0),\n\ VXC_MODIFIER(2, 3, 0, VXC_RM_TowardZero, 0));\n\ coord_in.x = left_x_idx.z;\n\ - VXC_OP4(img_load_3d, src, input, coord_in.xyww, VXC_5BITOFFSET_XY(0, 0),\n\ + VXC_OP4(img_load_3d, src, input, coord_in.xywz, VXC_5BITOFFSET_XY(0, 0),\n\ VXC_MODIFIER(4, 5, 0, VXC_RM_TowardZero, 0));\n\ coord_in.x = left_x_idx.w;\n\ - VXC_OP4(img_load_3d, src, input, coord_in.xyww, VXC_5BITOFFSET_XY(0, 0),\n\ + VXC_OP4(img_load_3d, src, input, coord_in.xywz, VXC_5BITOFFSET_XY(0, 0),\n\ VXC_MODIFIER(6, 7, 0, VXC_RM_TowardZero, 0));\n\ \n\ VXC_DP4x4(left4, src, inputZP, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniU8SubZPtoFp32_left_4x4);\n\ @@ -34791,7 +35666,7 @@ __kernel void resize_1d_bilinear_U8toF16_DOWN\n\ _viv_asm(CONV, dst, dst4);\n\ vxc_short8 dst_short;\n\ _viv_asm(COPY, dst_short, dst, 16);\n\ - VXC_OP4_NoDest(img_store_3d, output, coord_out.xyww, dst_short,\n\ + VXC_OP4_NoDest(img_store_3d, output, coord_out.xywz, dst_short,\n\ VXC_MODIFIER(0, 3, 0,VXC_RM_TowardZero, 0));\n\ coord_in.y++;\n\ coord_out.y ++;\n\ @@ -34828,7 +35703,7 @@ __kernel void resize_1d_bilinear_U8toU8_UP\n\ _viv_asm(COPY, input_desc, input, sizeof(input_desc));\n\ int baseAddr = (int)coord_in.z * input_desc.s4 + input_desc.s0;\n\ _viv_asm(MOV, coord_in.w, baseAddr);\n\ - VXC_OP4(img_load_3d, src0, input, coord_in.xyww, VXC_5BITOFFSET_XY(0, 0),\n\ + VXC_OP4(img_load_3d, src0, input, coord_in.xywz, VXC_5BITOFFSET_XY(0, 0),\n\ VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));\n\ \n\ vxc_ushort8 bitextract_p0;\n\ @@ -34848,7 +35723,7 @@ __kernel void resize_1d_bilinear_U8toU8_UP\n\ {\n\ VXC_BitExtract(top, src0, src0, maskShift, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ coord_in.y++;\n\ - VXC_OP4(img_load_3d, src0, input, coord_in.xyww,\n\ + VXC_OP4(img_load_3d, src0, input, coord_in.xywz,\n\ VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));\n\ \n\ float4 left4;\n\ @@ -34863,7 +35738,7 @@ __kernel void resize_1d_bilinear_U8toU8_UP\n\ dst4 = dst4 * uint8Scale + output_ZP;\n\ int4 dst = convert_int4_rte(dst4);\n\ VXC_DP2x8(top, dst, dst, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), uniExtact8Bit_2x8);\n\ - VXC_OP4_NoDest(img_store_3d, output, coord_out.xyww, top,\n\ + VXC_OP4_NoDest(img_store_3d, output, coord_out.xywz, top,\n\ VXC_MODIFIER(0, 3, 0,VXC_RM_TowardZero, 0));\n\ \n\ coord_out.y ++;\n\ @@ -34904,16 +35779,16 @@ __kernel void resize_1d_bilinear_U8toU8_DOWN\n\ \n\ do\n\ {\n\ - VXC_OP4(img_load_3d, src, input, coord_in.xyww, VXC_5BITOFFSET_XY(0, 0),\n\ + VXC_OP4(img_load_3d, src, input, coord_in.xywz, VXC_5BITOFFSET_XY(0, 0),\n\ VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0));\n\ coord_in.x = left_x_idx.y;\n\ - VXC_OP4(img_load_3d, src, input, coord_in.xyww, VXC_5BITOFFSET_XY(0, 0),\n\ + VXC_OP4(img_load_3d, src, input, coord_in.xywz, VXC_5BITOFFSET_XY(0, 0),\n\ VXC_MODIFIER(2, 3, 0, VXC_RM_TowardZero, 0));\n\ coord_in.x = left_x_idx.z;\n\ - VXC_OP4(img_load_3d, src, input, coord_in.xyww, VXC_5BITOFFSET_XY(0, 0),\n\ + VXC_OP4(img_load_3d, src, input, coord_in.xywz, VXC_5BITOFFSET_XY(0, 0),\n\ VXC_MODIFIER(4, 5, 0, VXC_RM_TowardZero, 0));\n\ coord_in.x = left_x_idx.w;\n\ - VXC_OP4(img_load_3d, src, input, coord_in.xyww, VXC_5BITOFFSET_XY(0, 0),\n\ + VXC_OP4(img_load_3d, src, input, coord_in.xywz, VXC_5BITOFFSET_XY(0, 0),\n\ VXC_MODIFIER(6, 7, 0, VXC_RM_TowardZero, 0));\n\ \n\ VXC_DP4x4(left4, src, inputZP, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniU8SubZPtoFp32_left_4x4);\n\ @@ -34925,7 +35800,7 @@ __kernel void resize_1d_bilinear_U8toU8_DOWN\n\ \n\ VXC_DP2x8(result, dst, dst, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), uniExtact8Bit_2x8);\n\ \n\ - VXC_OP4_NoDest(img_store_3d, output, coord_out.xyww, result,\n\ + VXC_OP4_NoDest(img_store_3d, output, coord_out.xywz, result,\n\ VXC_MODIFIER(0, 3, 0,VXC_RM_TowardZero, 0));\n\ coord_in.y++;\n\ coord_out.y ++;\n\ @@ -34971,7 +35846,7 @@ __kernel void resize_1d_bilinear_U8toU8_UP_opt\n\ _viv_asm(COPY, input_desc, input, sizeof(input_desc));\n\ int baseAddr = (int)coord_in.z * input_desc.s4 + input_desc.s0;\n\ _viv_asm(MOV, coord_in.w, baseAddr);\n\ - VXC_OP4(img_load_3d, src0, input, coord_in.xyww, VXC_5BITOFFSET_XY(0, 0),\n\ + VXC_OP4(img_load_3d, src0, input, coord_in.xywz, VXC_5BITOFFSET_XY(0, 0),\n\ VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));\n\ \n\ vxc_ushort8 bitextract_p0;\n\ @@ -34999,14 +35874,14 @@ __kernel void resize_1d_bilinear_U8toU8_UP_opt\n\ VXC_BitExtract(src_mask, src0, src0, maskShift, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ \n\ coord_in.y++;\n\ - VXC_OP4(img_load_3d, src0, input, coord_in.xyww,\n\ + VXC_OP4(img_load_3d, src0, input, coord_in.xywz,\n\ VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));\n\ \n\ vxc_uchar16 dst;\n\ VXC_DP4x4(dst, src_mask, lerp,\n\ VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), uniBilinear_4x4);\n\ \n\ - VXC_OP4_NoDest(img_store_3d, output, coord_out.xyww, dst,\n\ + VXC_OP4_NoDest(img_store_3d, output, coord_out.xywz, dst,\n\ VXC_MODIFIER(0, 3, 0,VXC_RM_TowardZero, 0));\n\ \n\ coord_out.y ++;\n\ @@ -35035,12 +35910,12 @@ _viv_uniform int out_height;\n\ \\\n\ while (coord_out.y < out_height) \\\n\ { \\\n\ - VXC_OP4(img_load_3d, read_data, input, coord_in.xyww, VXC_5BITOFFSET_XY(0, 0), \\\n\ + VXC_OP4(img_load_3d, read_data, input, coord_in.xywz, VXC_5BITOFFSET_XY(0, 0), \\\n\ VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \\\n\ _viv_asm(COPY, in0, read_data, 16); \\\n\ VXC_DP2x8(result, in0, in0, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniResizeNxUp_2x8); \\\n\ _viv_asm(COPY, save_data, result, 16); \\\n\ - VXC_OP4_NoDest(img_store_3d, output, coord_out.xyww, save_data, \\\n\ + VXC_OP4_NoDest(img_store_3d, output, coord_out.xywz, save_data, \\\n\ VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0)); \\\n\ coord_in.y++; \\\n\ coord_out.y++; \\\n\ @@ -35209,19 +36084,19 @@ __kernel void resize_1d_nearest_F16toF16\n\ baseAddr = (int)coord_out.z * output_desc.s4 + output_desc.s0;\n\ _viv_asm(MOV, coord_out.w, baseAddr);\n\ \n\ - VXC_OP4(img_load_3d, src, input, coord_in.xyww, VXC_5BITOFFSET_XY(0, 0),\n\ + VXC_OP4(img_load_3d, src, input, coord_in.xywz, VXC_5BITOFFSET_XY(0, 0),\n\ VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));\n\ coord_in.x = in_x_idx.y;\n\ - VXC_OP4(img_load_3d, src, input, coord_in.xyww, VXC_5BITOFFSET_XY(0, 0),\n\ + VXC_OP4(img_load_3d, src, input, coord_in.xywz, VXC_5BITOFFSET_XY(0, 0),\n\ VXC_MODIFIER(1, 1, 0, VXC_RM_TowardZero, 0));\n\ coord_in.x = in_x_idx.z;\n\ - VXC_OP4(img_load_3d, src, input, coord_in.xyww, VXC_5BITOFFSET_XY(0, 0),\n\ + VXC_OP4(img_load_3d, src, input, coord_in.xywz, VXC_5BITOFFSET_XY(0, 0),\n\ VXC_MODIFIER(2, 2, 0, VXC_RM_TowardZero, 0));\n\ coord_in.x = in_x_idx.w;\n\ - VXC_OP4(img_load_3d, src, input, coord_in.xyww, VXC_5BITOFFSET_XY(0, 0),\n\ + VXC_OP4(img_load_3d, src, input, coord_in.xywz, VXC_5BITOFFSET_XY(0, 0),\n\ VXC_MODIFIER(3, 3, 0, VXC_RM_TowardZero, 0));\n\ \n\ - VXC_OP4_NoDest(img_store_3d, output, coord_out.xyww, src,\n\ + VXC_OP4_NoDest(img_store_3d, output, coord_out.xywz, src,\n\ VXC_MODIFIER(0, 3, 0,VXC_RM_TowardZero, 0));\n\ }\n\ \n\ @@ -35249,9 +36124,9 @@ __kernel void resize_1d_nearest_F16toF16_op\n\ baseAddr = (int)coord_out.z * output_desc.s4 + output_desc.s0;\n\ _viv_asm(MOV, coord_out.w, baseAddr);\n\ \n\ - VXC_OP4(img_load_3d, src0, input, coord_in.xyww, VXC_5BITOFFSET_XY(0, 0),\n\ + VXC_OP4(img_load_3d, src0, input, coord_in.xywz, VXC_5BITOFFSET_XY(0, 0),\n\ VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ - VXC_OP4(img_load_3d, src1, input, coord_in.xyww, VXC_5BITOFFSET_XY(8, 0),\n\ + VXC_OP4(img_load_3d, src1, input, coord_in.xywz, VXC_5BITOFFSET_XY(8, 0),\n\ VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ \n\ vxc_uchar16 mask = (vxc_uchar16)(8, 8, 8, 8, 8, 8, 8, 8, 16, 16, 16, 16, 16, 16, 16, 16);\n\ @@ -35261,7 +36136,7 @@ __kernel void resize_1d_nearest_F16toF16_op\n\ VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniGetExtractData_2x8);\n\ VXC_BitExtract(dst, src0, src1, mask, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0));\n\ \n\ - VXC_OP4_NoDest(img_store_3d, output, coord_out.xyww, dst,\n\ + VXC_OP4_NoDest(img_store_3d, output, coord_out.xywz, dst,\n\ VXC_MODIFIER(0, 3, 0,VXC_RM_TowardZero, 0));\n\ }\n\ \n\ @@ -35289,19 +36164,19 @@ __kernel void resize_1d_nearest_I8toI8\n\ baseAddr = (int)coord_out.z * output_desc.s4 + output_desc.s0;\n\ _viv_asm(MOV, coord_out.w, baseAddr);\n\ \n\ - VXC_OP4(img_load_3d, src, input, coord_in.xyww, VXC_5BITOFFSET_XY(0, 0),\n\ + VXC_OP4(img_load_3d, src, input, coord_in.xywz, VXC_5BITOFFSET_XY(0, 0),\n\ VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));\n\ coord_in.x = in_x_idx.y;\n\ - VXC_OP4(img_load_3d, src, input, coord_in.xyww, VXC_5BITOFFSET_XY(0, 0),\n\ + VXC_OP4(img_load_3d, src, input, coord_in.xywz, VXC_5BITOFFSET_XY(0, 0),\n\ VXC_MODIFIER(1, 1, 0, VXC_RM_TowardZero, 0));\n\ coord_in.x = in_x_idx.z;\n\ - VXC_OP4(img_load_3d, src, input, coord_in.xyww, VXC_5BITOFFSET_XY(0, 0),\n\ + VXC_OP4(img_load_3d, src, input, coord_in.xywz, VXC_5BITOFFSET_XY(0, 0),\n\ VXC_MODIFIER(2, 2, 0, VXC_RM_TowardZero, 0));\n\ coord_in.x = in_x_idx.w;\n\ - VXC_OP4(img_load_3d, src, input, coord_in.xyww, VXC_5BITOFFSET_XY(0, 0),\n\ + VXC_OP4(img_load_3d, src, input, coord_in.xywz, VXC_5BITOFFSET_XY(0, 0),\n\ VXC_MODIFIER(3, 3, 0, VXC_RM_TowardZero, 0));\n\ VXC_DP2x8(src, src, src, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniConvertI8toI8_2x8);\n\ - VXC_OP4_NoDest(img_store_3d, output, coord_out.xyww, src,\n\ + VXC_OP4_NoDest(img_store_3d, output, coord_out.xywz, src,\n\ VXC_MODIFIER(0, 3, 0,VXC_RM_TowardZero, 0));\n\ }\n\ \n\ @@ -35329,7 +36204,7 @@ __kernel void resize_1d_nearest_I8toI8_op\n\ baseAddr = (int)coord_out.z * output_desc.s4 + output_desc.s0;\n\ _viv_asm(MOV, coord_out.w, baseAddr);\n\ \n\ - VXC_OP4(img_load_3d, src0, input, coord_in.xyww, VXC_5BITOFFSET_XY(0, 0),\n\ + VXC_OP4(img_load_3d, src0, input, coord_in.xywz, VXC_5BITOFFSET_XY(0, 0),\n\ VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));\n\ \n\ vxc_uchar16 mask = (vxc_uchar16)(8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8);\n\ @@ -35340,7 +36215,7 @@ __kernel void resize_1d_nearest_I8toI8_op\n\ VXC_BitExtract(dst0, src0, src0, mask, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0));\n\ _viv_asm(COPY, dst, dst0, 8);\n\ VXC_DP2x8(dst, dst, dst, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), uniConvertI8toI8_2x8);\n\ - VXC_OP4_NoDest(img_store_3d, output, coord_out.xyww, dst,\n\ + VXC_OP4_NoDest(img_store_3d, output, coord_out.xywz, dst,\n\ VXC_MODIFIER(0, 3, 0,VXC_RM_TowardZero, 0));\n\ }\n\ \n\ @@ -35367,16 +36242,16 @@ __kernel void resize_1d_nearest_U8toU8\n\ baseAddr = (int)coord_out.z * output_desc.s4 + output_desc.s0;\n\ _viv_asm(MOV, coord_out.w, baseAddr);\n\ \n\ - VXC_OP4(img_load_3d, src, input, coord_in.xyww, VXC_5BITOFFSET_XY(0, 0),\n\ + VXC_OP4(img_load_3d, src, input, coord_in.xywz, VXC_5BITOFFSET_XY(0, 0),\n\ VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));\n\ coord_in.x = in_x_idx.y;\n\ - VXC_OP4(img_load_3d, src, input, coord_in.xyww, VXC_5BITOFFSET_XY(0, 0),\n\ + VXC_OP4(img_load_3d, src, input, coord_in.xywz, VXC_5BITOFFSET_XY(0, 0),\n\ VXC_MODIFIER(1, 1, 0, VXC_RM_TowardZero, 0));\n\ coord_in.x = in_x_idx.z;\n\ - VXC_OP4(img_load_3d, src, input, coord_in.xyww, VXC_5BITOFFSET_XY(0, 0),\n\ + VXC_OP4(img_load_3d, src, input, coord_in.xywz, VXC_5BITOFFSET_XY(0, 0),\n\ VXC_MODIFIER(2, 2, 0, VXC_RM_TowardZero, 0));\n\ coord_in.x = in_x_idx.w;\n\ - VXC_OP4(img_load_3d, src, input, coord_in.xyww, VXC_5BITOFFSET_XY(0, 0),\n\ + VXC_OP4(img_load_3d, src, input, coord_in.xywz, VXC_5BITOFFSET_XY(0, 0),\n\ VXC_MODIFIER(3, 3, 0, VXC_RM_TowardZero, 0));\n\ \n\ vxc_ushort8 multiplier;\n\ @@ -35384,7 +36259,7 @@ __kernel void resize_1d_nearest_U8toU8\n\ VXC_DP2x8(src, src, multiplier, \\\n\ VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), uniMultiplyAndPostShift_2x8);\n\ \n\ - VXC_OP4_NoDest(img_store_3d, output, coord_out.xyww, src,\n\ + VXC_OP4_NoDest(img_store_3d, output, coord_out.xywz, src,\n\ VXC_MODIFIER(0, 3, 0,VXC_RM_TowardZero, 0));\n\ }\n\ \n\ @@ -35411,7 +36286,7 @@ __kernel void resize_1d_nearest_U8toU8_op\n\ baseAddr = (int)coord_out.z * output_desc.s4 + output_desc.s0;\n\ _viv_asm(MOV, coord_out.w, baseAddr);\n\ \n\ - VXC_OP4(img_load_3d, src0, input, coord_in.xyww, VXC_5BITOFFSET_XY(0, 0),\n\ + VXC_OP4(img_load_3d, src0, input, coord_in.xywz, VXC_5BITOFFSET_XY(0, 0),\n\ VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));\n\ \n\ vxc_uchar16 mask = (vxc_uchar16)(8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8);\n\ @@ -35423,7 +36298,7 @@ __kernel void resize_1d_nearest_U8toU8_op\n\ _viv_asm(COPY, multiplier, multAndoutZP, 16);\n\ VXC_DP2x8(dst, dst, multiplier, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), uniMultiplyAndPostShift_2x8);\n\ \n\ - VXC_OP4_NoDest(img_store_3d, output, coord_out.xyww, dst,\n\ + VXC_OP4_NoDest(img_store_3d, output, coord_out.xywz, dst,\n\ VXC_MODIFIER(0, 3, 0,VXC_RM_TowardZero, 0));\n\ }\n\ \n\ @@ -35450,21 +36325,21 @@ __kernel void resize_1d_nearest_I16toI16\n\ baseAddr = (int)coord_out.z * output_desc.s4 + output_desc.s0;\n\ _viv_asm(MOV, coord_out.w, baseAddr);\n\ \n\ - VXC_OP4(img_load_3d, src, input, coord_in.xyww, VXC_5BITOFFSET_XY(0, 0),\n\ + VXC_OP4(img_load_3d, src, input, coord_in.xywz, VXC_5BITOFFSET_XY(0, 0),\n\ VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));\n\ coord_in.x = in_x_idx.y;\n\ - VXC_OP4(img_load_3d, src, input, coord_in.xyww, VXC_5BITOFFSET_XY(0, 0),\n\ + VXC_OP4(img_load_3d, src, input, coord_in.xywz, VXC_5BITOFFSET_XY(0, 0),\n\ VXC_MODIFIER(1, 1, 0, VXC_RM_TowardZero, 0));\n\ coord_in.x = in_x_idx.z;\n\ - VXC_OP4(img_load_3d, src, input, coord_in.xyww, VXC_5BITOFFSET_XY(0, 0),\n\ + VXC_OP4(img_load_3d, src, input, coord_in.xywz, VXC_5BITOFFSET_XY(0, 0),\n\ VXC_MODIFIER(2, 2, 0, VXC_RM_TowardZero, 0));\n\ coord_in.x = in_x_idx.w;\n\ - VXC_OP4(img_load_3d, src, input, coord_in.xyww, VXC_5BITOFFSET_XY(0, 0),\n\ + VXC_OP4(img_load_3d, src, input, coord_in.xywz, VXC_5BITOFFSET_XY(0, 0),\n\ VXC_MODIFIER(3, 3, 0, VXC_RM_TowardZero, 0));\n\ \n\ VXC_DP2x8(src, src, src, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), uniConvertI8toI8_2x8);\n\ \n\ - VXC_OP4_NoDest(img_store_3d, output, coord_out.xyww, src,\n\ + VXC_OP4_NoDest(img_store_3d, output, coord_out.xywz, src,\n\ VXC_MODIFIER(0, 3, 0,VXC_RM_TowardZero, 0));\n\ }\n\ \n\ @@ -35492,9 +36367,9 @@ __kernel void resize_1d_nearest_I16toI16_op\n\ baseAddr = (int)coord_out.z * output_desc.s4 + output_desc.s0;\n\ _viv_asm(MOV, coord_out.w, baseAddr);\n\ \n\ - VXC_OP4(img_load_3d, src0, input, coord_in.xyww, VXC_5BITOFFSET_XY(0, 0),\n\ + VXC_OP4(img_load_3d, src0, input, coord_in.xywz, VXC_5BITOFFSET_XY(0, 0),\n\ VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ - VXC_OP4(img_load_3d, src1, input, coord_in.xyww, VXC_5BITOFFSET_XY(8, 0),\n\ + VXC_OP4(img_load_3d, src1, input, coord_in.xywz, VXC_5BITOFFSET_XY(8, 0),\n\ VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ \n\ vxc_uchar16 mask = (vxc_uchar16)(8, 8, 8, 8, 8, 8, 8, 8, 16, 16, 16, 16, 16, 16, 16, 16);\n\ @@ -35505,7 +36380,7 @@ __kernel void resize_1d_nearest_I16toI16_op\n\ _viv_asm(COPY, dst, dst0, 8);\n\ VXC_DP2x8(dst, dst, dst, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), uniConvertI8toI8_2x8);\n\ \n\ - VXC_OP4_NoDest(img_store_3d, output, coord_out.xyww, dst,\n\ + VXC_OP4_NoDest(img_store_3d, output, coord_out.xywz, dst,\n\ VXC_MODIFIER(0, 3, 0,VXC_RM_TowardZero, 0));\n\ }\n\ "; /* end of resize_1d_nearest_vx*/ @@ -35549,24 +36424,24 @@ __kernel void resize_bilinear_BF16toBF16_DOWN\n\ _viv_asm(COPY, input_desc, input, sizeof(input_desc));\n\ int baseAddr = (int)coord_in.z * input_desc.s4 + input_desc.s0;\n\ _viv_asm(MOV, coord_in.w, baseAddr);\n\ - VXC_OP4(img_load_3d, top, input, coord_in.xyww, VXC_5BITOFFSET_XY(0, 0),\n\ + VXC_OP4(img_load_3d, top, input, coord_in.xywz, VXC_5BITOFFSET_XY(0, 0),\n\ VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0));\n\ - VXC_OP4(img_load_3d, bottom, input, coord_in.xyww, VXC_5BITOFFSET_XY(0, 1),\n\ + VXC_OP4(img_load_3d, bottom, input, coord_in.xywz, VXC_5BITOFFSET_XY(0, 1),\n\ VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0));\n\ coord_in.x = left_x_idx.y;\n\ - VXC_OP4(img_load_3d, top, input, coord_in.xyww, VXC_5BITOFFSET_XY(0, 0),\n\ + VXC_OP4(img_load_3d, top, input, coord_in.xywz, VXC_5BITOFFSET_XY(0, 0),\n\ VXC_MODIFIER(2, 3, 0, VXC_RM_TowardZero, 0));\n\ - VXC_OP4(img_load_3d, bottom, input, coord_in.xyww, VXC_5BITOFFSET_XY(0, 1),\n\ + VXC_OP4(img_load_3d, bottom, input, coord_in.xywz, VXC_5BITOFFSET_XY(0, 1),\n\ VXC_MODIFIER(2, 3, 0, VXC_RM_TowardZero, 0));\n\ coord_in.x = left_x_idx.z;\n\ - VXC_OP4(img_load_3d, top, input, coord_in.xyww, VXC_5BITOFFSET_XY(0, 0),\n\ + VXC_OP4(img_load_3d, top, input, coord_in.xywz, VXC_5BITOFFSET_XY(0, 0),\n\ VXC_MODIFIER(4, 5, 0, VXC_RM_TowardZero, 0));\n\ - VXC_OP4(img_load_3d, bottom, input, coord_in.xyww, VXC_5BITOFFSET_XY(0, 1),\n\ + VXC_OP4(img_load_3d, bottom, input, coord_in.xywz, VXC_5BITOFFSET_XY(0, 1),\n\ VXC_MODIFIER(4, 5, 0, VXC_RM_TowardZero, 0));\n\ coord_in.x = left_x_idx.w;\n\ - VXC_OP4(img_load_3d, top, input, coord_in.xyww, VXC_5BITOFFSET_XY(0, 0),\n\ + VXC_OP4(img_load_3d, top, input, coord_in.xywz, VXC_5BITOFFSET_XY(0, 0),\n\ VXC_MODIFIER(6, 7, 0, VXC_RM_TowardZero, 0));\n\ - VXC_OP4(img_load_3d, bottom, input, coord_in.xyww, VXC_5BITOFFSET_XY(0, 1),\n\ + VXC_OP4(img_load_3d, bottom, input, coord_in.xywz, VXC_5BITOFFSET_XY(0, 1),\n\ VXC_MODIFIER(6, 7, 0, VXC_RM_TowardZero, 0));\n\ \n\ vxc_ushort8 src;\n\ @@ -35599,7 +36474,7 @@ __kernel void resize_bilinear_BF16toBF16_DOWN\n\ baseAddr = (int)coord_out.z * output_desc.s4 + output_desc.s0;\n\ _viv_asm(MOV, coord_out.w, baseAddr);\n\ \n\ - VXC_OP4_NoDest(img_store_3d, output, coord_out.xyww, dst,\n\ + VXC_OP4_NoDest(img_store_3d, output, coord_out.xywz, dst,\n\ VXC_MODIFIER(0, 3, 0,VXC_RM_TowardZero, 0));\n\ }\n\ \n\ @@ -35632,13 +36507,13 @@ __kernel void resize_bilinear_BF16toBF16_UP\n\ int baseAddr = (int)coord_in.z * input_desc.s4 + input_desc.s0;\n\ _viv_asm(MOV, coord_in.w, baseAddr);\n\ \n\ - VXC_OP4(img_load_3d, src0, input, coord_in.xyww, VXC_5BITOFFSET_XY(0, 0),\n\ + VXC_OP4(img_load_3d, src0, input, coord_in.xywz, VXC_5BITOFFSET_XY(0, 0),\n\ VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ - VXC_OP4(img_load_3d, src1, input, coord_in.xyww, VXC_5BITOFFSET_XY(8, 0),\n\ + VXC_OP4(img_load_3d, src1, input, coord_in.xywz, VXC_5BITOFFSET_XY(8, 0),\n\ VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ - VXC_OP4(img_load_3d, src2, input, coord_in.xyww, VXC_5BITOFFSET_XY(0, 1),\n\ + VXC_OP4(img_load_3d, src2, input, coord_in.xywz, VXC_5BITOFFSET_XY(0, 1),\n\ VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ - VXC_OP4(img_load_3d, src3, input, coord_in.xyww, VXC_5BITOFFSET_XY(8, 1),\n\ + VXC_OP4(img_load_3d, src3, input, coord_in.xywz, VXC_5BITOFFSET_XY(8, 1),\n\ VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ \n\ \n\ @@ -35667,19 +36542,17 @@ __kernel void resize_bilinear_BF16toBF16_UP\n\ VXC_BitExtract(dst1, src2, src3, maskShift, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ \n\ \n\ - coord_in.w += input_desc.s4;\n\ - VXC_OP4(img_load_3d, src0, input, coord_in.xyww, VXC_5BITOFFSET_XY(0, 0),\n\ + coord_in.zw += (int2)(1, input_desc.s4);\n\ + VXC_OP4(img_load_3d, src0, input, coord_in.xywz, VXC_5BITOFFSET_XY(0, 0),\n\ VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ - VXC_OP4(img_load_3d, src1, input, coord_in.xyww, VXC_5BITOFFSET_XY(8, 0),\n\ + VXC_OP4(img_load_3d, src1, input, coord_in.xywz, VXC_5BITOFFSET_XY(8, 0),\n\ VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ - VXC_OP4(img_load_3d, src2, input, coord_in.xyww, VXC_5BITOFFSET_XY(0, 1),\n\ + VXC_OP4(img_load_3d, src2, input, coord_in.xywz, VXC_5BITOFFSET_XY(0, 1),\n\ VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ - VXC_OP4(img_load_3d, src3, input, coord_in.xyww, VXC_5BITOFFSET_XY(8, 1),\n\ + VXC_OP4(img_load_3d, src3, input, coord_in.xywz, VXC_5BITOFFSET_XY(8, 1),\n\ VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ - coord_in.z ++;\n\ \n\ vxc_ushort8 dst_tmp;\n\ -\n\ \n\ VXC_DP2x8(dst_tmp, dst0, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvBF16toF32_Part0_2x8);\n\ _viv_asm(COPY, left4, dst_tmp, 16);\n\ @@ -35702,8 +36575,8 @@ __kernel void resize_bilinear_BF16toBF16_UP\n\ _viv_asm(COPY, tmp, dst4, 16);\n\ dst.s0123 = tmp.s1357;\n\ \n\ - VXC_OP4_NoDest(img_store_3d, output, coord_out.xyww, dst, VXC_MODIFIER(0, 3, 0,VXC_RM_TowardZero, 0));\n\ - coord_out.w += output_desc.s4;\n\ + VXC_OP4_NoDest(img_store_3d, output, coord_out.xywz, dst, VXC_MODIFIER(0, 3, 0,VXC_RM_TowardZero, 0));\n\ + coord_out.zw += (int2)(1, output_desc.s4);\n\ }\n\ \n\ VXC_BitExtract(dst0, src0, src1, maskShift, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ @@ -35726,7 +36599,7 @@ __kernel void resize_bilinear_BF16toBF16_UP\n\ vxc_ushort8 tmp, dst;\n\ _viv_asm(COPY, tmp, dst4, 16);\n\ dst.s0123 = tmp.s1357;\n\ - VXC_OP4_NoDest(img_store_3d, output, coord_out.xyww, dst, VXC_MODIFIER(0, 3, 0,VXC_RM_TowardZero, 0));\n\ + VXC_OP4_NoDest(img_store_3d, output, coord_out.xywz, dst, VXC_MODIFIER(0, 3, 0,VXC_RM_TowardZero, 0));\n\ }\n\ "; /* end of resize_bilinear_BF16_vx*/ @@ -35771,24 +36644,24 @@ __kernel void resize_bilinear_F16toF16_DOWN\n\ _viv_asm(COPY, input_desc, input, sizeof(input_desc));\n\ int baseAddr = (int)coord_in.z * input_desc.s4 + input_desc.s0;\n\ _viv_asm(MOV, coord_in.w, baseAddr);\n\ - VXC_OP4(img_load_3d, top_short, input, coord_in.xyww, VXC_5BITOFFSET_XY(0, 0),\n\ + VXC_OP4(img_load_3d, top_short, input, coord_in.xywz, VXC_5BITOFFSET_XY(0, 0),\n\ VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0));\n\ - VXC_OP4(img_load_3d, bottom_short, input, coord_in.xyww, VXC_5BITOFFSET_XY(0, 1),\n\ + VXC_OP4(img_load_3d, bottom_short, input, coord_in.xywz, VXC_5BITOFFSET_XY(0, 1),\n\ VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0));\n\ coord_in.x = left_x_idx.y;\n\ - VXC_OP4(img_load_3d, top_short, input, coord_in.xyww, VXC_5BITOFFSET_XY(0, 0),\n\ + VXC_OP4(img_load_3d, top_short, input, coord_in.xywz, VXC_5BITOFFSET_XY(0, 0),\n\ VXC_MODIFIER(2, 3, 0, VXC_RM_TowardZero, 0));\n\ - VXC_OP4(img_load_3d, bottom_short, input, coord_in.xyww, VXC_5BITOFFSET_XY(0, 1),\n\ + VXC_OP4(img_load_3d, bottom_short, input, coord_in.xywz, VXC_5BITOFFSET_XY(0, 1),\n\ VXC_MODIFIER(2, 3, 0, VXC_RM_TowardZero, 0));\n\ coord_in.x = left_x_idx.z;\n\ - VXC_OP4(img_load_3d, top_short, input, coord_in.xyww, VXC_5BITOFFSET_XY(0, 0),\n\ + VXC_OP4(img_load_3d, top_short, input, coord_in.xywz, VXC_5BITOFFSET_XY(0, 0),\n\ VXC_MODIFIER(4, 5, 0, VXC_RM_TowardZero, 0));\n\ - VXC_OP4(img_load_3d, bottom_short, input, coord_in.xyww, VXC_5BITOFFSET_XY(0, 1),\n\ + VXC_OP4(img_load_3d, bottom_short, input, coord_in.xywz, VXC_5BITOFFSET_XY(0, 1),\n\ VXC_MODIFIER(4, 5, 0, VXC_RM_TowardZero, 0));\n\ coord_in.x = left_x_idx.w;\n\ - VXC_OP4(img_load_3d, top_short, input, coord_in.xyww, VXC_5BITOFFSET_XY(0, 0),\n\ + VXC_OP4(img_load_3d, top_short, input, coord_in.xywz, VXC_5BITOFFSET_XY(0, 0),\n\ VXC_MODIFIER(6, 7, 0, VXC_RM_TowardZero, 0));\n\ - VXC_OP4(img_load_3d, bottom_short, input, coord_in.xyww, VXC_5BITOFFSET_XY(0, 1),\n\ + VXC_OP4(img_load_3d, bottom_short, input, coord_in.xywz, VXC_5BITOFFSET_XY(0, 1),\n\ VXC_MODIFIER(6, 7, 0, VXC_RM_TowardZero, 0));\n\ _viv_asm(COPY, top, top_short, 16);\n\ _viv_asm(COPY, bottom, bottom_short, 16);\n\ @@ -35817,7 +36690,7 @@ __kernel void resize_bilinear_F16toF16_DOWN\n\ baseAddr = (int)coord_out.z * output_desc.s4 + output_desc.s0;\n\ _viv_asm(MOV, coord_out.w, baseAddr);\n\ \n\ - VXC_OP4_NoDest(img_store_3d, output, coord_out.xyww, dst,\n\ + VXC_OP4_NoDest(img_store_3d, output, coord_out.xywz, dst,\n\ VXC_MODIFIER(0, 3, 0,VXC_RM_TowardZero, 0));\n\ }\n\ \n\ @@ -35848,24 +36721,24 @@ __kernel void resize_bilinear_F16toU8_DOWN\n\ _viv_asm(COPY, input_desc, input, sizeof(input_desc));\n\ int baseAddr = (int)coord_in.z * input_desc.s4 + input_desc.s0;\n\ _viv_asm(MOV, coord_in.w, baseAddr);\n\ - VXC_OP4(img_load_3d, top_short, input, coord_in.xyww, VXC_5BITOFFSET_XY(0, 0),\n\ + VXC_OP4(img_load_3d, top_short, input, coord_in.xywz, VXC_5BITOFFSET_XY(0, 0),\n\ VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0));\n\ - VXC_OP4(img_load_3d, bottom_short, input, coord_in.xyww, VXC_5BITOFFSET_XY(0, 1),\n\ + VXC_OP4(img_load_3d, bottom_short, input, coord_in.xywz, VXC_5BITOFFSET_XY(0, 1),\n\ VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0));\n\ coord_in.x = left_x_idx.y;\n\ - VXC_OP4(img_load_3d, top_short, input, coord_in.xyww, VXC_5BITOFFSET_XY(0, 0),\n\ + VXC_OP4(img_load_3d, top_short, input, coord_in.xywz, VXC_5BITOFFSET_XY(0, 0),\n\ VXC_MODIFIER(2, 3, 0, VXC_RM_TowardZero, 0));\n\ - VXC_OP4(img_load_3d, bottom_short, input, coord_in.xyww, VXC_5BITOFFSET_XY(0, 1),\n\ + VXC_OP4(img_load_3d, bottom_short, input, coord_in.xywz, VXC_5BITOFFSET_XY(0, 1),\n\ VXC_MODIFIER(2, 3, 0, VXC_RM_TowardZero, 0));\n\ coord_in.x = left_x_idx.z;\n\ - VXC_OP4(img_load_3d, top_short, input, coord_in.xyww, VXC_5BITOFFSET_XY(0, 0),\n\ + VXC_OP4(img_load_3d, top_short, input, coord_in.xywz, VXC_5BITOFFSET_XY(0, 0),\n\ VXC_MODIFIER(4, 5, 0, VXC_RM_TowardZero, 0));\n\ - VXC_OP4(img_load_3d, bottom_short, input, coord_in.xyww, VXC_5BITOFFSET_XY(0, 1),\n\ + VXC_OP4(img_load_3d, bottom_short, input, coord_in.xywz, VXC_5BITOFFSET_XY(0, 1),\n\ VXC_MODIFIER(4, 5, 0, VXC_RM_TowardZero, 0));\n\ coord_in.x = left_x_idx.w;\n\ - VXC_OP4(img_load_3d, top_short, input, coord_in.xyww, VXC_5BITOFFSET_XY(0, 0),\n\ + VXC_OP4(img_load_3d, top_short, input, coord_in.xywz, VXC_5BITOFFSET_XY(0, 0),\n\ VXC_MODIFIER(6, 7, 0, VXC_RM_TowardZero, 0));\n\ - VXC_OP4(img_load_3d, bottom_short, input, coord_in.xyww, VXC_5BITOFFSET_XY(0, 1),\n\ + VXC_OP4(img_load_3d, bottom_short, input, coord_in.xywz, VXC_5BITOFFSET_XY(0, 1),\n\ VXC_MODIFIER(6, 7, 0, VXC_RM_TowardZero, 0));\n\ _viv_asm(COPY, top, top_short, 16);\n\ _viv_asm(COPY, bottom, bottom_short, 16);\n\ @@ -35892,7 +36765,7 @@ __kernel void resize_bilinear_F16toU8_DOWN\n\ baseAddr = (int)coord_out.z * output_desc.s4 + output_desc.s0;\n\ _viv_asm(MOV, coord_out.w, baseAddr);\n\ \n\ - VXC_OP4_NoDest(img_store_3d, output, coord_out.xyww, dst_uchar,\n\ + VXC_OP4_NoDest(img_store_3d, output, coord_out.xywz, dst_uchar,\n\ VXC_MODIFIER(0, 3, 0,VXC_RM_TowardZero, 0));\n\ }\n\ \n\ @@ -35928,13 +36801,13 @@ __kernel void resize_bilinear_F16toF16_UP\n\ int baseAddr = (int)coord_in.z * input_desc.s4 + input_desc.s0;\n\ _viv_asm(MOV, coord_in.w, baseAddr);\n\ \n\ - VXC_OP4(img_load_3d, src0, input, coord_in.xyww, VXC_5BITOFFSET_XY(0, 0),\n\ + VXC_OP4(img_load_3d, src0, input, coord_in.xywz, VXC_5BITOFFSET_XY(0, 0),\n\ VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ - VXC_OP4(img_load_3d, src1, input, coord_in.xyww, VXC_5BITOFFSET_XY(8, 0),\n\ + VXC_OP4(img_load_3d, src1, input, coord_in.xywz, VXC_5BITOFFSET_XY(8, 0),\n\ VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ - VXC_OP4(img_load_3d, src2, input, coord_in.xyww, VXC_5BITOFFSET_XY(0, 1),\n\ + VXC_OP4(img_load_3d, src2, input, coord_in.xywz, VXC_5BITOFFSET_XY(0, 1),\n\ VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ - VXC_OP4(img_load_3d, src3, input, coord_in.xyww, VXC_5BITOFFSET_XY(8, 1),\n\ + VXC_OP4(img_load_3d, src3, input, coord_in.xywz, VXC_5BITOFFSET_XY(8, 1),\n\ VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ \n\ vxc_ushort8 bitextract_p0;\n\ @@ -35964,16 +36837,15 @@ __kernel void resize_bilinear_F16toF16_UP\n\ _viv_asm(COPY, bottom, dst1, 16);\n\ \n\ \n\ - coord_in.w += input_desc.s4;\n\ - VXC_OP4(img_load_3d, src0, input, coord_in.xyww, VXC_5BITOFFSET_XY(0, 0),\n\ + coord_in.zw += (int2)(1, input_desc.s4);\n\ + VXC_OP4(img_load_3d, src0, input, coord_in.xywz, VXC_5BITOFFSET_XY(0, 0),\n\ VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ - VXC_OP4(img_load_3d, src1, input, coord_in.xyww, VXC_5BITOFFSET_XY(8, 0),\n\ + VXC_OP4(img_load_3d, src1, input, coord_in.xywz, VXC_5BITOFFSET_XY(8, 0),\n\ VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ - VXC_OP4(img_load_3d, src2, input, coord_in.xyww, VXC_5BITOFFSET_XY(0, 1),\n\ + VXC_OP4(img_load_3d, src2, input, coord_in.xywz, VXC_5BITOFFSET_XY(0, 1),\n\ VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ - VXC_OP4(img_load_3d, src3, input, coord_in.xyww, VXC_5BITOFFSET_XY(8, 1),\n\ + VXC_OP4(img_load_3d, src3, input, coord_in.xywz, VXC_5BITOFFSET_XY(8, 1),\n\ VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ - coord_in.z ++;\n\ \n\ VXC_DP4x4(left4, top, top, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniFp16toFp32_left_4x4);\n\ VXC_DP4x4(right4, top, top, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniRightSubLeft_4x4);\n\ @@ -35988,8 +36860,8 @@ __kernel void resize_bilinear_F16toF16_UP\n\ VXC_DP2x8(top, tmp, tmp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniExtactHalf8_2x8);\n\ _viv_asm(COPY, dst0, top, 16);\n\ \n\ - VXC_OP4_NoDest(img_store_3d, output, coord_out.xyww, dst0, VXC_MODIFIER(0, 3, 0,VXC_RM_TowardZero, 0));\n\ - coord_out.w += output_desc.s4;\n\ + VXC_OP4_NoDest(img_store_3d, output, coord_out.xywz, dst0, VXC_MODIFIER(0, 3, 0,VXC_RM_TowardZero, 0));\n\ + coord_out.zw += (int2)(1, output_desc.s4);\n\ }\n\ \n\ VXC_BitExtract(dst0, src0, src1, maskShift, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ @@ -36010,7 +36882,7 @@ __kernel void resize_bilinear_F16toF16_UP\n\ VXC_DP2x8(top, tmp, tmp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniExtactHalf8_2x8);\n\ _viv_asm(COPY, dst0, top, 16);\n\ \n\ - VXC_OP4_NoDest(img_store_3d, output, coord_out.xyww, dst0, VXC_MODIFIER(0, 3, 0,VXC_RM_TowardZero, 0));\n\ + VXC_OP4_NoDest(img_store_3d, output, coord_out.xywz, dst0, VXC_MODIFIER(0, 3, 0,VXC_RM_TowardZero, 0));\n\ }\n\ "; /* end of resize_bilinear_F16_vx*/ @@ -36063,13 +36935,13 @@ __kernel void resize_bilinear_I16toI16_UP\n\ int baseAddr = (int)coord_in.z * input_desc.s4 + input_desc.s0;\n\ _viv_asm(MOV, coord_in.w, baseAddr);\n\ \n\ - VXC_OP4(img_load_3d, src0, input, coord_in.xyww, VXC_5BITOFFSET_XY(0, 0),\n\ + VXC_OP4(img_load_3d, src0, input, coord_in.xywz, VXC_5BITOFFSET_XY(0, 0),\n\ VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ - VXC_OP4(img_load_3d, src1, input, coord_in.xyww, VXC_5BITOFFSET_XY(8, 0),\n\ + VXC_OP4(img_load_3d, src1, input, coord_in.xywz, VXC_5BITOFFSET_XY(8, 0),\n\ VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ - VXC_OP4(img_load_3d, src2, input, coord_in.xyww, VXC_5BITOFFSET_XY(0, 1),\n\ + VXC_OP4(img_load_3d, src2, input, coord_in.xywz, VXC_5BITOFFSET_XY(0, 1),\n\ VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ - VXC_OP4(img_load_3d, src3, input, coord_in.xyww, VXC_5BITOFFSET_XY(8, 1),\n\ + VXC_OP4(img_load_3d, src3, input, coord_in.xywz, VXC_5BITOFFSET_XY(8, 1),\n\ VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ \n\ vxc_ushort8 bitextract_p0;\n\ @@ -36097,16 +36969,15 @@ __kernel void resize_bilinear_I16toI16_UP\n\ VXC_BitExtract(dst1, src2, src3, maskShift, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ _viv_asm(COPY, top, dst0, 16);\n\ _viv_asm(COPY, bottom, dst1, 16);\n\ - coord_in.w += input_desc.s4;\n\ - VXC_OP4(img_load_3d, src0, input, coord_in.xyww, VXC_5BITOFFSET_XY(0, 0),\n\ + coord_in.zw += (int2)(1, input_desc.s4);\n\ + VXC_OP4(img_load_3d, src0, input, coord_in.xywz, VXC_5BITOFFSET_XY(0, 0),\n\ VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ - VXC_OP4(img_load_3d, src1, input, coord_in.xyww, VXC_5BITOFFSET_XY(8, 0),\n\ + VXC_OP4(img_load_3d, src1, input, coord_in.xywz, VXC_5BITOFFSET_XY(8, 0),\n\ VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ - VXC_OP4(img_load_3d, src2, input, coord_in.xyww, VXC_5BITOFFSET_XY(0, 1),\n\ + VXC_OP4(img_load_3d, src2, input, coord_in.xywz, VXC_5BITOFFSET_XY(0, 1),\n\ VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ - VXC_OP4(img_load_3d, src3, input, coord_in.xyww, VXC_5BITOFFSET_XY(8, 1),\n\ + VXC_OP4(img_load_3d, src3, input, coord_in.xywz, VXC_5BITOFFSET_XY(8, 1),\n\ VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ - coord_in.z ++;\n\ \n\ VXC_DP4x4(left4, top, top, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniDFPtoFp32_left_4x4);\n\ VXC_DP4x4(right4, top, top, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniRightSubLeft_4x4);\n\ @@ -36124,8 +36995,8 @@ __kernel void resize_bilinear_I16toI16_UP\n\ \n\ VXC_DP2x8(top, dst, dst, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), uniExtact8Bit_2x8);\n\ \n\ - VXC_OP4_NoDest(img_store_3d, output, coord_out.xyww, top, VXC_MODIFIER(0, 3, 0,VXC_RM_TowardZero, 0));\n\ - coord_out.w += output_desc.s4;\n\ + VXC_OP4_NoDest(img_store_3d, output, coord_out.xywz, top, VXC_MODIFIER(0, 3, 0,VXC_RM_TowardZero, 0));\n\ + coord_out.zw += (int2)(1, output_desc.s4);\n\ }\n\ \n\ VXC_BitExtract(dst0, src0, src1, maskShift, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ @@ -36145,7 +37016,7 @@ __kernel void resize_bilinear_I16toI16_UP\n\ dst4 = dst4 * dfpScale;\n\ int4 dst = convert_int4_rte(dst4);\n\ VXC_DP2x8(top, dst, dst, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), uniExtact8Bit_2x8);\n\ - VXC_OP4_NoDest(img_store_3d, output, coord_out.xyww, top, VXC_MODIFIER(0, 3, 0,VXC_RM_TowardZero, 0));\n\ + VXC_OP4_NoDest(img_store_3d, output, coord_out.xywz, top, VXC_MODIFIER(0, 3, 0,VXC_RM_TowardZero, 0));\n\ \n\ }\n\ \n\ @@ -36177,24 +37048,24 @@ __kernel void resize_bilinear_I16toI16_DOWN\n\ _viv_asm(COPY, input_desc, input, sizeof(input_desc));\n\ int baseAddr = (int)coord_in.z * input_desc.s4 + input_desc.s0;\n\ _viv_asm(MOV, coord_in.w, baseAddr);\n\ - VXC_OP4(img_load_3d, top, input, coord_in.xyww, VXC_5BITOFFSET_XY(0, 0),\n\ + VXC_OP4(img_load_3d, top, input, coord_in.xywz, VXC_5BITOFFSET_XY(0, 0),\n\ VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0));\n\ - VXC_OP4(img_load_3d, bottom, input, coord_in.xyww, VXC_5BITOFFSET_XY(0, 1),\n\ + VXC_OP4(img_load_3d, bottom, input, coord_in.xywz, VXC_5BITOFFSET_XY(0, 1),\n\ VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0));\n\ coord_in.x = left_x_idx.y;\n\ - VXC_OP4(img_load_3d, top, input, coord_in.xyww, VXC_5BITOFFSET_XY(0, 0),\n\ + VXC_OP4(img_load_3d, top, input, coord_in.xywz, VXC_5BITOFFSET_XY(0, 0),\n\ VXC_MODIFIER(2, 3, 0, VXC_RM_TowardZero, 0));\n\ - VXC_OP4(img_load_3d, bottom, input, coord_in.xyww, VXC_5BITOFFSET_XY(0, 1),\n\ + VXC_OP4(img_load_3d, bottom, input, coord_in.xywz, VXC_5BITOFFSET_XY(0, 1),\n\ VXC_MODIFIER(2, 3, 0, VXC_RM_TowardZero, 0));\n\ coord_in.x = left_x_idx.z;\n\ - VXC_OP4(img_load_3d, top, input, coord_in.xyww, VXC_5BITOFFSET_XY(0, 0),\n\ + VXC_OP4(img_load_3d, top, input, coord_in.xywz, VXC_5BITOFFSET_XY(0, 0),\n\ VXC_MODIFIER(4, 5, 0, VXC_RM_TowardZero, 0));\n\ - VXC_OP4(img_load_3d, bottom, input, coord_in.xyww, VXC_5BITOFFSET_XY(0, 1),\n\ + VXC_OP4(img_load_3d, bottom, input, coord_in.xywz, VXC_5BITOFFSET_XY(0, 1),\n\ VXC_MODIFIER(4, 5, 0, VXC_RM_TowardZero, 0));\n\ coord_in.x = left_x_idx.w;\n\ - VXC_OP4(img_load_3d, top, input, coord_in.xyww, VXC_5BITOFFSET_XY(0, 0),\n\ + VXC_OP4(img_load_3d, top, input, coord_in.xywz, VXC_5BITOFFSET_XY(0, 0),\n\ VXC_MODIFIER(6, 7, 0, VXC_RM_TowardZero, 0));\n\ - VXC_OP4(img_load_3d, bottom, input, coord_in.xyww, VXC_5BITOFFSET_XY(0, 1),\n\ + VXC_OP4(img_load_3d, bottom, input, coord_in.xywz, VXC_5BITOFFSET_XY(0, 1),\n\ VXC_MODIFIER(6, 7, 0, VXC_RM_TowardZero, 0));\n\ \n\ float4 left4;\n\ @@ -36225,7 +37096,7 @@ __kernel void resize_bilinear_I16toI16_DOWN\n\ baseAddr = (int)coord_out.z * output_desc.s4 + output_desc.s0;\n\ _viv_asm(MOV, coord_out.w, baseAddr);\n\ \n\ - VXC_OP4_NoDest(img_store_3d, output, coord_out.xyww, result,\n\ + VXC_OP4_NoDest(img_store_3d, output, coord_out.xywz, result,\n\ VXC_MODIFIER(0, 3, 0,VXC_RM_TowardZero, 0));\n\ }\n\ \n\ @@ -36280,9 +37151,9 @@ __kernel void resize_bilinear_I8toI8_UP\n\ int baseAddr = (int)coord_in.z * input_desc.s4 + input_desc.s0;\n\ _viv_asm(MOV, coord_in.w, baseAddr);\n\ \n\ - VXC_OP4(img_load_3d, src0, input, coord_in.xyww, VXC_5BITOFFSET_XY(0, 0),\n\ + VXC_OP4(img_load_3d, src0, input, coord_in.xywz, VXC_5BITOFFSET_XY(0, 0),\n\ VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));\n\ - VXC_OP4(img_load_3d, src1, input, coord_in.xyww, VXC_5BITOFFSET_XY(0, 1),\n\ + VXC_OP4(img_load_3d, src1, input, coord_in.xywz, VXC_5BITOFFSET_XY(0, 1),\n\ VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));\n\ \n\ vxc_ushort8 bitextract_p0;\n\ @@ -36311,12 +37182,11 @@ __kernel void resize_bilinear_I8toI8_UP\n\ _viv_asm(COPY, top, dst0, 16);\n\ _viv_asm(COPY, bottom, dst1, 16);\n\ \n\ - coord_in.w += input_desc.s4;\n\ - VXC_OP4(img_load_3d, src0, input, coord_in.xyww, VXC_5BITOFFSET_XY(0, 0),\n\ + coord_in.zw += (int2)(1, input_desc.s4);\n\ + VXC_OP4(img_load_3d, src0, input, coord_in.xywz, VXC_5BITOFFSET_XY(0, 0),\n\ VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));\n\ - VXC_OP4(img_load_3d, src1, input, coord_in.xyww, VXC_5BITOFFSET_XY(0, 1),\n\ + VXC_OP4(img_load_3d, src1, input, coord_in.xywz, VXC_5BITOFFSET_XY(0, 1),\n\ VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));\n\ - coord_in.z ++;\n\ \n\ VXC_DP4x4(left4, top, top, \\\n\ VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniDFPtoFp32_left_4x4);\n\ @@ -36337,8 +37207,8 @@ __kernel void resize_bilinear_I8toI8_UP\n\ int4 dst = convert_int4_rte(dst4);\n\ VXC_DP2x8(top, dst, dst, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), uniExtact8Bit_2x8);\n\ \n\ - VXC_OP4_NoDest(img_store_3d, output, coord_out.xyww, top, VXC_MODIFIER(0, 3, 0,VXC_RM_TowardZero, 0));\n\ - coord_out.w += output_desc.s4;\n\ + VXC_OP4_NoDest(img_store_3d, output, coord_out.xywz, top, VXC_MODIFIER(0, 3, 0,VXC_RM_TowardZero, 0));\n\ + coord_out.zw += (int2)(1, output_desc.s4);\n\ }\n\ \n\ VXC_BitExtract(dst0, src0, src0, maskShift, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ @@ -36360,7 +37230,7 @@ __kernel void resize_bilinear_I8toI8_UP\n\ dst4 = dst4 * dfpScale;\n\ int4 dst = convert_int4_rte(dst4);\n\ VXC_DP2x8(top, dst, dst, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), uniExtact8Bit_2x8);\n\ - VXC_OP4_NoDest(img_store_3d, output, coord_out.xyww, top, VXC_MODIFIER(0, 3, 0,VXC_RM_TowardZero, 0));\n\ + VXC_OP4_NoDest(img_store_3d, output, coord_out.xywz, top, VXC_MODIFIER(0, 3, 0,VXC_RM_TowardZero, 0));\n\ }\n\ \n\ __kernel void resize_bilinear_I8toI8_DOWN\n\ @@ -36388,24 +37258,24 @@ __kernel void resize_bilinear_I8toI8_DOWN\n\ _viv_asm(COPY, input_desc, input, sizeof(input_desc));\n\ int baseAddr = (int)coord_in.z * input_desc.s4 + input_desc.s0;\n\ _viv_asm(MOV, coord_in.w, baseAddr);\n\ - VXC_OP4(img_load_3d, top, input, coord_in.xyww, VXC_5BITOFFSET_XY(0, 0),\n\ + VXC_OP4(img_load_3d, top, input, coord_in.xywz, VXC_5BITOFFSET_XY(0, 0),\n\ VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0));\n\ - VXC_OP4(img_load_3d, bottom, input, coord_in.xyww, VXC_5BITOFFSET_XY(0, 1),\n\ + VXC_OP4(img_load_3d, bottom, input, coord_in.xywz, VXC_5BITOFFSET_XY(0, 1),\n\ VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0));\n\ coord_in.x = left_x_idx.y;\n\ - VXC_OP4(img_load_3d, top, input, coord_in.xyww, VXC_5BITOFFSET_XY(0, 0),\n\ + VXC_OP4(img_load_3d, top, input, coord_in.xywz, VXC_5BITOFFSET_XY(0, 0),\n\ VXC_MODIFIER(2, 3, 0, VXC_RM_TowardZero, 0));\n\ - VXC_OP4(img_load_3d, bottom, input, coord_in.xyww, VXC_5BITOFFSET_XY(0, 1),\n\ + VXC_OP4(img_load_3d, bottom, input, coord_in.xywz, VXC_5BITOFFSET_XY(0, 1),\n\ VXC_MODIFIER(2, 3, 0, VXC_RM_TowardZero, 0));\n\ coord_in.x = left_x_idx.z;\n\ - VXC_OP4(img_load_3d, top, input, coord_in.xyww, VXC_5BITOFFSET_XY(0, 0),\n\ + VXC_OP4(img_load_3d, top, input, coord_in.xywz, VXC_5BITOFFSET_XY(0, 0),\n\ VXC_MODIFIER(4, 5, 0, VXC_RM_TowardZero, 0));\n\ - VXC_OP4(img_load_3d, bottom, input, coord_in.xyww, VXC_5BITOFFSET_XY(0, 1),\n\ + VXC_OP4(img_load_3d, bottom, input, coord_in.xywz, VXC_5BITOFFSET_XY(0, 1),\n\ VXC_MODIFIER(4, 5, 0, VXC_RM_TowardZero, 0));\n\ coord_in.x = left_x_idx.w;\n\ - VXC_OP4(img_load_3d, top, input, coord_in.xyww, VXC_5BITOFFSET_XY(0, 0),\n\ + VXC_OP4(img_load_3d, top, input, coord_in.xywz, VXC_5BITOFFSET_XY(0, 0),\n\ VXC_MODIFIER(6, 7, 0, VXC_RM_TowardZero, 0));\n\ - VXC_OP4(img_load_3d, bottom, input, coord_in.xyww, VXC_5BITOFFSET_XY(0, 1),\n\ + VXC_OP4(img_load_3d, bottom, input, coord_in.xywz, VXC_5BITOFFSET_XY(0, 1),\n\ VXC_MODIFIER(6, 7, 0, VXC_RM_TowardZero, 0));\n\ \n\ float4 left4;\n\ @@ -36436,7 +37306,7 @@ __kernel void resize_bilinear_I8toI8_DOWN\n\ _viv_asm(COPY, output_desc, output, sizeof(output_desc));\n\ baseAddr = (int)coord_out.z * output_desc.s4 + output_desc.s0;\n\ _viv_asm(MOV, coord_out.w, baseAddr);\n\ - VXC_OP4_NoDest(img_store_3d, output, coord_out.xyww, result, VXC_MODIFIER(0, 3, 0,VXC_RM_TowardZero, 0));\n\ + VXC_OP4_NoDest(img_store_3d, output, coord_out.xywz, result, VXC_MODIFIER(0, 3, 0,VXC_RM_TowardZero, 0));\n\ }\n\ "; /* end of resize_bilinear_I8_vx*/ @@ -36479,24 +37349,24 @@ __kernel void resize_bilinear_U8toF16_DOWN\n\ _viv_asm(COPY, input_desc, input, sizeof(input_desc));\n\ int baseAddr = (int)coord_in.z * input_desc.s4 + input_desc.s0;\n\ _viv_asm(MOV, coord_in.w, baseAddr);\n\ - VXC_OP4(img_load_3d, top, input, coord_in.xyww, VXC_5BITOFFSET_XY(0, 0),\n\ + VXC_OP4(img_load_3d, top, input, coord_in.xywz, VXC_5BITOFFSET_XY(0, 0),\n\ VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0));\n\ - VXC_OP4(img_load_3d, bottom, input, coord_in.xyww, VXC_5BITOFFSET_XY(0, 1),\n\ + VXC_OP4(img_load_3d, bottom, input, coord_in.xywz, VXC_5BITOFFSET_XY(0, 1),\n\ VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0));\n\ coord_in.x = left_x_idx.y;\n\ - VXC_OP4(img_load_3d, top, input, coord_in.xyww, VXC_5BITOFFSET_XY(0, 0),\n\ + VXC_OP4(img_load_3d, top, input, coord_in.xywz, VXC_5BITOFFSET_XY(0, 0),\n\ VXC_MODIFIER(2, 3, 0, VXC_RM_TowardZero, 0));\n\ - VXC_OP4(img_load_3d, bottom, input, coord_in.xyww, VXC_5BITOFFSET_XY(0, 1),\n\ + VXC_OP4(img_load_3d, bottom, input, coord_in.xywz, VXC_5BITOFFSET_XY(0, 1),\n\ VXC_MODIFIER(2, 3, 0, VXC_RM_TowardZero, 0));\n\ coord_in.x = left_x_idx.z;\n\ - VXC_OP4(img_load_3d, top, input, coord_in.xyww, VXC_5BITOFFSET_XY(0, 0),\n\ + VXC_OP4(img_load_3d, top, input, coord_in.xywz, VXC_5BITOFFSET_XY(0, 0),\n\ VXC_MODIFIER(4, 5, 0, VXC_RM_TowardZero, 0));\n\ - VXC_OP4(img_load_3d, bottom, input, coord_in.xyww, VXC_5BITOFFSET_XY(0, 1),\n\ + VXC_OP4(img_load_3d, bottom, input, coord_in.xywz, VXC_5BITOFFSET_XY(0, 1),\n\ VXC_MODIFIER(4, 5, 0, VXC_RM_TowardZero, 0));\n\ coord_in.x = left_x_idx.w;\n\ - VXC_OP4(img_load_3d, top, input, coord_in.xyww, VXC_5BITOFFSET_XY(0, 0),\n\ + VXC_OP4(img_load_3d, top, input, coord_in.xywz, VXC_5BITOFFSET_XY(0, 0),\n\ VXC_MODIFIER(6, 7, 0, VXC_RM_TowardZero, 0));\n\ - VXC_OP4(img_load_3d, bottom, input, coord_in.xyww, VXC_5BITOFFSET_XY(0, 1),\n\ + VXC_OP4(img_load_3d, bottom, input, coord_in.xywz, VXC_5BITOFFSET_XY(0, 1),\n\ VXC_MODIFIER(6, 7, 0, VXC_RM_TowardZero, 0));\n\ \n\ float4 left4;\n\ @@ -36529,7 +37399,7 @@ __kernel void resize_bilinear_U8toF16_DOWN\n\ _viv_asm(COPY, output_desc, output, sizeof(output_desc));\n\ baseAddr = (int)coord_out.z * output_desc.s4 + output_desc.s0;\n\ _viv_asm(MOV, coord_out.w, baseAddr);\n\ - VXC_OP4_NoDest(img_store_3d, output, coord_out.xyww, dst_short.s0246,\n\ + VXC_OP4_NoDest(img_store_3d, output, coord_out.xywz, dst_short.s0246,\n\ VXC_MODIFIER(0, 3, 0,VXC_RM_TowardZero, 0));\n\ }\n\ \n\ @@ -36570,9 +37440,9 @@ __kernel void resize_bilinear_U8toU8_UP\n\ int baseAddr = (int)coord_in.z * input_desc.s4 + input_desc.s0;\n\ _viv_asm(MOV, coord_in.w, baseAddr);\n\ \n\ - VXC_OP4(img_load_3d, src0, input, coord_in.xyww, VXC_5BITOFFSET_XY(0, 0),\n\ + VXC_OP4(img_load_3d, src0, input, coord_in.xywz, VXC_5BITOFFSET_XY(0, 0),\n\ VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));\n\ - VXC_OP4(img_load_3d, src1, input, coord_in.xyww, VXC_5BITOFFSET_XY(0, 1),\n\ + VXC_OP4(img_load_3d, src1, input, coord_in.xywz, VXC_5BITOFFSET_XY(0, 1),\n\ VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));\n\ \n\ vxc_ushort8 bitextract_p0;\n\ @@ -36599,12 +37469,12 @@ __kernel void resize_bilinear_U8toU8_UP\n\ VXC_BitExtract(top, src0, src0, maskShift, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ VXC_BitExtract(bottom, src1, src1, maskShift, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ \n\ - coord_in.w += input_desc.s4;\n\ - VXC_OP4(img_load_3d, src0, input, coord_in.xyww, VXC_5BITOFFSET_XY(0, 0),\n\ + coord_in.zw += (int2)(1, input_desc.s4);\n\ + VXC_OP4(img_load_3d, src0, input, coord_in.xywz, VXC_5BITOFFSET_XY(0, 0),\n\ VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));\n\ - VXC_OP4(img_load_3d, src1, input, coord_in.xyww, VXC_5BITOFFSET_XY(0, 1),\n\ + VXC_OP4(img_load_3d, src1, input, coord_in.xywz, VXC_5BITOFFSET_XY(0, 1),\n\ VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));\n\ - coord_in.z ++;\n\ +\n\ unsigned char inputZP;\n\ _viv_asm(COPY, inputZP, input_ZP, 4);\n\ VXC_DP4x4(left4, top, inputZP, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniU8SubZPtoFp32_left_4x4);\n\ @@ -36621,8 +37491,8 @@ __kernel void resize_bilinear_U8toU8_UP\n\ dst4 = dst4 * uint8Scale + output_ZP;\n\ int4 dst = convert_int4_rte(dst4);\n\ VXC_DP2x8(top, dst, dst, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), uniExtact8Bit_2x8);\n\ - VXC_OP4_NoDest(img_store_3d, output, coord_out.xyww, top, VXC_MODIFIER(0, 3, 0,VXC_RM_TowardZero, 0));\n\ - coord_out.w += output_desc.s4;\n\ + VXC_OP4_NoDest(img_store_3d, output, coord_out.xywz, top, VXC_MODIFIER(0, 3, 0,VXC_RM_TowardZero, 0));\n\ + coord_out.zw += (int2)(1, output_desc.s4);\n\ }\n\ \n\ VXC_BitExtract(top, src0, src0, maskShift, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ @@ -36643,7 +37513,7 @@ __kernel void resize_bilinear_U8toU8_UP\n\ dst4 = dst4 * uint8Scale + output_ZP;\n\ int4 dst = convert_int4_rte(dst4);\n\ VXC_DP2x8(top, dst, dst, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), uniExtact8Bit_2x8);\n\ - VXC_OP4_NoDest(img_store_3d, output, coord_out.xyww, top, VXC_MODIFIER(0, 3, 0,VXC_RM_TowardZero, 0));\n\ + VXC_OP4_NoDest(img_store_3d, output, coord_out.xywz, top, VXC_MODIFIER(0, 3, 0,VXC_RM_TowardZero, 0));\n\ }\n\ \n\ __kernel void resize_bilinear_U8toU8_DOWN\n\ @@ -36671,24 +37541,24 @@ __kernel void resize_bilinear_U8toU8_DOWN\n\ _viv_asm(COPY, input_desc, input, sizeof(input_desc));\n\ int baseAddr = (int)coord_in.z * input_desc.s4 + input_desc.s0;\n\ _viv_asm(MOV, coord_in.w, baseAddr);\n\ - VXC_OP4(img_load_3d, top, input, coord_in.xyww, VXC_5BITOFFSET_XY(0, 0),\n\ + VXC_OP4(img_load_3d, top, input, coord_in.xywz, VXC_5BITOFFSET_XY(0, 0),\n\ VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0));\n\ - VXC_OP4(img_load_3d, bottom, input, coord_in.xyww, VXC_5BITOFFSET_XY(0, 1),\n\ + VXC_OP4(img_load_3d, bottom, input, coord_in.xywz, VXC_5BITOFFSET_XY(0, 1),\n\ VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0));\n\ coord_in.x = left_x_idx.y;\n\ - VXC_OP4(img_load_3d, top, input, coord_in.xyww, VXC_5BITOFFSET_XY(0, 0),\n\ + VXC_OP4(img_load_3d, top, input, coord_in.xywz, VXC_5BITOFFSET_XY(0, 0),\n\ VXC_MODIFIER(2, 3, 0, VXC_RM_TowardZero, 0));\n\ - VXC_OP4(img_load_3d, bottom, input, coord_in.xyww, VXC_5BITOFFSET_XY(0, 1),\n\ + VXC_OP4(img_load_3d, bottom, input, coord_in.xywz, VXC_5BITOFFSET_XY(0, 1),\n\ VXC_MODIFIER(2, 3, 0, VXC_RM_TowardZero, 0));\n\ coord_in.x = left_x_idx.z;\n\ - VXC_OP4(img_load_3d, top, input, coord_in.xyww, VXC_5BITOFFSET_XY(0, 0),\n\ + VXC_OP4(img_load_3d, top, input, coord_in.xywz, VXC_5BITOFFSET_XY(0, 0),\n\ VXC_MODIFIER(4, 5, 0, VXC_RM_TowardZero, 0));\n\ - VXC_OP4(img_load_3d, bottom, input, coord_in.xyww, VXC_5BITOFFSET_XY(0, 1),\n\ + VXC_OP4(img_load_3d, bottom, input, coord_in.xywz, VXC_5BITOFFSET_XY(0, 1),\n\ VXC_MODIFIER(4, 5, 0, VXC_RM_TowardZero, 0));\n\ coord_in.x = left_x_idx.w;\n\ - VXC_OP4(img_load_3d, top, input, coord_in.xyww, VXC_5BITOFFSET_XY(0, 0),\n\ + VXC_OP4(img_load_3d, top, input, coord_in.xywz, VXC_5BITOFFSET_XY(0, 0),\n\ VXC_MODIFIER(6, 7, 0, VXC_RM_TowardZero, 0));\n\ - VXC_OP4(img_load_3d, bottom, input, coord_in.xyww, VXC_5BITOFFSET_XY(0, 1),\n\ + VXC_OP4(img_load_3d, bottom, input, coord_in.xywz, VXC_5BITOFFSET_XY(0, 1),\n\ VXC_MODIFIER(6, 7, 0, VXC_RM_TowardZero, 0));\n\ \n\ float4 left4;\n\ @@ -36718,7 +37588,7 @@ __kernel void resize_bilinear_U8toU8_DOWN\n\ _viv_asm(COPY, output_desc, output, sizeof(output_desc));\n\ baseAddr = (int)coord_out.z * output_desc.s4 + output_desc.s0;\n\ _viv_asm(MOV, coord_out.w, baseAddr);\n\ - VXC_OP4_NoDest(img_store_3d, output, coord_out.xyww, result,\n\ + VXC_OP4_NoDest(img_store_3d, output, coord_out.xywz, result,\n\ VXC_MODIFIER(0, 3, 0,VXC_RM_TowardZero, 0));\n\ }\n\ "; /* end of resize_bilinear_U8_vx*/ @@ -36748,9 +37618,9 @@ __kernel void resize_bilinear_U8toU8_SAME_2x_upsample_half_pixel_centers\n\ _viv_asm(COPY, input_desc, input, sizeof(input_desc));\n\ int baseAddr = (int)coord_in.z * input_desc.s4 + input_desc.s0;\n\ _viv_asm(MOV, coord_in.w, baseAddr);\n\ - VXC_OP4(img_load_3d, in0, input, coord_in.xyww, VXC_5BITOFFSET_XY(0, 0),\n\ + VXC_OP4(img_load_3d, in0, input, coord_in.xywz, VXC_5BITOFFSET_XY(0, 0),\n\ VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));\n\ - VXC_OP4(img_load_3d, in1, input, coord_in.xyww, VXC_5BITOFFSET_XY(0, 1),\n\ + VXC_OP4(img_load_3d, in1, input, coord_in.xywz, VXC_5BITOFFSET_XY(0, 1),\n\ VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));\n\ \n\ int8 output_desc;\n\ @@ -36762,26 +37632,26 @@ __kernel void resize_bilinear_U8toU8_SAME_2x_upsample_half_pixel_centers\n\ {\n\ VXC_DP4x8(result, in0, in1, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniResize2xUp_0_4x8);\n\ VXC_DP4x8(result, in0, in1, VXC_MODIFIER(8, 15, 0, VXC_RM_ToNearestEven, 1), uniResize2xUp_1_4x8);\n\ - VXC_OP4_NoDest(img_store_3d, output, coord_out.xyww, result,\n\ + VXC_OP4_NoDest(img_store_3d, output, coord_out.xywz, result,\n\ VXC_MODIFIER(0, 15, 0,VXC_RM_TowardZero, 0));\n\ coord_out.y++;\n\ - VXC_OP4(img_load_3d, in0, input, coord_in.xyww, VXC_5BITOFFSET_XY(0, 2),\n\ + VXC_OP4(img_load_3d, in0, input, coord_in.xywz, VXC_5BITOFFSET_XY(0, 2),\n\ VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));\n\ VXC_DP4x8(result, in0, in1, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniResize2xUp_0_4x8);\n\ VXC_DP4x8(result, in0, in1, VXC_MODIFIER(8, 15, 0, VXC_RM_ToNearestEven, 1), uniResize2xUp_1_4x8);\n\ - VXC_OP4_NoDest(img_store_3d, output, coord_out.xyww, result,\n\ + VXC_OP4_NoDest(img_store_3d, output, coord_out.xywz, result,\n\ VXC_MODIFIER(0, 15, 0,VXC_RM_TowardZero, 0));\n\ coord_out.y++;\n\ VXC_DP4x8(result, in1, in0, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniResize2xUp_0_4x8);\n\ VXC_DP4x8(result, in1, in0, VXC_MODIFIER(8, 15, 0, VXC_RM_ToNearestEven, 1), uniResize2xUp_1_4x8);\n\ - VXC_OP4_NoDest(img_store_3d, output, coord_out.xyww, result,\n\ + VXC_OP4_NoDest(img_store_3d, output, coord_out.xywz, result,\n\ VXC_MODIFIER(0, 15, 0,VXC_RM_TowardZero, 0));\n\ - VXC_OP4(img_load_3d, in1, input, coord_in.xyww, VXC_5BITOFFSET_XY(0, 3),\n\ + VXC_OP4(img_load_3d, in1, input, coord_in.xywz, VXC_5BITOFFSET_XY(0, 3),\n\ VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));\n\ coord_out.y++;\n\ VXC_DP4x8(result, in1, in0, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniResize2xUp_0_4x8);\n\ VXC_DP4x8(result, in1, in0, VXC_MODIFIER(8, 15, 0, VXC_RM_ToNearestEven, 1), uniResize2xUp_1_4x8);\n\ - VXC_OP4_NoDest(img_store_3d, output, coord_out.xyww, result,\n\ + VXC_OP4_NoDest(img_store_3d, output, coord_out.xywz, result,\n\ VXC_MODIFIER(0, 15, 0,VXC_RM_TowardZero, 0));\n\ coord_in.y += 2;\n\ coord_out.y++;\n\ @@ -36811,9 +37681,9 @@ __kernel void resize_bilinear_U8toU8_SAME_4x_upsample_half_pixel_centers\n\ _viv_asm(COPY, input_desc, input, sizeof(input_desc));\n\ int baseAddr = (int)coord_in.z * input_desc.s4 + input_desc.s0;\n\ _viv_asm(MOV, coord_in.w, baseAddr);\n\ - VXC_OP4(img_load_3d, in0, input, coord_in.xyww, VXC_5BITOFFSET_XY(0, 0),\n\ + VXC_OP4(img_load_3d, in0, input, coord_in.xywz, VXC_5BITOFFSET_XY(0, 0),\n\ VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));\n\ - VXC_OP4(img_load_3d, in1, input, coord_in.xyww, VXC_5BITOFFSET_XY(0, 1),\n\ + VXC_OP4(img_load_3d, in1, input, coord_in.xywz, VXC_5BITOFFSET_XY(0, 1),\n\ VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));\n\ \n\ int8 output_desc;\n\ @@ -36827,44 +37697,44 @@ __kernel void resize_bilinear_U8toU8_SAME_4x_upsample_half_pixel_centers\n\ VXC_DP4x8(dst0, in0, in1, VXC_MODIFIER(8, 15, 0, VXC_RM_ToNearestEven, 1), uniResize4xUp_l01_4x8);\n\ VXC_DP4x8(dst1, in0, in1, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniResize4xUp_l10_4x8);\n\ VXC_DP4x8(dst1, in0, in1, VXC_MODIFIER(8, 15, 0, VXC_RM_ToNearestEven, 1), uniResize4xUp_l11_4x8);\n\ - VXC_OP4(img_load_3d, in0, input, coord_in.xyww, VXC_5BITOFFSET_XY(0, 2),\n\ + VXC_OP4(img_load_3d, in0, input, coord_in.xywz, VXC_5BITOFFSET_XY(0, 2),\n\ VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));\n\ - VXC_OP4_NoDest(img_store_3d, output, coord_out.xyww, dst0,\n\ + VXC_OP4_NoDest(img_store_3d, output, coord_out.xywz, dst0,\n\ VXC_MODIFIER(0, 15, 0,VXC_RM_TowardZero, 0));\n\ coord_out.y++;\n\ - VXC_OP4_NoDest(img_store_3d, output, coord_out.xyww, dst1,\n\ + VXC_OP4_NoDest(img_store_3d, output, coord_out.xywz, dst1,\n\ VXC_MODIFIER(0, 15, 0,VXC_RM_TowardZero, 0));\n\ coord_out.y++;\n\ VXC_DP4x8(dst0, in0, in1, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniResize4xUp_l10_4x8);\n\ VXC_DP4x8(dst0, in0, in1, VXC_MODIFIER(8, 15, 0, VXC_RM_ToNearestEven, 1), uniResize4xUp_l11_4x8);\n\ VXC_DP4x8(dst1, in0, in1, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniResize4xUp_l00_4x8);\n\ VXC_DP4x8(dst1, in0, in1, VXC_MODIFIER(8, 15, 0, VXC_RM_ToNearestEven, 1), uniResize4xUp_l01_4x8);\n\ - VXC_OP4_NoDest(img_store_3d, output, coord_out.xyww, dst0,\n\ + VXC_OP4_NoDest(img_store_3d, output, coord_out.xywz, dst0,\n\ VXC_MODIFIER(0, 15, 0,VXC_RM_TowardZero, 0));\n\ coord_out.y++;\n\ - VXC_OP4_NoDest(img_store_3d, output, coord_out.xyww, dst1,\n\ + VXC_OP4_NoDest(img_store_3d, output, coord_out.xywz, dst1,\n\ VXC_MODIFIER(0, 15, 0,VXC_RM_TowardZero, 0));\n\ coord_out.y++;\n\ VXC_DP4x8(dst0, in1, in0, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniResize4xUp_l00_4x8);\n\ VXC_DP4x8(dst0, in1, in0, VXC_MODIFIER(8, 15, 0, VXC_RM_ToNearestEven, 1), uniResize4xUp_l01_4x8);\n\ VXC_DP4x8(dst1, in1, in0, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniResize4xUp_l10_4x8);\n\ VXC_DP4x8(dst1, in1, in0, VXC_MODIFIER(8, 15, 0, VXC_RM_ToNearestEven, 1), uniResize4xUp_l11_4x8);\n\ - VXC_OP4(img_load_3d, in1, input, coord_in.xyww, VXC_5BITOFFSET_XY(0, 3),\n\ + VXC_OP4(img_load_3d, in1, input, coord_in.xywz, VXC_5BITOFFSET_XY(0, 3),\n\ VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));\n\ - VXC_OP4_NoDest(img_store_3d, output, coord_out.xyww, dst0,\n\ + VXC_OP4_NoDest(img_store_3d, output, coord_out.xywz, dst0,\n\ VXC_MODIFIER(0, 15, 0,VXC_RM_TowardZero, 0));\n\ coord_out.y++;\n\ - VXC_OP4_NoDest(img_store_3d, output, coord_out.xyww, dst1,\n\ + VXC_OP4_NoDest(img_store_3d, output, coord_out.xywz, dst1,\n\ VXC_MODIFIER(0, 15, 0,VXC_RM_TowardZero, 0));\n\ coord_out.y++;\n\ VXC_DP4x8(dst0, in1, in0, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniResize4xUp_l10_4x8);\n\ VXC_DP4x8(dst0, in1, in0, VXC_MODIFIER(8, 15, 0, VXC_RM_ToNearestEven, 1), uniResize4xUp_l11_4x8);\n\ VXC_DP4x8(dst1, in1, in0, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniResize4xUp_l00_4x8);\n\ VXC_DP4x8(dst1, in1, in0, VXC_MODIFIER(8, 15, 0, VXC_RM_ToNearestEven, 1), uniResize4xUp_l01_4x8);\n\ - VXC_OP4_NoDest(img_store_3d, output, coord_out.xyww, dst0,\n\ + VXC_OP4_NoDest(img_store_3d, output, coord_out.xywz, dst0,\n\ VXC_MODIFIER(0, 15, 0,VXC_RM_TowardZero, 0));\n\ coord_out.y++;\n\ - VXC_OP4_NoDest(img_store_3d, output, coord_out.xyww, dst1,\n\ + VXC_OP4_NoDest(img_store_3d, output, coord_out.xywz, dst1,\n\ VXC_MODIFIER(0, 15, 0,VXC_RM_TowardZero, 0));\n\ coord_in.y += 2;\n\ coord_out.y++;\n\ @@ -36898,13 +37768,13 @@ __kernel void resize_bilinear_U8toU8_SAME_3x_upsample_half_pixel_centers\n\ _viv_asm(COPY, input_desc, input, sizeof(input_desc));\n\ int baseAddr = (int)coord_in.z * input_desc.s4 + input_desc.s0;\n\ _viv_asm(MOV, coord_in.w, baseAddr);\n\ - VXC_OP4(img_load_3d, in0, input, coord_in.xyww, VXC_5BITOFFSET_XY(0, 0),\n\ + VXC_OP4(img_load_3d, in0, input, coord_in.xywz, VXC_5BITOFFSET_XY(0, 0),\n\ VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));\n\ - VXC_OP4(img_load_3d, in1, input, coord_in.xyww, VXC_5BITOFFSET_XY(0, 1),\n\ + VXC_OP4(img_load_3d, in1, input, coord_in.xywz, VXC_5BITOFFSET_XY(0, 1),\n\ VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));\n\ - VXC_OP4(img_load_3d, in2, input, coord_in.xyww, VXC_5BITOFFSET_XY(0, 2),\n\ + VXC_OP4(img_load_3d, in2, input, coord_in.xywz, VXC_5BITOFFSET_XY(0, 2),\n\ VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));\n\ - VXC_OP4(img_load_3d, in3, input, coord_in.xyww, VXC_5BITOFFSET_XY(0, 3),\n\ + VXC_OP4(img_load_3d, in3, input, coord_in.xywz, VXC_5BITOFFSET_XY(0, 3),\n\ VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));\n\ \n\ int8 output_desc;\n\ @@ -36916,7 +37786,7 @@ __kernel void resize_bilinear_U8toU8_SAME_3x_upsample_half_pixel_centers\n\ VXC_DP4x4(dst2, in1, in0, VXC_MODIFIER(4, 7, 0, VXC_RM_ToNearestEven, 1), uniResize3xUp_l11_4x4);\n\ VXC_DP4x4(dst2, in1, in0, VXC_MODIFIER(8, 11, 0, VXC_RM_ToNearestEven, 1), uniResize3xUp_l12_4x4);\n\ VXC_DP4x4(dst2, in1, in0, VXC_MODIFIER(12, 14, 0, VXC_RM_ToNearestEven, 1), uniResize3xUp_l13_4x4);\n\ - VXC_OP4_NoDest(img_store_3d, output, coord_out.xyww, dst2,\n\ + VXC_OP4_NoDest(img_store_3d, output, coord_out.xywz, dst2,\n\ VXC_MODIFIER(0, 14, 0,VXC_RM_TowardZero, 0));\n\ coord_out.y++;\n\ \n\ @@ -36930,13 +37800,13 @@ __kernel void resize_bilinear_U8toU8_SAME_3x_upsample_half_pixel_centers\n\ VXC_DP4x4(dst2, in2, in1, VXC_MODIFIER(4, 7, 0, VXC_RM_ToNearestEven, 1), uniResize3xUp_l11_4x4);\n\ VXC_DP4x4(dst2, in2, in1, VXC_MODIFIER(8, 11, 0, VXC_RM_ToNearestEven, 1), uniResize3xUp_l12_4x4);\n\ VXC_DP4x4(dst2, in2, in1, VXC_MODIFIER(12, 14, 0, VXC_RM_ToNearestEven, 1), uniResize3xUp_l13_4x4);\n\ - VXC_OP4_NoDest(img_store_3d, output, coord_out.xyww, dst0,\n\ + VXC_OP4_NoDest(img_store_3d, output, coord_out.xywz, dst0,\n\ VXC_MODIFIER(0, 14, 0,VXC_RM_TowardZero, 0));\n\ coord_out.y++;\n\ - VXC_OP4_NoDest(img_store_3d, output, coord_out.xyww, dst1,\n\ + VXC_OP4_NoDest(img_store_3d, output, coord_out.xywz, dst1,\n\ VXC_MODIFIER(0, 14, 0,VXC_RM_TowardZero, 0));\n\ coord_out.y++;\n\ - VXC_OP4_NoDest(img_store_3d, output, coord_out.xyww, dst2,\n\ + VXC_OP4_NoDest(img_store_3d, output, coord_out.xywz, dst2,\n\ VXC_MODIFIER(0, 14, 0,VXC_RM_TowardZero, 0));\n\ coord_out.y++;\n\ \n\ @@ -36946,10 +37816,10 @@ __kernel void resize_bilinear_U8toU8_SAME_3x_upsample_half_pixel_centers\n\ VXC_DP4x4(dst1, in2, in3, VXC_MODIFIER(4, 7, 0, VXC_RM_ToNearestEven, 1), uniResize3xUp_l11_4x4);\n\ VXC_DP4x4(dst1, in2, in3, VXC_MODIFIER(8, 11, 0, VXC_RM_ToNearestEven, 1), uniResize3xUp_l12_4x4);\n\ VXC_DP4x4(dst1, in2, in3, VXC_MODIFIER(12, 14, 0, VXC_RM_ToNearestEven, 1), uniResize3xUp_l13_4x4);\n\ - VXC_OP4_NoDest(img_store_3d, output, coord_out.xyww, dst0,\n\ + VXC_OP4_NoDest(img_store_3d, output, coord_out.xywz, dst0,\n\ VXC_MODIFIER(0, 14, 0,VXC_RM_TowardZero, 0));\n\ coord_out.y++;\n\ - VXC_OP4_NoDest(img_store_3d, output, coord_out.xyww, dst1,\n\ + VXC_OP4_NoDest(img_store_3d, output, coord_out.xywz, dst1,\n\ VXC_MODIFIER(0, 14, 0,VXC_RM_TowardZero, 0));\n\ }\n\ "; /* end of resize_bilinear_U8_half_pixel_centers_vx*/ @@ -36998,9 +37868,9 @@ __kernel void resize_bilinear_U8toU8_UP_opt\n\ _viv_asm(COPY, input_desc, input, sizeof(input_desc));\n\ int baseAddr = (int)coord_in.z * input_desc.s4 + input_desc.s0;\n\ _viv_asm(MOV, coord_in.w, baseAddr);\n\ - VXC_OP4(img_load_3d, src0, input, coord_in.xyww, VXC_5BITOFFSET_XY(0, 0),\n\ + VXC_OP4(img_load_3d, src0, input, coord_in.xywz, VXC_5BITOFFSET_XY(0, 0),\n\ VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));\n\ - VXC_OP4(img_load_3d, src1, input, coord_in.xyww, VXC_5BITOFFSET_XY(0, 1),\n\ + VXC_OP4(img_load_3d, src1, input, coord_in.xywz, VXC_5BITOFFSET_XY(0, 1),\n\ VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));\n\ \n\ vxc_ushort8 bitextract_p0;\n\ @@ -37031,21 +37901,19 @@ __kernel void resize_bilinear_U8toU8_UP_opt\n\ VXC_BitExtract(top_bottom, src0, src0, maskShift, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ VXC_BitExtract(top_bottom, src1, src1, maskShift, VXC_MODIFIER(8, 15, 0, VXC_RM_TowardZero, 0));\n\ \n\ - coord_in.w += input_desc.s4;\n\ - VXC_OP4(img_load_3d, src0, input, coord_in.xyww,\n\ + coord_in.zw += (int2)(1, input_desc.s4);\n\ + VXC_OP4(img_load_3d, src0, input, coord_in.xywz,\n\ VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));\n\ - VXC_OP4(img_load_3d, src1, input, coord_in.xyww,\n\ + VXC_OP4(img_load_3d, src1, input, coord_in.xywz,\n\ VXC_5BITOFFSET_XY(0, 1), VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));\n\ \n\ vxc_uchar16 dst;\n\ VXC_DP4x4_b(dst, lerp.hi, lerp.lo, top_bottom,\n\ VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), uniBilinear_4x4_b);\n\ \n\ - VXC_OP4_NoDest(img_store_3d, output, coord_out.xyww, dst,\n\ + VXC_OP4_NoDest(img_store_3d, output, coord_out.xywz, dst,\n\ VXC_MODIFIER(0, 3, 0,VXC_RM_TowardZero, 0));\n\ - coord_out.w += output_desc.s4;\n\ -\n\ - coord_in.z ++;\n\ + coord_out.zw += (int2)(1, output_desc.s4);\n\ }\n\ \n\ VXC_BitExtract(top_bottom, src0, src0, maskShift, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ @@ -37053,7 +37921,7 @@ __kernel void resize_bilinear_U8toU8_UP_opt\n\ vxc_uchar16 dst;\n\ VXC_DP4x4_b(dst, lerp.hi, lerp.lo, top_bottom,\n\ VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), uniBilinear_4x4_b);\n\ - VXC_OP4_NoDest(img_store_3d, output, coord_out.xyww, dst,\n\ + VXC_OP4_NoDest(img_store_3d, output, coord_out.xywz, dst,\n\ VXC_MODIFIER(0, 3, 0,VXC_RM_TowardZero, 0));\n\ \n\ }\n\ @@ -37095,16 +37963,16 @@ __kernel void resize_nearest_F16toF16\n\ int baseAddr = (int)coord_in.z * input_desc.s4 + input_desc.s0;\n\ _viv_asm(MOV, coord_in.w, baseAddr);\n\ \n\ - VXC_OP4(img_load_3d, src, input, coord_in.xyww, VXC_5BITOFFSET_XY(0, 0),\n\ + VXC_OP4(img_load_3d, src, input, coord_in.xywz, VXC_5BITOFFSET_XY(0, 0),\n\ VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));\n\ coord_in.x = in_x_idx.y;\n\ - VXC_OP4(img_load_3d, src, input, coord_in.xyww, VXC_5BITOFFSET_XY(0, 0),\n\ + VXC_OP4(img_load_3d, src, input, coord_in.xywz, VXC_5BITOFFSET_XY(0, 0),\n\ VXC_MODIFIER(1, 1, 0, VXC_RM_TowardZero, 0));\n\ coord_in.x = in_x_idx.z;\n\ - VXC_OP4(img_load_3d, src, input, coord_in.xyww, VXC_5BITOFFSET_XY(0, 0),\n\ + VXC_OP4(img_load_3d, src, input, coord_in.xywz, VXC_5BITOFFSET_XY(0, 0),\n\ VXC_MODIFIER(2, 2, 0, VXC_RM_TowardZero, 0));\n\ coord_in.x = in_x_idx.w;\n\ - VXC_OP4(img_load_3d, src, input, coord_in.xyww, VXC_5BITOFFSET_XY(0, 0),\n\ + VXC_OP4(img_load_3d, src, input, coord_in.xywz, VXC_5BITOFFSET_XY(0, 0),\n\ VXC_MODIFIER(3, 3, 0, VXC_RM_TowardZero, 0));\n\ \n\ \n\ @@ -37112,7 +37980,7 @@ __kernel void resize_nearest_F16toF16\n\ _viv_asm(COPY, output_desc, output, sizeof(output_desc));\n\ baseAddr = (int)coord_out.z * output_desc.s4 + output_desc.s0;\n\ _viv_asm(MOV, coord_out.w, baseAddr);\n\ - VXC_OP4_NoDest(img_store_3d, output, coord_out.xyww, src,\n\ + VXC_OP4_NoDest(img_store_3d, output, coord_out.xywz, src,\n\ VXC_MODIFIER(0, 3, 0,VXC_RM_TowardZero, 0));\n\ }\n\ \n\ @@ -37135,9 +38003,9 @@ __kernel void resize_nearest_F16toF16_op\n\ int baseAddr = (int)coord_in.z * input_desc.s4 + input_desc.s0;\n\ _viv_asm(MOV, coord_in.w, baseAddr);\n\ \n\ - VXC_OP4(img_load_3d, src0, input, coord_in.xyww, VXC_5BITOFFSET_XY(0, 0),\n\ + VXC_OP4(img_load_3d, src0, input, coord_in.xywz, VXC_5BITOFFSET_XY(0, 0),\n\ VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ - VXC_OP4(img_load_3d, src1, input, coord_in.xyww, VXC_5BITOFFSET_XY(8, 0),\n\ + VXC_OP4(img_load_3d, src1, input, coord_in.xywz, VXC_5BITOFFSET_XY(8, 0),\n\ VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ \n\ vxc_uchar16 mask = (vxc_uchar16)(8, 8, 8, 8, 8, 8, 8, 8, 16, 16, 16, 16, 16, 16, 16, 16);\n\ @@ -37151,7 +38019,7 @@ __kernel void resize_nearest_F16toF16_op\n\ _viv_asm(COPY, output_desc, output, sizeof(output_desc));\n\ baseAddr = (int)coord_out.z * output_desc.s4 + output_desc.s0;\n\ _viv_asm(MOV, coord_out.w, baseAddr);\n\ - VXC_OP4_NoDest(img_store_3d, output, coord_out.xyww, dst,\n\ + VXC_OP4_NoDest(img_store_3d, output, coord_out.xywz, dst,\n\ VXC_MODIFIER(0, 3, 0,VXC_RM_TowardZero, 0));\n\ }\n\ \n\ @@ -37174,16 +38042,16 @@ __kernel void resize_nearest_I8toI8\n\ int baseAddr = (int)coord_in.z * input_desc.s4 + input_desc.s0;\n\ _viv_asm(MOV, coord_in.w, baseAddr);\n\ \n\ - VXC_OP4(img_load_3d, src, input, coord_in.xyww, VXC_5BITOFFSET_XY(0, 0),\n\ + VXC_OP4(img_load_3d, src, input, coord_in.xywz, VXC_5BITOFFSET_XY(0, 0),\n\ VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));\n\ coord_in.x = in_x_idx.y;\n\ - VXC_OP4(img_load_3d, src, input, coord_in.xyww, VXC_5BITOFFSET_XY(0, 0),\n\ + VXC_OP4(img_load_3d, src, input, coord_in.xywz, VXC_5BITOFFSET_XY(0, 0),\n\ VXC_MODIFIER(1, 1, 0, VXC_RM_TowardZero, 0));\n\ coord_in.x = in_x_idx.z;\n\ - VXC_OP4(img_load_3d, src, input, coord_in.xyww, VXC_5BITOFFSET_XY(0, 0),\n\ + VXC_OP4(img_load_3d, src, input, coord_in.xywz, VXC_5BITOFFSET_XY(0, 0),\n\ VXC_MODIFIER(2, 2, 0, VXC_RM_TowardZero, 0));\n\ coord_in.x = in_x_idx.w;\n\ - VXC_OP4(img_load_3d, src, input, coord_in.xyww, VXC_5BITOFFSET_XY(0, 0),\n\ + VXC_OP4(img_load_3d, src, input, coord_in.xywz, VXC_5BITOFFSET_XY(0, 0),\n\ VXC_MODIFIER(3, 3, 0, VXC_RM_TowardZero, 0));\n\ \n\ VXC_DP2x8(src, src, src, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniConvertI8toI8_2x8);\n\ @@ -37192,7 +38060,7 @@ __kernel void resize_nearest_I8toI8\n\ _viv_asm(COPY, output_desc, output, sizeof(output_desc));\n\ baseAddr = (int)coord_out.z * output_desc.s4 + output_desc.s0;\n\ _viv_asm(MOV, coord_out.w, baseAddr);\n\ - VXC_OP4_NoDest(img_store_3d, output, coord_out.xyww, src,\n\ + VXC_OP4_NoDest(img_store_3d, output, coord_out.xywz, src,\n\ VXC_MODIFIER(0, 3, 0,VXC_RM_TowardZero, 0));\n\ }\n\ \n\ @@ -37215,7 +38083,7 @@ __kernel void resize_nearest_I8toI8_op\n\ int baseAddr = (int)coord_in.z * input_desc.s4 + input_desc.s0;\n\ _viv_asm(MOV, coord_in.w, baseAddr);\n\ \n\ - VXC_OP4(img_load_3d, src0, input, coord_in.xyww, VXC_5BITOFFSET_XY(0, 0),\n\ + VXC_OP4(img_load_3d, src0, input, coord_in.xywz, VXC_5BITOFFSET_XY(0, 0),\n\ VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));\n\ \n\ vxc_uchar16 mask = (vxc_uchar16)(8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8);\n\ @@ -37231,7 +38099,7 @@ __kernel void resize_nearest_I8toI8_op\n\ _viv_asm(COPY, output_desc, output, sizeof(output_desc));\n\ baseAddr = (int)coord_out.z * output_desc.s4 + output_desc.s0;\n\ _viv_asm(MOV, coord_out.w, baseAddr);\n\ - VXC_OP4_NoDest(img_store_3d, output, coord_out.xyww, dst,\n\ + VXC_OP4_NoDest(img_store_3d, output, coord_out.xywz, dst,\n\ VXC_MODIFIER(0, 3, 0,VXC_RM_TowardZero, 0));\n\ }\n\ \n\ @@ -37253,16 +38121,16 @@ __kernel void resize_nearest_U8toU8\n\ int baseAddr = (int)coord_in.z * input_desc.s4 + input_desc.s0;\n\ _viv_asm(MOV, coord_in.w, baseAddr);\n\ \n\ - VXC_OP4(img_load_3d, src, input, coord_in.xyww, VXC_5BITOFFSET_XY(0, 0),\n\ + VXC_OP4(img_load_3d, src, input, coord_in.xywz, VXC_5BITOFFSET_XY(0, 0),\n\ VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));\n\ coord_in.x = in_x_idx.y;\n\ - VXC_OP4(img_load_3d, src, input, coord_in.xyww, VXC_5BITOFFSET_XY(0, 0),\n\ + VXC_OP4(img_load_3d, src, input, coord_in.xywz, VXC_5BITOFFSET_XY(0, 0),\n\ VXC_MODIFIER(1, 1, 0, VXC_RM_TowardZero, 0));\n\ coord_in.x = in_x_idx.z;\n\ - VXC_OP4(img_load_3d, src, input, coord_in.xyww, VXC_5BITOFFSET_XY(0, 0),\n\ + VXC_OP4(img_load_3d, src, input, coord_in.xywz, VXC_5BITOFFSET_XY(0, 0),\n\ VXC_MODIFIER(2, 2, 0, VXC_RM_TowardZero, 0));\n\ coord_in.x = in_x_idx.w;\n\ - VXC_OP4(img_load_3d, src, input, coord_in.xyww, VXC_5BITOFFSET_XY(0, 0),\n\ + VXC_OP4(img_load_3d, src, input, coord_in.xywz, VXC_5BITOFFSET_XY(0, 0),\n\ VXC_MODIFIER(3, 3, 0, VXC_RM_TowardZero, 0));\n\ \n\ vxc_ushort8 multiplier;\n\ @@ -37274,7 +38142,7 @@ __kernel void resize_nearest_U8toU8\n\ _viv_asm(COPY, output_desc, output, sizeof(output_desc));\n\ baseAddr = (int)coord_out.z * output_desc.s4 + output_desc.s0;\n\ _viv_asm(MOV, coord_out.w, baseAddr);\n\ - VXC_OP4_NoDest(img_store_3d, output, coord_out.xyww, src,\n\ + VXC_OP4_NoDest(img_store_3d, output, coord_out.xywz, src,\n\ VXC_MODIFIER(0, 3, 0,VXC_RM_TowardZero, 0));\n\ }\n\ \n\ @@ -37296,7 +38164,7 @@ __kernel void resize_nearest_U8toU8_op\n\ int baseAddr = (int)coord_in.z * input_desc.s4 + input_desc.s0;\n\ _viv_asm(MOV, coord_in.w, baseAddr);\n\ \n\ - VXC_OP4(img_load_3d, src0, input, coord_in.xyww, VXC_5BITOFFSET_XY(0, 0),\n\ + VXC_OP4(img_load_3d, src0, input, coord_in.xywz, VXC_5BITOFFSET_XY(0, 0),\n\ VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));\n\ \n\ vxc_uchar16 mask = (vxc_uchar16)(8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8);\n\ @@ -37312,7 +38180,7 @@ __kernel void resize_nearest_U8toU8_op\n\ _viv_asm(COPY, output_desc, output, sizeof(output_desc));\n\ baseAddr = (int)coord_out.z * output_desc.s4 + output_desc.s0;\n\ _viv_asm(MOV, coord_out.w, baseAddr);\n\ - VXC_OP4_NoDest(img_store_3d, output, coord_out.xyww, dst,\n\ + VXC_OP4_NoDest(img_store_3d, output, coord_out.xywz, dst,\n\ VXC_MODIFIER(0, 3, 0,VXC_RM_TowardZero, 0));\n\ }\n\ \n\ @@ -37334,16 +38202,16 @@ __kernel void resize_nearest_I16toI16\n\ int baseAddr = (int)coord_in.z * input_desc.s4 + input_desc.s0;\n\ _viv_asm(MOV, coord_in.w, baseAddr);\n\ \n\ - VXC_OP4(img_load_3d, src, input, coord_in.xyww, VXC_5BITOFFSET_XY(0, 0),\n\ + VXC_OP4(img_load_3d, src, input, coord_in.xywz, VXC_5BITOFFSET_XY(0, 0),\n\ VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));\n\ coord_in.x = in_x_idx.y;\n\ - VXC_OP4(img_load_3d, src, input, coord_in.xyww, VXC_5BITOFFSET_XY(0, 0),\n\ + VXC_OP4(img_load_3d, src, input, coord_in.xywz, VXC_5BITOFFSET_XY(0, 0),\n\ VXC_MODIFIER(1, 1, 0, VXC_RM_TowardZero, 0));\n\ coord_in.x = in_x_idx.z;\n\ - VXC_OP4(img_load_3d, src, input, coord_in.xyww, VXC_5BITOFFSET_XY(0, 0),\n\ + VXC_OP4(img_load_3d, src, input, coord_in.xywz, VXC_5BITOFFSET_XY(0, 0),\n\ VXC_MODIFIER(2, 2, 0, VXC_RM_TowardZero, 0));\n\ coord_in.x = in_x_idx.w;\n\ - VXC_OP4(img_load_3d, src, input, coord_in.xyww, VXC_5BITOFFSET_XY(0, 0),\n\ + VXC_OP4(img_load_3d, src, input, coord_in.xywz, VXC_5BITOFFSET_XY(0, 0),\n\ VXC_MODIFIER(3, 3, 0, VXC_RM_TowardZero, 0));\n\ \n\ VXC_DP2x8(src, src, src, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), uniConvertI8toI8_2x8);\n\ @@ -37353,7 +38221,7 @@ __kernel void resize_nearest_I16toI16\n\ _viv_asm(COPY, output_desc, output, sizeof(output_desc));\n\ baseAddr = (int)coord_out.z * output_desc.s4 + output_desc.s0;\n\ _viv_asm(MOV, coord_out.w, baseAddr);\n\ - VXC_OP4_NoDest(img_store_3d, output, coord_out.xyww, src,\n\ + VXC_OP4_NoDest(img_store_3d, output, coord_out.xywz, src,\n\ VXC_MODIFIER(0, 3, 0,VXC_RM_TowardZero, 0));\n\ }\n\ \n\ @@ -37376,9 +38244,9 @@ __kernel void resize_nearest_I16toI16_op\n\ int baseAddr = (int)coord_in.z * input_desc.s4 + input_desc.s0;\n\ _viv_asm(MOV, coord_in.w, baseAddr);\n\ \n\ - VXC_OP4(img_load_3d, src0, input, coord_in.xyww, VXC_5BITOFFSET_XY(0, 0),\n\ + VXC_OP4(img_load_3d, src0, input, coord_in.xywz, VXC_5BITOFFSET_XY(0, 0),\n\ VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ - VXC_OP4(img_load_3d, src1, input, coord_in.xyww, VXC_5BITOFFSET_XY(8, 0),\n\ + VXC_OP4(img_load_3d, src1, input, coord_in.xywz, VXC_5BITOFFSET_XY(8, 0),\n\ VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ \n\ //in_x_idx = in_x_idx - in_x_idx.xxxx;\n\ @@ -37394,7 +38262,7 @@ __kernel void resize_nearest_I16toI16_op\n\ _viv_asm(COPY, output_desc, output, sizeof(output_desc));\n\ baseAddr = (int)coord_out.z * output_desc.s4 + output_desc.s0;\n\ _viv_asm(MOV, coord_out.w, baseAddr);\n\ - VXC_OP4_NoDest(img_store_3d, output, coord_out.xyww, dst,\n\ + VXC_OP4_NoDest(img_store_3d, output, coord_out.xywz, dst,\n\ VXC_MODIFIER(0, 3, 0,VXC_RM_TowardZero, 0));\n\ }\n\ "; /* end of resize_nearest_vx*/ @@ -37493,15 +38361,6 @@ _viv_uniform int offsetX;\n\ _viv_uniform int offsetY;\n\ _viv_uniform int offsetZ;\n\ \n\ -inline uchar* get_image2D_array_ptr(image2d_t input)\n\ -{\n\ - int8 desc;\n\ - _viv_asm(COPY, desc, input, sizeof(desc));\n\ - uchar *src_ptr = (uchar*)desc.s0;\n\ -\n\ - return src_ptr;\n\ -}\n\ -\n\ __kernel void scatter_nd_F16toF16_big(\n\ __read_only image2d_t input0,\n\ __read_only image2d_t input1,\n\ @@ -37517,9 +38376,12 @@ __kernel void scatter_nd_F16toF16_big(\n\ vxc_short8 tmpVal = (vxc_short8)(0, 0, 0, 0, 0, 0, 0, 0);\n\ vxc_half8 sum;\n\ _viv_asm(COPY, sum, tmpVal, 16);\n\ - __global int* index_ptr = (__global int*)get_image2D_array_ptr(input0);\n\ - __global short* update_ptr = (__global short*)get_image2D_array_ptr(input1);\n\ - __global short* output_ptr = (__global short*)get_image2D_array_ptr(output);\n\ + Image i0_img = create_image_from_image2d(input0, 4);\n\ + __global int* index_ptr = (__global int*)i0_img.ptr;\n\ + Image i1_img = create_image_from_image2d(input1, 2);\n\ + __global short* update_ptr = (__global short*)i1_img.ptr;\n\ + Image o_img = create_image_from_image2d(output, 2);\n\ + __global short* output_ptr = (__global short*)o_img.ptr;\n\ for(int i = 0; i < index_num; i++)\n\ {\n\ int4 indice = vload4(0, index_ptr + i * coord_dim);\n\ @@ -37553,9 +38415,12 @@ __kernel void scatter_nd_##src0_type_name##to##src0_type_name##_big( \\\n\ int firstFlg = 1; \\\n\ \\\n\ data_type sum = (data_type)(0, 0, 0, 0, 0, 0, 0, 0); \\\n\ - __global int* index_ptr = (__global int*)get_image2D_array_ptr(input0); \\\n\ - __global ptr_type* update_ptr = (__global ptr_type*)get_image2D_array_ptr(input1); \\\n\ - __global ptr_type* output_ptr = (__global ptr_type*)get_image2D_array_ptr(output); \\\n\ + Image i0_img = create_image_from_image2d(input0, 2); \\\n\ + __global int* index_ptr = (__global int*)i0_img.ptr; \\\n\ + Image i1_img = create_image_from_image2d(input1, 2); \\\n\ + __global ptr_type* update_ptr = (__global ptr_type*)i1_img.ptr; \\\n\ + Image o_img = create_image_from_image2d(output, 2); \\\n\ + __global ptr_type* output_ptr = (__global ptr_type*)o_img.ptr; \\\n\ for(int i = 0; i < index_num; i++) \\\n\ { \\\n\ int4 indice = vload4(0, index_ptr + i * coord_dim); \\\n\ @@ -37588,6 +38453,497 @@ SCATTER_ND_QINT_BIG(I8, vxc_char8, char)\n\ SCATTER_ND_QINT_BIG(I16, vxc_short8, short)\n\ "; /* end of scatter_nd_big_vx*/ +static const char scatter_nd_update_vx[] = "#include \"cl_viv_vx_ext.h\"\n\ +\n\ +_viv_uniform VXC_512Bits uniAccumulateSum_2x8;\n\ +_viv_uniform VXC_512Bits uniU8MulAndPostShift_0_Lo_2x8;\n\ +_viv_uniform VXC_512Bits uniU8MulAndPostShift_1_Lo_2x8;\n\ +_viv_uniform int index_num;\n\ +_viv_uniform int offset_idx;\n\ +_viv_uniform int offsetX;\n\ +_viv_uniform int offsetY;\n\ +_viv_uniform int offsetZ;\n\ +_viv_uniform int offsetW;\n\ +_viv_uniform int2 multAndoutZP0;\n\ +_viv_uniform int2 multAndoutZP1;//[0:15] multiplier, [31:63] output zp\n\ +\n\ +_viv_uniform VXC_512Bits uniConvBF16toF32_Part0_2x8;\n\ +_viv_uniform VXC_512Bits uniConvBF16toF32_Part1_2x8;\n\ +_viv_uniform VXC_512Bits uniExtractOddData_2x8;\n\ +\n\ +__kernel void scatter_nd_update_F16F16toF16(\n\ + __read_only image2d_t input0,\n\ + __read_only image2d_t input1,\n\ + __read_only image2d_t input2,\n\ + image2d_array_t output,\n\ + int width,\n\ + int area,\n\ + int vol,\n\ + int coord_dim\n\ + )\n\ +{\n\ + int gidx = get_global_id(0);\n\ + int gidy = get_global_id(1);\n\ + int cnt = 0;\n\ +\n\ + vxc_short8 tmpVal = (vxc_short8)(0, 0, 0, 0, 0, 0, 0, 0);\n\ + vxc_half8 sum;\n\ + _viv_asm(COPY, sum, tmpVal, 16);\n\ + Image img1 = create_image_from_image2d(input1, 4);\n\ + __global int* index_ptr = (__global int*)img1.ptr;\n\ + for(int i = 0; i < index_num; i++)\n\ + {\n\ + //int4 indice = read_imagei(input1, (int2)(0, i));\n\ + int4 indice = vload4(0, index_ptr + offset_idx);\n\ + index_ptr += coord_dim;\n\ + int idx = indice.x * offsetX + indice.y * offsetY + indice.z * offsetZ + indice.w * offsetW;\n\ + if(gidy == idx)\n\ + {\n\ + vxc_half8 src;\n\ + VXC_ReadImage(tmpVal, input2, (int2)(gidx, i), 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ + cnt++;\n\ + _viv_asm(COPY, src, tmpVal, 16);\n\ + VXC_DP2x8(sum, sum, src, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniAccumulateSum_2x8);\n\ + }\n\ + }\n\ + _viv_asm(COPY, tmpVal, sum, 16);\n\ + int2 coord = (int2)(gidx, gidy);\n\ + if(cnt == 0)\n\ + {\n\ + VXC_ReadImage(tmpVal, input0, coord, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ + }\n\ + VXC_WriteImage(output, coord, tmpVal, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ +}\n\ +\n\ +#define SCATTER_ND_UPDATE_QINT(src0_type_name, src2_type_name, out_type_name, data_type) \\\n\ +__kernel void scatter_nd_update_##src0_type_name##src2_type_name##to##out_type_name##( \\\n\ + __read_only image2d_t input0, \\\n\ + __read_only image2d_t input1, \\\n\ + __read_only image2d_t input2, \\\n\ + image2d_array_t output, \\\n\ + int width, \\\n\ + int area, \\\n\ + int vol, \\\n\ + int coord_dim \\\n\ + ) \\\n\ +{ \\\n\ + int gidx = get_global_id(0); \\\n\ + int gidy = get_global_id(1); \\\n\ + int cnt = 0; \\\n\ + \\\n\ + data_type sum = (data_type)(0, 0, 0, 0, 0, 0, 0, 0); \\\n\ + Image img1 = create_image_from_image2d(input1, 4); \\\n\ + __global int* index_ptr = (__global int*)img1.ptr; \\\n\ + for(int i = 0; i < index_num; i++) \\\n\ + { \\\n\ + int4 indice = vload4(0, index_ptr + offset_idx); \\\n\ + index_ptr += coord_dim; \\\n\ + int idx = indice.x * offsetX + indice.y * offsetY + indice.z * offsetZ + indice.w * offsetW; \\\n\ + if(gidy == idx) \\\n\ + { \\\n\ + data_type src; \\\n\ + VXC_ReadImage(src, input2, (int2)(gidx, i), 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \\\n\ + cnt++; \\\n\ + VXC_DP2x8(sum, sum, src, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniAccumulateSum_2x8); \\\n\ + } \\\n\ + } \\\n\ + int2 coord = (int2)(gidx, gidy); \\\n\ + vxc_ushort8 ms0; \\\n\ + data_type dst; \\\n\ + if(cnt == 0) \\\n\ + { \\\n\ + VXC_ReadImage(sum, input0, coord, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \\\n\ + _viv_asm(COPY, ms0, multAndoutZP0, 16); \\\n\ + VXC_DP2x8(dst, sum, ms0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1), \\\n\ + uniU8MulAndPostShift_0_Lo_2x8); \\\n\ + VXC_WriteImage(output, coord, dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \\\n\ + } \\\n\ + else \\\n\ + { \\\n\ + _viv_asm(COPY, ms0, multAndoutZP1, 16); \\\n\ + VXC_DP2x8(dst, sum, ms0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1), \\\n\ + uniU8MulAndPostShift_1_Lo_2x8); \\\n\ + VXC_WriteImage(output, coord, dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \\\n\ + } \\\n\ +}\n\ +SCATTER_ND_UPDATE_QINT(U8, U8, U8, vxc_uchar8)\n\ +SCATTER_ND_UPDATE_QINT(I8, I8, I8, vxc_char8)\n\ +SCATTER_ND_UPDATE_QINT(I16, I16, I16, vxc_short8)\n\ +\n\ +#define SCATTER_ND_UPDATE_QINT_TO_F16(src0_type, data_type) \\\n\ +__kernel void scatter_nd_update_##src0_type##src0_type##toF16( \\\n\ + __read_only image2d_t input0, \\\n\ + __read_only image2d_t input1, \\\n\ + __read_only image2d_t input2, \\\n\ + image2d_array_t output, \\\n\ + int width, \\\n\ + int area, \\\n\ + int vol, \\\n\ + int coord_dim \\\n\ + ) \\\n\ +{ \\\n\ + int gidx = get_global_id(0); \\\n\ + int gidy = get_global_id(1); \\\n\ + int cnt = 0; \\\n\ + vxc_short8 sum = (vxc_short8)(0, 0, 0, 0, 0, 0, 0, 0); \\\n\ + data_type src; \\\n\ + Image img1 = create_image_from_image2d(input1, 4); \\\n\ + __global int* index_ptr = (__global int*)img1.ptr; \\\n\ + for(int i = 0; i < index_num; i++) \\\n\ + { \\\n\ + int4 indice = vload4(0, index_ptr + offset_idx); \\\n\ + index_ptr += coord_dim; \\\n\ + int idx = indice.x * offsetX + indice.y * offsetY + indice.z * offsetZ + indice.w * offsetW; \\\n\ + if(gidy == idx) \\\n\ + { \\\n\ + VXC_ReadImage(src, input2, (int2)(gidx, i), 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \\\n\ + cnt++; \\\n\ + VXC_DP2x8(sum, sum, src, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniAccumulateSum_2x8); \\\n\ + } \\\n\ + } \\\n\ + int2 coord = (int2)(gidx, gidy); \\\n\ + vxc_ushort8 ms0; \\\n\ + vxc_half8 tmpDst; \\\n\ + vxc_short8 dst; \\\n\ + if(cnt == 0) \\\n\ + { \\\n\ + VXC_ReadImage(src, input0, coord, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \\\n\ + _viv_asm(COPY, ms0, multAndoutZP0, 16); \\\n\ + VXC_DP2x8(tmpDst, src, ms0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1), \\\n\ + uniU8MulAndPostShift_0_Lo_2x8); \\\n\ + _viv_asm(COPY, dst, tmpDst, 16); \\\n\ + VXC_WriteImage(output, coord, dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \\\n\ + } \\\n\ + else \\\n\ + { \\\n\ + _viv_asm(COPY, ms0, multAndoutZP1, 16); \\\n\ + VXC_DP2x8(tmpDst, sum, ms0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1), \\\n\ + uniU8MulAndPostShift_1_Lo_2x8); \\\n\ + _viv_asm(COPY, dst, tmpDst, 16); \\\n\ + VXC_WriteImage(output, coord, dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \\\n\ + } \\\n\ +}\n\ +SCATTER_ND_UPDATE_QINT_TO_F16(U8, vxc_uchar8)\n\ +SCATTER_ND_UPDATE_QINT_TO_F16(I8, vxc_char8)\n\ +SCATTER_ND_UPDATE_QINT_TO_F16(I16, vxc_short8)\n\ +\n\ +__kernel void scatter_nd_update_BF16BF16toBF16(\n\ + __read_only image2d_t input0,\n\ + __read_only image2d_t input1,\n\ + __read_only image2d_t input2,\n\ + image2d_array_t output,\n\ + int width,\n\ + int area,\n\ + int vol,\n\ + int coord_dim\n\ + )\n\ +{\n\ + int gidx = get_global_id(0);\n\ + int gidy = get_global_id(1);\n\ + int cnt = 0;\n\ +\n\ + vxc_short8 zero = (vxc_short8)(0, 0, 0, 0, 0, 0, 0, 0);\n\ + vxc_ushort8 src0, src1, src2;\n\ + float4 srcA, srcB;\n\ + float4 sum0 = (float4)(0);\n\ + float4 sum1 = sum0;\n\ +\n\ + Image img1 = create_image_from_image2d(input1, 4);\n\ + __global int* index_ptr = (__global int*)img1.ptr;\n\ + for(int i = 0; i < index_num; i++)\n\ + {\n\ + //int4 indice = read_imagei(input1, (int2)(0, i));\n\ + int4 indice = vload4(0, index_ptr + offset_idx);\n\ + index_ptr += coord_dim;\n\ + int idx = indice.x * offsetX + indice.y * offsetY + indice.z * offsetZ + indice.w * offsetW;\n\ + if(gidy == idx)\n\ + {\n\ + VXC_ReadImage(src0, input2, (int2)(gidx, i), 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ + cnt++;\n\ + VXC_DP2x8(src1, src0, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0),\n\ + uniConvBF16toF32_Part0_2x8);\n\ + VXC_DP2x8(src2, src0, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0),\n\ + uniConvBF16toF32_Part1_2x8);\n\ + _viv_asm(COPY, srcA, src1, 16);\n\ + _viv_asm(COPY, srcB, src2, 16);\n\ + sum0 += srcA;\n\ + sum1 += srcB;\n\ + }\n\ + }\n\ + int2 coord = (int2)(gidx, gidy);\n\ + if(cnt == 0)\n\ + {\n\ + VXC_ReadImage(src2, input0, coord, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ + VXC_WriteImage(output, coord, src2, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ + }\n\ + else\n\ + {\n\ + _viv_asm(COPY, src0, sum0, 16);\n\ + _viv_asm(COPY, src1, sum1, 16);\n\ + VXC_DP2x8(src2, src0, src1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniExtractOddData_2x8);\n\ + VXC_WriteImage(output, coord, src2, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ + }\n\ +}"; /* end of scatter_nd_update_vx*/ + +static const char scatter_nd_update_atom_vx[] = "#include \"cl_viv_vx_ext.h\"\n\ +\n\ +_viv_uniform VXC_512Bits uniU8MulAndPostShift_0_Lo_2x8;\n\ +_viv_uniform int update_width;\n\ +_viv_uniform int output_width;\n\ +_viv_uniform int2 multAndoutZP0;\n\ +\n\ +_viv_uniform int offsetX;\n\ +_viv_uniform int offsetY;\n\ +_viv_uniform int offsetZ;\n\ +_viv_uniform int offsetW;\n\ +_viv_uniform int offset_idx;\n\ +\n\ +_viv_uniform float scaleInOut;\n\ +_viv_uniform float output_zp;\n\ +_viv_uniform int input_zp;\n\ +_viv_uniform float input_scale;\n\ +_viv_uniform VXC_512Bits uniConvert1stUint8SubZpToFp32_4x4;\n\ +_viv_uniform VXC_512Bits uniConvertInt32toUint8_2x8;\n\ +\n\ +_viv_uniform int count_width;\n\ +\n\ +#define SCATTER_ND_UPDATE_QINT_PRE(src0_type, data_type, ptr_type, element_size) \\\n\ +__kernel void scatter_nd_update_##src0_type##_pre( \\\n\ + __read_only image2d_t input1, __read_only image2d_t input2, \\\n\ + image2d_t output, image2d_t output_cnt, image2d_t tmp_output, \\\n\ + int width, int area, int vol, int coord_dim ) \\\n\ +{ \\\n\ + int gidx = get_global_id(0); \\\n\ + int gidy = get_global_id(1); \\\n\ + \\\n\ + Image img1 = create_image_from_image2d(input1, 4); \\\n\ + Image img2 = create_image_from_image2d(input2, element_size); \\\n\ + Image img3 = create_image_from_image2d(output, 4); \\\n\ + Image img4 = create_image_from_image2d(output_cnt, 4); \\\n\ + __global int* index_ptr = (__global int*)img1.ptr; \\\n\ + __global ptr_type* update_ptr = (__global ptr_type*)img2.ptr; \\\n\ + __global int* output_ptr = (__global int*)img3.ptr; \\\n\ + __global int* cnt_ptr = (__global int*)img4.ptr; \\\n\ + data_type src; \\\n\ + \\\n\ + int4 indice = vload4(0, index_ptr + gidy * coord_dim + offset_idx); \\\n\ + ptr_type tmpData = update_ptr[gidy * update_width + gidx]; \\\n\ + int idx = indice.x * offsetX + indice.y * offsetY + indice.z * offsetZ + indice.w * offsetW; \\\n\ + int loc = idx * output_width + gidx; \\\n\ + _viv_asm(COPY, src, tmpData, 4); \\\n\ + vxc_int4 data; \\\n\ + short zp = input_zp; \\\n\ + VXC_DP4x4(data, src, zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 1), \\\n\ + uniConvert1stUint8SubZpToFp32_4x4); \\\n\ + atomic_add(output_ptr + loc, data.x); \\\n\ + if(gidx == 0) \\\n\ + { \\\n\ + atomic_inc(cnt_ptr + idx); \\\n\ + } \\\n\ +}\n\ +SCATTER_ND_UPDATE_QINT_PRE(U8, vxc_uchar8, uchar, 1)\n\ +SCATTER_ND_UPDATE_QINT_PRE(I8, vxc_char8, char, 1)\n\ +SCATTER_ND_UPDATE_QINT_PRE(I16, vxc_short8, short, 2)\n\ +\n\ +// input0 ref\n\ +// input1 sum\n\ +// input2 count\n\ +// input3 update\n\ +#define SCATTER_ND_UPDATE_QINT_TO_F16_BIG(src0_type, data_type, ptr_type, element_size) \\\n\ +__kernel void scatter_nd_update_##src0_type##src0_type##toF16_big( \\\n\ + __read_only image2d_t input0, \\\n\ + __read_only image2d_t input1, \\\n\ + __read_only image2d_t input2, \\\n\ + __read_only image2d_t input3, \\\n\ + __read_only image2d_t input4, \\\n\ + image2d_t output, \\\n\ + int width, \\\n\ + int area, \\\n\ + int vol, \\\n\ + int coord_dim \\\n\ + ) \\\n\ +{ \\\n\ + int gidx = get_global_id(0); \\\n\ + int gidy = get_global_id(1); \\\n\ + \\\n\ + Image img2 = create_image_from_image2d(input2, 4); \\\n\ + Image img3 = create_image_from_image2d(output, 2); \\\n\ + \\\n\ + __global int* cnt_ptr = (__global int*)img2.ptr; \\\n\ + __global short* output_ptr = (__global short*)img3.ptr; \\\n\ + data_type src; \\\n\ + \\\n\ + int cnt = cnt_ptr[gidy]; \\\n\ + int loc = gidy * output_width + gidx; \\\n\ + \\\n\ + vxc_ushort8 ms0; \\\n\ + vxc_half8 tmpDst; \\\n\ + vxc_short8 dst; \\\n\ + if(cnt == 0) \\\n\ + { \\\n\ + Image img0 = create_image_from_image2d(input0, element_size); \\\n\ + __global ptr_type* ref_ptr = (__global ptr_type*)img0.ptr; \\\n\ + ptr_type tmpData = ref_ptr[loc]; \\\n\ + _viv_asm(COPY, ms0, multAndoutZP0, 16); \\\n\ + _viv_asm(COPY, src, tmpData, 4); \\\n\ + VXC_DP2x8(tmpDst, src, ms0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1), \\\n\ + uniU8MulAndPostShift_0_Lo_2x8); \\\n\ + _viv_asm(COPY, dst, tmpDst, 16); \\\n\ + output_ptr[loc] = dst.x; \\\n\ + } \\\n\ + else \\\n\ + { \\\n\ + Image img1 = create_image_from_image2d(input1, 4); \\\n\ + __global int* sum_ptr = (__global int*)img1.ptr; \\\n\ + int sum = sum_ptr[loc]; \\\n\ + float result = sum * input_scale; \\\n\ + half tmpOut; \\\n\ + _viv_asm(CONV, tmpOut, result); \\\n\ + _viv_asm(COPY, dst, tmpOut, 4); \\\n\ + output_ptr[loc] = dst.x; \\\n\ + } \\\n\ +}\n\ +SCATTER_ND_UPDATE_QINT_TO_F16_BIG(U8, vxc_uchar8, uchar, 1)\n\ +SCATTER_ND_UPDATE_QINT_TO_F16_BIG(I8, vxc_char8, char, 1)\n\ +SCATTER_ND_UPDATE_QINT_TO_F16_BIG(I16, vxc_short8, short, 2)\n\ +\n\ +#define SCATTER_ND_UPDATE_QINT_BIG(src0_type, data_type, ptr_type, element_size) \\\n\ +__kernel void scatter_nd_update_##src0_type##src0_type##to##src0_type##_big( \\\n\ + __read_only image2d_t input0, \\\n\ + __read_only image2d_t input1, \\\n\ + __read_only image2d_t input2, \\\n\ + __read_only image2d_t input3, \\\n\ + __read_only image2d_t input4, \\\n\ + image2d_t output, \\\n\ + int width, \\\n\ + int area, \\\n\ + int vol, \\\n\ + int coord_dim \\\n\ + ) \\\n\ +{ \\\n\ + int gidx = get_global_id(0); \\\n\ + int gidy = get_global_id(1); \\\n\ + Image img2 = create_image_from_image2d(input2, 4); \\\n\ + Image img3 = create_image_from_image2d(output, element_size); \\\n\ + __global int* cnt_ptr = (__global int*)img2.ptr; \\\n\ + __global ptr_type* output_ptr = (__global ptr_type*)img3.ptr; \\\n\ + int cnt = cnt_ptr[gidy]; \\\n\ + int loc = gidy * output_width + gidx; \\\n\ + data_type src, dst; \\\n\ + if(cnt == 0) \\\n\ + { \\\n\ + Image img0 = create_image_from_image2d(input0, element_size); \\\n\ + __global ptr_type* ref_ptr = (__global ptr_type*)img0.ptr; \\\n\ + ptr_type tmpData = ref_ptr[loc]; \\\n\ + vxc_ushort8 ms0; \\\n\ + _viv_asm(COPY, ms0, multAndoutZP0, 16); \\\n\ + _viv_asm(COPY, src, tmpData, 4); \\\n\ + VXC_DP2x8(dst, src, ms0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1), \\\n\ + uniU8MulAndPostShift_0_Lo_2x8); \\\n\ + output_ptr[loc] = dst.x; \\\n\ + } \\\n\ + else \\\n\ + { \\\n\ + Image img1 = create_image_from_image2d(input1, 4); \\\n\ + __global int* sum_ptr = (__global int*)img1.ptr; \\\n\ + int sum = sum_ptr[loc]; \\\n\ + int4 result; \\\n\ + result.x = convert_int_rte(sum * scaleInOut + output_zp); \\\n\ + VXC_DP2x8(dst, result, result, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1), \\\n\ + uniConvertInt32toUint8_2x8); \\\n\ + output_ptr[loc] = dst.x; \\\n\ + } \\\n\ +}\n\ +SCATTER_ND_UPDATE_QINT_BIG(U8, vxc_uchar8, uchar, 1)\n\ +SCATTER_ND_UPDATE_QINT_BIG(I8, vxc_char8, char, 1)\n\ +SCATTER_ND_UPDATE_QINT_BIG(I16, vxc_short8, short, 2)\n\ +\n\ +__kernel void scatter_nd_update_reset(\n\ + __read_only image2d_t input0,\n\ + image2d_t output_sum,\n\ + image2d_t output_cnt\n\ + )\n\ +{\n\ + int gidx = get_global_id(0);\n\ +\n\ + Image img3 = create_image_from_image2d(output_sum, 4);\n\ + Image img4 = create_image_from_image2d(output_cnt, 4);\n\ + __global int* sum_ptr = (__global int*)img3.ptr;\n\ + __global int* cnt_ptr = (__global int*)img4.ptr;\n\ + int4 data = (int4)(0);\n\ + vstore4(data, gidx, sum_ptr);\n\ + if(gidx < count_width)\n\ + {\n\ + vstore4(data, gidx, cnt_ptr);\n\ + }\n\ +}"; /* end of scatter_nd_update_atom_vx*/ + +static const char scatter_nd_update_big_vx[] = "#include \"cl_viv_vx_ext.h\"\n\ +\n\ +_viv_uniform VXC_512Bits uniAccumulateSum_2x8;\n\ +_viv_uniform int index_num;\n\ +_viv_uniform int update_width;\n\ +_viv_uniform int output_width;\n\ +\n\ +_viv_uniform int offsetX;\n\ +_viv_uniform int offsetY;\n\ +_viv_uniform int offsetZ;\n\ +_viv_uniform int offsetW;\n\ +_viv_uniform int offset_idx;\n\ +\n\ +__kernel void scatter_nd_update_F16F16toF16_big(\n\ + __read_only image2d_t input0,\n\ + __read_only image2d_t input1,\n\ + __read_only image2d_t input2,\n\ + image2d_t output,\n\ + int width,\n\ + int area,\n\ + int vol,\n\ + int coord_dim\n\ + )\n\ +{\n\ + int gidx = get_global_id(0);\n\ + int gidy = get_global_id(1);\n\ + int cnt = 0;\n\ +\n\ + vxc_short8 tmpVal = (vxc_short8)(0, 0, 0, 0, 0, 0, 0, 0);\n\ + vxc_half8 sum;\n\ + _viv_asm(COPY, sum, tmpVal, 16);\n\ + Image img1 = create_image_from_image2d(input1, 4);\n\ + Image img2 = create_image_from_image2d(input2, 2);\n\ + Image img3 = create_image_from_image2d(output, 2);\n\ +\n\ + __global int* index_ptr = (__global int*)img1.ptr;\n\ + __global short* update_ptr = (__global short*)img2.ptr;\n\ + __global short* output_ptr = (__global short*)img3.ptr;\n\ + for(int i = 0; i < index_num; i++)\n\ + {\n\ + int4 indice = vload4(0, index_ptr + offset_idx);\n\ + index_ptr += coord_dim;\n\ +\n\ + int idx = indice.x * offsetX + indice.y * offsetY + indice.z * offsetZ + indice.w * offsetW;\n\ + if(gidy == idx)\n\ + {\n\ + vxc_half8 src;\n\ + short tmpData = update_ptr[i * update_width + gidx];\n\ + cnt++;\n\ + _viv_asm(COPY, src, tmpData, 4);\n\ + VXC_DP2x8(sum, sum, src, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniAccumulateSum_2x8);\n\ + }\n\ + }\n\ + short dst;\n\ + _viv_asm(COPY, dst, sum, 4);\n\ + int loc = gidy * output_width+ gidx;\n\ + if(cnt == 0)\n\ + {\n\ + Image img0 = create_image_from_image2d(input0, 2);\n\ + __global short* ref_ptr = (__global short*)img0.ptr;\n\ + dst = ref_ptr[loc];\n\ + }\n\ + output_ptr[loc] = dst;\n\ +}\n\ +"; /* end of scatter_nd_update_big_vx*/ + static const char select_vx[] = "#include \"cl_viv_vx_ext.h\"\n\ \n\ _viv_uniform VXC_512Bits uniConvConditiontoDst_2x8;\n\ @@ -37874,6 +39230,53 @@ __kernel void sequence_mask_F16toU8(\n\ \n\ "; /* end of sequence_mask_vx*/ +static const char signal_frame_vx[] = "#include \"cl_viv_vx_ext.h\"\n\ +\n\ +#define SIGNAL_FRAME_8BITS_SH_IMPL(type) \\\n\ +__kernel void signal_frame_##type##to##type \\\n\ + ( \\\n\ + __read_only image2d_array_t input, \\\n\ + __write_only image2d_array_t output, \\\n\ + int frame_step \\\n\ + ) \\\n\ +{ \\\n\ + int inner = get_global_id(0); \\\n\ + int length_k = get_global_id(1); \\\n\ + int frames_id = get_global_id(2); \\\n\ + \\\n\ + int4 coord = (int4)(inner, length_k, frames_id, frames_id); \\\n\ + int2 coord_in = (int2)(inner, frames_id * frame_step + length_k); \\\n\ + \\\n\ + vxc_uchar16 src; \\\n\ + VXC_ReadImage(src, input, coord_in, 0, VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); \\\n\ + VXC_WriteImage2DArray(output, coord, src, VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); \\\n\ +}\n\ +SIGNAL_FRAME_8BITS_SH_IMPL(U8)\n\ +SIGNAL_FRAME_8BITS_SH_IMPL(I8)\n\ +\n\ +#define SIGNAL_FRAME_16BITS_SH_IMPL(type) \\\n\ +__kernel void signal_frame_##type##to##type \\\n\ + ( \\\n\ + __read_only image2d_array_t input, \\\n\ + __write_only image2d_array_t output, \\\n\ + int frame_step \\\n\ + ) \\\n\ +{ \\\n\ + int inner = get_global_id(0); \\\n\ + int length_k = get_global_id(1); \\\n\ + int frames_id = get_global_id(2); \\\n\ + \\\n\ + int4 coord = (int4)(inner, length_k, frames_id, frames_id); \\\n\ + int2 coord_in = (int2)(inner, frames_id * frame_step + length_k); \\\n\ + \\\n\ + vxc_short8 src; \\\n\ + VXC_ReadImage(src, input, coord_in, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \\\n\ + VXC_WriteImage2DArray(output, coord, src, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \\\n\ +}\n\ +SIGNAL_FRAME_16BITS_SH_IMPL(I16)\n\ +SIGNAL_FRAME_16BITS_SH_IMPL(F16)\n\ +SIGNAL_FRAME_16BITS_SH_IMPL(BF16)"; /* end of signal_frame_vx*/ + static const char slice_vx[] = "#include \"cl_viv_vx_ext.h\"\n\ \n\ #define SLICE_SAMLEFL_SH_IMPL(name, data_type, end_bin) \\\n\ @@ -38395,6 +39798,71 @@ __kernel void swish_BF16toBF16_2D(\n\ }\n\ "; /* end of swish_vx*/ +static const char tensorstackconcat_vx[] = "#include \"cl_viv_vx_ext.h\"\n\ +\n\ +__kernel void tensorstackconcat_16bits\n\ + (\n\ + __read_only image2d_array_t input,\n\ + __read_only image2d_t index,\n\ + __write_only image2d_array_t output\n\ + )\n\ +{\n\ + int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0);\n\ + vxc_short8 src0;\n\ + VXC_ReadImage2DArray(src0, input, coord, VXC_5BITOFFSET_XY(0, 0),\\\n\ + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ + coord.w = 0;\n\ + coord.y = read_imagei(index, coord.ww).x;\n\ + VXC_WriteImage2DArray(output, coord, src0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ +}\n\ +\n\ +__kernel void tensorstackconcat_8bits\n\ + (\n\ + __read_only image2d_array_t input,\n\ + __read_only image2d_t index,\n\ + __write_only image2d_array_t output\n\ + )\n\ +{\n\ + int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0);\n\ + int idx = coord.x;\n\ + vxc_char16 src0, src1;\n\ + VXC_ReadImage2DArray(src0, input, coord, VXC_5BITOFFSET_XY(0, 0),\\\n\ + VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));\n\ + coord.y = read_imagei(index, coord.ww).x;\n\ + VXC_WriteImage2DArray(output, coord, src0, VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));\n\ +}\n\ +\n\ +__kernel void tensorstackconcat_16bits_2D\n\ + (\n\ + __read_only image2d_array_t input,\n\ + __read_only image2d_t index,\n\ + __write_only image2d_array_t output\n\ + )\n\ +{\n\ + int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0);\n\ + vxc_short8 src0;\n\ + VXC_ReadImage(src0, input, coord.xy, VXC_5BITOFFSET_XY(0, 0),\\\n\ + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ + coord.y = read_imagei(index, coord.ww).x;\n\ + VXC_WriteImage(output, coord.xy, src0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ +}\n\ +\n\ +__kernel void tensorstackconcat_8bits_2D\n\ + (\n\ + __read_only image2d_array_t input,\n\ + __read_only image2d_t index,\n\ + __write_only image2d_array_t output\n\ + )\n\ +{\n\ + int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0);\n\ + int idx = coord.x;\n\ + vxc_char16 src0, src1;\n\ + VXC_ReadImage(src0, input, coord.xy, VXC_5BITOFFSET_XY(0, 0),\\\n\ + VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));\n\ + coord.y = read_imagei(index, coord.ww).x;\n\ + VXC_WriteImage(output, coord.xy, src0, VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));\n\ +}"; /* end of tensorstackconcat_vx*/ + static const char tile_vx[] = "#include \"cl_viv_vx_ext.h\"\n\ \n\ _viv_uniform int lastWorkItem;\n\ @@ -39816,7 +41284,7 @@ _viv_uniform float tail;\n\ coord_out.x = coord.x; \\\n\ for (int x = 0; x < stride; ) \\\n\ { \\\n\ - VXC_OP4_NoDest(img_store_3d, output, coord_out.xyww, write_val, \\\n\ + VXC_OP4_NoDest(img_store_3d, output, coord_out.xywz, write_val, \\\n\ VXC_MODIFIER(0, 0, 0,VXC_RM_TowardZero, 0)); \\\n\ x++; \\\n\ coord_out.x ++; \\\n\ @@ -39870,9 +41338,9 @@ _viv_uniform int2 multAndoutZP;//[0:15] multiplier, [31:63] output zp\n\ VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniUpScale2X_lo_2x8); \\\n\ VXC_DP2x8(dst_val, src_val, multiplier, \\\n\ VXC_MODIFIER(8, 15, 0, VXC_RM_ToNearestEven, 1), uniUpScale2X_hi_2x8); \\\n\ - VXC_OP4_NoDest(img_store_3d, output, coord.xyww, dst_val, VXC_MODIFIER(0, 15, 0,VXC_RM_TowardZero, 0)); \\\n\ + VXC_OP4_NoDest(img_store_3d, output, coord.xywz, dst_val, VXC_MODIFIER(0, 15, 0,VXC_RM_TowardZero, 0)); \\\n\ coord.y ++; \\\n\ - VXC_OP4_NoDest(img_store_3d, output, coord.xyww, dst_val, VXC_MODIFIER(0, 15, 0,VXC_RM_TowardZero, 0)); \\\n\ + VXC_OP4_NoDest(img_store_3d, output, coord.xywz, dst_val, VXC_MODIFIER(0, 15, 0,VXC_RM_TowardZero, 0)); \\\n\ }\n\ \n\ UPSAMPLE_SCALETO8B_FUN(F16, I8, vxc_short8, vxc_half8, vxc_char16)\n\ @@ -39906,14 +41374,14 @@ UPSAMPLE_SCALETO8B_FUN(U8, U8, vxc_uchar16, vxc_uchar16, vxc_uchar16)\n\ VXC_DP2x8(dst0_val, src_val, multiplier, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1), uniUpScale2X_lo_2x8); \\\n\ VXC_DP2x8(dst1_val, src_val, multiplier, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1), uniUpScale2X_hi_2x8); \\\n\ _viv_asm(COPY, write_val, dst0_val, 16); \\\n\ - VXC_OP4_NoDest(img_store_3d, output, coord.xyww, write_val, VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0)); \\\n\ + VXC_OP4_NoDest(img_store_3d, output, coord.xywz, write_val, VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0)); \\\n\ coord.y ++; \\\n\ - VXC_OP4_NoDest(img_store_3d, output, coord.xyww, write_val, VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0)); \\\n\ + VXC_OP4_NoDest(img_store_3d, output, coord.xywz, write_val, VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0)); \\\n\ _viv_asm(COPY, write_val, dst1_val, 16); \\\n\ coord.xy = coord.xy + (int2)(8, -1); \\\n\ - VXC_OP4_NoDest(img_store_3d, output, coord.xyww, write_val, VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0)); \\\n\ + VXC_OP4_NoDest(img_store_3d, output, coord.xywz, write_val, VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0)); \\\n\ coord.y ++; \\\n\ - VXC_OP4_NoDest(img_store_3d, output, coord.xyww, write_val, VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0)); \\\n\ + VXC_OP4_NoDest(img_store_3d, output, coord.xywz, write_val, VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0)); \\\n\ }\n\ UPSAMPLE_SCALETO16B_FUN(F16, F16, vxc_short8, vxc_half8, vxc_half8, vxc_short8)\n\ UPSAMPLE_SCALETO16B_FUN(F16, I16, vxc_short8, vxc_half8, vxc_short8, vxc_short8)\n\ @@ -39923,71 +41391,6 @@ UPSAMPLE_SCALETO16B_FUN(I16, F16, vxc_short8, vxc_short8, vxc_half8, vxc_sho UPSAMPLE_SCALETO16B_FUN(I16, I16, vxc_short8, vxc_short8, vxc_short8, vxc_short8)\n\ "; /* end of upsamplescale_k2_vx*/ -static const char vsi_nn_kernel_box_with_nms_limit_vx[] = "#include \"cl_viv_vx_ext.h\"\n\ -\n\ -__kernel void vxcBox_with_nms_limit(\n\ - __read_only image2d_array_t input,\n\ - __write_only image2d_array_t output)\n\ -{\n\ -\n\ -}\n\ -"; /* end of vsi_nn_kernel_box_with_nms_limit_vx*/ - -static const char vsi_nn_kernel_detection_postprocess_vx[] = "#include \"cl_viv_vx_ext.h\"\n\ -\n\ -__kernel void vxcDetection_postprocess(\n\ - __read_only image2d_array_t input,\n\ - __write_only image2d_array_t output)\n\ -{\n\ -\n\ -}\n\ -"; /* end of vsi_nn_kernel_detection_postprocess_vx*/ - -static const char vsi_nn_kernel_extra_ending_vx[] = "#include \"cl_viv_vx_ext.h\"\n\ -\n\ -__kernel void vxcExtra_ending_i16(\n\ - __read_only image2d_array_t input0,\n\ - __read_only image2d_array_t input,\n\ - __write_only image2d_array_t output)\n\ -{\n\ - int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0);\n\ -\n\ - vxc_short8 data;\n\ - VXC_ReadImage2DArray(data, input, coord, VXC_5BITOFFSET_XY(0, 0),\\\n\ - VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ -\n\ - VXC_WriteImage2DArray(output, coord, data, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ -}\n\ -\n\ -__kernel void vxcExtra_ending_i8(\n\ - __read_only image2d_array_t input0,\n\ - __read_only image2d_array_t input,\n\ - __write_only image2d_array_t output)\n\ -{\n\ - int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0);\n\ -\n\ - vxc_char8 data;\n\ - VXC_ReadImage2DArray(data, input, coord, VXC_5BITOFFSET_XY(0, 0),\\\n\ - VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ -\n\ - VXC_WriteImage2DArray(output, coord, data, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ -}\n\ -\n\ -__kernel void vxcExtra_ending_u8(\n\ - __read_only image2d_array_t input0,\n\ - __read_only image2d_array_t input,\n\ - __write_only image2d_array_t output)\n\ -{\n\ - int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0);\n\ -\n\ - vxc_uchar8 data;\n\ - VXC_ReadImage2DArray(data, input, coord, VXC_5BITOFFSET_XY(0, 0),\\\n\ - VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ -\n\ - VXC_WriteImage2DArray(output, coord, data, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ -}\n\ -"; /* end of vsi_nn_kernel_extra_ending_vx*/ - static const char vsi_nn_kernel_header_vx[] = "/*\n\ ============================================================================\n\ Name : libNNExt.vx\n\ @@ -40015,12 +41418,20 @@ inline Image create_image_from_image2d(image2d_t input, int stride_x)\n\ {\n\ int8 desc;\n\ _viv_asm(COPY, desc, input, sizeof(desc));\n\ +\n\ +#if (USE_40BITS_VA==0)\n\ + uint address = as_uint(desc.s0);\n\ + int stride_y = desc.s1;\n\ +#else\n\ + ulong address = as_ulong(desc.s05);\n\ + int stride_y = desc.s6;\n\ +#endif\n\ \n\ Image img =\n\ {\n\ - .ptr = (uchar*)desc.s0,\n\ + .ptr = (uchar*)address,\n\ .stride_x = stride_x,\n\ - .stride_y = desc.s1\n\ + .stride_y = stride_y\n\ };\n\ \n\ return img;\n\ @@ -40034,22 +41445,35 @@ typedef struct Tensor\n\ int stride_z;\n\ } Tensor;\n\ \n\ -inline uchar* create_tensor_ptr_from_coord(Tensor t, int4 coord)\n\ +inline uchar* get_tensor_ptr_from_coord(Tensor t, int4 coord)\n\ {\n\ return t.ptr + coord.x * t.stride_x + coord.y * t.stride_y + coord.z * t.stride_z;\n\ }\n\ \n\ inline Tensor create_tensor_from_image2d_array(image2d_array_t input, int stride_x)\n\ {\n\ +#if (USE_40BITS_VA==0)\n\ int8 desc;\n\ _viv_asm(COPY, desc, input, sizeof(desc));\n\ +\n\ + uint address = as_uint(desc.s0);\n\ + int stride_y = desc.s1;\n\ + int stride_z = desc.s4;\n\ +#else\n\ + int16 desc;\n\ + _viv_asm(COPY, desc, input, sizeof(desc));\n\ +\n\ + ulong address = as_ulong(desc.s05);\n\ + int stride_y = desc.s6;\n\ + int stride_z = desc.sa;\n\ +#endif\n\ \n\ Tensor t =\n\ {\n\ - .ptr = (uchar*)desc.s0,\n\ + .ptr = (uchar*)address,\n\ .stride_x = stride_x,\n\ - .stride_y = desc.s1,\n\ - .stride_z = desc.s4\n\ + .stride_y = stride_y,\n\ + .stride_z = stride_z\n\ };\n\ \n\ return t;\n\ @@ -40244,1984 +41668,96 @@ do\\\n\ #endif\n\ "; /* end of vsi_nn_kernel_header_vx*/ -static const char vsi_nn_kernel_heatmap_max_keypoint_vx[] = "#include \"cl_viv_vx_ext.h\"\n\ -\n\ -__kernel void vxcHeatmap_max_keypoint(\n\ - __read_only image2d_array_t input,\n\ - __write_only image2d_array_t output)\n\ -{\n\ +static const char warp_affine_vx[] = "#include \"cl_viv_vx_ext.h\"\n\ \n\ +_viv_uniform VXC_512Bits uniConvertDatatoF32_0_4x4;\n\ +_viv_uniform VXC_512Bits uniConvertDatatoF32_1_4x4;\n\ +_viv_uniform VXC_512Bits uniExtract8Data_2x8;\n\ +_viv_uniform float input_scale;\n\ +_viv_uniform float input_tail;\n\ +_viv_uniform float output_scale;\n\ +_viv_uniform float output_zp;\n\ +#define WARP_AFFINE_SH_IMPL(name0, name1, src_type, src_copy_type, convert_type, dst_type, dst_copy_type) \\\n\ +__kernel void warp_affine_##name0##to##name1 \\\n\ + ( \\\n\ + __read_only image2d_array_t input, \\\n\ + __read_only image2d_t matrix, \\\n\ + __write_only image2d_array_t output \\\n\ + ) \\\n\ +{ \\\n\ + int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), get_global_id(2)); \\\n\ + int4 coord_in = (int4)(get_global_id(0), get_global_id(1), get_global_id(0) + 1, get_global_id(1)); \\\n\ + \\\n\ + float4 coord_f = convert_float4(coord_in); \\\n\ + \\\n\ + int2 m_coord = (int2)(0, 0); \\\n\ + vxc_ushort8 m0, m1; \\\n\ + VXC_ReadImage(m0, matrix, m_coord, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \\\n\ + VXC_ReadImage(m1, matrix, m_coord, VXC_5BITOFFSET_XY(8, 0), VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \\\n\ + float4 matrix0, matrix1; \\\n\ + _viv_asm(COPY, matrix0, m0, 16); \\\n\ + _viv_asm(COPY, matrix1, m1, 16); \\\n\ + \\\n\ + coord_f = coord_f.xxzz * matrix0.xyxy + coord_f.yyww * matrix0.zwzw + matrix1.xyxy; \\\n\ + \\\n\ + coord_in = convert_int4(coord_f < 0 ? coord_f - 2 : coord_f); \\\n\ + \\\n\ + int4 coord_in0 = (int4)(coord_in.xy, coord.zw); \\\n\ + int8 input_desc; \\\n\ + _viv_asm(COPY, input_desc, input, sizeof(input_desc)); \\\n\ + int baseAddr = (int)coord_in0.z * input_desc.s4 + input_desc.s0; \\\n\ + _viv_asm(MOV, coord_in0.z, baseAddr); \\\n\ + \\\n\ + src_type v0, v1; \\\n\ + src_copy_type top, bot; \\\n\ + VXC_OP4(img_load_3d, v0, input, coord_in0, VXC_5BITOFFSET_XY(0, 0), \\\n\ + VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0)); \\\n\ + VXC_OP4(img_load_3d, v1, input, coord_in0, VXC_5BITOFFSET_XY(0, 1), \\\n\ + VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0)); \\\n\ + coord_in0.xy = coord_in.zw; \\\n\ + VXC_OP4(img_load_3d, v0, input, coord_in0, VXC_5BITOFFSET_XY(0, 0), \\\n\ + VXC_MODIFIER(2, 3, 0, VXC_RM_TowardZero, 0)); \\\n\ + VXC_OP4(img_load_3d, v1, input, coord_in0, VXC_5BITOFFSET_XY(0, 1), \\\n\ + VXC_MODIFIER(2, 3, 0, VXC_RM_TowardZero, 0)); \\\n\ + _viv_asm(COPY, top, v0, 16); \\\n\ + _viv_asm(COPY, bot, v1, 16); \\\n\ + \\\n\ + float4 lerp = coord_f - floor(coord_f); \\\n\ + float4 minus_lerp = 1.0f - lerp; \\\n\ + float4 coef0 = (float4)( minus_lerp.x * minus_lerp.y, lerp.x * minus_lerp.y, \\\n\ + minus_lerp.x * lerp.y, lerp.x * lerp.y); \\\n\ + \\\n\ + float4 coef1 = (float4)( minus_lerp.z * minus_lerp.w, lerp.z * minus_lerp.w, \\\n\ + minus_lerp.z * lerp.w, lerp.z * lerp.w); \\\n\ + \\\n\ + float4 data0, data1, result = 0; \\\n\ + VXC_DP4x4(data0, top, bot, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertDatatoF32_0_4x4); \\\n\ + VXC_DP4x4(data1, top, bot, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertDatatoF32_1_4x4); \\\n\ + \\\n\ + data0 = data0 * input_scale + input_tail; \\\n\ + data1 = data1 * input_scale + input_tail; \\\n\ + result.x = dot(data0, coef0); \\\n\ + result.y = dot(data1, coef1); \\\n\ + result.xy = result.xy * output_scale + output_zp; \\\n\ + convert_type dst0; \\\n\ + _viv_asm(CONV_RTE, dst0, result); \\\n\ + dst_type dst1; \\\n\ + VXC_DP2x8(dst1, dst0, dst0, VXC_MODIFIER(0, 1, 0, VXC_RM_ToNearestEven, 1), uniExtract8Data_2x8); \\\n\ + dst_copy_type dst; \\\n\ + _viv_asm(COPY, dst, dst1, 8); \\\n\ + VXC_WriteImage2DArray(output, coord, dst, VXC_MODIFIER(0, 1, 0,VXC_RM_TowardZero, 0)); \\\n\ }\n\ -"; /* end of vsi_nn_kernel_heatmap_max_keypoint_vx*/ - -static const char vsi_nn_kernel_imageprocess_vx[] = "#include \"cl_viv_vx_ext.h\"\n\ -\n\ -_viv_uniform VXC_512Bits uniVecShift10;\n\ -_viv_uniform VXC_512Bits uniAddRShift;\n\ -_viv_uniform VXC_512Bits uniGetTempVal;\n\ -_viv_uniform VXC_512Bits uniExtractBytes;\n\ -_viv_uniform VXC_512Bits uniUnpackToR;\n\ -_viv_uniform VXC_512Bits uniUnpackToG;\n\ -_viv_uniform VXC_512Bits uniUnpackToB;\n\ -_viv_uniform VXC_512Bits uniDataMulAlpha_4x4;\n\ -_viv_uniform VXC_512Bits uniDataSubMean_4x4;\n\ -\n\ -_viv_uniform VXC_512Bits uniConvertIntergetoF32_4x4;\n\ -_viv_uniform float outputScale;\n\ -_viv_uniform VXC_512Bits uniExtactInteger_2x8;\n\ -\n\ -#define DESCALE(x) (((x) + (1<<19)) >> 20)\n\ -__kernel void ScaletoTensor_Int8\n\ - (\n\ - __read_only image2d_t input,\n\ - __write_only image2d_array_t output,\n\ - global int *xRatio,\n\ - global int *yRatio,\n\ - global int *xOffset,\n\ - global int *yOffset,\n\ - float rMean,\n\ - float gMean,\n\ - float bMean,\n\ - float f32Var\n\ - )\n\ -{\n\ - int2 ratioXY = (int2)(*xRatio, *yRatio);\n\ -\n\ - int4 xPos = get_global_id(0);\n\ - int yPos = get_global_id(1);\n\ -\n\ - int2 ratioSufXY = (ratioXY >> 1) - (1 << 14);\n\ - xPos += (int4)(0, 1, 2, 3);\n\ -\n\ - //x\n\ - int4 fx0 = xPos * ratioXY.x + ratioSufXY.x;\n\ - int4 sx = fx0 & 0xffff8000;\n\ - fx0 -= sx;\n\ - sx = sx >> 15;\n\ -\n\ - vxc_short4 fx;\n\ - VXC_DP4x4(fx, fx0, 1 << 4, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAddRShift);\n\ - //y\n\ - int fy = yPos * ratioXY.y + ratioSufXY.y;\n\ - int sy = fy & 0xffff8000; // Floor\n\ -\n\ - fy -= sy;\n\ - sy = sy >> 15;\n\ -\n\ - fy = (fy + (1<< 4)) >> 5;\n\ -\n\ - //R\n\ - vxc_uchar16 line0RGB1, line0RGB2;\n\ - vxc_uchar16 line1RGB3, line1RGB4;\n\ - int4 coord;\n\ - sx = sx * 3 + *xOffset;\n\ - coord.xyz = sx.xyz;\n\ - coord.w = sy + *yOffset;\n\ - int2 coord1 = (int2)(sx.w, coord.w);\n\ - VXC_ReadImage(line0RGB1, input, coord.xw, VXC_5BITOFFSET_XY(0, 0),\\\n\ - VXC_MODIFIER(0, 5, 0, VXC_RM_TowardZero, 0));\n\ - VXC_ReadImage(line0RGB1, input, coord.yw, VXC_5BITOFFSET_XY(0, 0),\\\n\ - VXC_MODIFIER(6, 11, 0, VXC_RM_TowardZero, 0));\n\ - VXC_ReadImage(line0RGB2, input, coord.zw, VXC_5BITOFFSET_XY(0, 0),\\\n\ - VXC_MODIFIER(0, 5, 0, VXC_RM_TowardZero, 0));\n\ - VXC_ReadImage(line0RGB2, input, coord1, VXC_5BITOFFSET_XY(0, 0),\\\n\ - VXC_MODIFIER(6, 11, 0, VXC_RM_TowardZero, 0));\n\ -\n\ - VXC_ReadImage(line1RGB3, input, coord.xw, VXC_5BITOFFSET_XY(0, 1),\\\n\ - VXC_MODIFIER(0, 5, 0, VXC_RM_TowardZero, 0));\n\ - VXC_ReadImage(line1RGB3, input, coord.yw, VXC_5BITOFFSET_XY(0, 1),\\\n\ - VXC_MODIFIER(6, 11, 0, VXC_RM_TowardZero, 0));\n\ - VXC_ReadImage(line1RGB4, input, coord.zw, VXC_5BITOFFSET_XY(0, 1),\\\n\ - VXC_MODIFIER(0, 5, 0, VXC_RM_TowardZero, 0));\n\ - VXC_ReadImage(line1RGB4, input, coord1, VXC_5BITOFFSET_XY(0, 1),\\\n\ - VXC_MODIFIER(6, 11, 0, VXC_RM_TowardZero, 0));\n\ -\n\ - float4 bgrMean = (float4)(bMean, gMean, rMean, 0);\n\ -\n\ - bgrMean *= f32Var;\n\ -\n\ - int4 test01, temp1;\n\ - int4 test02, temp2;\n\ - int4 tt;\n\ - vxc_uchar4 val;\n\ - int4 coord_out = (int4)(xPos.x, yPos, 2, 0);\n\ -\n\ - vxc_uchar8 line1, line2;\n\ -\n\ - //R\n\ - VXC_DP2x8(line1, line0RGB1, line0RGB2,\\\n\ - VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniUnpackToR);\n\ - VXC_DP2x8(line2, line1RGB3, line1RGB4,\\\n\ - VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniUnpackToR);\n\ -\n\ - VXC_DP4x4(test01, line1, line1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniVecShift10);\n\ - VXC_DP4x4(temp1, line1, fx, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniGetTempVal);\n\ - temp1 = temp1 + test01;\n\ -\n\ - VXC_DP4x4(test02, line2, line2, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniVecShift10);\n\ - VXC_DP4x4(temp2, line2, fx, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniGetTempVal);\n\ - temp2 = temp2 + test02;\n\ - temp2 = fy * (temp2 - temp1) + (temp1 << 10);\n\ -\n\ - vxc_float4 tmp_dst;\n\ - vxc_uchar4 u8_dst;\n\ - VXC_DP4x4(u8_dst, temp2, 1 << 19, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 1), uniExtractBytes);\n\ - VXC_DP4x4(tmp_dst, u8_dst, u8_dst,\\\n\ - VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 1), uniConvertIntergetoF32_4x4);\n\ -\n\ - //convert U8 to dfp8\n\ - int4 dst0;\n\ - vxc_char4 dst;\n\ - tmp_dst = tmp_dst * f32Var - bgrMean.z;\n\ - tmp_dst *= outputScale;\n\ - dst0 = convert_int4_rte(tmp_dst);\n\ - VXC_DP2x8(dst, dst0, dst0, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), uniExtactInteger_2x8);\n\ -\n\ - VXC_WriteImage2DArray(output, coord_out, dst, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0));\n\ -\n\ - //G\n\ - VXC_DP2x8(line1, line0RGB1, line0RGB2, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniUnpackToG);\n\ - VXC_DP2x8(line2, line1RGB3, line1RGB4, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniUnpackToG);\n\ -\n\ - coord_out.z = 1;\n\ - VXC_DP4x4(test01, line1, line1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniVecShift10);\n\ - VXC_DP4x4(temp1, line1, fx, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniGetTempVal);\n\ - temp1 = temp1 + test01;\n\ -\n\ - VXC_DP4x4(test02, line2, line2, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniVecShift10);\n\ - VXC_DP4x4(temp2, line2, fx, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniGetTempVal);\n\ - temp2 = temp2 + test02;\n\ - temp2 = fy * (temp2 - temp1) + (temp1 << 10);\n\ -\n\ - VXC_DP4x4(u8_dst, temp2, 1 << 19, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 1), uniExtractBytes);\n\ - VXC_DP4x4(tmp_dst, u8_dst, u8_dst,\\\n\ - VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 1), uniConvertIntergetoF32_4x4);\n\ -\n\ - tmp_dst = tmp_dst * f32Var - bgrMean.y;\n\ - tmp_dst *= outputScale;\n\ - dst0 = convert_int4_rte(tmp_dst);\n\ - VXC_DP2x8(dst, dst0, dst0, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), uniExtactInteger_2x8);\n\ - VXC_WriteImage2DArray(output, coord_out, dst, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0));\n\ -\n\ - //B\n\ - VXC_DP2x8(line1, line0RGB1, line0RGB2, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniUnpackToB);\n\ - VXC_DP2x8(line2, line1RGB3, line1RGB4, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniUnpackToB);\n\ -\n\ - coord_out.z = 0;\n\ - VXC_DP4x4(test01, line1, line1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniVecShift10);\n\ - VXC_DP4x4(temp1, line1, fx, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniGetTempVal);\n\ - temp1 = temp1 + test01;\n\ -\n\ - VXC_DP4x4(test02, line2, line2, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniVecShift10);\n\ - VXC_DP4x4(temp2, line2, fx, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniGetTempVal);\n\ - temp2 = temp2 + test02;\n\ - temp2 = fy * (temp2 - temp1) + (temp1 << 10);\n\ -\n\ - VXC_DP4x4(u8_dst, temp2, 1 << 19,\\\n\ - VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 1), uniExtractBytes);\n\ - VXC_DP4x4(tmp_dst, u8_dst, u8_dst,\\\n\ - VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 1), uniConvertIntergetoF32_4x4);\n\ -\n\ - tmp_dst = tmp_dst * f32Var - bgrMean.x;\n\ - tmp_dst *= outputScale;\n\ - dst0 = convert_int4_rte(tmp_dst);\n\ - VXC_DP2x8(dst, dst0, dst0, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), uniExtactInteger_2x8);\n\ - VXC_WriteImage2DArray(output, coord_out, dst, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0));\n\ -}\n\ -\n\ -__kernel void ScaletoTensor_Fp16\n\ - (\n\ - __read_only image2d_t input,\n\ - __write_only image2d_array_t output,\n\ - global int *xRatio,\n\ - global int *yRatio,\n\ - global int *xOffset,\n\ - global int *yOffset,\n\ - float rMean,\n\ - float gMean,\n\ - float bMean,\n\ - float f32Var\n\ - )\n\ -{\n\ - int2 ratioXY = (int2)(*xRatio, *yRatio);\n\ -\n\ - int4 xPos = get_global_id(0);\n\ - int yPos = get_global_id(1);\n\ -\n\ - int2 ratioSufXY = (ratioXY >> 1) - (1 << 14);\n\ - xPos += (int4)(0, 1, 2, 3);\n\ -\n\ - //x\n\ - int4 fx0 = xPos * ratioXY.x + ratioSufXY.x;\n\ - int4 sx = fx0 & 0xffff8000;\n\ - fx0 -= sx;\n\ - sx = sx >> 15;\n\ -\n\ - vxc_short4 fx;\n\ - VXC_DP4x4(fx, fx0, 1 << 4, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAddRShift);\n\ - //y\n\ - int fy = yPos * ratioXY.y + ratioSufXY.y;\n\ - int sy = fy & 0xffff8000; // Floor\n\ -\n\ - fy -= sy;\n\ - sy = sy >> 15;\n\ -\n\ - fy = (fy + (1<< 4)) >> 5;\n\ -\n\ - //R\n\ - vxc_uchar16 line0RGB1, line0RGB2;\n\ - vxc_uchar16 line1RGB3, line1RGB4;\n\ - int4 coord;\n\ - sx = sx * 3 + *xOffset;\n\ - coord.xyz = sx.xyz;\n\ - coord.w = sy + *yOffset;\n\ - int2 coord1 = (int2)(sx.w, coord.w);\n\ - VXC_ReadImage(line0RGB1, input, coord.xw,\\\n\ - VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 5, 0, VXC_RM_TowardZero, 0));\n\ - VXC_ReadImage(line0RGB1, input, coord.yw,\\\n\ - VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(6, 11, 0, VXC_RM_TowardZero, 0));\n\ - VXC_ReadImage(line0RGB2, input, coord.zw,\\\n\ - VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 5, 0, VXC_RM_TowardZero, 0));\n\ - VXC_ReadImage(line0RGB2, input, coord1,\\\n\ - VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(6, 11, 0, VXC_RM_TowardZero, 0));\n\ -\n\ - VXC_ReadImage(line1RGB3, input, coord.xw,\\\n\ - VXC_5BITOFFSET_XY(0, 1), VXC_MODIFIER(0, 5, 0, VXC_RM_TowardZero, 0));\n\ - VXC_ReadImage(line1RGB3, input, coord.yw,\\\n\ - VXC_5BITOFFSET_XY(0, 1), VXC_MODIFIER(6, 11, 0, VXC_RM_TowardZero, 0));\n\ - VXC_ReadImage(line1RGB4, input, coord.zw,\\\n\ - VXC_5BITOFFSET_XY(0, 1), VXC_MODIFIER(0, 5, 0, VXC_RM_TowardZero, 0));\n\ - VXC_ReadImage(line1RGB4, input, coord1,\\\n\ - VXC_5BITOFFSET_XY(0, 1), VXC_MODIFIER(6, 11, 0, VXC_RM_TowardZero, 0));\n\ -\n\ - float4 bgrMean = (float4)(bMean, gMean, rMean, 0);\n\ -\n\ - int4 test01, temp1;\n\ - int4 test02, temp2;\n\ - int4 tt;\n\ - vxc_uchar4 val;\n\ - int4 coord_out = (int4)(xPos.x, yPos, 2, 0);\n\ -\n\ - vxc_uchar8 line1, line2;\n\ -\n\ - //R\n\ - VXC_DP2x8(line1, line0RGB1, line0RGB2, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniUnpackToR);\n\ - VXC_DP2x8(line2, line1RGB3, line1RGB4, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniUnpackToR);\n\ -\n\ - VXC_DP4x4(test01, line1, line1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniVecShift10);\n\ - VXC_DP4x4(temp1, line1, fx, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniGetTempVal);\n\ - temp1 = temp1 + test01;\n\ -\n\ - VXC_DP4x4(test02, line2, line2, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniVecShift10);\n\ - VXC_DP4x4(temp2, line2, fx, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniGetTempVal);\n\ - temp2 = temp2 + test02;\n\ - temp2 = fy * (temp2 - temp1) + (temp1 << 10);\n\ -\n\ - VXC_DP4x4(val, temp2, 1 << 19, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 1), uniExtractBytes);\n\ -\n\ - //convert U8 to FP16\n\ - half4 f16mean;\n\ - half f16alpha;\n\ - vxc_half4 dst;\n\ - vxc_short4 tmp_dst;\n\ - _viv_asm(CONV, f16mean, bgrMean);\n\ - _viv_asm(CONV, f16alpha, f32Var);\n\ - VXC_DP4x4(dst, val, f16mean.z, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniDataSubMean_4x4);\n\ - VXC_DP4x4(dst, dst, f16alpha, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniDataMulAlpha_4x4);\n\ - _viv_asm(COPY, tmp_dst, dst, 8);\n\ - VXC_WriteImage2DArray(output, coord_out, tmp_dst, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0));\n\ -\n\ - //G\n\ - VXC_DP2x8(line1, line0RGB1, line0RGB2, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniUnpackToG);\n\ - VXC_DP2x8(line2, line1RGB3, line1RGB4, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniUnpackToG);\n\ -\n\ - coord_out.z = 1;\n\ - VXC_DP4x4(test01, line1, line1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniVecShift10);\n\ - VXC_DP4x4(temp1, line1, fx, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniGetTempVal);\n\ - temp1 = temp1 + test01;\n\ -\n\ - VXC_DP4x4(test02, line2, line2, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniVecShift10);\n\ - VXC_DP4x4(temp2, line2, fx, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniGetTempVal);\n\ - temp2 = temp2 + test02;\n\ - temp2 = fy * (temp2 - temp1) + (temp1 << 10);\n\ -\n\ - VXC_DP4x4(val, temp2, 1 << 19, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 1), uniExtractBytes);\n\ -\n\ - VXC_DP4x4(dst, val, f16mean.y, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniDataSubMean_4x4);\n\ - VXC_DP4x4(dst, dst, f16alpha, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniDataMulAlpha_4x4);\n\ - _viv_asm(COPY, tmp_dst, dst, 8);\n\ - VXC_WriteImage2DArray(output, coord_out, tmp_dst, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0));\n\ -\n\ - //B\n\ - VXC_DP2x8(line1, line0RGB1, line0RGB2, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniUnpackToB);\n\ - VXC_DP2x8(line2, line1RGB3, line1RGB4, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniUnpackToB);\n\ -\n\ - coord_out.z = 0;\n\ - VXC_DP4x4(test01, line1, line1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniVecShift10);\n\ - VXC_DP4x4(temp1, line1, fx, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniGetTempVal);\n\ - temp1 = temp1 + test01;\n\ -\n\ - VXC_DP4x4(test02, line2, line2, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniVecShift10);\n\ - VXC_DP4x4(temp2, line2, fx, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniGetTempVal);\n\ - temp2 = temp2 + test02;\n\ - temp2 = fy * (temp2 - temp1) + (temp1 << 10);\n\ -\n\ - VXC_DP4x4(val, temp2, 1 << 19, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 1), uniExtractBytes);\n\ -\n\ - VXC_DP4x4(dst, val, f16mean.x, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniDataSubMean_4x4);\n\ - VXC_DP4x4(dst, dst, f16alpha, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniDataMulAlpha_4x4);\n\ - _viv_asm(COPY, tmp_dst, dst, 8);\n\ - VXC_WriteImage2DArray(output, coord_out, tmp_dst, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0));\n\ -\n\ -}\n\ -\n\ -"; /* end of vsi_nn_kernel_imageprocess_vx*/ - -static const char vsi_nn_kernel_imageprocess_2_vx[] = "#include \"cl_viv_vx_ext.h\"\n\ -\n\ -_viv_uniform VXC_512Bits uniVecShift10;\n\ -_viv_uniform VXC_512Bits uniAddRShift;\n\ -_viv_uniform VXC_512Bits uniGetTempVal;\n\ -_viv_uniform VXC_512Bits uniExtractBytes;\n\ -_viv_uniform VXC_512Bits uniUnpackToR;\n\ -_viv_uniform VXC_512Bits uniUnpackToG;\n\ -_viv_uniform VXC_512Bits uniUnpackToB;\n\ -_viv_uniform VXC_512Bits uniDataMulAlpha_4x4;\n\ -_viv_uniform VXC_512Bits uniDataSubMean_4x4;\n\ -\n\ -_viv_uniform VXC_512Bits uniConvertIntergetoF32_4x4;\n\ -_viv_uniform float outputScale;\n\ -_viv_uniform VXC_512Bits uniExtactInteger_2x8;\n\ -\n\ -#define DESCALE(x) (((x) + (1<<19)) >> 20)\n\ -__kernel void ScaletoTensor_Int16\n\ - (\n\ - __read_only image2d_t input,\n\ - __write_only image2d_array_t output,\n\ - global int *xRatio,\n\ - global int *yRatio,\n\ - global int *xOffset,\n\ - global int *yOffset,\n\ - float rMean,\n\ - float gMean,\n\ - float bMean,\n\ - float f32Var\n\ - )\n\ -{\n\ - int2 ratioXY = (int2)(*xRatio, *yRatio);\n\ -\n\ - int4 xPos = get_global_id(0);\n\ - int yPos = get_global_id(1);\n\ -\n\ - int2 ratioSufXY = (ratioXY >> 1) - (1 << 14);\n\ - xPos += (int4)(0, 1, 2, 3);\n\ -\n\ - //x\n\ - int4 fx0 = xPos * ratioXY.x + ratioSufXY.x;\n\ - int4 sx = fx0 & 0xffff8000;\n\ - fx0 -= sx;\n\ - sx = sx >> 15;\n\ -\n\ - vxc_short4 fx;\n\ - VXC_DP4x4(fx, fx0, 1 << 4, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAddRShift);\n\ - //y\n\ - int fy = yPos * ratioXY.y + ratioSufXY.y;\n\ - int sy = fy & 0xffff8000; // Floor\n\ -\n\ - fy -= sy;\n\ - sy = sy >> 15;\n\ -\n\ - fy = (fy + (1<< 4)) >> 5;\n\ -\n\ - //R\n\ - vxc_uchar16 line0RGB1, line0RGB2;\n\ - vxc_uchar16 line1RGB3, line1RGB4;\n\ - int4 coord;\n\ - sx = sx * 3 + *xOffset;\n\ - coord.xyz = sx.xyz;\n\ - coord.w = sy + *yOffset;\n\ - int2 coord1 = (int2)(sx.w, coord.w);\n\ - VXC_ReadImage(line0RGB1, input, coord.xw,\\\n\ - VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 5, 0, VXC_RM_TowardZero, 0));\n\ - VXC_ReadImage(line0RGB1, input, coord.yw,\\\n\ - VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(6, 11, 0, VXC_RM_TowardZero, 0));\n\ - VXC_ReadImage(line0RGB2, input, coord.zw,\\\n\ - VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 5, 0, VXC_RM_TowardZero, 0));\n\ - VXC_ReadImage(line0RGB2, input, coord1,\\\n\ - VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(6, 11, 0, VXC_RM_TowardZero, 0));\n\ -\n\ - VXC_ReadImage(line1RGB3, input, coord.xw,\\\n\ - VXC_5BITOFFSET_XY(0, 1), VXC_MODIFIER(0, 5, 0, VXC_RM_TowardZero, 0));\n\ - VXC_ReadImage(line1RGB3, input, coord.yw,\\\n\ - VXC_5BITOFFSET_XY(0, 1), VXC_MODIFIER(6, 11, 0, VXC_RM_TowardZero, 0));\n\ - VXC_ReadImage(line1RGB4, input, coord.zw,\\\n\ - VXC_5BITOFFSET_XY(0, 1), VXC_MODIFIER(0, 5, 0, VXC_RM_TowardZero, 0));\n\ - VXC_ReadImage(line1RGB4, input, coord1,\\\n\ - VXC_5BITOFFSET_XY(0, 1), VXC_MODIFIER(6, 11, 0, VXC_RM_TowardZero, 0));\n\ -\n\ - float4 bgrMean = (float4)(bMean, gMean, rMean, 0);\n\ -\n\ - bgrMean *= f32Var;\n\ -\n\ - int4 test01, temp1;\n\ - int4 test02, temp2;\n\ - int4 tt;\n\ - vxc_uchar4 val;\n\ - int4 coord_out = (int4)(xPos.x, yPos, 2, 0);\n\ -\n\ - vxc_uchar8 line1, line2;\n\ -\n\ - //R\n\ - VXC_DP2x8(line1, line0RGB1, line0RGB2, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniUnpackToR);\n\ - VXC_DP2x8(line2, line1RGB3, line1RGB4, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniUnpackToR);\n\ -\n\ - VXC_DP4x4(test01, line1, line1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniVecShift10);\n\ - VXC_DP4x4(temp1, line1, fx, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniGetTempVal);\n\ - temp1 = temp1 + test01;\n\ -\n\ - VXC_DP4x4(test02, line2, line2, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniVecShift10);\n\ - VXC_DP4x4(temp2, line2, fx, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniGetTempVal);\n\ - temp2 = temp2 + test02;\n\ - temp2 = fy * (temp2 - temp1) + (temp1 << 10);\n\ -\n\ - vxc_float4 tmp_dst;\n\ - vxc_uchar4 u8_dst;\n\ - VXC_DP4x4(u8_dst, temp2, 1 << 19, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 1), uniExtractBytes);\n\ - VXC_DP4x4(tmp_dst, u8_dst, u8_dst,\\\n\ - VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 1), uniConvertIntergetoF32_4x4);\n\ -\n\ - //convert U8 to dfp8\n\ - int4 dst0;\n\ - vxc_short4 dst;\n\ - tmp_dst = tmp_dst * f32Var - bgrMean.z;\n\ - tmp_dst *= outputScale;\n\ - dst0 = convert_int4_rte(tmp_dst);\n\ - VXC_DP2x8(dst, dst0, dst0, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), uniExtactInteger_2x8);\n\ -\n\ - VXC_WriteImage2DArray(output, coord_out, dst, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0));\n\ -\n\ - //G\n\ - VXC_DP2x8(line1, line0RGB1, line0RGB2, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniUnpackToG);\n\ - VXC_DP2x8(line2, line1RGB3, line1RGB4, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniUnpackToG);\n\ -\n\ - coord_out.z = 1;\n\ - VXC_DP4x4(test01, line1, line1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniVecShift10);\n\ - VXC_DP4x4(temp1, line1, fx, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniGetTempVal);\n\ - temp1 = temp1 + test01;\n\ -\n\ - VXC_DP4x4(test02, line2, line2, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniVecShift10);\n\ - VXC_DP4x4(temp2, line2, fx, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniGetTempVal);\n\ - temp2 = temp2 + test02;\n\ - temp2 = fy * (temp2 - temp1) + (temp1 << 10);\n\ -\n\ - VXC_DP4x4(u8_dst, temp2, 1 << 19, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 1), uniExtractBytes);\n\ - VXC_DP4x4(tmp_dst, u8_dst, u8_dst,\\\n\ - VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 1), uniConvertIntergetoF32_4x4);\n\ -\n\ - tmp_dst = tmp_dst * f32Var - bgrMean.y;\n\ - tmp_dst *= outputScale;\n\ - dst0 = convert_int4_rte(tmp_dst);\n\ - VXC_DP2x8(dst, dst0, dst0, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), uniExtactInteger_2x8);\n\ - VXC_WriteImage2DArray(output, coord_out, dst, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0));\n\ -\n\ - //B\n\ - VXC_DP2x8(line1, line0RGB1, line0RGB2, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniUnpackToB);\n\ - VXC_DP2x8(line2, line1RGB3, line1RGB4, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniUnpackToB);\n\ -\n\ - coord_out.z = 0;\n\ - VXC_DP4x4(test01, line1, line1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniVecShift10);\n\ - VXC_DP4x4(temp1, line1, fx, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniGetTempVal);\n\ - temp1 = temp1 + test01;\n\ -\n\ - VXC_DP4x4(test02, line2, line2, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniVecShift10);\n\ - VXC_DP4x4(temp2, line2, fx, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniGetTempVal);\n\ - temp2 = temp2 + test02;\n\ - temp2 = fy * (temp2 - temp1) + (temp1 << 10);\n\ -\n\ - VXC_DP4x4(u8_dst, temp2, 1 << 19, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 1), uniExtractBytes);\n\ - VXC_DP4x4(tmp_dst, u8_dst, u8_dst,\\\n\ - VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 1), uniConvertIntergetoF32_4x4);\n\ -\n\ - tmp_dst = tmp_dst * f32Var - bgrMean.x;\n\ - tmp_dst *= outputScale;\n\ - dst0 = convert_int4_rte(tmp_dst);\n\ - VXC_DP2x8(dst, dst0, dst0, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), uniExtactInteger_2x8);\n\ - VXC_WriteImage2DArray(output, coord_out, dst, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0));\n\ -}\n\ -\n\ -_viv_uniform float outputZP;\n\ -__kernel void ScaletoTensor_UInt8\n\ - (\n\ - __read_only image2d_t input,\n\ - __write_only image2d_array_t output,\n\ - global int *xRatio,\n\ - global int *yRatio,\n\ - global int *xOffset,\n\ - global int *yOffset,\n\ - float rMean,\n\ - float gMean,\n\ - float bMean,\n\ - float f32Var\n\ - )\n\ -{\n\ - int2 ratioXY = (int2)(*xRatio, *yRatio);\n\ -\n\ - int4 xPos = get_global_id(0);\n\ - int yPos = get_global_id(1);\n\ -\n\ - int2 ratioSufXY = (ratioXY >> 1) - (1 << 14);\n\ - xPos += (int4)(0, 1, 2, 3);\n\ -\n\ - //x\n\ - int4 fx0 = xPos * ratioXY.x + ratioSufXY.x;\n\ - int4 sx = fx0 & 0xffff8000;\n\ - fx0 -= sx;\n\ - sx = sx >> 15;\n\ -\n\ - vxc_short4 fx;\n\ - VXC_DP4x4(fx, fx0, 1 << 4, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAddRShift);\n\ - //y\n\ - int fy = yPos * ratioXY.y + ratioSufXY.y;\n\ - int sy = fy & 0xffff8000; // Floor\n\ -\n\ - fy -= sy;\n\ - sy = sy >> 15;\n\ -\n\ - fy = (fy + (1<< 4)) >> 5;\n\ -\n\ - //R\n\ - vxc_uchar16 line0RGB1, line0RGB2;\n\ - vxc_uchar16 line1RGB3, line1RGB4;\n\ - int4 coord;\n\ - sx = sx * 3 + *xOffset;\n\ - coord.xyz = sx.xyz;\n\ - coord.w = sy + *yOffset;\n\ - int2 coord1 = (int2)(sx.w, coord.w);\n\ - VXC_ReadImage(line0RGB1, input, coord.xw,\\\n\ - VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 5, 0, VXC_RM_TowardZero, 0));\n\ - VXC_ReadImage(line0RGB1, input, coord.yw,\\\n\ - VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(6, 11, 0, VXC_RM_TowardZero, 0));\n\ - VXC_ReadImage(line0RGB2, input, coord.zw,\\\n\ - VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 5, 0, VXC_RM_TowardZero, 0));\n\ - VXC_ReadImage(line0RGB2, input, coord1,\\\n\ - VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(6, 11, 0, VXC_RM_TowardZero, 0));\n\ -\n\ - VXC_ReadImage(line1RGB3, input, coord.xw,\\\n\ - VXC_5BITOFFSET_XY(0, 1), VXC_MODIFIER(0, 5, 0, VXC_RM_TowardZero, 0));\n\ - VXC_ReadImage(line1RGB3, input, coord.yw,\\\n\ - VXC_5BITOFFSET_XY(0, 1), VXC_MODIFIER(6, 11, 0, VXC_RM_TowardZero, 0));\n\ - VXC_ReadImage(line1RGB4, input, coord.zw,\\\n\ - VXC_5BITOFFSET_XY(0, 1), VXC_MODIFIER(0, 5, 0, VXC_RM_TowardZero, 0));\n\ - VXC_ReadImage(line1RGB4, input, coord1,\\\n\ - VXC_5BITOFFSET_XY(0, 1), VXC_MODIFIER(6, 11, 0, VXC_RM_TowardZero, 0));\n\ -\n\ - float4 bgrMean = (float4)(bMean, gMean, rMean, 0);\n\ -\n\ - bgrMean *= f32Var;\n\ -\n\ - int4 test01, temp1;\n\ - int4 test02, temp2;\n\ - int4 tt;\n\ - vxc_uchar4 val;\n\ - int4 coord_out = (int4)(xPos.x, yPos, 2, 0);\n\ -\n\ - vxc_uchar8 line1, line2;\n\ -\n\ - //R\n\ - VXC_DP2x8(line1, line0RGB1, line0RGB2, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniUnpackToR);\n\ - VXC_DP2x8(line2, line1RGB3, line1RGB4, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniUnpackToR);\n\ -\n\ - VXC_DP4x4(test01, line1, line1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniVecShift10);\n\ - VXC_DP4x4(temp1, line1, fx, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniGetTempVal);\n\ - temp1 = temp1 + test01;\n\ -\n\ - VXC_DP4x4(test02, line2, line2, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniVecShift10);\n\ - VXC_DP4x4(temp2, line2, fx, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniGetTempVal);\n\ - temp2 = temp2 + test02;\n\ - temp2 = fy * (temp2 - temp1) + (temp1 << 10);\n\ -\n\ - vxc_float4 tmp_dst;\n\ - vxc_uchar4 u8_dst;\n\ - VXC_DP4x4(u8_dst, temp2, 1 << 19, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 1), uniExtractBytes);\n\ - VXC_DP4x4(tmp_dst, u8_dst, u8_dst,\\\n\ - VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 1), uniConvertIntergetoF32_4x4);\n\ -\n\ - //convert U8 to dfp8\n\ - int4 dst0;\n\ - vxc_uchar4 dst;\n\ - tmp_dst = tmp_dst * f32Var - bgrMean.z;\n\ - tmp_dst = tmp_dst * outputScale + outputZP;\n\ - dst0 = convert_int4_rte(tmp_dst);\n\ - VXC_DP2x8(dst, dst0, dst0, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), uniExtactInteger_2x8);\n\ -\n\ - VXC_WriteImage2DArray(output, coord_out, dst, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0));\n\ -\n\ - //G\n\ - VXC_DP2x8(line1, line0RGB1, line0RGB2, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniUnpackToG);\n\ - VXC_DP2x8(line2, line1RGB3, line1RGB4, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniUnpackToG);\n\ -\n\ - coord_out.z = 1;\n\ - VXC_DP4x4(test01, line1, line1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniVecShift10);\n\ - VXC_DP4x4(temp1, line1, fx, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniGetTempVal);\n\ - temp1 = temp1 + test01;\n\ -\n\ - VXC_DP4x4(test02, line2, line2, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniVecShift10);\n\ - VXC_DP4x4(temp2, line2, fx, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniGetTempVal);\n\ - temp2 = temp2 + test02;\n\ - temp2 = fy * (temp2 - temp1) + (temp1 << 10);\n\ -\n\ - VXC_DP4x4(u8_dst, temp2, 1 << 19, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 1), uniExtractBytes);\n\ - VXC_DP4x4(tmp_dst, u8_dst, u8_dst,\\\n\ - VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 1), uniConvertIntergetoF32_4x4);\n\ -\n\ - tmp_dst = tmp_dst * f32Var - bgrMean.y;\n\ - tmp_dst = tmp_dst * outputScale + outputZP;\n\ - dst0 = convert_int4_rte(tmp_dst);\n\ - VXC_DP2x8(dst, dst0, dst0, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), uniExtactInteger_2x8);\n\ - VXC_WriteImage2DArray(output, coord_out, dst, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0));\n\ -\n\ - //B\n\ - VXC_DP2x8(line1, line0RGB1, line0RGB2, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniUnpackToB);\n\ - VXC_DP2x8(line2, line1RGB3, line1RGB4, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniUnpackToB);\n\ -\n\ - coord_out.z = 0;\n\ - VXC_DP4x4(test01, line1, line1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniVecShift10);\n\ - VXC_DP4x4(temp1, line1, fx, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniGetTempVal);\n\ - temp1 = temp1 + test01;\n\ -\n\ - VXC_DP4x4(test02, line2, line2, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniVecShift10);\n\ - VXC_DP4x4(temp2, line2, fx, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniGetTempVal);\n\ - temp2 = temp2 + test02;\n\ - temp2 = fy * (temp2 - temp1) + (temp1 << 10);\n\ -\n\ - VXC_DP4x4(u8_dst, temp2, 1 << 19, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 1), uniExtractBytes);\n\ - VXC_DP4x4(tmp_dst, u8_dst, u8_dst,\\\n\ - VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 1), uniConvertIntergetoF32_4x4);\n\ -\n\ - tmp_dst = tmp_dst * f32Var - bgrMean.x;\n\ - tmp_dst = tmp_dst * outputScale + outputZP;\n\ - dst0 = convert_int4_rte(tmp_dst);\n\ - VXC_DP2x8(dst, dst0, dst0, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), uniExtactInteger_2x8);\n\ - VXC_WriteImage2DArray(output, coord_out, dst, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0));\n\ -}\n\ -"; /* end of vsi_nn_kernel_imageprocess_2_vx*/ - -static const char vsi_nn_kernel_imageprocess_3_vx[] = "#include \"cl_viv_vx_ext.h\"\n\ -\n\ -_viv_uniform VXC_512Bits uniExtractR_2x8;\n\ -_viv_uniform VXC_512Bits uniExtractG_2x8;\n\ -_viv_uniform VXC_512Bits uniExtractB_2x8;\n\ -_viv_uniform float outputScale;\n\ -__kernel void ScaletoTensor_Fp16_copy\n\ - (\n\ - __read_only image2d_t input,\n\ - __write_only image2d_array_t output,\n\ - global int *xRatio,\n\ - global int *yRatio,\n\ - global int *xOffset,\n\ - global int *yOffset,\n\ - float rMean,\n\ - float gMean,\n\ - float bMean,\n\ - float f32Var\n\ - )\n\ -{\n\ - int2 coord = (int2)(get_global_id(0) * 3, get_global_id(1));\n\ -\n\ - coord.xy += (int2) (*xOffset, *yOffset);\n\ - vxc_uchar16 src0, src1;\n\ - vxc_half8 dst;\n\ -\n\ - VXC_ReadImage(src0, input, coord.xy, VXC_5BITOFFSET_XY(0, 0),\\\n\ - VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));\n\ - VXC_ReadImage(src1, input, coord.xy, VXC_5BITOFFSET_XY(15, 0),\\\n\ - VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));\n\ -\n\ - float4 paramData = (float4)(rMean * f32Var, gMean * f32Var, bMean * f32Var, f32Var);\n\ - //convert U8 to FP16\n\ - half4 paramData_f16;\n\ - vxc_short8 tmp_dst;\n\ - _viv_asm(CONV, paramData_f16, paramData);\n\ -\n\ - int4 coord_out = (int4)(get_global_id(0), get_global_id(1), 2, 0);\n\ - //R\n\ - VXC_DP2x8(dst, src0, paramData_f16, VXC_MODIFIER(0, 4, 0, VXC_RM_TowardZero, 0), uniExtractR_2x8);\n\ - VXC_DP2x8(dst, src1, paramData_f16, VXC_MODIFIER(5, 7, 0, VXC_RM_TowardZero, 0), uniExtractR_2x8);\n\ - _viv_asm(COPY, tmp_dst, dst, 16);\n\ - VXC_WriteImage2DArray(output, coord_out, tmp_dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ -\n\ -\n\ - //G\n\ - coord_out.z = 1;\n\ - VXC_DP2x8(dst, src0, paramData_f16, VXC_MODIFIER(0, 4, 0, VXC_RM_TowardZero, 0), uniExtractG_2x8);\n\ - VXC_DP2x8(dst, src1, paramData_f16, VXC_MODIFIER(5, 7, 0, VXC_RM_TowardZero, 0), uniExtractG_2x8);\n\ - _viv_asm(COPY, tmp_dst, dst, 16);\n\ - VXC_WriteImage2DArray(output, coord_out, tmp_dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ -\n\ - //B\n\ - coord_out.z = 0;\n\ - VXC_DP2x8(dst, src0, paramData_f16, VXC_MODIFIER(0, 4, 0, VXC_RM_TowardZero, 0), uniExtractB_2x8);\n\ - VXC_DP2x8(dst, src1, paramData_f16, VXC_MODIFIER(5, 7, 0, VXC_RM_TowardZero, 0), uniExtractB_2x8);\n\ - _viv_asm(COPY, tmp_dst, dst, 16);\n\ - VXC_WriteImage2DArray(output, coord_out, tmp_dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ -}\n\ -\n\ -__kernel void ScaletoTensor_Int8_copy\n\ - (\n\ - __read_only image2d_t input,\n\ - __write_only image2d_array_t output,\n\ - global int *xRatio,\n\ - global int *yRatio,\n\ - global int *xOffset,\n\ - global int *yOffset,\n\ - float rMean,\n\ - float gMean,\n\ - float bMean,\n\ - float f32Var\n\ - )\n\ -{\n\ - int2 coord = (int2)(get_global_id(0) * 3, get_global_id(1));\n\ -\n\ - coord.xy += (int2) (*xOffset, *yOffset);\n\ - vxc_uchar16 src0, src1;\n\ - vxc_char16 dst;\n\ -\n\ - VXC_ReadImage(src0, input, coord.xy, VXC_5BITOFFSET_XY(0, 0),\\\n\ - VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));\n\ - VXC_ReadImage(src1, input, coord.xy, VXC_5BITOFFSET_XY(15, 0),\\\n\ - VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));\n\ -\n\ - f32Var *= outputScale;\n\ - float4 paramData = (float4)(rMean * f32Var, gMean * f32Var, bMean * f32Var, f32Var);\n\ - //convert U8 to FP16\n\ - half4 paramData_f16;\n\ - _viv_asm(CONV, paramData_f16, paramData);\n\ -\n\ - int4 coord_out = (int4)(get_global_id(0), get_global_id(1), 2, 0);\n\ - //R\n\ - VXC_DP2x8(dst, src0, paramData_f16, VXC_MODIFIER(0, 4, 0, VXC_RM_ToNearestEven, 1), uniExtractR_2x8);\n\ - VXC_DP2x8(dst, src1, paramData_f16, VXC_MODIFIER(5, 9, 0, VXC_RM_ToNearestEven, 1), uniExtractR_2x8);\n\ - VXC_WriteImage2DArray(output, coord_out, dst, VXC_MODIFIER(0, 9, 0, VXC_RM_TowardZero, 0));\n\ -\n\ -\n\ - //G\n\ - coord_out.z = 1;\n\ - VXC_DP2x8(dst, src0, paramData_f16, VXC_MODIFIER(0, 4, 0, VXC_RM_ToNearestEven, 1), uniExtractG_2x8);\n\ - VXC_DP2x8(dst, src1, paramData_f16, VXC_MODIFIER(5, 9, 0, VXC_RM_ToNearestEven, 1), uniExtractG_2x8);\n\ - VXC_WriteImage2DArray(output, coord_out, dst, VXC_MODIFIER(0, 9, 0, VXC_RM_TowardZero, 0));\n\ -\n\ - //B\n\ - coord_out.z = 0;\n\ - VXC_DP2x8(dst, src0, paramData_f16, VXC_MODIFIER(0, 4, 0, VXC_RM_ToNearestEven, 1), uniExtractB_2x8);\n\ - VXC_DP2x8(dst, src1, paramData_f16, VXC_MODIFIER(5, 9, 0, VXC_RM_ToNearestEven, 1), uniExtractB_2x8);\n\ - VXC_WriteImage2DArray(output, coord_out, dst, VXC_MODIFIER(0, 9, 0, VXC_RM_TowardZero, 0));\n\ -}\n\ -\n\ -__kernel void ScaletoTensor_Int16_copy\n\ - (\n\ - __read_only image2d_t input,\n\ - __write_only image2d_array_t output,\n\ - global int *xRatio,\n\ - global int *yRatio,\n\ - global int *xOffset,\n\ - global int *yOffset,\n\ - float rMean,\n\ - float gMean,\n\ - float bMean,\n\ - float f32Var\n\ - )\n\ -{\n\ - int2 coord = (int2)(get_global_id(0) * 3, get_global_id(1));\n\ -\n\ - coord.xy += (int2) (*xOffset, *yOffset);\n\ - vxc_uchar16 src0, src1;\n\ - vxc_short8 dst;\n\ -\n\ - VXC_ReadImage(src0, input, coord.xy, VXC_5BITOFFSET_XY(0, 0),\\\n\ - VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));\n\ - VXC_ReadImage(src1, input, coord.xy, VXC_5BITOFFSET_XY(15, 0),\\\n\ - VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));\n\ -\n\ - f32Var *= outputScale;\n\ - float4 paramData = (float4)(rMean * f32Var, gMean * f32Var, bMean * f32Var, f32Var);\n\ - //convert U8 to FP16\n\ - half4 paramData_f16;\n\ - _viv_asm(CONV, paramData_f16, paramData);\n\ -\n\ - int4 coord_out = (int4)(get_global_id(0), get_global_id(1), 2, 0);\n\ - //R\n\ - VXC_DP2x8(dst, src0, paramData_f16, VXC_MODIFIER(0, 4, 0, VXC_RM_ToNearestEven, 1), uniExtractR_2x8);\n\ - VXC_DP2x8(dst, src1, paramData_f16, VXC_MODIFIER(5, 7, 0, VXC_RM_ToNearestEven, 1), uniExtractR_2x8);\n\ - VXC_WriteImage2DArray(output, coord_out, dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ -\n\ -\n\ - //G\n\ - coord_out.z = 1;\n\ - VXC_DP2x8(dst, src0, paramData_f16, VXC_MODIFIER(0, 4, 0, VXC_RM_ToNearestEven, 1), uniExtractG_2x8);\n\ - VXC_DP2x8(dst, src1, paramData_f16, VXC_MODIFIER(5, 7, 0, VXC_RM_ToNearestEven, 1), uniExtractG_2x8);\n\ - VXC_WriteImage2DArray(output, coord_out, dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ -\n\ - //B\n\ - coord_out.z = 0;\n\ - VXC_DP2x8(dst, src0, paramData_f16, VXC_MODIFIER(0, 4, 0, VXC_RM_ToNearestEven, 1), uniExtractB_2x8);\n\ - VXC_DP2x8(dst, src1, paramData_f16, VXC_MODIFIER(5, 7, 0, VXC_RM_ToNearestEven, 1), uniExtractB_2x8);\n\ - VXC_WriteImage2DArray(output, coord_out, dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ -}\n\ -\n\ -_viv_uniform float outputZP;\n\ -__kernel void ScaletoTensor_UInt8_copy\n\ - (\n\ - __read_only image2d_t input,\n\ - __write_only image2d_array_t output,\n\ - global int *xRatio,\n\ - global int *yRatio,\n\ - global int *xOffset,\n\ - global int *yOffset,\n\ - float rMean,\n\ - float gMean,\n\ - float bMean,\n\ - float f32Var\n\ - )\n\ -{\n\ - int2 coord = (int2)(get_global_id(0) * 3, get_global_id(1));\n\ -\n\ - coord.xy += (int2) (*xOffset, *yOffset);\n\ - vxc_uchar16 src0, src1;\n\ - vxc_uchar16 dst;\n\ -\n\ - VXC_ReadImage(src0, input, coord.xy, VXC_5BITOFFSET_XY(0, 0),\\\n\ - VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));\n\ - VXC_ReadImage(src1, input, coord.xy, VXC_5BITOFFSET_XY(15, 0),\\\n\ - VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));\n\ -\n\ - f32Var *= outputScale;\n\ - float4 paramData = (float4)(rMean * f32Var - outputZP,\\\n\ - gMean * f32Var - outputZP, bMean * f32Var - outputZP, f32Var);\n\ - //convert U8 to FP16\n\ - half4 paramData_f16;\n\ - _viv_asm(CONV, paramData_f16, paramData);\n\ -\n\ - int4 coord_out = (int4)(get_global_id(0), get_global_id(1), 2, 0);\n\ - //R\n\ - VXC_DP2x8(dst, src0, paramData_f16, VXC_MODIFIER(0, 4, 0, VXC_RM_ToNearestEven, 1), uniExtractR_2x8);\n\ - VXC_DP2x8(dst, src1, paramData_f16, VXC_MODIFIER(5, 9, 0, VXC_RM_ToNearestEven, 1), uniExtractR_2x8);\n\ - VXC_WriteImage2DArray(output, coord_out, dst, VXC_MODIFIER(0, 9, 0, VXC_RM_TowardZero, 0));\n\ -\n\ -\n\ - //G\n\ - coord_out.z = 1;\n\ - VXC_DP2x8(dst, src0, paramData_f16, VXC_MODIFIER(0, 4, 0, VXC_RM_ToNearestEven, 1), uniExtractG_2x8);\n\ - VXC_DP2x8(dst, src1, paramData_f16, VXC_MODIFIER(5, 9, 0, VXC_RM_ToNearestEven, 1), uniExtractG_2x8);\n\ - VXC_WriteImage2DArray(output, coord_out, dst, VXC_MODIFIER(0, 9, 0, VXC_RM_TowardZero, 0));\n\ -\n\ - //B\n\ - coord_out.z = 0;\n\ - VXC_DP2x8(dst, src0, paramData_f16, VXC_MODIFIER(0, 4, 0, VXC_RM_ToNearestEven, 1), uniExtractB_2x8);\n\ - VXC_DP2x8(dst, src1, paramData_f16, VXC_MODIFIER(5, 9, 0, VXC_RM_ToNearestEven, 1), uniExtractB_2x8);\n\ - VXC_WriteImage2DArray(output, coord_out, dst, VXC_MODIFIER(0, 9, 0, VXC_RM_TowardZero, 0));\n\ -}\n\ -"; /* end of vsi_nn_kernel_imageprocess_3_vx*/ - -static const char vsi_nn_kernel_imageprocess_4_vx[] = "#include \"cl_viv_vx_ext.h\"\n\ -\n\ -_viv_uniform VXC_512Bits uniVecShift10;\n\ -_viv_uniform VXC_512Bits uniAddRShift;\n\ -_viv_uniform VXC_512Bits uniGetTempVal;\n\ -_viv_uniform VXC_512Bits uniExtractBytes;\n\ -\n\ -_viv_uniform VXC_512Bits uniConvertIntergetoF32_4x4;\n\ -_viv_uniform float outputScale;\n\ -_viv_uniform VXC_512Bits uniExtactInteger_2x8;\n\ -\n\ -#define DESCALE(x) (((x) + (1<<19)) >> 20)\n\ -__kernel void GrayScaletoTensor_Int8\n\ - (\n\ - __read_only image2d_t input,\n\ - __write_only image2d_array_t output,\n\ - global int *xRatio,\n\ - global int *yRatio,\n\ - global int *xOffset,\n\ - global int *yOffset,\n\ - float mean,\n\ - float f32Var\n\ - )\n\ -{\n\ - int2 ratioXY = (int2)(*xRatio, *yRatio);\n\ -\n\ - int4 xPos = get_global_id(0);\n\ - int yPos = get_global_id(1);\n\ -\n\ - int2 ratioSufXY = (ratioXY >> 1) - (1 << 14);\n\ - xPos += (int4)(0, 1, 2, 3);\n\ -\n\ - //x\n\ - int4 fx0 = xPos * ratioXY.x + ratioSufXY.x;\n\ - int4 sx = fx0 & 0xffff8000;\n\ - fx0 -= sx;\n\ - sx = sx >> 15;\n\ -\n\ - vxc_short4 fx;\n\ - VXC_DP4x4(fx, fx0, 1 << 4, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAddRShift);\n\ - //y\n\ - int fy = yPos * ratioXY.y + ratioSufXY.y;\n\ - int sy = fy & 0xffff8000; // Floor\n\ -\n\ - fy -= sy;\n\ - sy = sy >> 15;\n\ -\n\ - fy = (fy + (1<< 4)) >> 5;\n\ -\n\ - //R\n\ - vxc_uchar16 line0Y;\n\ - vxc_uchar16 line1Y;\n\ - int4 coord;\n\ - sx = sx + *xOffset;\n\ - coord.xyz = sx.xyz;\n\ - coord.w = sy + *yOffset;\n\ - int2 coord1 = (int2)(sx.w, coord.w);\n\ - VXC_ReadImage(line0Y, input, coord.xw, VXC_5BITOFFSET_XY(0, 0),\n\ - VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0));\n\ - VXC_ReadImage(line0Y, input, coord.yw, VXC_5BITOFFSET_XY(0, 0),\n\ - VXC_MODIFIER(2, 3, 0, VXC_RM_TowardZero, 0));\n\ - VXC_ReadImage(line0Y, input, coord.zw, VXC_5BITOFFSET_XY(0, 0),\n\ - VXC_MODIFIER(4, 5, 0, VXC_RM_TowardZero, 0));\n\ - VXC_ReadImage(line0Y, input, coord1, VXC_5BITOFFSET_XY(0, 0),\n\ - VXC_MODIFIER(6, 7, 0, VXC_RM_TowardZero, 0));\n\ -\n\ - VXC_ReadImage(line1Y, input, coord.xw, VXC_5BITOFFSET_XY(0, 1),\n\ - VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0));\n\ - VXC_ReadImage(line1Y, input, coord.yw, VXC_5BITOFFSET_XY(0, 1),\n\ - VXC_MODIFIER(2, 3, 0, VXC_RM_TowardZero, 0));\n\ - VXC_ReadImage(line1Y, input, coord.zw, VXC_5BITOFFSET_XY(0, 1),\n\ - VXC_MODIFIER(4, 5, 0, VXC_RM_TowardZero, 0));\n\ - VXC_ReadImage(line1Y, input, coord1, VXC_5BITOFFSET_XY(0, 1),\n\ - VXC_MODIFIER(6, 7, 0, VXC_RM_TowardZero, 0));\n\ -\n\ - float grayMean = mean * f32Var;\n\ -\n\ - int4 test01, temp1;\n\ - int4 test02, temp2;\n\ - int4 tt;\n\ - vxc_uchar4 val;\n\ - int2 coord_out = (int2)(xPos.x, yPos);\n\ -\n\ - vxc_uchar8 line1, line2;\n\ -\n\ - VXC_DP4x4(test01, line0Y, line0Y, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniVecShift10);\n\ - VXC_DP4x4(temp1, line0Y, fx, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniGetTempVal);\n\ - temp1 = temp1 + test01;\n\ -\n\ - VXC_DP4x4(test02, line1Y, line1Y, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniVecShift10);\n\ - VXC_DP4x4(temp2, line1Y, fx, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniGetTempVal);\n\ - temp2 = temp2 + test02;\n\ - temp2 = fy * (temp2 - temp1) + (temp1 << 10);\n\ -\n\ - vxc_float4 tmp_dst;\n\ - vxc_uchar4 u8_dst;\n\ - VXC_DP4x4(u8_dst, temp2, 1 << 19, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 1), uniExtractBytes);\n\ - VXC_DP4x4(tmp_dst, u8_dst, u8_dst, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero,\n\ - 1), uniConvertIntergetoF32_4x4);\n\ -\n\ - //convert U8 to dfp8\n\ - int4 dst0;\n\ - vxc_char4 dst;\n\ - tmp_dst = tmp_dst * f32Var - grayMean;\n\ - tmp_dst *= outputScale;\n\ - dst0 = convert_int4_rte(tmp_dst);\n\ - VXC_DP2x8(dst, dst0, dst0, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), uniExtactInteger_2x8);\n\ -\n\ - VXC_WriteImage(output, coord_out, dst, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0));\n\ -}\n\ -\n\ -_viv_uniform VXC_512Bits uniDataMulAlpha_4x4;\n\ -_viv_uniform VXC_512Bits uniDataSubMean_4x4;\n\ -__kernel void GrayScaletoTensor_Fp16\n\ - (\n\ - __read_only image2d_t input,\n\ - __write_only image2d_array_t output,\n\ - global int *xRatio,\n\ - global int *yRatio,\n\ - global int *xOffset,\n\ - global int *yOffset,\n\ - float mean,\n\ - float f32Var\n\ - )\n\ -{\n\ - int2 ratioXY = (int2)(*xRatio, *yRatio);\n\ -\n\ - int4 xPos = get_global_id(0);\n\ - int yPos = get_global_id(1);\n\ -\n\ - int2 ratioSufXY = (ratioXY >> 1) - (1 << 14);\n\ - xPos += (int4)(0, 1, 2, 3);\n\ -\n\ - //x\n\ - int4 fx0 = xPos * ratioXY.x + ratioSufXY.x;\n\ - int4 sx = fx0 & 0xffff8000;\n\ - fx0 -= sx;\n\ - sx = sx >> 15;\n\ -\n\ - vxc_short4 fx;\n\ - VXC_DP4x4(fx, fx0, 1 << 4, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAddRShift);\n\ - //y\n\ - int fy = yPos * ratioXY.y + ratioSufXY.y;\n\ - int sy = fy & 0xffff8000; // Floor\n\ -\n\ - fy -= sy;\n\ - sy = sy >> 15;\n\ -\n\ - fy = (fy + (1<< 4)) >> 5;\n\ -\n\ - //R\n\ - vxc_uchar16 line0Y;\n\ - vxc_uchar16 line1Y;\n\ - int4 coord;\n\ - sx = sx + *xOffset;\n\ - coord.xyz = sx.xyz;\n\ - coord.w = sy + *yOffset;\n\ - int2 coord1 = (int2)(sx.w, coord.w);\n\ - VXC_ReadImage(line0Y, input, coord.xw, VXC_5BITOFFSET_XY(0, 0),\n\ - VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0));\n\ - VXC_ReadImage(line0Y, input, coord.yw, VXC_5BITOFFSET_XY(0, 0),\n\ - VXC_MODIFIER(2, 3, 0, VXC_RM_TowardZero, 0));\n\ - VXC_ReadImage(line0Y, input, coord.zw, VXC_5BITOFFSET_XY(0, 0),\n\ - VXC_MODIFIER(4, 5, 0, VXC_RM_TowardZero, 0));\n\ - VXC_ReadImage(line0Y, input, coord1, VXC_5BITOFFSET_XY(0, 0),\n\ - VXC_MODIFIER(6, 7, 0, VXC_RM_TowardZero, 0));\n\ -\n\ - VXC_ReadImage(line1Y, input, coord.xw, VXC_5BITOFFSET_XY(0, 1),\n\ - VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0));\n\ - VXC_ReadImage(line1Y, input, coord.yw, VXC_5BITOFFSET_XY(0, 1),\n\ - VXC_MODIFIER(2, 3, 0, VXC_RM_TowardZero, 0));\n\ - VXC_ReadImage(line1Y, input, coord.zw, VXC_5BITOFFSET_XY(0, 1),\n\ - VXC_MODIFIER(4, 5, 0, VXC_RM_TowardZero, 0));\n\ - VXC_ReadImage(line1Y, input, coord1, VXC_5BITOFFSET_XY(0, 1),\n\ - VXC_MODIFIER(6, 7, 0, VXC_RM_TowardZero, 0));\n\ -\n\ - float grayMean = mean;\n\ -\n\ - int4 test01, temp1;\n\ - int4 test02, temp2;\n\ - int4 tt;\n\ - vxc_uchar4 val;\n\ - int2 coord_out = (int2)(xPos.x, yPos);\n\ -\n\ - VXC_DP4x4(test01, line0Y, line0Y, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniVecShift10);\n\ - VXC_DP4x4(temp1, line0Y, fx, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniGetTempVal);\n\ - temp1 = temp1 + test01;\n\ -\n\ - VXC_DP4x4(test02, line1Y, line1Y, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniVecShift10);\n\ - VXC_DP4x4(temp2, line1Y, fx, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniGetTempVal);\n\ - temp2 = temp2 + test02;\n\ - temp2 = fy * (temp2 - temp1) + (temp1 << 10);\n\ -\n\ - VXC_DP4x4(val, temp2, 1 << 19, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 1), uniExtractBytes);\n\ -\n\ - //convert U8 to FP16\n\ - half f16mean;\n\ - half f16alpha;\n\ - vxc_half4 dst;\n\ - vxc_short4 tmp_dst;\n\ - _viv_asm(CONV, f16mean, grayMean);\n\ - _viv_asm(CONV, f16alpha, f32Var);\n\ - VXC_DP4x4(dst, val, f16mean, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniDataSubMean_4x4);\n\ - VXC_DP4x4(dst, dst, f16alpha, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniDataMulAlpha_4x4);\n\ - _viv_asm(COPY, tmp_dst, dst, 8);\n\ - VXC_WriteImage(output, coord_out, tmp_dst, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0));\n\ -}\n\ -"; /* end of vsi_nn_kernel_imageprocess_4_vx*/ - -static const char vsi_nn_kernel_imageprocess_5_vx[] = "#include \"cl_viv_vx_ext.h\"\n\ -\n\ -_viv_uniform VXC_512Bits uniVecShift10;\n\ -_viv_uniform VXC_512Bits uniAddRShift;\n\ -_viv_uniform VXC_512Bits uniGetTempVal;\n\ -_viv_uniform VXC_512Bits uniExtractBytes;\n\ -\n\ -_viv_uniform VXC_512Bits uniConvertIntergetoF32_4x4;\n\ -_viv_uniform float outputScale;\n\ -_viv_uniform VXC_512Bits uniExtactInteger_2x8;\n\ -\n\ -_viv_uniform VXC_512Bits uniDataMeanStddevLo_2x8;\n\ -_viv_uniform VXC_512Bits uniDataMeanStddevHi_2x8;\n\ -__kernel void GrayScaletoTensor_Fp16_copy\n\ - (\n\ - __read_only image2d_t input,\n\ - __write_only image2d_array_t output,\n\ - global int *xRatio,\n\ - global int *yRatio,\n\ - global int *xOffset,\n\ - global int *yOffset,\n\ - float mean,\n\ - float f32Var\n\ - )\n\ -{\n\ - int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(0), get_global_id(1));\n\ -\n\ - coord.xy += (int2) (*xOffset, *yOffset);\n\ - vxc_uchar16 src0;\n\ - vxc_half8 dst0, dst1;\n\ -\n\ - VXC_ReadImage(src0, input, coord.xy, VXC_5BITOFFSET_XY(0, 0),\n\ - VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));\n\ -\n\ - coord.x = coord.z + 8;\n\ - float4 paramData = (float4)(mean * f32Var, mean * f32Var, mean * f32Var, f32Var);\n\ - //convert U8 to FP16\n\ - half4 paramData_f16;\n\ - vxc_short8 tmp_dst;\n\ - _viv_asm(CONV, paramData_f16, paramData);\n\ -\n\ - VXC_DP2x8(dst0, src0, paramData_f16,\n\ - VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniDataMeanStddevLo_2x8);\n\ - VXC_DP2x8(dst1, src0, paramData_f16,\n\ - VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniDataMeanStddevHi_2x8);\n\ - _viv_asm(COPY, tmp_dst, dst0, 16);\n\ - VXC_WriteImage(output, coord.zw, tmp_dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ - _viv_asm(COPY, tmp_dst, dst1, 16);\n\ - VXC_WriteImage(output, coord.xw, tmp_dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ -}\n\ -\n\ -__kernel void GrayScaletoTensor_Int8_copy\n\ - (\n\ - __read_only image2d_t input,\n\ - __write_only image2d_array_t output,\n\ - global int *xRatio,\n\ - global int *yRatio,\n\ - global int *xOffset,\n\ - global int *yOffset,\n\ - float mean,\n\ - float f32Var\n\ - )\n\ -{\n\ - int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(0), get_global_id(1));\n\ -\n\ - coord.xy += (int2) (*xOffset, *yOffset);\n\ - vxc_uchar16 src0;\n\ - vxc_char16 dst;\n\ -\n\ - VXC_ReadImage(src0, input, coord.xy, VXC_5BITOFFSET_XY(0, 0),\n\ - VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));\n\ -\n\ - f32Var *= outputScale;\n\ - float4 paramData = (float4)(mean * f32Var, mean * f32Var, mean * f32Var, f32Var);\n\ - //convert U8 to FP16\n\ - half4 paramData_f16;\n\ - _viv_asm(CONV, paramData_f16, paramData);\n\ -\n\ - VXC_DP2x8(dst, src0, paramData_f16,\n\ - VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniDataMeanStddevLo_2x8);\n\ - VXC_DP2x8(dst, src0, paramData_f16,\n\ - VXC_MODIFIER(8, 15, 0, VXC_RM_ToNearestEven, 1), uniDataMeanStddevHi_2x8);\n\ - VXC_WriteImage(output, coord.zw, dst,\n\ - VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));\n\ -\n\ -}\n\ -\n\ -__kernel void GrayScaletoTensor_Int16\n\ - (\n\ - __read_only image2d_t input,\n\ - __write_only image2d_array_t output,\n\ - global int *xRatio,\n\ - global int *yRatio,\n\ - global int *xOffset,\n\ - global int *yOffset,\n\ - float mean,\n\ - float f32Var\n\ - )\n\ -{\n\ - int2 ratioXY = (int2)(*xRatio, *yRatio);\n\ -\n\ - int4 xPos = get_global_id(0);\n\ - int yPos = get_global_id(1);\n\ -\n\ - int2 ratioSufXY = (ratioXY >> 1) - (1 << 14);\n\ - xPos += (int4)(0, 1, 2, 3);\n\ -\n\ - //x\n\ - int4 fx0 = xPos * ratioXY.x + ratioSufXY.x;\n\ - int4 sx = fx0 & 0xffff8000;\n\ - fx0 -= sx;\n\ - sx = sx >> 15;\n\ -\n\ - vxc_short4 fx;\n\ - VXC_DP4x4(fx, fx0, 1 << 4, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAddRShift);\n\ - //y\n\ - int fy = yPos * ratioXY.y + ratioSufXY.y;\n\ - int sy = fy & 0xffff8000; // Floor\n\ -\n\ - fy -= sy;\n\ - sy = sy >> 15;\n\ -\n\ - fy = (fy + (1<< 4)) >> 5;\n\ -\n\ - vxc_uchar16 line0Y;\n\ - vxc_uchar16 line1Y;\n\ - int4 coord;\n\ - sx = sx + *xOffset;\n\ - coord.xyz = sx.xyz;\n\ - coord.w = sy + *yOffset;\n\ - int2 coord1 = (int2)(sx.w, coord.w);\n\ - VXC_ReadImage(line0Y, input, coord.xw,\n\ - VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0));\n\ - VXC_ReadImage(line0Y, input, coord.yw,\n\ - VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(2, 3, 0, VXC_RM_TowardZero, 0));\n\ - VXC_ReadImage(line0Y, input, coord.zw,\n\ - VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(4, 5, 0, VXC_RM_TowardZero, 0));\n\ - VXC_ReadImage(line0Y, input, coord1, VXC_5BITOFFSET_XY(0, 0),\n\ - VXC_MODIFIER(6, 7, 0, VXC_RM_TowardZero, 0));\n\ -\n\ - VXC_ReadImage(line1Y, input, coord.xw, VXC_5BITOFFSET_XY(0, 1),\n\ - VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0));\n\ - VXC_ReadImage(line1Y, input, coord.yw, VXC_5BITOFFSET_XY(0, 1),\n\ - VXC_MODIFIER(2, 3, 0, VXC_RM_TowardZero, 0));\n\ - VXC_ReadImage(line1Y, input, coord.zw, VXC_5BITOFFSET_XY(0, 1),\n\ - VXC_MODIFIER(4, 5, 0, VXC_RM_TowardZero, 0));\n\ - VXC_ReadImage(line1Y, input, coord1, VXC_5BITOFFSET_XY(0, 1),\n\ - VXC_MODIFIER(6, 7, 0, VXC_RM_TowardZero, 0));\n\ -\n\ - float grayMean = mean * f32Var;\n\ -\n\ - int4 test01, temp1;\n\ - int4 test02, temp2;\n\ - int4 tt;\n\ - vxc_uchar4 val;\n\ - int2 coord_out = (int2)(xPos.x, yPos);\n\ -\n\ - vxc_uchar8 line1, line2;\n\ -\n\ - VXC_DP4x4(test01, line0Y, line0Y, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniVecShift10);\n\ - VXC_DP4x4(temp1, line0Y, fx, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniGetTempVal);\n\ - temp1 = temp1 + test01;\n\ -\n\ - VXC_DP4x4(test02, line1Y, line1Y, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniVecShift10);\n\ - VXC_DP4x4(temp2, line1Y, fx, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniGetTempVal);\n\ - temp2 = temp2 + test02;\n\ - temp2 = fy * (temp2 - temp1) + (temp1 << 10);\n\ -\n\ - vxc_float4 tmp_dst;\n\ - vxc_uchar4 u8_dst;\n\ - VXC_DP4x4(u8_dst, temp2, 1 << 19, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 1), uniExtractBytes);\n\ - VXC_DP4x4(tmp_dst, u8_dst, u8_dst,\n\ - VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 1), uniConvertIntergetoF32_4x4);\n\ -\n\ - //convert U8 to dfp8\n\ - int4 dst0;\n\ - vxc_short4 dst;\n\ - tmp_dst = tmp_dst * f32Var - grayMean;\n\ - tmp_dst *= outputScale;\n\ - dst0 = convert_int4_rte(tmp_dst);\n\ - VXC_DP2x8(dst, dst0, dst0, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), uniExtactInteger_2x8);\n\ -\n\ - VXC_WriteImage(output, coord_out, dst, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0));\n\ -}\n\ -\n\ -__kernel void GrayScaletoTensor_Int16_copy\n\ - (\n\ - __read_only image2d_t input,\n\ - __write_only image2d_array_t output,\n\ - global int *xRatio,\n\ - global int *yRatio,\n\ - global int *xOffset,\n\ - global int *yOffset,\n\ - float mean,\n\ - float f32Var\n\ - )\n\ -{\n\ - int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(0), get_global_id(1));\n\ -\n\ - coord.xy += (int2) (*xOffset, *yOffset);\n\ - vxc_uchar16 src0;\n\ - vxc_short8 dst0, dst1;\n\ -\n\ - VXC_ReadImage(src0, input, coord.xy, VXC_5BITOFFSET_XY(0, 0),\n\ - VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));\n\ -\n\ - coord.x = coord.z + 8;\n\ -\n\ - f32Var *= outputScale;\n\ - float4 paramData = (float4)(mean * f32Var, mean * f32Var, mean * f32Var, f32Var);\n\ - //convert U8 to FP16\n\ - half4 paramData_f16;\n\ - _viv_asm(CONV, paramData_f16, paramData);\n\ -\n\ -\n\ - VXC_DP2x8(dst0, src0, paramData_f16,\n\ - VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniDataMeanStddevLo_2x8);\n\ - VXC_DP2x8(dst1, src0, paramData_f16,\n\ - VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniDataMeanStddevHi_2x8);\n\ - VXC_WriteImage(output, coord.zw, dst0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ - VXC_WriteImage(output, coord.xw, dst1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ -}\n\ -\n\ -_viv_uniform float outputZP;\n\ -__kernel void GrayScaletoTensor_UInt8_copy\n\ - (\n\ - __read_only image2d_t input,\n\ - __write_only image2d_array_t output,\n\ - global int *xRatio,\n\ - global int *yRatio,\n\ - global int *xOffset,\n\ - global int *yOffset,\n\ - float mean,\n\ - float f32Var\n\ - )\n\ -{\n\ - int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(0), get_global_id(1));\n\ -\n\ - coord.xy += (int2) (*xOffset, *yOffset);\n\ - vxc_uchar16 src0;\n\ - vxc_uchar16 dst;\n\ -\n\ - VXC_ReadImage(src0, input, coord.xy, VXC_5BITOFFSET_XY(0, 0),\n\ - VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));\n\ -\n\ - f32Var *= outputScale;\n\ - float4 paramData = (float4)(mean * f32Var - outputZP, mean * f32Var - outputZP,\n\ - mean * f32Var - outputZP, f32Var);\n\ - //convert U8 to FP16\n\ - half4 paramData_f16;\n\ - _viv_asm(CONV, paramData_f16, paramData);\n\ -\n\ - VXC_DP2x8(dst, src0, paramData_f16,\n\ - VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniDataMeanStddevLo_2x8);\n\ - VXC_DP2x8(dst, src0, paramData_f16,\n\ - VXC_MODIFIER(8, 15, 0, VXC_RM_ToNearestEven, 1), uniDataMeanStddevHi_2x8);\n\ - VXC_WriteImage(output, coord.zw, dst, VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));\n\ -}\n\ -\n\ -__kernel void GrayScaletoTensor_UInt8\n\ - (\n\ - __read_only image2d_t input,\n\ - __write_only image2d_array_t output,\n\ - global int *xRatio,\n\ - global int *yRatio,\n\ - global int *xOffset,\n\ - global int *yOffset,\n\ - float mean,\n\ - float f32Var\n\ - )\n\ -{\n\ - int2 ratioXY = (int2)(*xRatio, *yRatio);\n\ -\n\ - int4 xPos = get_global_id(0);\n\ - int yPos = get_global_id(1);\n\ -\n\ - int2 ratioSufXY = (ratioXY >> 1) - (1 << 14);\n\ - xPos += (int4)(0, 1, 2, 3);\n\ -\n\ - //x\n\ - int4 fx0 = xPos * ratioXY.x + ratioSufXY.x;\n\ - int4 sx = fx0 & 0xffff8000;\n\ - fx0 -= sx;\n\ - sx = sx >> 15;\n\ -\n\ - vxc_short4 fx;\n\ - VXC_DP4x4(fx, fx0, 1 << 4, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAddRShift);\n\ - //y\n\ - int fy = yPos * ratioXY.y + ratioSufXY.y;\n\ - int sy = fy & 0xffff8000; // Floor\n\ -\n\ - fy -= sy;\n\ - sy = sy >> 15;\n\ -\n\ - fy = (fy + (1<< 4)) >> 5;\n\ -\n\ - //R\n\ - vxc_uchar16 line0Y;\n\ - vxc_uchar16 line1Y;\n\ - int4 coord;\n\ - sx = sx + *xOffset;\n\ - coord.xyz = sx.xyz;\n\ - coord.w = sy + *yOffset;\n\ - int2 coord1 = (int2)(sx.w, coord.w);\n\ - VXC_ReadImage(line0Y, input, coord.xw, VXC_5BITOFFSET_XY(0, 0),\n\ - VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0));\n\ - VXC_ReadImage(line0Y, input, coord.yw, VXC_5BITOFFSET_XY(0, 0),\n\ - VXC_MODIFIER(2, 3, 0, VXC_RM_TowardZero, 0));\n\ - VXC_ReadImage(line0Y, input, coord.zw, VXC_5BITOFFSET_XY(0, 0),\n\ - VXC_MODIFIER(4, 5, 0, VXC_RM_TowardZero, 0));\n\ - VXC_ReadImage(line0Y, input, coord1, VXC_5BITOFFSET_XY(0, 0),\n\ - VXC_MODIFIER(6, 7, 0, VXC_RM_TowardZero, 0));\n\ -\n\ - VXC_ReadImage(line1Y, input, coord.xw, VXC_5BITOFFSET_XY(0, 1),\n\ - VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0));\n\ - VXC_ReadImage(line1Y, input, coord.yw, VXC_5BITOFFSET_XY(0, 1),\n\ - VXC_MODIFIER(2, 3, 0, VXC_RM_TowardZero, 0));\n\ - VXC_ReadImage(line1Y, input, coord.zw, VXC_5BITOFFSET_XY(0, 1),\n\ - VXC_MODIFIER(4, 5, 0, VXC_RM_TowardZero, 0));\n\ - VXC_ReadImage(line1Y, input, coord1, VXC_5BITOFFSET_XY(0, 1),\n\ - VXC_MODIFIER(6, 7, 0, VXC_RM_TowardZero, 0));\n\ -\n\ - float grayMean = mean * f32Var;\n\ -\n\ - int4 test01, temp1;\n\ - int4 test02, temp2;\n\ - int4 tt;\n\ - vxc_uchar4 val;\n\ - int2 coord_out = (int2)(xPos.x, yPos);\n\ -\n\ - VXC_DP4x4(test01, line0Y, line0Y, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniVecShift10);\n\ - VXC_DP4x4(temp1, line0Y, fx, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniGetTempVal);\n\ - temp1 = temp1 + test01;\n\ -\n\ - VXC_DP4x4(test02, line1Y, line1Y, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniVecShift10);\n\ - VXC_DP4x4(temp2, line1Y, fx, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniGetTempVal);\n\ - temp2 = temp2 + test02;\n\ - temp2 = fy * (temp2 - temp1) + (temp1 << 10);\n\ -\n\ - vxc_float4 tmp_dst;\n\ - vxc_uchar4 u8_dst;\n\ - VXC_DP4x4(u8_dst, temp2, 1 << 19, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 1), uniExtractBytes);\n\ - VXC_DP4x4(tmp_dst, u8_dst, u8_dst,\n\ - VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 1), uniConvertIntergetoF32_4x4);\n\ -\n\ - //convert U8 to dfp8\n\ - int4 dst0;\n\ - vxc_uchar4 dst;\n\ - tmp_dst = tmp_dst * f32Var - grayMean;\n\ - tmp_dst = tmp_dst * outputScale + outputZP;\n\ - dst0 = convert_int4_rte(tmp_dst);\n\ - VXC_DP2x8(dst, dst0, dst0, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), uniExtactInteger_2x8);\n\ -\n\ - VXC_WriteImage(output, coord_out, dst, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0));\n\ -}\n\ -"; /* end of vsi_nn_kernel_imageprocess_5_vx*/ - -static const char vsi_nn_kernel_roi_align_vx[] = "#include \"cl_viv_vx_ext.h\"\n\ -\n\ -__kernel void vxcRoi_align(\n\ - __read_only image2d_array_t input,\n\ - __write_only image2d_array_t output)\n\ -{\n\ -\n\ -}\n\ -"; /* end of vsi_nn_kernel_roi_align_vx*/ - -static const char vsi_nn_kernel_signalframe_vx[] = "#include \"cl_viv_vx_ext.h\"\n\ -\n\ -_viv_uniform int input_width;\n\ -_viv_uniform int input_height;\n\ -_viv_uniform int input_channel;\n\ -_viv_uniform int output_channel;\n\ -\n\ -\n\ -__kernel __attribute__((reqd_work_group_size(8, 1, 1))) void vxcSignalFrame_width(\n\ - image2d_array_t input,\n\ - image2d_array_t output,\n\ - int frame_length,\n\ - int step,\n\ - int pad_end,\n\ - int pad,\n\ - int axis)\n\ -{\n\ - int gidx = get_global_id(0);\n\ - int gidy = get_global_id(1);\n\ - int gidz = get_global_id(2);\n\ - int outChn = gidz * input_height + gidy;\n\ - int4 coord = (int4)(0, gidy, gidz, 0);\n\ - int4 coord_out = (int4)(0, 0, outChn, 0);\n\ -\n\ - int endcoord = (pad_end == 0) ? (input_width - frame_length + 1) : (input_width);\n\ - int iter = frame_length / 8;\n\ - int res = frame_length % 8;\n\ - vxc_short8 src0;\n\ -\n\ - for(int i = 0; i < endcoord; i += step)\n\ - {\n\ - coord.x = i;\n\ - for(int j = 0; j < iter; j++)\n\ - {\n\ - coord_out.x = j << 3;\n\ - coord.x = i + (j << 3);\n\ - VXC_ReadImage2DArray(src0, input, coord, VXC_5BITOFFSET_XY(0, 0),\n\ - VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ - VXC_WriteImage2DArray(output, coord_out, src0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ - }\n\ - coord.x = i + (iter << 3);\n\ - coord_out.x = (iter << 3);\n\ - for(int j = 0; j < res; j++)\n\ - {\n\ - VXC_ReadImage2DArray(src0, input, coord, VXC_5BITOFFSET_XY(0, 0),\n\ - VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));\n\ - VXC_WriteImage2DArray(output, coord_out, src0, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));\n\ - coord_out.x++;\n\ - coord.x++;\n\ - }\n\ -\n\ - coord_out.y++;\n\ - }\n\ -}\n\ -\n\ -__kernel __attribute__((reqd_work_group_size(8, 1, 1))) void vxcSignalFrame_height(\n\ - image2d_array_t input,\n\ - image2d_array_t output,\n\ - int frame_length,\n\ - int step,\n\ - int pad_end,\n\ - int pad,\n\ - int axis)\n\ -{\n\ - int gidx = get_global_id(0);\n\ - int gidy = get_global_id(1);\n\ - int gidz = get_global_id(2);\n\ - int outChn = gidz * output_channel + (gidy / step);\n\ - int4 coord = (int4)(gidx, gidy, gidz, 0);\n\ - int4 coord_out = (int4)(gidx, 0, outChn, 0);\n\ - vxc_short8 src0;\n\ -\n\ - for(int i = 0; i < frame_length; i++)\n\ - {\n\ - coord.y = gidy + i;\n\ - coord_out.y = i;\n\ - VXC_ReadImage2DArray(src0, input, coord, VXC_5BITOFFSET_XY(0, 0),\\\n\ - VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ - VXC_WriteImage2DArray(output, coord_out, src0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ - }\n\ -}\n\ -\n\ -__kernel __attribute__((reqd_work_group_size(8, 1, 1))) void vxcSignalFrame_channel(\n\ - image2d_array_t input,\n\ - image2d_array_t output,\n\ - int frame_length,\n\ - int step,\n\ - int pad_end,\n\ - int pad,\n\ - int axis)\n\ -{\n\ - int gidx = get_global_id(0);\n\ - int gidy = get_global_id(1);\n\ - int gidz = get_global_id(2);\n\ - int outChn = (gidz / step) * frame_length;\n\ - int4 coord = (int4)(gidx, gidy, gidz, 0);\n\ - int4 coord_out = (int4)(gidx, gidy, outChn, 0);\n\ - vxc_short8 src0;\n\ -\n\ - for(int i = 0; i < frame_length; i++)\n\ - {\n\ - coord.z = gidz + i;\n\ - coord_out.z = outChn + i;\n\ - if(coord.z < input_channel)\n\ - {\n\ - VXC_ReadImage2DArray(src0, input, coord, VXC_5BITOFFSET_XY(0, 0),\n\ - VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ - }\n\ - else\n\ - {\n\ - src0 = (vxc_short8)(0);\n\ - }\n\ - VXC_WriteImage2DArray(output, coord_out, src0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ - }\n\ -}\n\ -\n\ -__kernel __attribute__((reqd_work_group_size(8, 1, 1))) void vxcSignalFrame_width_8bit(\n\ - image2d_array_t input,\n\ - image2d_array_t output,\n\ - int frame_length,\n\ - int step,\n\ - int pad_end,\n\ - int pad,\n\ - int axis)\n\ -{\n\ - int gidx = get_global_id(0);\n\ - int gidy = get_global_id(1);\n\ - int gidz = get_global_id(2);\n\ - int outChn = gidz * input_height + gidy;\n\ - int4 coord = (int4)(0, gidy, gidz, 0);\n\ - int4 coord_out = (int4)(0, 0, outChn, 0);\n\ -\n\ - int endcoord = (pad_end == 0) ? (input_width - frame_length + 1) : (input_width);\n\ - int iter = frame_length / 8;\n\ - int res = frame_length % 8;\n\ - vxc_char8 src0;\n\ -\n\ - for(int i = 0; i < endcoord; i += step)\n\ - {\n\ - coord.x = i;\n\ - for(int j = 0; j < iter; j++)\n\ - {\n\ - coord_out.x = j << 3;\n\ - coord.x = i + (j << 3);\n\ - VXC_ReadImage2DArray(src0, input, coord, VXC_5BITOFFSET_XY(0, 0),\n\ - VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ - VXC_WriteImage2DArray(output, coord_out, src0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ - }\n\ - coord.x = i + (iter << 3);\n\ - coord_out.x = (iter << 3);\n\ - for(int j = 0; j < res; j++)\n\ - {\n\ - VXC_ReadImage2DArray(src0, input, coord, VXC_5BITOFFSET_XY(0, 0),\n\ - VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));\n\ - VXC_WriteImage2DArray(output, coord_out, src0, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));\n\ - coord_out.x++;\n\ - coord.x++;\n\ - }\n\ -\n\ - coord_out.y++;\n\ - }\n\ -}\n\ -\n\ -__kernel __attribute__((reqd_work_group_size(8, 1, 1))) void vxcSignalFrame_height_8bit(\n\ - image2d_array_t input,\n\ - image2d_array_t output,\n\ - int frame_length,\n\ - int step,\n\ - int pad_end,\n\ - int pad,\n\ - int axis)\n\ -{\n\ - int gidx = get_global_id(0);\n\ - int gidy = get_global_id(1);\n\ - int gidz = get_global_id(2);\n\ - int outChn = gidz * output_channel + (gidy / step);\n\ - int4 coord = (int4)(gidx, gidy, gidz, 0);\n\ - int4 coord_out = (int4)(gidx, 0, outChn, 0);\n\ - vxc_char8 src0;\n\ -\n\ - for(int i = 0; i < frame_length; i++)\n\ - {\n\ - coord.y = gidy + i;\n\ - coord_out.y = i;\n\ - VXC_ReadImage2DArray(src0, input, coord, VXC_5BITOFFSET_XY(0, 0),\\\n\ - VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ - VXC_WriteImage2DArray(output, coord_out, src0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ - }\n\ -}\n\ -\n\ -__kernel __attribute__((reqd_work_group_size(8, 1, 1))) void vxcSignalFrame_channel_8bit(\n\ - image2d_array_t input,\n\ - image2d_array_t output,\n\ - int frame_length,\n\ - int step,\n\ - int pad_end,\n\ - int pad,\n\ - int axis)\n\ -{\n\ - int gidx = get_global_id(0);\n\ - int gidy = get_global_id(1);\n\ - int gidz = get_global_id(2);\n\ - int outChn = (gidz / step) * frame_length;\n\ - int4 coord = (int4)(gidx, gidy, gidz, 0);\n\ - int4 coord_out = (int4)(gidx, gidy, outChn, 0);\n\ - vxc_char8 src0;\n\ -\n\ - for(int i = 0; i < frame_length; i++)\n\ - {\n\ - coord.z = gidz + i;\n\ - coord_out.z = outChn + i;\n\ - if(coord.z < input_channel)\n\ - {\n\ - VXC_ReadImage2DArray(src0, input, coord, VXC_5BITOFFSET_XY(0, 0),\n\ - VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ - }\n\ - else\n\ - {\n\ - src0 = (vxc_char8)(0);\n\ - }\n\ - VXC_WriteImage2DArray(output, coord_out, src0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ - }\n\ -}\n\ -\n\ -#if 0\n\ -__kernel __attribute__((reqd_work_group_size(16, 1, 1))) void vxcSignalFrame_tensor(\n\ - image2d_array_t input,\n\ - image2d_array_t output,\n\ - image2d_array_t frame_length,\n\ - image2d_array_t steps,\n\ - image2d_array_t pad_end,\n\ - image2d_array_t pad,\n\ - image2d_array_t axis)\n\ -{\n\ - int gidx = get_global_id(0);\n\ - int gidy = get_global_id(1);\n\ - int gidz = get_global_id(2);\n\ - int outChn = gidz * input_height + gidy;\n\ - int4 coord = (int4)(0, gidy, gidz, 0);\n\ - int4 coord_out = (int4)(0, 0, outChn, 0);\n\ - int4 coord_para = (int4)(0, 0, 0, 0);\n\ -\n\ - int4 size = read_imagei(frame_length, coord_para);\n\ - int4 step = read_imagei(steps, coord_para);\n\ - int4 pe = read_imagei(pad_end, coord_para);\n\ - int4 pd = read_imagei(pad, coord_para);\n\ - int len = input_width + (pe.x ? pd : 0);\n\ - int endcoord = len - size.x + 1;\n\ - int iter = size.x / 8;\n\ - int res = size.x % 8;\n\ - vxc_short8 src0;\n\ -\n\ - for(int i = 0; i < endcoord; i += step.x)\n\ - {\n\ - coord.x = i;\n\ - for(int j = 0; j < iter; j++)\n\ - {\n\ - coord_out.x = j << 3;\n\ - coord.x += (j << 3);\n\ - VXC_ReadImage2DArray(src0, input, coord, VXC_5BITOFFSET_XY(0, 0),\n\ - VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ - VXC_WriteImage2DArray(output, coord_out, src0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ - }\n\ - coord.x = i + (iter << 3);\n\ - coord_out.x = (iter << 3);\n\ - for(int j = 0; j < res; j++)\n\ - {\n\ - VXC_ReadImage2DArray(src0, input, coord, VXC_5BITOFFSET_XY(0, 0),\n\ - VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));\n\ - VXC_WriteImage2DArray(output, coord_out, src0, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));\n\ - coord_out.x++;\n\ - coord.x++;\n\ - }\n\ -\n\ - coord_out.y++;\n\ - }\n\ -}\n\ -#endif\n\ -"; /* end of vsi_nn_kernel_signalframe_vx*/ - -static const char vsi_nn_kernel_tensorstackconcat_vx[] = "#include \"cl_viv_vx_ext.h\"\n\ -\n\ -/*******************tensorstackconcat 16BITs********************/\n\ -__kernel void vxcTensorStackConcat(\n\ - image2d_array_t input,\n\ - image2d_t index,\n\ - image2d_array_t output)\n\ -{\n\ - int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0);\n\ - vxc_short8 src0, src1;\n\ - VXC_ReadImage2DArray(src0, input, coord, VXC_5BITOFFSET_XY(0, 0),\\\n\ - VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ - VXC_ReadImage2DArray(src1, input, coord, VXC_5BITOFFSET_XY(8, 0),\\\n\ - VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ - coord.w = 0;\n\ - coord.y = read_imagei(index, coord.ww).x;\n\ - VXC_WriteImage2DArray(output, coord, src0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ - coord.x += 8;\n\ - VXC_WriteImage2DArray(output, coord, src1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ -}\n\ -\n\ -/**************tensorstackconcat 8BITs***************************/\n\ -__kernel void vxcTensorStackConcat8Bits(\n\ - image2d_array_t input,\n\ - image2d_t index,\n\ - image2d_array_t output)\n\ -{\n\ - int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0);\n\ - int idx = coord.x;\n\ - vxc_char16 src0, src1;\n\ - VXC_ReadImage2DArray(src0, input, coord, VXC_5BITOFFSET_XY(0, 0),\\\n\ - VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));\n\ - coord.x += 16;\n\ - VXC_ReadImage2DArray(src1, input, coord, VXC_5BITOFFSET_XY(0, 0),\\\n\ - VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));\n\ - coord.x = idx;\n\ - coord.w = 0;\n\ - coord.y = read_imagei(index, coord.ww).x;\n\ - VXC_WriteImage2DArray(output, coord, src0, VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));\n\ - coord.x += 16;\n\ - VXC_WriteImage2DArray(output, coord, src1, VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));\n\ -}"; /* end of vsi_nn_kernel_tensorstackconcat_vx*/ - -static const char vsi_nn_kernel_transform_gemm_vx[] = "/*\n\ - ============================================================================\n\ - Name : gemm.vx\n\ - Author : Sam\n\ - Version :\n\ - Copyright : Your copyright notice\n\ - Description :\n\ - ============================================================================\n\ - */\n\ -#include \"cl_viv_vx_ext.h\"\n\ -\n\ -_viv_uniform VXC_512Bits uniGemm3x3_4x4;\n\ -__kernel void vxcTransform_Gemm_F16toF16\n\ - (\n\ - __read_only image2d_array_t thetaTensor,\n\ - __read_only image2d_array_t gridTensor,\n\ - __write_only image2d_array_t coordinates\n\ - )\n\ -{\n\ - int4 coord = (int4)(0, get_global_id(0), get_global_id(1), 0);\n\ -\n\ - vxc_short8 vec0, vec1, vec2;\n\ - vxc_half8 src0, src1, src2, dst;\n\ -\n\ - VXC_ReadImage(vec0,thetaTensor,coord.xx,VXC_5BITOFFSET_XY(0,0),VXC_MODIFIER(0,5, 0, VXC_RM_TowardZero, 0));\n\ - _viv_asm(COPY, src0, vec0, 16);\n\ - VXC_ReadImage(vec1,gridTensor,coord.yz,VXC_5BITOFFSET_XY(0,0),VXC_MODIFIER(0,5,0, VXC_RM_TowardZero, 0));\n\ - _viv_asm(COPY, src1, vec1, 16);\n\ - VXC_ReadImage(vec2,gridTensor,coord.yz,VXC_5BITOFFSET_XY(6,0),VXC_MODIFIER(0,5,0, VXC_RM_TowardZero, 0));\n\ - _viv_asm(COPY, src2, vec2, 16);\n\ -\n\ - coord.y = (int)((short)coord.y / (short)3) * 2;\n\ -\n\ - VXC_DP4x4(dst, src1, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniGemm3x3_4x4);\n\ - VXC_DP4x4(dst, src2, src0, VXC_MODIFIER(4, 7, 0, VXC_RM_TowardZero, 0), uniGemm3x3_4x4);\n\ -\n\ - _viv_asm(COPY, vec0, dst, 16);\n\ - VXC_WriteImage(coordinates, coord.yz, vec0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ -}\n\ -"; /* end of vsi_nn_kernel_transform_gemm_vx*/ - -static const char vsi_nn_kernel_transform_interp_vx[] = "/*\n\ - ============================================================================\n\ - Name : minimum.vx\n\ - Author : Sam\n\ - Version :\n\ - Copyright : Your copyright notice\n\ - Description :\n\ - ============================================================================\n\ - */\n\ -#include \"cl_viv_vx_ext.h\"\n\ -\n\ -_viv_uniform VXC_512Bits uniGetDXY_4x4;\n\ -_viv_uniform VXC_512Bits uniConvertF16toF32_4x4;\n\ -_viv_uniform int2 packedWH2;\n\ -_viv_uniform int packedWH;\n\ -__kernel void vxcTransform_InterP_F16toF16_2D\n\ - (\n\ - __read_only image2d_array_t input0,\n\ - __read_only image2d_array_t input1,\n\ - __write_only image2d_array_t output\n\ - )\n\ -{\n\ - int2 coord = (int2)(get_global_id(0), get_global_id(1));\n\ -\n\ - vxc_short8 vec0;\n\ - vxc_half8 pxy;\n\ - vxc_float4 dxy4;\n\ - vxc_int4 pos4;\n\ - short dst = 0;\n\ -\n\ - VXC_ReadImage(vec0, input1, coord.xy, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0));\n\ - _viv_asm(COPY, pxy, vec0, 4);\n\ -\n\ - coord.x >>= 1;\n\ - vxc_short2 packedWH_16B;\n\ - _viv_asm(COPY, packedWH_16B, packedWH, 4);\n\ - VXC_DP4x4(dxy4, pxy, packedWH_16B, VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0), uniGetDXY_4x4);\n\ - dxy4.zw = floor(dxy4.xy);\n\ - pos4.xy = convert_int2(dxy4.zw);\n\ - pos4.zw = convert_int2(ceil(dxy4.xy));\n\ -\n\ - vxc_short8 vec1;\n\ - vxc_half8 src0, src1;\n\ - VXC_ReadImage(vec0, input0, pos4.xy, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0));\n\ - _viv_asm(COPY, src0, vec0, 8);\n\ - VXC_ReadImage(vec1, input0, pos4.xw, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0));\n\ - _viv_asm(COPY, src1, vec1, 8);\n\ -\n\ - float2 xyLerp = dxy4.xy - dxy4.zw;\n\ - float2 oneSub_xyLerp = 1.0f - xyLerp;\n\ - float4 coef = (float4)(oneSub_xyLerp.x * oneSub_xyLerp.y, xyLerp.x * oneSub_xyLerp.y,\n\ - oneSub_xyLerp.x * xyLerp.y, xyLerp.x * xyLerp.y);\n\ - float4 data;\n\ -\n\ - VXC_DP4x4(data, src0, src1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertF16toF32_4x4);\n\ -\n\ - data.x = dot(data, coef);\n\ -\n\ - half tmp;\n\ - _viv_asm(CONV, tmp, data);\n\ - _viv_asm(COPY, dst, tmp, 4);\n\ -\n\ - VXC_WriteImage(output, coord, dst, VXC_MODIFIER(0, 0, 0,VXC_RM_TowardZero, 0));\n\ -}\n\ -\n\ -_viv_uniform int depth;\n\ -__kernel void vxcTransform_InterP_F16toF16\n\ - (\n\ - __read_only image2d_array_t input0,\n\ - __read_only image2d_array_t input1,\n\ - __write_only image2d_array_t output\n\ - )\n\ -{\n\ - int4 coord = (int4)(get_global_id(0), get_global_id(1), 0, 0);\n\ -\n\ - vxc_short8 vec0;\n\ - vxc_half8 pxy;\n\ - vxc_float4 dxy4;\n\ - vxc_int4 pos4;\n\ - short dst = 0;\n\ -\n\ - VXC_ReadImage(vec0, input1, coord.xy, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0));\n\ - _viv_asm(COPY, pxy, vec0, 4);\n\ -\n\ - coord.x >>= 1;\n\ - vxc_short2 packedWH_16B;\n\ - _viv_asm(COPY, packedWH_16B, packedWH, 4);\n\ - VXC_DP4x4(dxy4, pxy, packedWH_16B, VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0), uniGetDXY_4x4);\n\ - dxy4.zw = floor(dxy4.xy);\n\ - pos4.xy = convert_int2(dxy4.zw);\n\ - pos4.zw = convert_int2(ceil(dxy4.xy));\n\ -\n\ -\n\ - float2 xyLerp = dxy4.xy - dxy4.zw;\n\ - float2 oneSub_xyLerp = 1.0f - xyLerp;\n\ - float4 coef = (float4)(oneSub_xyLerp.x * oneSub_xyLerp.y, xyLerp.x * oneSub_xyLerp.y,\n\ - oneSub_xyLerp.x * xyLerp.y, xyLerp.x * xyLerp.y);\n\ -\n\ - int4 coord_ = (int4)(pos4.x, pos4.y, 0, 0);\n\ - do\n\ - {\n\ - vxc_short8 vec1;\n\ - vxc_half8 src0, src1;\n\ - VXC_ReadImage2DArray(vec0,input0,coord_,VXC_5BITOFFSET_XY(0,0),VXC_MODIFIER(0,1,0, VXC_RM_TowardZero, 0));\n\ - _viv_asm(COPY, src0, vec0, 8);\n\ - VXC_ReadImage2DArray(vec1,input0,coord_,VXC_5BITOFFSET_XY(0,1),VXC_MODIFIER(0,1,0, VXC_RM_TowardZero, 0));\n\ - _viv_asm(COPY, src1, vec1, 8);\n\ -\n\ - coord_.z ++;\n\ - float4 data;\n\ - VXC_DP4x4(data, src0, src1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertF16toF32_4x4);\n\ -\n\ - data.x = dot(data, coef);\n\ -\n\ - half tmp;\n\ - _viv_asm(CONV, tmp, data);\n\ - _viv_asm(COPY, dst, tmp, 4);\n\ -\n\ -\n\ - VXC_WriteImage2DArray(output, coord, dst, VXC_MODIFIER(0, 0, 0,VXC_RM_TowardZero, 0));\n\ - coord.z ++;\n\ -\n\ - } while (coord.z < depth);\n\ -}\n\ -\n\ -"; /* end of vsi_nn_kernel_transform_interp_vx*/ - -static const char vsi_nn_kernel_transform_setupThres_vx[] = "/*\n\ - ============================================================================\n\ - Name : gemm.vx\n\ - Author : Sam\n\ - Version :\n\ - Copyright : Your copyright notice\n\ - Description :\n\ - ============================================================================\n\ - */\n\ -#include \"cl_viv_vx_ext.h\"\n\ -\n\ -_viv_uniform int4 extract_packed;\n\ -__kernel void vxcTransform_setupThres_F16toF16\n\ - (\n\ - __read_only image2d_array_t initTensor,\n\ - __read_only image2d_array_t inputFC,\n\ - global int* thresFlag,\n\ - __write_only image2d_array_t thres\n\ - )\n\ -{\n\ - int2 coord = (int2)(0, 0);\n\ -\n\ - vxc_ushort8 src0, src1, dst;\n\ -\n\ - int flag = *thresFlag;\n\ - VXC_ReadImage(src0, initTensor, coord, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 5, 0, VXC_RM_TowardZero, 0));\n\ - VXC_ReadImage(src1, inputFC, coord, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 5, 0, VXC_RM_TowardZero, 0));\n\ -\n\ - VXC_BitExtract(dst, src0, src1, extract_packed, VXC_MODIFIER(0, 5, 0, VXC_RM_TowardZero, 0));\n\ -\n\ - VXC_WriteImage(thres, coord, dst, VXC_MODIFIER(0, 5, 0, VXC_RM_TowardZero, 0));\n\ -}\n\ -"; /* end of vsi_nn_kernel_transform_setupThres_vx*/ +WARP_AFFINE_SH_IMPL(F16, F16, vxc_short8, vxc_half8, half4, vxc_half8, vxc_short8)\n\ +WARP_AFFINE_SH_IMPL(F16, I8, vxc_short8, vxc_half8, int4, vxc_char8, vxc_char8)\n\ +WARP_AFFINE_SH_IMPL(F16, U8, vxc_short8, vxc_half8, int4, vxc_uchar8, vxc_uchar8)\n\ +WARP_AFFINE_SH_IMPL(F16, I16, vxc_short8, vxc_half8, int4, vxc_short8, vxc_short8)\n\ +WARP_AFFINE_SH_IMPL(I8, I8, vxc_char8, vxc_char8, int4, vxc_char8, vxc_char8)\n\ +WARP_AFFINE_SH_IMPL(I8, F16, vxc_char8, vxc_char8, half4, vxc_half8, vxc_short8)\n\ +WARP_AFFINE_SH_IMPL(U8, U8, vxc_uchar8, vxc_uchar8, int4, vxc_uchar8, vxc_uchar8)\n\ +WARP_AFFINE_SH_IMPL(U8, F16, vxc_uchar8, vxc_uchar8, half4, vxc_half8, vxc_short8)\n\ +WARP_AFFINE_SH_IMPL(I16, I16, vxc_short8, vxc_short8, int4, vxc_short8, vxc_short8)\n\ +WARP_AFFINE_SH_IMPL(I16, F16, vxc_short8, vxc_short8, half4, vxc_half8, vxc_short8)\n\ +"; /* end of warp_affine_vx*/ @@ -43220,6 +42756,12 @@ __kernel void argmin_axis2_I32toI32_2D\n\ "; /* end of argmin_axis2_cl*/ static const char batchnorm_single_cl[] = "\n\ +#define READ_IMAGEF_ARRAY2D(dest, tensor, coord) \\\n\ + do { \\\n\ + int depth = get_image_array_size(tensor); \\\n\ + _viv_asm(CLAMP0MAX, coord_in0.z, coord_in0.z, in0_depth - 1); \\\n\ + dest = read_imagef(tensor, coord); \\\n\ + } while(0)\n\ __kernel void batch_norm_F32toF32\n\ (\n\ __read_only image2d_array_t input,\n\ @@ -43238,11 +42780,11 @@ __kernel void batch_norm_F32toF32\n\ int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0);\n\ \n\ float4 src, mean, var, gamma, beta;\n\ - readImage2DArray(src, input, coord);\n\ - readImage2DArray(mean, Mean, coord);\n\ - readImage2DArray(var, Variance, coord);\n\ - readImage2DArray(gamma, Gamma, coord);\n\ - readImage2DArray(beta, Beta, coord);\n\ + READ_IMAGEF_2DARRAY(src, input, coord);\n\ + READ_IMAGEF_2DARRAY(mean, Mean, coord);\n\ + READ_IMAGEF_2DARRAY(var, Variance, coord);\n\ + READ_IMAGEF_2DARRAY(gamma, Gamma, coord);\n\ + READ_IMAGEF_2DARRAY(beta, Beta, coord);\n\ \n\ float4 dst;\n\ src.x = src.x - mean.x;\n\ @@ -43302,11 +42844,11 @@ __kernel void batch_norm_U8toU8\n\ \n\ uint4 data;\n\ float4 src, mean, var, gamma, beta;\n\ - readImage2DArray(data, input, coord);\n\ - readImage2DArray(mean, Mean, coord);\n\ - readImage2DArray(var, Variance, coord);\n\ - readImage2DArray(gamma, Gamma, coord);\n\ - readImage2DArray(beta, Beta, coord);\n\ + READ_IMAGEF_2DARRAY(data, input, coord);\n\ + READ_IMAGEF_2DARRAY(mean, Mean, coord);\n\ + READ_IMAGEF_2DARRAY(var, Variance, coord);\n\ + READ_IMAGEF_2DARRAY(gamma, Gamma, coord);\n\ + READ_IMAGEF_2DARRAY(beta, Beta, coord);\n\ \n\ src = convert_float4(data) * input_scale - input_tail;\n\ src.x = src.x - mean.x;\n\ @@ -43695,12 +43237,20 @@ inline Image create_image_from_image2d(image2d_t input, int stride_x)\n\ {\n\ int8 desc;\n\ _viv_asm(COPY, desc, input, sizeof(desc));\n\ +\n\ +#if (USE_40BITS_VA==0)\n\ + uint address = as_uint(desc.s0);\n\ + int stride_y = desc.s1;\n\ +#else\n\ + ulong address = as_ulong(desc.s05);\n\ + int stride_y = desc.s6;\n\ +#endif\n\ \n\ Image img =\n\ {\n\ - .ptr = (uchar*)desc.s0,\n\ + .ptr = (uchar*)address,\n\ .stride_x = stride_x,\n\ - .stride_y = desc.s1\n\ + .stride_y = stride_y\n\ };\n\ \n\ return img;\n\ @@ -43714,67 +43264,74 @@ typedef struct Tensor\n\ int stride_z;\n\ } Tensor;\n\ \n\ -inline uchar* create_tensor_ptr_from_coord(Tensor t, int4 coord)\n\ +inline uchar* get_tensor_ptr_from_coord(Tensor t, int4 coord)\n\ {\n\ return t.ptr + coord.x * t.stride_x + coord.y * t.stride_y + coord.z * t.stride_z;\n\ }\n\ \n\ inline Tensor create_tensor_from_image2d_array(image2d_array_t input, int stride_x)\n\ {\n\ +#if (USE_40BITS_VA==0)\n\ int8 desc;\n\ _viv_asm(COPY, desc, input, sizeof(desc));\n\ +\n\ + uint address = as_uint(desc.s0);\n\ + int stride_y = desc.s1;\n\ + int stride_z = desc.s4;\n\ +#else\n\ + int16 desc;\n\ + _viv_asm(COPY, desc, input, sizeof(desc));\n\ +\n\ + ulong address = as_ulong(desc.s05);\n\ + int stride_y = desc.s6;\n\ + int stride_z = desc.sa;\n\ +#endif\n\ \n\ Tensor t =\n\ {\n\ - .ptr = (uchar*)desc.s0,\n\ + .ptr = (uchar*)address,\n\ .stride_x = stride_x,\n\ - .stride_y = desc.s1,\n\ - .stride_z = desc.s4\n\ + .stride_y = stride_y,\n\ + .stride_z = stride_z\n\ };\n\ \n\ return t;\n\ }\n\ \n\ -#define readImage2DArray(Dest, Image, Coord) \\\n\ - do { \\\n\ - int8 desc; \\\n\ - _viv_asm(COPY, desc, Image, sizeof(desc)); \\\n\ - _viv_asm(CLAMP0MAX, (Coord).w, (Coord).z, desc.s5 - 1); \\\n\ - int baseAddr = (int)(Coord).w * desc.s4 + desc.s0; \\\n\ - _viv_asm(MOV, (Coord).w, baseAddr); \\\n\ - _viv_asm(IMAGE_READ_3D, Dest, Image, (Coord).xyww); \\\n\ - } while (0)\n\ +#define READ_IMAGEF_2DARRAY(dest, tensor, coord) \\\n\ + do { \\\n\ + int depth = get_image_array_size(tensor); \\\n\ + int4 coord_in = coord; \\\n\ + _viv_asm(CLAMP0MAX, coord_in.z, coord_in.z, depth - 1); \\\n\ + dest = read_imagef(tensor, coord_in); \\\n\ + } while(0)\n\ \n\ -#define writeImage2DArray(Image, Coord, Color) \\\n\ - do { \\\n\ - int8 desc; \\\n\ - _viv_asm(COPY, desc, Image, sizeof(desc)); \\\n\ - _viv_asm(CLAMP0MAX, (Coord).w, (Coord).z, desc.s5 - 1); \\\n\ - int baseAddr = (int)(Coord).w * desc.s4 + desc.s0; \\\n\ - _viv_asm(MOV, (Coord).w, baseAddr); \\\n\ - _viv_asm(IMAGE_WRITE_3D, Color, Image, (Coord).xyww); \\\n\ - } while (0)\n\ +#define READ_IMAGEI_2DARRAY(dest, tensor, coord) \\\n\ + do { \\\n\ + int depth = get_image_array_size(tensor); \\\n\ + int4 coord_in = coord; \\\n\ + _viv_asm(CLAMP0MAX, coord_in.z, coord_in.z, depth - 1); \\\n\ + dest = read_imagei(tensor, coord_in); \\\n\ + } while(0)\n\ \n\ -#define readImage(Dest, Image, Coord) \\\n\ - do { \\\n\ - _viv_asm(IMAGE_READ, Dest, Image, Coord); \\\n\ - } while (0)\n\ -\n\ -#define writeImage(Image, Coord, Color) \\\n\ - do { \\\n\ - _viv_asm(IMAGE_WRITE, Color, Image, Coord); \\\n\ - } while (0)\n\ +#define READ_IMAGEUI_2DARRAY(dest, tensor, coord) \\\n\ + do { \\\n\ + int depth = get_image_array_size(tensor); \\\n\ + int4 coord_in = coord; \\\n\ + _viv_asm(CLAMP0MAX, coord_in.z, coord_in.z, depth - 1); \\\n\ + dest = read_imageui(tensor, coord_in); \\\n\ + } while(0)\n\ "; /* end of eltwise_ops_helper_cl*/ static const char eltwise_unary_cl[] = "\n\ -float4 eltwise_unary_sin(float4 x, float alpha)\n\ +float eltwise_unary_sin(float x, float alpha)\n\ {\n\ return native_sin(x);\n\ }\n\ \n\ #define logE (1.44269502f)\n\ #define twoLogE (logE * 2.0f)\n\ -float4 eltwise_unary_exp(float4 x, float alpha)\n\ +float eltwise_unary_exp(float x, float alpha)\n\ {\n\ x *= logE;\n\ x = exp2(x);\n\ @@ -43782,33 +43339,33 @@ float4 eltwise_unary_exp(float4 x, float alpha)\n\ }\n\ \n\ #define rlogE (0.693147182f)\n\ -float4 eltwise_unary_log(float4 x, float alpha)\n\ +float eltwise_unary_log(float x, float alpha)\n\ {\n\ x = log2(x);\n\ return x * rlogE;\n\ }\n\ \n\ -float4 eltwise_unary_elu(float4 val, float alpha)\n\ +float eltwise_unary_elu(float val, float alpha)\n\ {\n\ - float4 x = val * logE;\n\ + float x = val * logE;\n\ x = exp2(x) * alpha - alpha;\n\ \n\ return val < 0 ? x : val;\n\ }\n\ \n\ -float4 eltwise_unary_neg(float4 x, float alpha)\n\ +float eltwise_unary_neg(float x, float alpha)\n\ {\n\ return x * -1;\n\ }\n\ \n\ -float4 eltwise_unary_hard_sigmoid(float4 x, float alpha)\n\ +float eltwise_unary_hard_sigmoid(float x, float alpha)\n\ {\n\ x = 0.2 * x + 0.5;\n\ x = clamp(x, 0, 1);\n\ return x;\n\ }\n\ \n\ -float4 _softrelu(float4 x, float alpha)\n\ +float _softrelu(float x, float alpha)\n\ {\n\ x *= logE;\n\ x = exp2(x);\n\ @@ -43817,7 +43374,7 @@ float4 _softrelu(float4 x, float alpha)\n\ return x * rlogE;\n\ }\n\ \n\ -float4 _tanh(float4 x, float alpha)\n\ +float _tanh(float x, float alpha)\n\ {\n\ x *= -twoLogE;\n\ x = 1 + exp2(x);\n\ @@ -43825,16 +43382,60 @@ float4 _tanh(float4 x, float alpha)\n\ return (2 * x - 1);\n\ }\n\ \n\ -float4 eltwise_unary_mish(float4 x, float alpha)\n\ +float eltwise_unary_mish(float x, float alpha)\n\ {\n\ - float4 y = _softrelu(x, alpha);\n\ + float y = _softrelu(x, alpha);\n\ x = x * _tanh(y, alpha);\n\ return x;\n\ }\n\ \n\ -float4 eltwise_unary_round(float4 x, float alpha)\n\ +float eltwise_unary_round(float x, float alpha)\n\ {\n\ - return convert_float4(convert_int4_rte(x));\n\ + return convert_float(convert_int_rte(x));\n\ +}\n\ +\n\ +#define MUL2_RSQRTPI (1.1283791670955126f)\n\ +float erf_eval(float x)\n\ +{\n\ + float res = 0;\n\ + float tmp = x;\n\ + float factorial = 1;\n\ + float x_pow = x;\n\ + float one = 1.0f;\n\ + float n = 1;\n\ +\n\ + if (x <= -3)\n\ + return -1;\n\ + else if (x >= 3)\n\ + return 1;\n\ +\n\ + while (fabs(tmp) > 1e-5)\n\ + {\n\ + res += tmp;\n\ +\n\ + factorial *= n;\n\ + one *= -1;\n\ + x_pow *= x * x;\n\ + tmp = one / factorial * x_pow / ( 2 * n + 1);\n\ +\n\ + n += 1.0f;\n\ + }\n\ + return res * MUL2_RSQRTPI;\n\ +}\n\ +#define RSQRT2 (0.70710678118654752440084436210485f)\n\ +float eltwise_unary_gelu(float x, float alpha)\n\ +{\n\ + x = 0.5f * x * (1 + erf_eval(x * RSQRT2));\n\ +\n\ + return x;\n\ +}\n\ +\n\ +#define SQRT_2_RCP_PI 0.7978845834732056f\n\ +float eltwise_unary_hard_gelu(float x, float alpha)\n\ +{\n\ + float cdf = 0.5f + 0.5f * _tanh(SQRT_2_RCP_PI *\n\ + (x + 0.044715f * x * x * x), 0);\n\ + return x * cdf;\n\ }\n\ \n\ #define ELTWISE_UNARY_F32(func_name) \\\n\ @@ -43853,9 +43454,10 @@ __kernel void func_name##_F32toF32 \\\n\ \\\n\ float4 src = read_imagef(input, coord); \\\n\ \\\n\ - float4 dst = eltwise_unary_##func_name(src, alpha); \\\n\ + float4 dst = 0; \\\n\ + dst.x = eltwise_unary_##func_name(src.x, alpha); \\\n\ \\\n\ - write_imagef(output, coord, dst); \\\n\ + write_imagef(output, coord, dst.xxxx); \\\n\ }\n\ ELTWISE_UNARY_F32(sin)\n\ ELTWISE_UNARY_F32(exp)\n\ @@ -43865,6 +43467,8 @@ ELTWISE_UNARY_F32(neg)\n\ ELTWISE_UNARY_F32(mish)\n\ ELTWISE_UNARY_F32(hard_sigmoid)\n\ ELTWISE_UNARY_F32(round)\n\ +ELTWISE_UNARY_F32(gelu)\n\ +ELTWISE_UNARY_F32(hard_gelu)\n\ \n\ #define ELTWISE_UNARY_F32_2D(func_name) \\\n\ __kernel void func_name##_F32toF32_2D \\\n\ @@ -43882,9 +43486,10 @@ __kernel void func_name##_F32toF32_2D \\\n\ \\\n\ float4 src = read_imagef(input, coord); \\\n\ \\\n\ - float4 dst = eltwise_unary_##func_name(src, alpha); \\\n\ + float4 dst = 0; \\\n\ + dst.x = eltwise_unary_##func_name(src.x, alpha); \\\n\ \\\n\ - write_imagef(output, coord, dst); \\\n\ + write_imagef(output, coord, dst.xxxx); \\\n\ }\n\ ELTWISE_UNARY_F32_2D(sin)\n\ ELTWISE_UNARY_F32_2D(exp)\n\ @@ -43894,6 +43499,8 @@ ELTWISE_UNARY_F32_2D(neg)\n\ ELTWISE_UNARY_F32_2D(mish)\n\ ELTWISE_UNARY_F32_2D(hard_sigmoid)\n\ ELTWISE_UNARY_F32_2D(round)\n\ +ELTWISE_UNARY_F32_2D(gelu)\n\ +ELTWISE_UNARY_F32_2D(hard_gelu)\n\ \n\ #define ELTWISE_UNARY_U8(func_name) \\\n\ __kernel void func_name##_U8toU8 \\\n\ @@ -43912,7 +43519,7 @@ __kernel void func_name##_U8toU8 \\\n\ uint4 src = read_imageui(input, coord); \\\n\ float4 data = convert_float4(src) * inputScale - inputTail; \\\n\ \\\n\ - data = eltwise_unary_##func_name(data, alpha); \\\n\ + data.x = eltwise_unary_##func_name(data.x, alpha); \\\n\ uint4 dst = convert_uint4(data * outputScale + outputZP); \\\n\ \\\n\ write_imageui(output, coord, dst); \\\n\ @@ -43925,6 +43532,8 @@ ELTWISE_UNARY_U8(neg)\n\ ELTWISE_UNARY_U8(mish)\n\ ELTWISE_UNARY_U8(hard_sigmoid)\n\ ELTWISE_UNARY_U8(round)\n\ +ELTWISE_UNARY_U8(gelu)\n\ +ELTWISE_UNARY_U8(hard_gelu)\n\ \n\ #define ELTWISE_UNARY_U8_2D(func_name) \\\n\ __kernel void func_name##_U8toU8_2D \\\n\ @@ -43943,7 +43552,7 @@ __kernel void func_name##_U8toU8_2D \\\n\ uint4 src = read_imageui(input, coord); \\\n\ float4 data = convert_float4(src) * inputScale - inputTail; \\\n\ \\\n\ - data = eltwise_unary_##func_name(data, alpha); \\\n\ + data.x = eltwise_unary_##func_name(data.x, alpha); \\\n\ uint4 dst = convert_uint4(data * outputScale + outputZP); \\\n\ \\\n\ write_imageui(output, coord, dst); \\\n\ @@ -43956,6 +43565,8 @@ ELTWISE_UNARY_U8_2D(neg)\n\ ELTWISE_UNARY_U8_2D(mish)\n\ ELTWISE_UNARY_U8_2D(hard_sigmoid)\n\ ELTWISE_UNARY_U8_2D(round)\n\ +ELTWISE_UNARY_U8_2D(gelu)\n\ +ELTWISE_UNARY_U8_2D(hard_gelu)\n\ \n\ __kernel void neg_I32toI32\n\ (\n\ @@ -43997,8 +43608,9 @@ __kernel void neg_I32toI32_2D\n\ "; /* end of eltwise_unary_cl*/ static const char erf_cl[] = "#define MUL2_RSQRTPI (1.1283791670955126f)\n\ -float eltwise_unary_erf(float x)\n\ +float eltwise_unary_erf(float _x)\n\ {\n\ + float x = clamp(_x, -2, 2);\n\ float res = 0;\n\ float tmp = x;\n\ float factorial = 1;\n\ @@ -44119,8 +43731,8 @@ static const char floordiv_cl[] = "__kernel void floordiv_F32F32toF32(\n\ int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0);\n\ float4 src0;\n\ float4 src1;\n\ - readImage2DArray(src0, input, coord);\n\ - readImage2DArray(src1, input1, coord);\n\ + READ_IMAGEF_2DARRAY(src0, input, coord);\n\ + READ_IMAGEF_2DARRAY(src1, input1, coord);\n\ float4 dst = floor(src0 / src1);\n\ write_imagef(output, coord, dst);\n\ }\n\ @@ -44145,8 +43757,8 @@ __kernel void floordiv_I32I32toI32(\n\ int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0);\n\ int4 src0;\n\ int4 src1;\n\ - readImage2DArray(src0, input, coord);\n\ - readImage2DArray(src1, input1, coord);\n\ + READ_IMAGEI_2DARRAY(src0, input, coord);\n\ + READ_IMAGEI_2DARRAY(src1, input1, coord);\n\ int4 dst = convert_int4(floor(convert_float4(src0) / convert_float4(src1)));\n\ write_imagei(output, coord, dst);\n\ }\n\ @@ -44177,8 +43789,8 @@ __kernel void floordiv_I32I32toU8(\n\ int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0);\n\ int4 src0;\n\ int4 src1;\n\ - readImage2DArray(src0, input, coord);\n\ - readImage2DArray(src1, input1, coord);\n\ + READ_IMAGEI_2DARRAY(src0, input, coord);\n\ + READ_IMAGEI_2DARRAY(src1, input1, coord);\n\ uint4 dst = convert_uint4(floor(convert_float4(src0) / convert_float4(src1)) * outputScale + outputTail);\n\ write_imageui(output, coord, dst);\n\ }\n\ @@ -44215,8 +43827,8 @@ __kernel void floordiv_U8U8toU8(\n\ int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0);\n\ uint4 src0, src1;\n\ float4 in0, in1, out;\n\ - readImage2DArray(src0, input, coord);\n\ - readImage2DArray(src1, input1, coord);\n\ + READ_IMAGEUI_2DARRAY(src0, input, coord);\n\ + READ_IMAGEUI_2DARRAY(src1, input1, coord);\n\ in0 = convert_float4(src0) * input0Scale + input0Tail;\n\ in1 = convert_float4(src1) * input1Scale + input1Tail;\n\ out = floor(in0 / in1) * outputScale + outputTail;\n\ @@ -44261,8 +43873,8 @@ __kernel void floordiv_U8I32toU8(\n\ uint4 src0;\n\ int4 src1;\n\ float4 in0, in1, out;\n\ - readImage2DArray(src0, input, coord);\n\ - readImage2DArray(src1, input1, coord);\n\ + READ_IMAGEUI_2DARRAY(src0, input, coord);\n\ + READ_IMAGEI_2DARRAY(src1, input1, coord);\n\ in0 = convert_float4(src0) * input0Scale + input0Tail;\n\ in1 = convert_float4(src1);\n\ out = floor(in0 / in1) * outputScale + outputTail;\n\ @@ -47672,8 +47284,8 @@ __kernel void logical_##name##_I8toI8( \\\n\ int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0); \\\n\ int4 src0; \\\n\ int4 src1; \\\n\ - readImage2DArray(src0, input, coord); \\\n\ - readImage2DArray(src1, input1, coord); \\\n\ + READ_IMAGEI_2DARRAY(src0, input, coord); \\\n\ + READ_IMAGEI_2DARRAY(src1, input1, coord); \\\n\ int4 dst = (lgc_op2(src0))lgc_op(lgc_op2(src1)); \\\n\ dst.x = dst.x & 1; \\\n\ write_imagei(output, coord, dst); \\\n\ @@ -50931,8 +50543,8 @@ static const char maximum_cl[] = "__kernel void maximum_FP32FP32toFP32\n\ \n\ float4 src0;\n\ float4 src1;\n\ - readImage2DArray(src0, input0, coord);\n\ - readImage2DArray(src1, input1, coord);\n\ + READ_IMAGEF_2DARRAY(src0, input0, coord);\n\ + READ_IMAGEF_2DARRAY(src1, input1, coord);\n\ \n\ float4 dst = src0 > src1 ? src0 : src1;\n\ \n\ @@ -50979,8 +50591,8 @@ __kernel void maximum_U8U8toU8\n\ \n\ uint4 src0;\n\ uint4 src1;\n\ - readImage2DArray(src0, input0, coord);\n\ - readImage2DArray(src1, input1, coord);\n\ + READ_IMAGEUI_2DARRAY(src0, input0, coord);\n\ + READ_IMAGEUI_2DARRAY(src1, input1, coord);\n\ \n\ float4 data0 = convert_float4(src0) * input0Scale - input0Tail;\n\ float4 data1 = convert_float4(src1) * input1Scale - input1Tail;\n\ @@ -51034,8 +50646,8 @@ __kernel void maximum_I32I32toI32\n\ \n\ int4 src0;\n\ int4 src1;\n\ - readImage2DArray(src0, input0, coord);\n\ - readImage2DArray(src1, input1, coord);\n\ + READ_IMAGEI_2DARRAY(src0, input0, coord);\n\ + READ_IMAGEI_2DARRAY(src1, input1, coord);\n\ \n\ int4 dst = src0 > src1 ? src0 : src1;\n\ \n\ @@ -51084,8 +50696,8 @@ static const char minimum_cl[] = "__kernel void minimum_FP32FP32toFP32\n\ \n\ float4 src0;\n\ float4 src1;\n\ - readImage2DArray(src0, input0, coord);\n\ - readImage2DArray(src1, input1, coord);\n\ + READ_IMAGEF_2DARRAY(src0, input0, coord);\n\ + READ_IMAGEF_2DARRAY(src1, input1, coord);\n\ \n\ float4 dst = src0 < src1 ? src0 : src1;\n\ \n\ @@ -51132,8 +50744,8 @@ __kernel void minimum_U8U8toU8\n\ \n\ uint4 src0;\n\ uint4 src1;\n\ - readImage2DArray(src0, input0, coord);\n\ - readImage2DArray(src1, input1, coord);\n\ + READ_IMAGEUI_2DARRAY(src0, input0, coord);\n\ + READ_IMAGEUI_2DARRAY(src1, input1, coord);\n\ \n\ float4 data0 = convert_float4(src0) * input0Scale - input0Tail;\n\ float4 data1 = convert_float4(src1) * input1Scale - input1Tail;\n\ @@ -51187,8 +50799,8 @@ __kernel void minimum_I32I32toI32\n\ \n\ int4 src0;\n\ int4 src1;\n\ - readImage2DArray(src0, input0, coord);\n\ - readImage2DArray(src1, input1, coord);\n\ + READ_IMAGEI_2DARRAY(src0, input0, coord);\n\ + READ_IMAGEI_2DARRAY(src1, input1, coord);\n\ \n\ int4 dst = src0 < src1 ? src0 : src1;\n\ \n\ @@ -52333,8 +51945,8 @@ static const char pow_cl[] = "__kernel void pow_FP32FP32toFP32\n\ \n\ float4 src0, src1;\n\ float4 dst;\n\ - readImage2DArray(src0, input0, coord);\n\ - readImage2DArray(src1, input1, coord);\n\ + READ_IMAGEF_2DARRAY(src0, input0, coord);\n\ + READ_IMAGEF_2DARRAY(src1, input1, coord);\n\ \n\ float4 s0 = sign(src0);\n\ int4 t0 = convert_int4(src1) & 1;\n\ @@ -52385,8 +51997,8 @@ static const char prelu_cl[] = "__kernel void prelu_FP32FP32toFP32\n\ \n\ float4 src0;\n\ float4 src1;\n\ - readImage2DArray(src0, input0, coord);\n\ - readImage2DArray(src1, input1, coord);\n\ + READ_IMAGEF_2DARRAY(src0, input0, coord);\n\ + READ_IMAGEF_2DARRAY(src1, input1, coord);\n\ \n\ float4 maxData = src0 >= 0 ? src0 : 0;\n\ float4 minData = src0 < 0 ? src0 : 0;\n\ @@ -52437,8 +52049,8 @@ __kernel void prelu_U8U8toU8\n\ \n\ uint4 src0;\n\ uint4 src1;\n\ - readImage2DArray(src0, input0, coord);\n\ - readImage2DArray(src1, input1, coord);\n\ + READ_IMAGEUI_2DARRAY(src0, input0, coord);\n\ + READ_IMAGEUI_2DARRAY(src1, input1, coord);\n\ \n\ float4 data0 = convert_float4(src0) * input0Scale - input0Tail;\n\ float4 data1 = convert_float4(src1) * input1Scale - input1Tail;\n\ @@ -52500,8 +52112,8 @@ __kernel void prelu_I32I32toI32\n\ \n\ int4 src0;\n\ int4 src1;\n\ - readImage2DArray(src0, input0, coord);\n\ - readImage2DArray(src1, input1, coord);\n\ + READ_IMAGEI_2DARRAY(src0, input0, coord);\n\ + READ_IMAGEI_2DARRAY(src1, input1, coord);\n\ \n\ float4 data0 = convert_float4(src0) * input0Scale - input0Tail;\n\ float4 data1 = convert_float4(src1) * input1Scale - input1Tail;\n\ @@ -52549,15 +52161,6 @@ __kernel void prelu_I32I32toI32_2D\n\ static const char random_multinomial_cl[] = "#pragma OPENCL EXTENSION CL_VIV_asm : enable\n\ \n\ -inline uchar* get_image2D_array_ptr(image2d_array_t input)\n\ -{\n\ - int8 desc;\n\ - _viv_asm(COPY, desc, input, sizeof(desc));\n\ - uchar *src_ptr = (uchar*)desc.s0;\n\ -\n\ - return src_ptr;\n\ -}\n\ -\n\ uint4 _philox4x32bumpkey(uint4 key)\n\ {\n\ uint4 mask = (uint4)((uint)0x9E3779B9, (uint)0xBB67AE85, 0, 0);\n\ @@ -52610,14 +52213,16 @@ __kernel void random_seed(\n\ float re_rand_max\n\ )\n\ {\n\ - __global uint* seeds_ptr = (__global uint*)get_image2D_array_ptr(seeds);\n\ + Tensor s_tensor = create_tensor_from_image2d_array(seeds, 4);\n\ + __global uint* seeds_ptr = (__global uint*)s_tensor.ptr;\n\ seeds_ptr = seeds_ptr;\n\ uint4 key = vload4(0, seeds_ptr);\n\ \n\ uint4 ctr = (uint4)(0);\n\ float4 result = 0;\n\ \n\ - __global float* output_ptr = (__global float*)get_image2D_array_ptr(output);\n\ + Tensor o_tensor = create_tensor_from_image2d_array(output, 4);\n\ + __global float* output_ptr = (__global float*)o_tensor.ptr;\n\ \n\ for(int i = 0; i < iter; i++)\n\ {\n\ @@ -52701,17 +52306,20 @@ __kernel void random_multinomial\n\ int class_size = get_image_width(cdfs);\n\ \n\ int offset = gidy * class_size;\n\ - __global float* cdf_ptr = (__global float*)get_image2D_array_ptr(cdfs);\n\ + Tensor cdf_tensor = create_tensor_from_image2d_array(cdfs, 4);\n\ + __global float* cdf_ptr = (__global uint*)cdf_tensor.ptr;\n\ __global float* cdfPtr = cdf_ptr + offset;\n\ \n\ int width = get_image_width(randoms);\n\ offset = coord.x + coord.y * width;\n\ - __global float* randoms_ptr = (__global float*)get_image2D_array_ptr(randoms);\n\ + Tensor r_tensor = create_tensor_from_image2d_array(randoms, 4);\n\ + __global float* randoms_ptr = (__global float*)r_tensor.ptr;\n\ randoms_ptr = randoms_ptr + offset;\n\ \n\ width = get_image_width(output);\n\ offset = coord.x + coord.y * width;\n\ - __global uint* output_ptr = (__global uint*)get_image2D_array_ptr(output);\n\ + Tensor o_tensor = create_tensor_from_image2d_array(output, 4);\n\ + __global uint* output_ptr = (__global uint*)o_tensor.ptr;\n\ output_ptr = output_ptr + offset;\n\ \n\ float4 ran = vload4(0, randoms_ptr);\n\ @@ -54108,8 +53716,8 @@ __kernel void func_name##_F32F32toBOOL8 \\\n\ \\\n\ float4 src0; \\\n\ float4 src1; \\\n\ - readImage2DArray(src0, input0, coord); \\\n\ - readImage2DArray(src1, input1, coord); \\\n\ + READ_IMAGEF_2DARRAY(src0, input0, coord); \\\n\ + READ_IMAGEF_2DARRAY(src1, input1, coord); \\\n\ \\\n\ int4 dst = (src0)comp_op(src1); \\\n\ dst &= 1; \\\n\ @@ -54168,8 +53776,8 @@ __kernel void func_name##_U32U32toBOOL8 \\\n\ \\\n\ uint4 data0; \\\n\ uint4 data1; \\\n\ - readImage2DArray(data0, input0, coord); \\\n\ - readImage2DArray(data1, input1, coord); \\\n\ + READ_IMAGEUI_2DARRAY(data0, input0, coord); \\\n\ + READ_IMAGEUI_2DARRAY(data1, input1, coord); \\\n\ \\\n\ float4 src0 = convert_float4(data0) * input0Scale - input0Tail; \\\n\ float4 src1 = convert_float4(data1) * input1Scale - input1Tail; \\\n\ @@ -54232,8 +53840,8 @@ __kernel void func_name##_I32I32toBOOL8 \\\n\ \\\n\ int4 src0; \\\n\ int4 src1; \\\n\ - readImage2DArray(src0, input0, coord); \\\n\ - readImage2DArray(src1, input1, coord); \\\n\ + READ_IMAGEI_2DARRAY(src0, input0, coord); \\\n\ + READ_IMAGEI_2DARRAY(src1, input1, coord); \\\n\ \\\n\ int4 dst = (src0)comp_op(src1); \\\n\ dst &= 1; \\\n\ @@ -55187,6 +54795,53 @@ __kernel void scatter_nd_F32toF32_3D(\n\ write_imagef(output, (int2)(gidx, gidy), sum);\n\ }"; /* end of scatter_nd_cl*/ +static const char scatter_nd_update_cl[] = "\n\ +#define SCATTER_ND_UPDATE(src0_type, data_type, read_func, write_func) \\\n\ +__kernel void scatter_nd_update_##src0_type##src0_type##to##src0_type( \\\n\ + __read_only image2d_t input0, \\\n\ + __read_only image2d_t input1, \\\n\ + __read_only image2d_t input2, \\\n\ + __write_only image2d_t output, \\\n\ + int offsetX, \\\n\ + int offsetY, \\\n\ + int offsetZ, \\\n\ + int offsetW, \\\n\ + int offset_idx, \\\n\ + int coord_dim, \\\n\ + int index_num \\\n\ + ) \\\n\ +{ \\\n\ + int gidx = get_global_id(0); \\\n\ + int gidy = get_global_id(1); \\\n\ + int cnt = 0; \\\n\ + \\\n\ + data_type sum = (data_type)(0, 0, 0, 0); \\\n\ + Image img1 = create_image_from_image2d(input1, 4); \\\n\ + __global int* index_ptr = (__global int*)img1.ptr; \\\n\ + for(int i = 0; i < index_num; i++) \\\n\ + { \\\n\ + int4 indice = vload4(0, index_ptr + offset_idx); \\\n\ + index_ptr += coord_dim; \\\n\ + int idx = indice.x * offsetX + indice.y * offsetY + indice.z * offsetZ + indice.w * offsetW; \\\n\ + if(gidy == idx) \\\n\ + { \\\n\ + data_type data = read_func(input2, (int2)(gidx, i)); \\\n\ + cnt++; \\\n\ + sum += data; \\\n\ + } \\\n\ + } \\\n\ + int2 coord = (int2)(gidx, gidy); \\\n\ + if(cnt == 0) \\\n\ + { \\\n\ + sum = read_func(input0, coord); \\\n\ + } \\\n\ + write_func(output, coord, sum); \\\n\ +}\n\ +SCATTER_ND_UPDATE(U32, uint4, read_imageui, write_imageui)\n\ +SCATTER_ND_UPDATE(I32, int4, read_imagei, write_imagei)\n\ +SCATTER_ND_UPDATE(F32, float4, read_imagef, write_imagef)\n\ +"; /* end of scatter_nd_update_cl*/ + static const char select_cl[] = "__kernel void select_I8_U8_U8toU8(\n\ __read_only image2d_array_t condition,\n\ __read_only image2d_array_t input0,\n\ @@ -55201,9 +54856,9 @@ static const char select_cl[] = "__kernel void select_I8_U8_U8toU8(\n\ int4 value;\n\ uint4 src0, src1, src, dst;\n\ float inputScale, inputTail;\n\ - readImage2DArray(value, condition, coord);\n\ - readImage2DArray(src0, input0, coord);\n\ - readImage2DArray(src1, input1, coord);\n\ + READ_IMAGEI_2DARRAY(value, condition, coord);\n\ + READ_IMAGEF_2DARRAY(src0, input0, coord);\n\ + READ_IMAGEF_2DARRAY(src1, input1, coord);\n\ src = (value != 0 ? src0 : src1);\n\ inputScale = (value.x != 0 ? input0Scale : input1Scale);\n\ inputTail = (value.x != 0 ? input0Tail : input1Tail);\n\ @@ -55245,9 +54900,9 @@ __kernel void select_I8_I32_I32toI32(\n\ int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0);\n\ int4 value;\n\ int4 src0, src1, dst;\n\ - readImage2DArray(value, condition, coord);\n\ - readImage2DArray(src0, input0, coord);\n\ - readImage2DArray(src1, input1, coord);\n\ + READ_IMAGEI_2DARRAY(value, condition, coord);\n\ + READ_IMAGEI_2DARRAY(src0, input0, coord);\n\ + READ_IMAGEI_2DARRAY(src1, input1, coord);\n\ dst = (value != 0 ? src0 : src1);\n\ write_imagei(output, coord, dst);\n\ }\n\ @@ -55283,9 +54938,9 @@ __kernel void select_I8_F32_F32toF32(\n\ int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0);\n\ int4 value;\n\ float4 src0, src1, dst;\n\ - readImage2DArray(value, condition, coord);\n\ - readImage2DArray(src0, input0, coord);\n\ - readImage2DArray(src1, input1, coord);\n\ + READ_IMAGEI_2DARRAY(value, condition, coord);\n\ + READ_IMAGEF_2DARRAY(src0, input0, coord);\n\ + READ_IMAGEF_2DARRAY(src1, input1, coord);\n\ dst = (value != 0 ? src0 : src1);\n\ write_imagef(output, coord, dst);\n\ }\n\ @@ -55382,6 +55037,29 @@ __kernel void sequence_mask_I32toF32_2D(\n\ write_imagef(output, coord, data);\n\ }"; /* end of sequence_mask_cl*/ +static const char signal_frame_cl[] = "\n\ +#define SIGNAL_FRAME_SH_IMPL(type, data_type, read_imagefunc, write_imagefunc) \\\n\ +__kernel void signal_frame_##type##to##type \\\n\ + ( \\\n\ + __read_only image2d_t input, \\\n\ + __write_only image2d_array_t output, \\\n\ + int frame_step \\\n\ + ) \\\n\ +{ \\\n\ + int inner = get_global_id(0); \\\n\ + int length_k = get_global_id(1); \\\n\ + int frames_id = get_global_id(2); \\\n\ + \\\n\ + int4 coord = (int4)(inner, length_k, frames_id, frames_id); \\\n\ + int2 coord_in = (int2)(inner, frames_id * frame_step + length_k); \\\n\ + \\\n\ + data_type src = read_imagefunc(input, coord_in); \\\n\ + write_imagefunc(output, coord, src); \\\n\ +}\n\ +SIGNAL_FRAME_SH_IMPL(F32, float4, read_imagef, write_imagef)\n\ +SIGNAL_FRAME_SH_IMPL(U8, uint4, read_imageui, write_imageui)\n\ +"; /* end of signal_frame_cl*/ + static const char slice_cl[] = "__kernel void slice_F32_I32toF32\n\ (\n\ __read_only image2d_array_t input0,\n\ @@ -55740,7 +55418,7 @@ __kernel void swish_I32toI32_2D(\n\ "; /* end of swish_cl*/ static const char tile_cl[] = "\n\ -#define TILE_3D(name0, name1, data_type, write_image_func) \\\n\ +#define TILE_3D(name0, name1, data_type, read_image_func, write_image_func) \\\n\ __kernel void tile_##name0##to##name1 \\\n\ ( \\\n\ __read_only image2d_array_t input, \\\n\ @@ -55760,7 +55438,7 @@ __kernel void tile_##name0##to##name1 \\\n\ int height = get_image_height(input); \\\n\ \\\n\ data_type src; \\\n\ - readImage2DArray(src, input, coord); \\\n\ + read_image_func(src, input, coord); \\\n\ \\\n\ int batch_id = (short)coord.z / (short)depthIn; \\\n\ coord.z = (short)coord.z % (short)depthIn; \\\n\ @@ -55787,11 +55465,11 @@ __kernel void tile_##name0##to##name1 \\\n\ } \\\n\ } \\\n\ }\n\ -TILE_3D(I32, I32, int4, write_imagei)\n\ -TILE_3D(U32, U32, uint4, write_imageui)\n\ -TILE_3D(F32, F32, float4, write_imagef)\n\ +TILE_3D(I32, I32, int4, READ_IMAGEI_2DARRAY, write_imagei)\n\ +TILE_3D(U32, U32, uint4, READ_IMAGEUI_2DARRAY, write_imageui)\n\ +TILE_3D(F32, F32, float4, READ_IMAGEF_2DARRAY, write_imagef)\n\ \n\ -#define TILE_2D(name0, name1, data_type) \\\n\ +#define TILE_2D(name0, name1, data_type, read_image_func, write_image_func) \\\n\ __kernel void tile_##name0##to##name1##_2D \\\n\ ( \\\n\ __read_only image2d_t input, \\\n\ @@ -55811,23 +55489,22 @@ __kernel void tile_##name0##to##name1##_2D \\\n\ int output_width = get_image_width(output); \\\n\ int output_height = get_image_height(output); \\\n\ \\\n\ - data_type src; \\\n\ - readImage(src, input, coord); \\\n\ + data_type src = read_image_func(input, coord); \\\n\ \\\n\ do \\\n\ { \\\n\ do \\\n\ { \\\n\ - writeImage(output, coord, src); \\\n\ + write_image_func(output, coord, src); \\\n\ coord.x += width; \\\n\ } while (coord.x < output_width); \\\n\ coord.x = get_global_id(0); \\\n\ coord.y += height; \\\n\ } while (coord.y < output_height); \\\n\ }\n\ -TILE_2D(I32, I32, int4)\n\ -TILE_2D(U32, U32, uint4)\n\ -TILE_2D(F32, F32, float4)\n\ +TILE_2D(I32, I32, int4, read_imagei, write_imagei)\n\ +TILE_2D(U32, U32, uint4, read_imageui, write_imageui)\n\ +TILE_2D(F32, F32, float4, read_imagef, write_imagef)\n\ \n\ \n\ \n\ @@ -56075,6 +55752,7 @@ static const source_map_t evis_resource[] = {"eltwise_unary_2d_vx", eltwise_unary_2d_vx}, {"eltwise_unary_3d_vx", eltwise_unary_3d_vx}, {"erf_vx", erf_vx}, + {"extra_ending_vx", extra_ending_vx}, {"floordiv_vx", floordiv_vx}, {"gather_vx", gather_vx}, {"gather_array_vx", gather_array_vx}, @@ -56085,6 +55763,7 @@ static const source_map_t evis_resource[] = {"gather_nd_3d_vx", gather_nd_3d_vx}, {"gather_nd_3d_mix_vx", gather_nd_3d_mix_vx}, {"gather_nd_mix_vx", gather_nd_mix_vx}, + {"get_matrix_vx", get_matrix_vx}, {"group_normalization_f16_vx", group_normalization_f16_vx}, {"group_normalization_i16_vx", group_normalization_i16_vx}, {"group_normalization_i8_vx", group_normalization_i8_vx}, @@ -56167,6 +55846,8 @@ static const source_map_t evis_resource[] = {"moments_axis012_vx", moments_axis012_vx}, {"moments_axis1_vx", moments_axis1_vx}, {"moments_axis2_vx", moments_axis2_vx}, + {"moments_u8_vx", moments_u8_vx}, + {"moments_u8_axis012_vx", moments_u8_axis012_vx}, {"one_hot_vx", one_hot_vx}, {"poolwithargmax_F16_vx", poolwithargmax_F16_vx}, {"poolwithargmax_I16_vx", poolwithargmax_I16_vx}, @@ -56236,11 +55917,16 @@ static const source_map_t evis_resource[] = {"resize_nearest_vx", resize_nearest_vx}, {"scatter_nd_vx", scatter_nd_vx}, {"scatter_nd_big_vx", scatter_nd_big_vx}, + {"scatter_nd_update_vx", scatter_nd_update_vx}, + {"scatter_nd_update_atom_vx", scatter_nd_update_atom_vx}, + {"scatter_nd_update_big_vx", scatter_nd_update_big_vx}, {"select_vx", select_vx}, {"sequence_mask_vx", sequence_mask_vx}, + {"signal_frame_vx", signal_frame_vx}, {"slice_vx", slice_vx}, {"space2depth_internal_vx", space2depth_internal_vx}, {"swish_vx", swish_vx}, + {"tensorstackconcat_vx", tensorstackconcat_vx}, {"tile_vx", tile_vx}, {"tile_mix_vx", tile_mix_vx}, {"upsample_F16_vx", upsample_F16_vx}, @@ -56249,22 +55935,8 @@ static const source_map_t evis_resource[] = {"upsample_U8_vx", upsample_U8_vx}, {"upsamplescale_vx", upsamplescale_vx}, {"upsamplescale_k2_vx", upsamplescale_k2_vx}, - {"vsi_nn_kernel_box_with_nms_limit_vx", vsi_nn_kernel_box_with_nms_limit_vx}, - {"vsi_nn_kernel_detection_postprocess_vx", vsi_nn_kernel_detection_postprocess_vx}, - {"vsi_nn_kernel_extra_ending_vx", vsi_nn_kernel_extra_ending_vx}, {"vsi_nn_kernel_header_vx", vsi_nn_kernel_header_vx}, - {"vsi_nn_kernel_heatmap_max_keypoint_vx", vsi_nn_kernel_heatmap_max_keypoint_vx}, - {"vsi_nn_kernel_imageprocess_vx", vsi_nn_kernel_imageprocess_vx}, - {"vsi_nn_kernel_imageprocess_2_vx", vsi_nn_kernel_imageprocess_2_vx}, - {"vsi_nn_kernel_imageprocess_3_vx", vsi_nn_kernel_imageprocess_3_vx}, - {"vsi_nn_kernel_imageprocess_4_vx", vsi_nn_kernel_imageprocess_4_vx}, - {"vsi_nn_kernel_imageprocess_5_vx", vsi_nn_kernel_imageprocess_5_vx}, - {"vsi_nn_kernel_roi_align_vx", vsi_nn_kernel_roi_align_vx}, - {"vsi_nn_kernel_signalframe_vx", vsi_nn_kernel_signalframe_vx}, - {"vsi_nn_kernel_tensorstackconcat_vx", vsi_nn_kernel_tensorstackconcat_vx}, - {"vsi_nn_kernel_transform_gemm_vx", vsi_nn_kernel_transform_gemm_vx}, - {"vsi_nn_kernel_transform_interp_vx", vsi_nn_kernel_transform_interp_vx}, - {"vsi_nn_kernel_transform_setupThres_vx", vsi_nn_kernel_transform_setupThres_vx}, + {"warp_affine_vx", warp_affine_vx}, }; static const source_map_t cl_resource[] = @@ -56366,8 +56038,10 @@ static const source_map_t cl_resource[] = {"resize_nearest_cl", resize_nearest_cl}, {"roi_align_cl", roi_align_cl}, {"scatter_nd_cl", scatter_nd_cl}, + {"scatter_nd_update_cl", scatter_nd_update_cl}, {"select_cl", select_cl}, {"sequence_mask_cl", sequence_mask_cl}, + {"signal_frame_cl", signal_frame_cl}, {"slice_cl", slice_cl}, {"space2depth_internal_cl", space2depth_internal_cl}, {"swish_cl", swish_cl}, diff --git a/src/tim/vx/internal/src/libnnext/vsi_nn_vxkernel.c b/src/tim/vx/internal/src/libnnext/vsi_nn_vxkernel.c index f79b691..14558a9 100644 --- a/src/tim/vx/internal/src/libnnext/vsi_nn_vxkernel.c +++ b/src/tim/vx/internal/src/libnnext/vsi_nn_vxkernel.c @@ -238,11 +238,11 @@ static vsi_status vsi_nn_RegisterVXKernel if(evis == VSI_NN_HW_EVIS_NONE) { // set default evis version is 2 - sprintf(cmd, "-cl-viv-vx-extension -D VX_VERSION=2"); + sprintf(cmd, "-cl-viv-vx-extension -D VX_VERSION=2 -D USE_40BITS_VA=%d", context->config.use_40bits_va); } else { - sprintf(cmd, "-cl-viv-vx-extension -D VX_VERSION=%d", evis); + sprintf(cmd, "-cl-viv-vx-extension -D VX_VERSION=%d -D USE_40BITS_VA=%d", evis, context->config.use_40bits_va); } status = vxBuildProgram(program, cmd); @@ -323,11 +323,11 @@ static vsi_status vsi_nn_RegisterBinKernel if(evis == VSI_NN_HW_EVIS_NONE) { // set default evis version is 2 - sprintf(cmd, "-cl-viv-vx-extension -D VX_VERSION=2"); + sprintf(cmd, "-cl-viv-vx-extension -D VX_VERSION=2 -D USE_40BITS_VA=%d", context->config.use_40bits_va); } else { - sprintf(cmd, "-cl-viv-vx-extension -D VX_VERSION=%d", evis); + sprintf(cmd, "-cl-viv-vx-extension -D VX_VERSION=%d -D USE_40BITS_VA=%d", evis, context->config.use_40bits_va); } #else sprintf(cmd, "-cl-viv-vx-extension"); diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_box_with_nms_limit.c b/src/tim/vx/internal/src/ops/vsi_nn_op_box_with_nms_limit.c index d376212..878c606 100644 --- a/src/tim/vx/internal/src/ops/vsi_nn_op_box_with_nms_limit.c +++ b/src/tim/vx/internal/src/ops/vsi_nn_op_box_with_nms_limit.c @@ -34,163 +34,10 @@ #include "vsi_nn_ops.h" #include "vsi_nn_tensor.h" #include "vsi_nn_tensor_util.h" -#include "libnnext/vsi_nn_vxkernel.h" +#include "kernel/vsi_nn_kernel.h" -#define _ARG_NUM (6) #define _INPUT_NUM (3) #define _OUTPUT_NUM (4) -#define _IO_NUM (_INPUT_NUM + _OUTPUT_NUM) -#define _PARAM_NUM (_ARG_NUM + _IO_NUM) - -extern vx_kernel_description_t * vx_kernel_BOX_WITH_NMS_LIMIT_list[]; - -static void _set_inputs_outputs - ( - vx_reference * params, - vsi_nn_tensor_t ** inputs, - vsi_nn_tensor_t ** outputs - ) -{ - uint32_t i; - uint32_t cnt; - - /* Set inputs */ - cnt = 0; - for( i = 0; i < _INPUT_NUM; i ++, cnt ++ ) - { - params[cnt] = (vx_reference)inputs[i]->t; - } - - /* Set outputs */ - for( i = 0; i < _OUTPUT_NUM; i ++, cnt ++ ) - { - params[cnt] = (vx_reference)outputs[i]->t; - } -} /* _set_inputs_outputs() */ - -static vsi_status _create_params - ( - vsi_nn_node_t * node, - vx_reference * params, - uint32_t num - ) -{ - vsi_status status; - vx_context ctx; - vsi_nn_box_with_nms_limit_param * p; - if( 0 == num ) - { - return VSI_SUCCESS; - } - memset( params, 0, sizeof( vx_reference * ) * num ); - p = &(node->nn_param.box_with_nms_limit); - ctx = vxGetContext( (vx_reference)node->graph->g ); - /* Init parameters */ - #define _SET_PARAM( i, type, arg ) do{ \ - params[i] = (vx_reference)vxCreateScalar( ctx, type, &p->arg ); \ - status = vxGetStatus( params[i] ); \ - if( VSI_SUCCESS != status ) { \ - goto set_param_error; \ - } \ - } while(0) - _SET_PARAM( 0, VX_TYPE_FLOAT32, score_threshold ); - _SET_PARAM( 1, VX_TYPE_INT32, max_num_bbox ); - _SET_PARAM( 2, VX_TYPE_INT32, nms_kernel_method ); - _SET_PARAM( 3, VX_TYPE_FLOAT32, iou_threshold ); - _SET_PARAM( 4, VX_TYPE_FLOAT32, sigma ); - _SET_PARAM( 5, VX_TYPE_FLOAT32, nms_score_threshold ); - #undef _SET_PARAM -set_param_error: - - return status; -} /* _create_params */ - -static void _release_params - ( - vx_reference * params, - uint32_t num - ) -{ - uint32_t i; - vx_scalar scalar; - for( i = 0; i < num; i ++ ) - { - scalar = (vx_scalar)params[i]; - vxReleaseScalar( &scalar ); - } -} /* _release_params() */ - -static vsi_status cpu_op_compute - ( - vsi_nn_node_t * self, - vsi_nn_tensor_t ** inputs, - vsi_nn_tensor_t ** outputs - ) -{ - vsi_status status = VSI_SUCCESS; - vx_reference params[_PARAM_NUM]; - vx_reference * args; - - args = ¶ms[_IO_NUM]; - - if( NULL == self->n ) - { - return VSI_FAILURE; - } - - /* Set inputs and outputs */ - _set_inputs_outputs( params, inputs, outputs ); - - /* Init parameters. */ - _create_params( self, args, _ARG_NUM ); - - /* Pass parameters to node. */ - status = vsi_nn_ClientNodePassParameters( self->n, params, _PARAM_NUM ); - - _release_params( args, _ARG_NUM ); - - return status; -} - -static vsi_status vx_op_compute - ( - vsi_nn_node_t * self, - vsi_nn_tensor_t ** inputs, - vsi_nn_tensor_t ** outputs - ) -{ - vsi_status status = VSI_SUCCESS; - vx_reference params[_PARAM_NUM]; - vx_reference * args; - - args = ¶ms[_IO_NUM]; - - if( NULL == self->n ) - { - return VSI_FAILURE; - } - - /* Set inputs and outputs */ - _set_inputs_outputs( params, inputs, outputs ); - /*TODO: Add code if need to change your parameter*/ - - /* Init parameters. */ - _create_params( self, args, _ARG_NUM ); - - /* Pass parameters to node. */ - status = vsi_nn_ClientNodePassParameters( self->n, params, _PARAM_NUM ); - - _release_params( args, _ARG_NUM ); - - return status; -} - -static vsi_nn_op_compute_t op_compute_list[] = -{ - cpu_op_compute, - vx_op_compute, - NULL -}; static vsi_status op_compute ( @@ -199,46 +46,31 @@ static vsi_status op_compute vsi_nn_tensor_t ** outputs ) { - vsi_status status; - vsi_nn_kernel_info_t kernel_info; - char *path = NULL; + vsi_status status = VSI_FAILURE; + vsi_nn_kernel_param_t * param = NULL; + vsi_nn_box_with_nms_limit_param *p = &self->nn_param.box_with_nms_limit; - memset(&kernel_info, 0x0, sizeof(vsi_nn_kernel_info_t)); - status = VSI_FAILURE; - kernel_info.type = VX_KERNEL_TYPE_CPU; - kernel_info.kernel = vx_kernel_BOX_WITH_NMS_LIMIT_list; - kernel_info.resource_num = 1; - kernel_info.resource_name = (char **)malloc(kernel_info.resource_num * sizeof(char *)); - kernel_info.resource_name[0] = "vsi_nn_kernel_box_with_nms_limit"; - path = getenv("USER_VX_SOURCE_PATH"); - if(path) - vsi_nn_VxResourceSetPath(path); + param = vsi_nn_kernel_param_create(); - if( kernel_info.type == VX_KERNEL_TYPE_VX) + vsi_nn_kernel_param_add_float32( param, "score_threshold", p->score_threshold ); + vsi_nn_kernel_param_add_int32( param, "max_num_detections", p->max_num_bbox ); + vsi_nn_kernel_param_add_int32( param, "nms_kernel_method", p->nms_kernel_method ); + vsi_nn_kernel_param_add_float32( param, "iou_threshold", p->iou_threshold ); + vsi_nn_kernel_param_add_float32( param, "sigma", p->sigma ); + vsi_nn_kernel_param_add_float32( param, "nms_score_threshold", p->nms_score_threshold ); + + self->n = (vx_node)vsi_nn_kernel_selector( self->graph, + "box_with_nms_limit", + inputs, _INPUT_NUM, + outputs, _OUTPUT_NUM, param ); + + if( self->n ) { - kernel_info.kernel_index = 1; - kernel_info.init_index = 1; - } - else /*kernel_info.type = VX_KERNEL_TYPE_CPU;*/ - { - kernel_info.kernel_index = 0; - kernel_info.init_index = 0; + status = VSI_SUCCESS; } - self->n = vsi_nn_RegisterClientKernelAndNewNode( - self->graph, &kernel_info); - if (kernel_info.resource_name) - { - free(kernel_info.resource_name); - } - if( NULL == self->n ) - { - return VSI_FAILURE; - } - if (NULL != op_compute_list[kernel_info.init_index]) - { - status = op_compute_list[kernel_info.init_index](self, inputs, outputs); - } + vsi_nn_kernel_param_release( ¶m ); + return status; } /* op_compute() */ @@ -264,17 +96,27 @@ static vsi_bool op_setup { outputs[0]->attr.dim_num = 1; outputs[0]->attr.size[0] = inputs[0]->attr.size[1]; + } + if( VSI_NN_DIM_AUTO == outputs[1]->attr.dim_num ) + { outputs[1]->attr.dim_num = 2; outputs[1]->attr.size[0] = 4; outputs[1]->attr.size[1] = inputs[0]->attr.size[1]; - - outputs[2]->attr.dim_num = 1; - outputs[2]->attr.size[0] = inputs[0]->attr.size[1]; - - outputs[3]->attr.dim_num = 1; - outputs[3]->attr.size[0] = inputs[0]->attr.size[1]; } + + if( VSI_NN_DIM_AUTO == outputs[2]->attr.dim_num ) + { + outputs[0]->attr.dim_num = 1; + outputs[0]->attr.size[0] = inputs[0]->attr.size[1]; + } + + if( VSI_NN_DIM_AUTO == outputs[3]->attr.dim_num ) + { + outputs[0]->attr.dim_num = 1; + outputs[0]->attr.size[0] = inputs[0]->attr.size[1]; + } + return TRUE; } /* op_setup() */ diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_cast.c b/src/tim/vx/internal/src/ops/vsi_nn_op_cast.c index 34eb9cf..5189994 100644 --- a/src/tim/vx/internal/src/ops/vsi_nn_op_cast.c +++ b/src/tim/vx/internal/src/ops/vsi_nn_op_cast.c @@ -45,7 +45,7 @@ #define _INPUT_NUM (1) #define _OUTPUT_NUM (1) -static vsi_bool _is_same_quant +static vsi_bool _is_dataconvert_op ( vsi_nn_node_t * self, vsi_nn_tensor_t ** inputs, @@ -57,6 +57,15 @@ static vsi_bool _is_same_quant dtype = &inputs[0]->attr.dtype; _dtype = &outputs[0]->attr.dtype; + if ( dtype->qnt_type == VSI_NN_QNT_TYPE_NONE && + dtype->vx_type != VSI_NN_TYPE_FLOAT32 && + dtype->vx_type != VSI_NN_TYPE_FLOAT16 && + _dtype->qnt_type == VSI_NN_QNT_TYPE_NONE && + vsi_nn_OpCheck(VSI_NN_OP_DATACONVERT, self, inputs, outputs) ) + { + return TRUE; + } + if (vsi_nn_DtypeCompare(dtype, _dtype) == FALSE) { return FALSE; @@ -74,9 +83,8 @@ static vsi_status op_compute { vsi_status status = VSI_FAILURE; - if ( _is_same_quant(self, inputs, outputs)) + if ( _is_dataconvert_op(self, inputs, outputs) ) { - vsi_nn_internal_compute_node( self ); status = VSI_SUCCESS; } @@ -157,6 +165,9 @@ static vsi_bool op_check IO_TYPE(D_I16, D_I8|Q_DFP) IO_TYPE(D_I16, D_U8|Q_ASYM) IO_TYPE(D_I16, D_BOOL8) + IO_TYPE(D_I16, D_I32) + IO_TYPE(D_I16, D_U32) + IO_TYPE(D_I16, D_F32) IO_TYPE(D_I8|Q_DFP, D_F16) IO_TYPE(D_I8|Q_DFP, D_I16|Q_DFP) IO_TYPE(D_I8|Q_DFP, D_U8|Q_ASYM) @@ -165,6 +176,9 @@ static vsi_bool op_check IO_TYPE(D_I8, D_I16|Q_DFP) IO_TYPE(D_I8, D_U8|Q_ASYM) IO_TYPE(D_I8, D_BOOL8) + IO_TYPE(D_I8, D_I32) + IO_TYPE(D_I8, D_U32) + IO_TYPE(D_I8, D_F32) IO_TYPE(D_U8|Q_ASYM, D_F16) IO_TYPE(D_U8|Q_ASYM, D_I16|Q_DFP) IO_TYPE(D_U8|Q_ASYM, D_I8|Q_DFP) @@ -173,6 +187,9 @@ static vsi_bool op_check IO_TYPE(D_U8, D_I16|Q_DFP) IO_TYPE(D_U8, D_I8|Q_DFP) IO_TYPE(D_U8, D_BOOL8) + IO_TYPE(D_U8, D_I32) + IO_TYPE(D_U8, D_U32) + IO_TYPE(D_U8, D_F32) IO_TYPE(D_F32, D_I16|Q_DFP) IO_TYPE(D_F32, D_I8|Q_DFP) IO_TYPE(D_F32, D_U8|Q_ASYM) @@ -224,7 +241,7 @@ static vsi_status op_optimize status = VSI_SUCCESS; - if ( _is_same_quant(self, inputs, outputs)) + if ( _is_dataconvert_op(self, inputs, outputs) ) { vsi_nn_internal_optimize_node( self, direction ); } @@ -248,7 +265,7 @@ static vsi_bool op_setup } ret = vsi_nn_op_common_setup(self, inputs, outputs); - if ( _is_same_quant(self, inputs, outputs) ) + if ( _is_dataconvert_op(self, inputs, outputs) ) { vsi_nn_internal_node_t* curr = NULL; curr = vsi_nn_internal_new_node(self, VSI_NN_OP_DATACONVERT, 1, 1); @@ -304,4 +321,3 @@ DEF_OP_REG ); __END_DECLS - diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_concat.c b/src/tim/vx/internal/src/ops/vsi_nn_op_concat.c index d36cf41..0f7abb0 100644 --- a/src/tim/vx/internal/src/ops/vsi_nn_op_concat.c +++ b/src/tim/vx/internal/src/ops/vsi_nn_op_concat.c @@ -37,28 +37,6 @@ #include "utils/vsi_nn_dtype_util.h" #include "utils/vsi_nn_constraint_check.h" -static vsi_bool _enable_concat_optimize() -{ - char *envctrl; - static int32_t enableOptimize = -1; - - if (enableOptimize == -1) - { - enableOptimize = 1; - envctrl = getenv("VSI_NN_ENABLE_CONCAT_OPTIMIZE"); - if (envctrl) - { - enableOptimize = atoi(envctrl); - } - } - - if (enableOptimize == 1) - { - return TRUE; - } - - return FALSE; -} static int32_t _get_input_num ( @@ -267,7 +245,7 @@ static vsi_status op_compute status = VSI_SUCCESS; self->n = NULL; if(_is_highest_dimension(self, outputs) && _is_same_quant(self, inputs, outputs) - && _enable_concat_optimize()) + && self->graph->ctx->options.enable_concat_optimize) { iter = self->nn_param.concat.lcl_data; while( NULL != iter ) @@ -422,7 +400,7 @@ static vsi_status op_optimize /* we don't create tensor view if the axis is not the highest dimension */ if (_is_highest_dimension(self, outputs) == FALSE || _is_same_quant(self, inputs, outputs) == FALSE || - _enable_concat_optimize() == FALSE) + self->graph->ctx->options.enable_concat_optimize == 0) { return status; } @@ -547,4 +525,3 @@ DEF_OP_REG #ifdef __cplusplus } #endif - diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_dataconvert.c b/src/tim/vx/internal/src/ops/vsi_nn_op_dataconvert.c index 79631e3..c3317e0 100644 --- a/src/tim/vx/internal/src/ops/vsi_nn_op_dataconvert.c +++ b/src/tim/vx/internal/src/ops/vsi_nn_op_dataconvert.c @@ -188,7 +188,8 @@ static vsi_bool op_check IO_TYPE(D_F16, D_I8) IO_TYPE(D_F16, D_U8) IO_TYPE(D_F32, D_F32) - IO_TYPE(D_F32, D_I32) + IO_TYPE(D_F32, D_I32|Q_DFP) + IO_TYPE(D_F32, D_I32|Q_ASYM) IO_TYPE(D_F32, D_U32) IO_TYPE(D_F32, D_F16) IO_TYPE(D_F32, D_BF16) @@ -292,7 +293,7 @@ static vsi_bool op_check { char* desc = generate_op_io_types_desc(inputs, self->input.num, outputs, self->output.num); - VSILOGE("Inputs/Outputs data type not support: %s", desc); + VSILOGD("Inputs/Outputs data type not support: %s", desc); destroy_op_io_types_desc(desc); return FALSE; } diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_eltwise_unary.c b/src/tim/vx/internal/src/ops/vsi_nn_op_eltwise_unary.c index d55ac92..408d87f 100644 --- a/src/tim/vx/internal/src/ops/vsi_nn_op_eltwise_unary.c +++ b/src/tim/vx/internal/src/ops/vsi_nn_op_eltwise_unary.c @@ -59,8 +59,17 @@ static vsi_status _eltwise_unary_op_compute // TODO: This optimzie is a hack for gpu path, // it should be moved to gpu kernel setup. - self->n = (vx_node)vsi_nn_kernel_selector( self->graph, - kernel_name, inputs, 1, outputs, 1, param ); + if (strcmp(kernel_name, "gelu") == 0 && self->nn_param.gelu.approximate) + { + self->n = (vx_node)vsi_nn_kernel_selector( self->graph, + "hard_gelu", inputs, 1, outputs, 1, param ); + } + else + { + self->n = (vx_node)vsi_nn_kernel_selector( self->graph, + kernel_name, inputs, 1, outputs, 1, param ); + } + if( self->n ) { @@ -197,6 +206,7 @@ DEF_ELEMENT_WISE_UNARY_OP( NEG, neg ); DEF_ELEMENT_WISE_UNARY_OP( HARD_SIGMOID, hard_sigmoid ); DEF_ELEMENT_WISE_UNARY_OP( MISH, mish ); DEF_ELEMENT_WISE_UNARY_OP( ROUND, round ); +DEF_ELEMENT_WISE_UNARY_OP( GELU, gelu ); #undef DEF_ELEMENT_UNARY_WISE_OP diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_extra_ending.c b/src/tim/vx/internal/src/ops/vsi_nn_op_extra_ending.c index 009e75d..23be09a 100644 --- a/src/tim/vx/internal/src/ops/vsi_nn_op_extra_ending.c +++ b/src/tim/vx/internal/src/ops/vsi_nn_op_extra_ending.c @@ -34,154 +34,12 @@ #include "vsi_nn_ops.h" #include "vsi_nn_tensor.h" #include "vsi_nn_tensor_util.h" -#include "libnnext/vsi_nn_vxkernel.h" +#include "kernel/vsi_nn_kernel.h" #include "utils/vsi_nn_constraint_check.h" #define _INPUT_NUM (1) #define _OUTPUT_NUM (1) -extern vx_kernel_description_t * vx_kernel_EXTRA_ENDING_list[]; - -static void check_tensor_shape - ( - vsi_nn_node_t * self, - vsi_nn_tensor_t * input, - vx_reference * params, - uint32_t index, - vx_bool rsFlg - ) -{ - vsi_nn_tensor_attr_t attr; - - if (index == 0) - { - if( input->attr.dim_num == 1) - { - memcpy(&attr, &(input->attr), sizeof(vsi_nn_tensor_attr_t)); - attr.size[1] = 1; - attr.dim_num = 2; - self->nn_param.extra_ending.local.local_tensor[index] = - vxReshapeTensor(input->t, (int32_t*)(attr.size), attr.dim_num); - params[index] = (vx_reference)self->nn_param.extra_ending.local.local_tensor[index]; - } - else - params[index] = (vx_reference)input->t; - } - else if (index == 1) - { - params[index] = (vx_reference)input->t; - } - else if (index == 2) - { - if( input->attr.dim_num == 1) - { - memcpy(&attr, &(input->attr), sizeof(vsi_nn_tensor_attr_t)); - attr.size[1] = 1; - attr.dim_num = 2; - self->nn_param.extra_ending.local.local_tensor[index] = - vxReshapeTensor(input->t, (int32_t*)(attr.size), attr.dim_num); - params[index] = (vx_reference)self->nn_param.extra_ending.local.local_tensor[index]; - } - else - params[index] = (vx_reference)input->t; - } - else - { - VSILOGE("No more local tensor!(pow) at [%s : %d]\n", __FILE__, __LINE__); - } -} - -static vsi_status cpu_op_compute - ( - vsi_nn_node_t * self, - vsi_nn_tensor_t ** inputs, - vsi_nn_tensor_t ** outputs, - vsi_nn_tensor_t* extraInput - ) -{ - vsi_status status = VSI_SUCCESS; - vx_reference params[3]; - - if( NULL == self->n ) - { - return VSI_FAILURE; - } - - /* Set inputs and outputs */ - check_tensor_shape(self, inputs[0], params, 0, 0); - check_tensor_shape(self, extraInput, params, 1, 0); - check_tensor_shape(self, outputs[0], params, 2, 0); - - /* Pass parameters to node. */ - status = vsi_nn_ClientNodePassParameters( self->n, params, 3 ); - - return status; -} - -static vsi_status vx_op_compute - ( - vsi_nn_node_t * self, - vsi_nn_tensor_t ** inputs, - vsi_nn_tensor_t ** outputs, - vsi_nn_tensor_t* extraInput - ) -{ - vsi_status status = VSI_SUCCESS; - vx_reference params[3]; - vx_border_t border; - - if( NULL == self->n ) - { - return VSI_FAILURE; - } - - /* Set inputs and outputs */ - check_tensor_shape(self, inputs[0], params, 0, 0); - check_tensor_shape(self, extraInput, params, 1, 0); - check_tensor_shape(self, outputs[0], params, 2, 0); - /*TODO: Add code if need to change your parameter*/ - - /* Pass parameters to node. */ - status = vsi_nn_ClientNodePassParameters( self->n, params, 3 ); - - border.mode = VX_BORDER_REPLICATE; - border.constant_value.U32 = 0; - status |= vxSetNodeAttribute(self->n, VX_NODE_BORDER, &border, sizeof(border)); - - return status; -} - -static vsi_status vx_op_pre_compute - ( - vsi_nn_node_t * self, - vsi_nn_tensor_t ** inputs, - vsi_nn_tensor_t ** outputs, - vsi_nn_kernel_info_t * kernel_info - ) -{ - vsi_nn_type_e outputDataFormat = outputs[0]->attr.dtype.vx_type; - - if (outputDataFormat == VSI_NN_TYPE_INT16 || outputDataFormat == VSI_NN_TYPE_FLOAT16) - { - kernel_info->kernel_index = 1; - } - if (outputDataFormat == VSI_NN_TYPE_INT8) - { - kernel_info->kernel_index = 2; - } - if (outputDataFormat == VSI_NN_TYPE_UINT8) - { - kernel_info->kernel_index = 3; - } - else - { - VSILOGE("Not support input or output data format!(extra ending) at [%s : %d]\n", __FILE__, __LINE__); - return VSI_FAILURE; - } - - return VSI_SUCCESS; -} - static vsi_status op_compute ( vsi_nn_node_t * self, @@ -189,69 +47,36 @@ static vsi_status op_compute vsi_nn_tensor_t ** outputs ) { - vsi_status status; - vsi_nn_kernel_info_t kernel_info; - vsi_nn_tensor_t* tmpRealInput = NULL; + vsi_status status = VSI_FAILURE; + vsi_nn_tensor_attr_t attr; + vsi_nn_tensor_t *tmp_tensor = NULL; + vsi_nn_tensor_t *input_tensor[2] = {NULL}; + vsi_nn_extra_ending_param *p = &self->nn_param.extra_ending; - memset(&kernel_info, 0x0, sizeof(vsi_nn_kernel_info_t)); - status = VSI_FAILURE; - kernel_info.type = vsi_nn_GetVXKernelTypeForShader(); - kernel_info.kernel = vx_kernel_EXTRA_ENDING_list; - kernel_info.resource_num = 1; - kernel_info.resource_name = (char **)malloc(kernel_info.resource_num * sizeof(char *)); - kernel_info.resource_name[0] = "vsi_nn_kernel_extra_ending"; + memset(&attr, 0, sizeof(vsi_nn_tensor_attr_t)); + attr.dim_num = 2; + attr.size[0] = p->length; + attr.size[1] = 1; + attr.dtype.vx_type = VSI_NN_TYPE_UINT8; + attr.vtl = FALSE; + tmp_tensor = vsi_nn_CreateTensorFromData(self->graph, + (uint8_t*)&p->value, &attr); + input_tensor[0] = inputs[0]; + input_tensor[1] = tmp_tensor; + + self->n = (vx_node)vsi_nn_kernel_selector( self->graph, + "signal_frame", + input_tensor, 2, + outputs, 1, NULL ); + + if( self->n ) { - vsi_nn_tensor_attr_t attr; - - memset(&attr, 0, sizeof(vsi_nn_tensor_attr_t)); - attr.size[0] = self->nn_param.extra_ending.length; - attr.size[1] = 1; - attr.size[2] = 1; - attr.size[3] = 1; - attr.dim_num = 2; - attr.dtype.vx_type = VSI_NN_TYPE_UINT8; - attr.vtl = FALSE; - tmpRealInput = vsi_nn_CreateTensorFromData(self->graph, - (uint8_t*)&self->nn_param.extra_ending.value, &attr); + status = VSI_SUCCESS; } - if( kernel_info.type == VX_KERNEL_TYPE_VX) - { - kernel_info.kernel_index = 1; - kernel_info.init_index = 1; - vx_op_pre_compute(self, inputs, outputs, &kernel_info); - } - else /*kernel_info.type = VX_KERNEL_TYPE_CPU;*/ - { - kernel_info.kernel_index = 0; - kernel_info.init_index = 0; - kernel_info.type = VX_KERNEL_TYPE_CPU; - } + vsi_safe_release_tensor(tmp_tensor); - self->n = vsi_nn_RegisterClientKernelAndNewNode( - self->graph, &kernel_info); - if (kernel_info.resource_name) - { - free(kernel_info.resource_name); - } - if( NULL == self->n ) - { - status = VSI_FAILURE; - goto final; - } - - if(kernel_info.type == VX_KERNEL_TYPE_VX) - { - status = vx_op_compute(self, inputs, outputs, tmpRealInput); - } - else - { - status = cpu_op_compute(self, inputs, outputs, tmpRealInput); - } - -final: - if(tmpRealInput) vsi_nn_ReleaseTensor(&tmpRealInput); return status; } /* op_compute() */ @@ -263,10 +88,10 @@ static vsi_bool op_check ) { BEGIN_IO_TYPE_DECL(EXTRA_ENDING, 1, 1) - IO_TYPE(D_F16, D_F16) - IO_TYPE(D_F16, D_U8|Q_ASYM) - IO_TYPE(D_F16, D_I16|Q_DFP) - IO_TYPE(D_F16, D_I8|Q_DFP) + IO_TYPE(D_F16, D_F16) + IO_TYPE(D_F16, D_U8|Q_ASYM) + IO_TYPE(D_F16, D_I16|Q_DFP) + IO_TYPE(D_F16, D_I8|Q_DFP) IO_TYPE(D_U8|Q_ASYM, D_F16) IO_TYPE(D_U8|Q_ASYM, D_U8|Q_ASYM) IO_TYPE(D_U8|Q_ASYM, D_I16|Q_DFP) @@ -275,10 +100,10 @@ static vsi_bool op_check IO_TYPE(D_I16|Q_DFP, D_U8|Q_ASYM) IO_TYPE(D_I16|Q_DFP, D_I16|Q_DFP) IO_TYPE(D_I16|Q_DFP, D_I8|Q_DFP) - IO_TYPE(D_I8|Q_DFP, D_F16) - IO_TYPE(D_I8|Q_DFP, D_U8|Q_ASYM) - IO_TYPE(D_I8|Q_DFP, D_I16|Q_DFP) - IO_TYPE(D_I8|Q_DFP, D_I8|Q_DFP) + IO_TYPE(D_I8|Q_DFP, D_F16) + IO_TYPE(D_I8|Q_DFP, D_U8|Q_ASYM) + IO_TYPE(D_I8|Q_DFP, D_I16|Q_DFP) + IO_TYPE(D_I8|Q_DFP, D_I8|Q_DFP) END_IO_TYPE_DECL(EXTRA_ENDING) if (!VALIDATE_OP_IO_TYPES(EXTRA_ENDING, self, inputs, self->input.num, outputs, self->output.num)) { @@ -312,15 +137,6 @@ static vsi_status op_deinit vsi_nn_node_t * self ) { - uint32_t i; - for (i = 0; i < _VSI_NN_EXTRA_ENDING_LOCAL_TENSOR_NUM; i++) - { - if (self->nn_param.extra_ending.local.local_tensor[i] != NULL) - { - vxReleaseTensor(&(self->nn_param.extra_ending.local.local_tensor[i])); - self->nn_param.extra_ending.local.local_tensor[i] = NULL; - } - } vsi_nn_op_common_deinit(self); return VSI_SUCCESS; diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_heatmap_max_keypoint.c b/src/tim/vx/internal/src/ops/vsi_nn_op_heatmap_max_keypoint.c index 0a01f72..46ee1d2 100644 --- a/src/tim/vx/internal/src/ops/vsi_nn_op_heatmap_max_keypoint.c +++ b/src/tim/vx/internal/src/ops/vsi_nn_op_heatmap_max_keypoint.c @@ -34,158 +34,10 @@ #include "vsi_nn_ops.h" #include "vsi_nn_tensor.h" #include "vsi_nn_tensor_util.h" -#include "libnnext/vsi_nn_vxkernel.h" +#include "kernel/vsi_nn_kernel.h" -#define _ARG_NUM (1) #define _INPUT_NUM (2) #define _OUTPUT_NUM (2) -#define _IO_NUM (_INPUT_NUM + _OUTPUT_NUM) -#define _PARAM_NUM (_ARG_NUM + _IO_NUM) - -extern vx_kernel_description_t * vx_kernel_HEATMAP_MAX_KEYPOINT_list[]; - -static void _set_inputs_outputs - ( - vx_reference * params, - vsi_nn_tensor_t ** inputs, - vsi_nn_tensor_t ** outputs - ) -{ - uint32_t i; - uint32_t cnt; - - /* Set inputs */ - cnt = 0; - for( i = 0; i < _INPUT_NUM; i ++, cnt ++ ) - { - params[cnt] = (vx_reference)inputs[i]->t; - } - - /* Set outputs */ - for( i = 0; i < _OUTPUT_NUM; i ++, cnt ++ ) - { - params[cnt] = (vx_reference)outputs[i]->t; - } -} /* _set_inputs_outputs() */ - -static vsi_status _create_params - ( - vsi_nn_node_t * node, - vx_reference * params, - uint32_t num - ) -{ - vsi_status status; - vx_context ctx; - vsi_nn_heatmap_max_keypoint_param * p; - if( 0 == num ) - { - return VSI_SUCCESS; - } - memset( params, 0, sizeof( vx_reference * ) * num ); - p = &(node->nn_param.heatmap_max_keypoint); - ctx = vxGetContext( (vx_reference)node->graph->g ); - /* Init parameters */ - #define _SET_PARAM( i, type, arg ) do{ \ - params[i] = (vx_reference)vxCreateScalar( ctx, type, &p->arg ); \ - status = vxGetStatus( params[i] ); \ - if( VSI_SUCCESS != status ) { \ - goto set_param_error; \ - } \ - } while(0) - _SET_PARAM( 0, VX_TYPE_INT32, type ); - #undef _SET_PARAM -set_param_error: - - return status; -} /* _create_params */ - -static void _release_params - ( - vx_reference * params, - uint32_t num - ) -{ - uint32_t i; - vx_scalar scalar; - for( i = 0; i < num; i ++ ) - { - scalar = (vx_scalar)params[i]; - vxReleaseScalar( &scalar ); - } -} /* _release_params() */ - -static vsi_status cpu_op_compute - ( - vsi_nn_node_t * self, - vsi_nn_tensor_t ** inputs, - vsi_nn_tensor_t ** outputs - ) -{ - vsi_status status = VSI_SUCCESS; - vx_reference params[_PARAM_NUM]; - vx_reference * args; - - args = ¶ms[_IO_NUM]; - - if( NULL == self->n ) - { - return VSI_FAILURE; - } - - /* Set inputs and outputs */ - _set_inputs_outputs( params, inputs, outputs ); - - /* Init parameters. */ - _create_params( self, args, _ARG_NUM ); - - /* Pass parameters to node. */ - status = vsi_nn_ClientNodePassParameters( self->n, params, _PARAM_NUM ); - - _release_params( args, _ARG_NUM ); - - return status; -} - -static vsi_status vx_op_compute - ( - vsi_nn_node_t * self, - vsi_nn_tensor_t ** inputs, - vsi_nn_tensor_t ** outputs - ) -{ - vsi_status status = VSI_SUCCESS; - vx_reference params[_PARAM_NUM]; - vx_reference * args; - - args = ¶ms[_IO_NUM]; - - if( NULL == self->n ) - { - return VSI_FAILURE; - } - - /* Set inputs and outputs */ - _set_inputs_outputs( params, inputs, outputs ); - /*TODO: Add code if need to change your parameter*/ - - /* Init parameters. */ - _create_params( self, args, _ARG_NUM ); - - /* Pass parameters to node. */ - status = vsi_nn_ClientNodePassParameters( self->n, params, _PARAM_NUM ); - - _release_params( args, _ARG_NUM ); - - return status; -} - -static vsi_nn_op_compute_t op_compute_list[] = -{ - cpu_op_compute, - vx_op_compute, - NULL -}; static vsi_status op_compute ( @@ -194,48 +46,20 @@ static vsi_status op_compute vsi_nn_tensor_t ** outputs ) { - vsi_status status; - vsi_nn_kernel_info_t kernel_info; - char *path = NULL; + vsi_status status = VSI_FAILURE; - memset(&kernel_info, 0x0, sizeof(vsi_nn_kernel_info_t)); - status = VSI_FAILURE; - kernel_info.type = VX_KERNEL_TYPE_CPU; - kernel_info.kernel = vx_kernel_HEATMAP_MAX_KEYPOINT_list; - kernel_info.resource_num = 1; - kernel_info.resource_name = (char **)malloc(kernel_info.resource_num * sizeof(char *)); - kernel_info.resource_name[0] = "vsi_nn_kernel_heatmap_max_keypoint"; - path = getenv("USER_VX_SOURCE_PATH"); - if(path) - vsi_nn_VxResourceSetPath(path); + self->n = (vx_node)vsi_nn_kernel_selector( self->graph, + "heatmap_max_keypoint", + inputs, _INPUT_NUM, + outputs, _OUTPUT_NUM, NULL ); - if( kernel_info.type == VX_KERNEL_TYPE_VX) + if( self->n ) { - kernel_info.kernel_index = 1; - kernel_info.init_index = 1; - } - else /*kernel_info.type = VX_KERNEL_TYPE_CPU;*/ - { - kernel_info.kernel_index = 0; - kernel_info.init_index = 0; + status = VSI_SUCCESS; } - self->n = vsi_nn_RegisterClientKernelAndNewNode( - self->graph, &kernel_info); - if (kernel_info.resource_name) - { - free(kernel_info.resource_name); - } - if( NULL == self->n ) - { - return VSI_FAILURE; - } - if (NULL != op_compute_list[kernel_info.init_index]) - { - status = op_compute_list[kernel_info.init_index](self, inputs, outputs); - } return status; -} /* op_compute() */ +} static vsi_bool op_check ( @@ -255,16 +79,21 @@ static vsi_bool op_setup vsi_nn_tensor_t ** outputs ) { - if( VSI_NN_DIM_AUTO == outputs[0]->attr.dim_num ) + if ( VSI_NN_DIM_AUTO == outputs[0]->attr.dim_num ) { outputs[0]->attr.dim_num = 2; outputs[0]->attr.size[0] = inputs[0]->attr.size[0]; outputs[0]->attr.size[1] = inputs[0]->attr.size[3]; + } + + if ( VSI_NN_DIM_AUTO == outputs[1]->attr.dim_num ) + { outputs[1]->attr.dim_num = 3; outputs[1]->attr.size[0] = 2; outputs[1]->attr.size[1] = inputs[0]->attr.size[0]; outputs[1]->attr.size[2] = inputs[0]->attr.size[3]; } + return TRUE; } /* op_setup() */ diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_imageprocess.c b/src/tim/vx/internal/src/ops/vsi_nn_op_imageprocess.c index 8883c35..d9b3b32 100644 --- a/src/tim/vx/internal/src/ops/vsi_nn_op_imageprocess.c +++ b/src/tim/vx/internal/src/ops/vsi_nn_op_imageprocess.c @@ -37,83 +37,8 @@ #include "libnnext/vsi_nn_vxkernel.h" #include "utils/vsi_nn_constraint_check.h" -#define _ARG_NUM (14) #define _INPUT_NUM (1) #define _OUTPUT_NUM (1) -#define _IO_NUM (_INPUT_NUM + _OUTPUT_NUM) -#define _PARAM_NUM (_ARG_NUM + _IO_NUM) - -extern vx_kernel_description_t * vx_kernel_IMAGEPROCESS_list[]; - -static void _set_inputs_outputs - ( - vx_reference * params, - vsi_nn_tensor_t ** inputs, - vsi_nn_tensor_t ** outputs - ) -{ - uint32_t i; - uint32_t cnt; - - /* Set inputs */ - cnt = 0; - for( i = 0; i < _INPUT_NUM; i ++, cnt ++ ) - { - params[cnt] = (vx_reference)inputs[i]->t; - } - - /* Set outputs */ - for( i = 0; i < _OUTPUT_NUM; i ++, cnt ++ ) - { - params[cnt] = (vx_reference)outputs[i]->t; - } -} /* _set_inputs_outputs() */ - -static vsi_status _create_params - ( - vsi_nn_node_t * node, - vx_reference * params, - uint32_t num - ) -{ - vsi_status status; - vx_context ctx; - vsi_nn_imageprocess_param * p; - int32_t i; - if( 0 == num ) - { - return VSI_SUCCESS; - } - memset( params, 0, sizeof( vx_reference * ) * num ); - p = &(node->nn_param.imageprocess); - ctx = vxGetContext( (vx_reference)node->graph->g ); - /* Init parameters */ -#define _SET_PARAM( i, type, arg ) do{ \ - params[i] = (vx_reference)vxCreateScalar( ctx, type, &p->arg ); \ - status = vxGetStatus( params[i] ); \ - if( VSI_SUCCESS != status ) { \ - goto set_param_error; \ - } \ - } while(0) - _SET_PARAM( 0, VX_TYPE_INT32, crop.enable ); - _SET_PARAM( 1, VX_TYPE_INT32, crop.dim_num ); - for (i = 0; i < p->crop.dim_num; i++) - { - _SET_PARAM( 2 + i, VX_TYPE_INT32, crop.start[i] ); - } - _SET_PARAM( 6, VX_TYPE_BOOL, reverse_channel ); - _SET_PARAM( 7, VX_TYPE_INT32, mean.type ); - _SET_PARAM( 8, VX_TYPE_FLOAT32, mean.scale ); - _SET_PARAM( 9, VX_TYPE_INT32, mean.mean_value_size ); - for (i = 0; i < p->mean.mean_value_size; i++) - { - _SET_PARAM( 10 + i, VX_TYPE_FLOAT32, mean.mean_value[i] ); - } -#undef _SET_PARAM -set_param_error: - - return status; -} /* _create_params */ struct _scaletotensor_kernel_params { @@ -125,511 +50,6 @@ struct _scaletotensor_kernel_params typedef struct _scaletotensor_kernel_params scaletotensor_kernel_params_t; -static vsi_status prepare_params_scaletotensor - ( - vsi_nn_imageprocess_param *p, - scaletotensor_kernel_params_t *params, - vsi_nn_tensor_attr_t *attr_in, - vsi_nn_tensor_attr_t *attr_out - ) -{ - int32_t i; - if (p->crop.enable == TRUE) - { - params->offset[0] = p->crop.start[0]; - params->offset[1] = p->crop.start[1]; - } - else - { - params->offset[0] = 0; - params->offset[1] = 0; - } - - if (p->crop.enable == TRUE) - { - params->ratio[0] = (p->crop.length[0] << 15) / attr_out->size[0]; - params->ratio[1] = (p->crop.length[1] << 15) / attr_out->size[1]; - } - else - { - params->ratio[0] = (attr_in->size[0] << 15) / attr_out->size[0]; - params->ratio[1] = (attr_in->size[1] << 15) / attr_out->size[1]; - } - - if (p->mean.type == VSI_NN_IMAGEPROCESS_MEAN_NONE) - { - for (i = 0; i < 3; i++) - { - params->mean[i] = 0; - } - } - else if (p->mean.type == VSI_NN_IMAGEPROCESS_MEAN_CHANNEL) - { - for (i = 0; i < 3; i++) - { - params->mean[i] = p->mean.mean_value[i]; - } - } - else if (p->mean.type == VSI_NN_IMAGEPROCESS_MEAN_PIXEL) - { - for (i = 0; i < 3; i++) - { - params->mean[i] = p->mean.mean_value[0]; - } - } - params->scale = p->mean.scale; - return VSI_SUCCESS; -} - -static vsi_status prepare_params_grayscaletotensor - ( - vsi_nn_imageprocess_param *p, - scaletotensor_kernel_params_t *params, - vsi_nn_tensor_attr_t *attr_in, - vsi_nn_tensor_attr_t *attr_out - ) -{ - if (p->crop.enable == TRUE) - { - params->offset[0] = p->crop.start[0]; - params->offset[1] = p->crop.start[1]; - } - else - { - params->offset[0] = 0; - params->offset[1] = 0; - } - - if (p->crop.enable == TRUE) - { - params->ratio[0] = (p->crop.length[0] << 15) / attr_out->size[0]; - params->ratio[1] = (p->crop.length[1] << 15) / attr_out->size[1]; - } - else - { - params->ratio[0] = (attr_in->size[0] << 15) / attr_out->size[0]; - params->ratio[1] = (attr_in->size[1] << 15) / attr_out->size[1]; - } - - if (p->mean.type == VSI_NN_IMAGEPROCESS_MEAN_NONE) - { - params->mean[0] = 0; - } - else - { - params->mean[0] = p->mean.mean_value[0]; - } - params->scale = p->mean.scale; - return VSI_SUCCESS; -} - -static vsi_status _create_params_vx_scaletotensor - ( - vsi_nn_node_t * node, - vx_reference * params, - uint32_t num, - vsi_nn_tensor_attr_t *attr_in, - vsi_nn_tensor_attr_t *attr_out - ) -{ - vsi_status status; - vx_context ctx; - vsi_nn_imageprocess_param * p; - if( 0 == num ) - { - return VSI_SUCCESS; - } - memset( params, 0, sizeof( vx_reference * ) * num ); - p = &(node->nn_param.imageprocess); - ctx = vxGetContext( (vx_reference)node->graph->g ); - /* Init parameters */ -#define _SET_PARAM( i, type, arg ) do{ \ - params[i] = (vx_reference)vxCreateScalar( ctx, type, &arg ); \ - status = vxGetStatus( params[i] ); \ - if( VSI_SUCCESS != status ) { \ - goto set_param_error; \ - } \ - } while(0) - { - scaletotensor_kernel_params_t scaletotensor_kernel_params; - prepare_params_scaletotensor(p, &scaletotensor_kernel_params, attr_in, attr_out); - _SET_PARAM( 0, VX_TYPE_INT32, scaletotensor_kernel_params.ratio[0]); - _SET_PARAM( 1, VX_TYPE_INT32, scaletotensor_kernel_params.ratio[1]); - _SET_PARAM( 2, VX_TYPE_INT32, scaletotensor_kernel_params.offset[0]); - _SET_PARAM( 3, VX_TYPE_INT32, scaletotensor_kernel_params.offset[1]); - _SET_PARAM( 4, VX_TYPE_FLOAT32, scaletotensor_kernel_params.mean[0]); - _SET_PARAM( 5, VX_TYPE_FLOAT32, scaletotensor_kernel_params.mean[1]); - _SET_PARAM( 6, VX_TYPE_FLOAT32, scaletotensor_kernel_params.mean[2]); - _SET_PARAM( 7, VX_TYPE_FLOAT32, scaletotensor_kernel_params.scale); - } -#undef _SET_PARAM -set_param_error: - - return status; -} /* _create_params_vx_scaletotensor */ - -static vsi_status _create_params_vx_grayscaletotensor - ( - vsi_nn_node_t * node, - vx_reference * params, - uint32_t num, - vsi_nn_tensor_attr_t *attr_in, - vsi_nn_tensor_attr_t *attr_out - ) -{ - vsi_status status; - vx_context ctx; - vsi_nn_imageprocess_param * p; - if( 0 == num ) - { - return VSI_SUCCESS; - } - memset( params, 0, sizeof( vx_reference * ) * num ); - p = &(node->nn_param.imageprocess); - ctx = vxGetContext( (vx_reference)node->graph->g ); - /* Init parameters */ -#define _SET_PARAM( i, type, arg ) do{ \ - params[i] = (vx_reference)vxCreateScalar( ctx, type, &arg ); \ - status = vxGetStatus( params[i] ); \ - if( VSI_SUCCESS != status ) { \ - goto set_param_error; \ - } \ - } while(0) - { - scaletotensor_kernel_params_t scaletotensor_kernel_params; - prepare_params_grayscaletotensor(p, &scaletotensor_kernel_params, attr_in, attr_out); - _SET_PARAM( 0, VX_TYPE_INT32, scaletotensor_kernel_params.ratio[0]); - _SET_PARAM( 1, VX_TYPE_INT32, scaletotensor_kernel_params.ratio[1]); - _SET_PARAM( 2, VX_TYPE_INT32, scaletotensor_kernel_params.offset[0]); - _SET_PARAM( 3, VX_TYPE_INT32, scaletotensor_kernel_params.offset[1]); - _SET_PARAM( 4, VX_TYPE_FLOAT32, scaletotensor_kernel_params.mean[0]); - _SET_PARAM( 5, VX_TYPE_FLOAT32, scaletotensor_kernel_params.scale); - } -#undef _SET_PARAM -set_param_error: - - return status; -} /* _create_params_vx_scaletotensor */ - -static void _release_params - ( - vx_reference * params, - uint32_t num - ) -{ - uint32_t i; - vx_scalar scalar; - for( i = 0; i < num; i ++ ) - { - scalar = (vx_scalar)params[i]; - vxReleaseScalar( &scalar ); - } -} /* _release_params() */ - -static vsi_status cpu_op_compute - ( - vsi_nn_node_t * self, - vsi_nn_tensor_t ** inputs, - vsi_nn_tensor_t ** outputs - ) -{ - vsi_status status = VSI_SUCCESS; - vx_reference params[_PARAM_NUM]; - vx_reference * args; - - args = ¶ms[_IO_NUM]; - - if( NULL == self->n ) - { - return VSI_FAILURE; - } - - /* Set inputs and outputs */ - _set_inputs_outputs( params, inputs, outputs ); - - /* Init parameters. */ - _create_params( self, args, _ARG_NUM ); - - /* Pass parameters to node. */ - status = vsi_nn_ClientNodePassParameters( self->n, params, _PARAM_NUM ); - - _release_params( args, _ARG_NUM ); - - return status; -} - -static vsi_status select_kernel_index - ( - vsi_nn_kernel_info_t * kernel_info, - vsi_nn_type_e outDataType, - vx_bool is_copy - ) -{ - if (!is_copy) - { - if (outDataType == VSI_NN_TYPE_FLOAT16) - { - kernel_info->resource_name[0] = "vsi_nn_kernel_imageprocess"; - kernel_info->kernel_index = 1; - } - else if (outDataType == VSI_NN_TYPE_INT8) - { - kernel_info->resource_name[0] = "vsi_nn_kernel_imageprocess"; - kernel_info->kernel_index = 2; - } - else if (outDataType == VSI_NN_TYPE_INT16) - { - kernel_info->resource_name[0] = "vsi_nn_kernel_imageprocess_2"; - kernel_info->kernel_index = 3; - } - else if (outDataType == VSI_NN_TYPE_UINT8) - { - kernel_info->resource_name[0] = "vsi_nn_kernel_imageprocess_2"; - kernel_info->kernel_index = 4; - } - else - { - VSILOGE("Unsupported data type(imageprocess).\n"); - return VSI_FAILURE; - } - } - else - { - if (outDataType == VSI_NN_TYPE_FLOAT16) - { - kernel_info->resource_name[0] = "vsi_nn_kernel_imageprocess_3"; - kernel_info->kernel_index = 5; - } - else if (outDataType == VSI_NN_TYPE_INT8) - { - kernel_info->resource_name[0] = "vsi_nn_kernel_imageprocess_3"; - kernel_info->kernel_index = 6; - } - else if (outDataType == VSI_NN_TYPE_INT16) - { - kernel_info->resource_name[0] = "vsi_nn_kernel_imageprocess_3"; - kernel_info->kernel_index = 7; - } - else if (outDataType == VSI_NN_TYPE_UINT8) - { - kernel_info->resource_name[0] = "vsi_nn_kernel_imageprocess_3"; - kernel_info->kernel_index = 8; - } - else - { - VSILOGE("Unsupported data type(imageprocess).\n"); - return VSI_FAILURE; - } - } - - return VSI_SUCCESS; -} - -static vsi_status select_kernel_index_gray - ( - vsi_nn_kernel_info_t * kernel_info, - vsi_nn_type_e outDataType, - vx_bool is_copy - ) -{ - if (!is_copy) - { - if (outDataType == VSI_NN_TYPE_FLOAT16) - { - kernel_info->resource_name[0] = "vsi_nn_kernel_imageprocess_4"; - kernel_info->kernel_index = 9; - } - else if (outDataType == VSI_NN_TYPE_INT8) - { - kernel_info->resource_name[0] = "vsi_nn_kernel_imageprocess_4"; - kernel_info->kernel_index = 10; - } - else if (outDataType == VSI_NN_TYPE_INT16) - { - kernel_info->resource_name[0] = "vsi_nn_kernel_imageprocess_5"; - kernel_info->kernel_index = 11; - } - else if (outDataType == VSI_NN_TYPE_UINT8) - { - kernel_info->resource_name[0] = "vsi_nn_kernel_imageprocess_5"; - kernel_info->kernel_index = 12; - } - else - { - VSILOGE("Unsupported data type(imageprocess).\n"); - return VSI_FAILURE; - } - } - else - { - if (outDataType == VSI_NN_TYPE_FLOAT16) - { - kernel_info->resource_name[0] = "vsi_nn_kernel_imageprocess_5"; - kernel_info->kernel_index = 13; - } - else if (outDataType == VSI_NN_TYPE_INT8) - { - kernel_info->resource_name[0] = "vsi_nn_kernel_imageprocess_5"; - kernel_info->kernel_index = 14; - } - else if (outDataType == VSI_NN_TYPE_INT16) - { - kernel_info->resource_name[0] = "vsi_nn_kernel_imageprocess_5"; - kernel_info->kernel_index = 15; - } - else if (outDataType == VSI_NN_TYPE_UINT8) - { - kernel_info->resource_name[0] = "vsi_nn_kernel_imageprocess_5"; - kernel_info->kernel_index = 16; - } - else - { - VSILOGE("Unsupported data type(imageprocess).\n"); - return VSI_FAILURE; - } - } - - return VSI_SUCCESS; -} - -static vsi_status vx_op_pre_compute - ( - vsi_nn_node_t * self, - vsi_nn_tensor_t ** inputs, - vsi_nn_tensor_t ** outputs, - vsi_nn_kernel_info_t * kernel_info - ) -{ - vsi_nn_type_e outDataType = outputs[0]->attr.dtype.vx_type; - vx_bool is_copy = (vx_bool)((inputs[0]->attr.size[0] == outputs[0]->attr.size[0]) - && (inputs[0]->attr.size[1] == outputs[0]->attr.size[1])); - - if (inputs[0]->attr.size[2] == 1) - { - kernel_info->init_index = 2; - return select_kernel_index_gray(kernel_info, outDataType, is_copy); - } - else - { - kernel_info->init_index = 1; - return select_kernel_index(kernel_info, outDataType, is_copy); - } -} - -#define _ARG_NUM_SCALETOTENSOR (8) -#define _PARAM_NUM_SCALETOTENSOR (_ARG_NUM_SCALETOTENSOR + _IO_NUM) - -static vsi_status vx_op_compute - ( - vsi_nn_node_t * self, - vsi_nn_tensor_t ** inputs, - vsi_nn_tensor_t ** outputs - ) -{ - vsi_status status = VSI_SUCCESS; - vx_reference params[_PARAM_NUM_SCALETOTENSOR]; - vx_border_t border; - vx_reference * args; - - args = ¶ms[_IO_NUM]; - - if( NULL == self->n ) - { - return VSI_FAILURE; - } - - /* Set inputs and outputs */ - _set_inputs_outputs( params, inputs, outputs ); - - /* Init parameters. */ - _create_params_vx_scaletotensor( self, args, _ARG_NUM_SCALETOTENSOR, - &(inputs[0]->attr), &(outputs[0]->attr)); - - /* Pass parameters to node. */ - status = vsi_nn_ClientNodePassParameters( self->n, params, _PARAM_NUM_SCALETOTENSOR ); - - border.mode = VX_BORDER_REPLICATE; - border.constant_value.U32 = 0; - status |= vxSetNodeAttribute(self->n, VX_NODE_BORDER, &border, sizeof(border)); - - _release_params( args, _ARG_NUM_SCALETOTENSOR ); - - return status; -} /* vx_op_compute() */ - -#define _ARG_NUM_GRAYSCALETOTENSOR (6) -#define _PARAM_NUM_GRAYSCALETOTENSOR (_ARG_NUM_GRAYSCALETOTENSOR + _IO_NUM) - -static vsi_status vx_gray_op_compute - ( - vsi_nn_node_t * self, - vsi_nn_tensor_t ** inputs, - vsi_nn_tensor_t ** outputs - ) -{ - vsi_status status = VSI_SUCCESS; - vx_reference params[_PARAM_NUM_GRAYSCALETOTENSOR]; - vx_border_t border; - vx_reference * args; - - args = ¶ms[_IO_NUM]; - - if( NULL == self->n ) - { - return VSI_FAILURE; - } - - /* Set inputs and outputs */ - _set_inputs_outputs( params, inputs, outputs ); - - /* Init parameters. */ - _create_params_vx_grayscaletotensor( self, args, _ARG_NUM_GRAYSCALETOTENSOR, - &(inputs[0]->attr), &(outputs[0]->attr)); - - /* Pass parameters to node. */ - status = vsi_nn_ClientNodePassParameters( self->n, params, _PARAM_NUM_GRAYSCALETOTENSOR ); - - border.mode = VX_BORDER_REPLICATE; - border.constant_value.U32 = 0; - status |= vxSetNodeAttribute(self->n, VX_NODE_BORDER, &border, sizeof(border)); - - _release_params( args, _ARG_NUM_GRAYSCALETOTENSOR ); - - return status; -} /* vx_gray_op_compute() */ - -static vsi_bool op_check - ( - vsi_nn_node_t * self, - vsi_nn_tensor_t ** inputs, - vsi_nn_tensor_t ** outputs - ) -{ - BEGIN_IO_TYPE_DECL(IMAGEPROCESS, 1, 1) - IO_TYPE(D_U8, D_F16) - IO_TYPE(D_U8, D_U8|Q_ASYM) - IO_TYPE(D_U8, D_I16|Q_DFP) - IO_TYPE(D_U8, D_I8|Q_DFP) - END_IO_TYPE_DECL(IMAGEPROCESS) - if (!VALIDATE_OP_IO_TYPES(IMAGEPROCESS, self, inputs, self->input.num, outputs, self->output.num)) - { - char* desc = generate_op_io_types_desc(inputs, - self->input.num, outputs, self->output.num); - VSILOGE("Inputs/Outputs data type not support: %s", desc); - destroy_op_io_types_desc(desc); - return FALSE; - } - - return TRUE; -} /* op_check() */ - -static vsi_nn_op_compute_t op_compute_list[] = -{ - cpu_op_compute, - vx_op_compute, - vx_gray_op_compute, - NULL -}; static vsi_status op_compute ( @@ -638,33 +58,8 @@ static vsi_status op_compute vsi_nn_tensor_t ** outputs ) { - vsi_status status; - vsi_nn_kernel_info_t kernel_info; + vsi_status status = VSI_SUCCESS; - memset(&kernel_info, 0x0, sizeof(vsi_nn_kernel_info_t)); - status = VSI_FAILURE; - kernel_info.resource_num = 1; - kernel_info.resource_name = (char **)malloc(kernel_info.resource_num * sizeof(char *)); - kernel_info.resource_name[0] = "vsi_nn_kernel_imageprocess"; - kernel_info.type = vsi_nn_GetVXKernelTypeForShader(); - kernel_info.kernel = vx_kernel_IMAGEPROCESS_list; - - if (vsi_nn_is_do_vx_op_pre_init(kernel_info.type)) - { - vx_op_pre_compute(self, inputs, outputs, &kernel_info); - } - - self->n = vsi_nn_RegisterClientKernelAndNewNode( - self->graph, &kernel_info); - if (kernel_info.resource_name) free(kernel_info.resource_name); - if( NULL == self->n ) - { - return VSI_FAILURE; - } - if (NULL != op_compute_list[kernel_info.init_index]) - { - status = op_compute_list[kernel_info.init_index](self, inputs, outputs); - } return status; } /* op_compute() */ @@ -720,398 +115,6 @@ static vsi_bool op_setup return TRUE; } /* op_setup() */ -typedef struct _vsi_nn_image_data_t -{ - int32_t id; - vx_image handle; -}vsi_nn_image_data_t; - -typedef struct _vsi_nn_image_list_t -{ - vsi_nn_link_list_t link_list; - vsi_nn_image_data_t image; -} vsi_nn_image_list_t; - -static void _init_image_list(vsi_nn_link_list_t *node) -{ - vsi_nn_image_list_t *image_list = (vsi_nn_image_list_t *)node; - image_list->link_list.next = NULL; - image_list->link_list.prev = NULL; - memset(&image_list->image, 0, sizeof(vsi_nn_image_data_t)); -} - -static vsi_nn_image_list_t* get_image_by_id -( - vsi_nn_image_list_t* head, - int32_t id -) -{ - vsi_nn_image_list_t *iter; - iter = head; - while(iter) - { - if (iter->image.id == id) - { - return iter; - } - iter = (vsi_nn_image_list_t *)vsi_nn_LinkListNext( (vsi_nn_link_list_t *)iter); - } - iter = (vsi_nn_image_list_t *)vsi_nn_LinkListNewNode( - sizeof(vsi_nn_image_list_t), _init_image_list); - iter->image.id = id; - return iter; -} - -vsi_nn_image_list_t* images_head = NULL; -// pipeline: -// 1.crop -// 2.resize -// 3.(val-mean)*scale -// 4.RGBRGBRGB ---> BBBGGGRRR -// 5.revert channel: BBBGGGRRR ---> RRRGGGBBB -vsi_status vsi_nn_InsertImageprocessSingleNode - ( - vsi_nn_graph_t *graph, - vsi_nn_tensor_attr_t *attr, - vsi_nn_imageprocess_param *p, - uint8_t *data, - vsi_nn_tensor_t *tensor_out, - int32_t id - ) -{ - vsi_nn_image_list_t* p_image; - vx_image image_global; - if(images_head == NULL) - { - images_head = (vsi_nn_image_list_t *)vsi_nn_LinkListNewNode( - sizeof(vsi_nn_image_list_t), _init_image_list); - } - p_image = get_image_by_id(images_head, id); - image_global = p_image->image.handle; - if(image_global == NULL) - { - vsi_status status; - vsi_nn_kernel_info_t kernel_info; - vx_node node = NULL; - vx_reference params[_PARAM_NUM_SCALETOTENSOR]; - vx_border_t border; - vx_reference * args; - vx_image image = NULL; - vx_context ctx = vxGetContext( (vx_reference)graph->g ); - vx_imagepatch_addressing_t imgInfo; - vx_bool is_copy = (vx_bool)((attr->size[0] == tensor_out->attr.size[0]) - && (attr->size[1] == tensor_out->attr.size[1])); - vsi_nn_tensor_t *tensor_temp = NULL; - vsi_nn_tensor_t *output_scaletotensor = NULL; - vsi_nn_tensor_t *output_reversetensor = NULL; - vx_nn_tensor_reverse_params_t para; - int32_t reverse1_axis[4] = {2}; - uint32_t perm[] = {2, 0, 1, 3}; - vsi_nn_tensor_t out0; - uint32_t arg_num; - vx_bool is_gray = FALSE; - - memset(&kernel_info, 0x0, sizeof(vsi_nn_kernel_info_t)); - memset(&out0, 0, sizeof(vsi_nn_tensor_t)); - para.axis = reverse1_axis; - para.numberOfAxis = 1; - - if (p->platform_type == VSI_NN_PLATFORM_TENSORFLOW) - { - vsi_nn_tensor_attr_t attr0; - memcpy(&attr0, &tensor_out->attr, sizeof(vsi_nn_tensor_attr_t)); - attr0.size[0] = tensor_out->attr.size[1]; - attr0.size[1] = tensor_out->attr.size[2]; - attr0.size[2] = tensor_out->attr.size[0]; - - if (attr0.size[2] == 1) - { - is_gray= TRUE; - p->reverse_channel = FALSE; - } - is_copy = (vx_bool)((attr->size[0] == attr0.size[0]) - && (attr->size[1] == attr0.size[1])); - if (!is_gray) - { - output_scaletotensor = vsi_nn_CreateTensor(graph, &attr0); - if (p->reverse_channel == TRUE) - { - output_reversetensor = vsi_nn_CreateTensor(graph, &attr0); - } - tensor_temp = output_scaletotensor; - } - else - { - out0.t = vxReshapeTensor(tensor_out->t, (int32_t *)attr0.size, attr0.dim_num); - memcpy(&out0.attr, &attr0, sizeof(vsi_nn_tensor_attr_t)); - tensor_temp = &out0; - } - } - else /* VSI_NN_PLATFORM_CAFFE */ - { - if (tensor_out->attr.size[2] == 1) - { - is_gray= TRUE; - p->reverse_channel = FALSE; - } - - if (p->reverse_channel == TRUE) - { - output_scaletotensor = vsi_nn_CreateTensor(graph, &(tensor_out->attr)); - tensor_temp = output_scaletotensor; - } - else - { - tensor_temp = tensor_out; - } - } - - args = ¶ms[_IO_NUM]; - - status = VSI_FAILURE; - kernel_info.resource_num = 1; - kernel_info.resource_name = (char **)malloc(kernel_info.resource_num * sizeof(char *)); - kernel_info.type = vsi_nn_GetVXKernelTypeForShader(); - kernel_info.kernel = vx_kernel_IMAGEPROCESS_list; - if (!is_gray) - { - kernel_info.init_index = 1; - status = select_kernel_index(&kernel_info, tensor_out->attr.dtype.vx_type, is_copy); - } - else - { - kernel_info.init_index = 2; - status = select_kernel_index_gray(&kernel_info, tensor_out->attr.dtype.vx_type, is_copy); - } - - node = vsi_nn_RegisterClientKernelAndNewNode( - graph, &kernel_info); - if (kernel_info.resource_name) free(kernel_info.resource_name); - if( NULL == node ) - { - VSILOGE("Create scaletotensor node fails"); - status = VSI_FAILURE; - goto OnError; - } - //imgInfo = {width * num_of_channels, height, 1, width * num_of_channels, VX_SCALE_UNITY, VX_SCALE_UNITY, 1, 1}; - imgInfo.dim_x = attr->size[0] * attr->size[2]; - imgInfo.dim_y = attr->size[1]; - imgInfo.stride_x = 1; - imgInfo.stride_y = imgInfo.dim_x; - imgInfo.scale_x = VX_SCALE_UNITY; - imgInfo.scale_y = VX_SCALE_UNITY; - imgInfo.step_x = 1; - imgInfo.step_y = 1; - -#if defined(__linux__) - image = vxCreateImageFromHandle(ctx, VX_DF_IMAGE_U8, &imgInfo, (void **)&data, VX_MEMORY_TYPE_HOST); -#else - image = vxCreateImage(ctx, imgInfo.dim_x, imgInfo.dim_y, VX_DF_IMAGE_U8); - { - vx_rectangle_t rect = {0, 0, 0, 0}; - vx_map_id map_id = 0; - void* imgBaseAddr = NULL; - - rect.end_x = imgInfo.dim_x; - rect.end_y = imgInfo.dim_y; - vxMapImagePatch(image, &rect, 0,&map_id, &imgInfo, &imgBaseAddr, VX_WRITE_ONLY,VX_MEMORY_TYPE_HOST, 0);// get data pointer of image in GPU side - memcpy((vx_uint8*)imgBaseAddr, data, imgInfo.dim_x * imgInfo.dim_y); - vxUnmapImagePatch(image, map_id); - imgBaseAddr = NULL; - } -#endif - image_global = image; - p_image->image.handle = image; - - /* Set inputs and outputs */ - params[0] = (vx_reference)image; - params[1] = (vx_reference)tensor_temp->t; - - /* Init parameters. */ -#define _SET_PARAM( i, type, arg ) do{ \ - params[i + _IO_NUM] = (vx_reference)vxCreateScalar( ctx, type, &arg ); \ - status = vxGetStatus( params[i + _IO_NUM] ); \ - if( VSI_SUCCESS != status ) { \ - status = VSI_FAILURE;\ - goto OnError;\ - } \ - } while(0) - if (!is_gray) - { - { - scaletotensor_kernel_params_t scaletotensor_kernel_params; - prepare_params_scaletotensor(p, &scaletotensor_kernel_params, attr, &(tensor_temp->attr)); - _SET_PARAM( 0, VX_TYPE_INT32, scaletotensor_kernel_params.ratio[0]); - _SET_PARAM( 1, VX_TYPE_INT32, scaletotensor_kernel_params.ratio[1]); - _SET_PARAM( 2, VX_TYPE_INT32, scaletotensor_kernel_params.offset[0]); - _SET_PARAM( 3, VX_TYPE_INT32, scaletotensor_kernel_params.offset[1]); - if (p->reverse_channel == TRUE) - { - _SET_PARAM( 4, VX_TYPE_FLOAT32, scaletotensor_kernel_params.mean[0]); - _SET_PARAM( 5, VX_TYPE_FLOAT32, scaletotensor_kernel_params.mean[1]); - _SET_PARAM( 6, VX_TYPE_FLOAT32, scaletotensor_kernel_params.mean[2]); - } - else - { - _SET_PARAM( 4, VX_TYPE_FLOAT32, scaletotensor_kernel_params.mean[2]); - _SET_PARAM( 5, VX_TYPE_FLOAT32, scaletotensor_kernel_params.mean[1]); - _SET_PARAM( 6, VX_TYPE_FLOAT32, scaletotensor_kernel_params.mean[0]); - } - _SET_PARAM( 7, VX_TYPE_FLOAT32, scaletotensor_kernel_params.scale); - } - arg_num = _ARG_NUM_SCALETOTENSOR; - } - else - { - { - scaletotensor_kernel_params_t scaletotensor_kernel_params; - prepare_params_scaletotensor(p, &scaletotensor_kernel_params, attr, &(tensor_temp->attr)); - _SET_PARAM( 0, VX_TYPE_INT32, scaletotensor_kernel_params.ratio[0]); - _SET_PARAM( 1, VX_TYPE_INT32, scaletotensor_kernel_params.ratio[1]); - _SET_PARAM( 2, VX_TYPE_INT32, scaletotensor_kernel_params.offset[0]); - _SET_PARAM( 3, VX_TYPE_INT32, scaletotensor_kernel_params.offset[1]); - _SET_PARAM( 4, VX_TYPE_FLOAT32, scaletotensor_kernel_params.mean[0]); - _SET_PARAM( 5, VX_TYPE_FLOAT32, scaletotensor_kernel_params.scale); - } - arg_num = _ARG_NUM_GRAYSCALETOTENSOR; - } -#undef _SET_PARAM - - /* Pass parameters to node. */ - status = vsi_nn_ClientNodePassParameters(node, params, _IO_NUM + arg_num); - - border.mode = VX_BORDER_REPLICATE; - border.constant_value.U32 = 0; - status |= vxSetNodeAttribute(node, VX_NODE_BORDER, &border, sizeof(border)); - - _release_params( args, arg_num); - - if (p->platform_type == VSI_NN_PLATFORM_TENSORFLOW) - { - if (p->reverse_channel == TRUE) - { - node = vxTensorReverse( graph->g, output_scaletotensor->t, ¶, - sizeof(vx_nn_tensor_reverse_params_t), output_reversetensor->t ); - if( NULL == node ) - { - VSILOGE("Create vxTensorReverse node fails"); - status = VSI_FAILURE; - goto OnError; - } - - node = vxTensorPermuteNode( graph->g, output_reversetensor->t, - tensor_out->t, perm, 4); - if( NULL == node ) - { - VSILOGE("Create vxTensorPermuteNode node fails"); - status = VSI_FAILURE; - goto OnError; - } - } - else - { - if (!is_gray) - { - node = vxTensorPermuteNode( graph->g, output_scaletotensor->t, - tensor_out->t, perm, 4); - if( NULL == node ) - { - VSILOGE("Create vxTensorPermuteNode node fails"); - status = VSI_FAILURE; - goto OnError; - } - } - else - { - if (out0.t) vxReleaseTensor(&out0.t); - } - } - } - else /* VSI_NN_PLATFORM_CAFFE */ - { - if (p->reverse_channel == TRUE) - { - node = vxTensorReverse( graph->g, output_scaletotensor->t, ¶, - sizeof(vx_nn_tensor_reverse_params_t), tensor_out->t ); - if( NULL == node ) - { - VSILOGE("Create vxTensorReverse node fails"); - status = VSI_FAILURE; - goto OnError; - } - } - } - - //set graph inputs outputs again, because pre_process changed graph inputs - { - uint32_t num_of_graph_inputs; - vx_reference *graph_inputs = NULL; - uint32_t num_of_graph_outputs; - vx_reference *graph_outputs = NULL; - uint32_t i = 0; - - /* Explicitly set graph inputs and outputs */ - num_of_graph_inputs = 1; - graph_inputs = (vx_reference *)malloc( num_of_graph_inputs * sizeof( vx_reference ) ); - - graph_inputs[0] = (vx_reference)image; - - num_of_graph_outputs = graph->output.num; - graph_outputs = (vx_reference *)malloc( num_of_graph_outputs * sizeof( vx_reference ) ); - for( i = 0; i < num_of_graph_outputs; i++ ) - { - graph_outputs[i] = (vx_reference)( ( vsi_nn_GetTensor( graph, graph->output.tensors[i] ) )->t ); - } - status = vxIdentifyGraphInputsAndOutputs( graph->g, - num_of_graph_inputs, - graph_inputs, - num_of_graph_outputs, - graph_outputs ); - - if ( NULL != graph_inputs) - { - free( graph_inputs ); - } - if ( NULL != graph_outputs) - { - free( graph_outputs ); - } - } -OnError: - //if(tensor_temp) vsi_nn_ReleaseTensor(&tensor_temp); - if(output_scaletotensor) vsi_nn_ReleaseTensor(&output_scaletotensor); - if(output_reversetensor) vsi_nn_ReleaseTensor(&output_reversetensor); - return status; - } - else - { -#if !defined(__linux__) - vx_imagepatch_addressing_t imgInfo; - vx_rectangle_t rect = {0, 0, 0, 0}; - vx_map_id map_id = 0; - void* imgBaseAddr = NULL; - - //imgInfo = {width * num_of_channels, height, 1, width * num_of_channels, VX_SCALE_UNITY, VX_SCALE_UNITY, 1, 1}; - imgInfo.dim_x = attr->size[0] * attr->size[2]; - imgInfo.dim_y = attr->size[1]; - imgInfo.stride_x = 1; - imgInfo.stride_y = imgInfo.dim_x; - imgInfo.scale_x = VX_SCALE_UNITY; - imgInfo.scale_y = VX_SCALE_UNITY; - imgInfo.step_x = 1; - imgInfo.step_y = 1; - - rect.end_x = imgInfo.dim_x; - rect.end_y = imgInfo.dim_y; - vxMapImagePatch(image_global, &rect, 0,&map_id, &imgInfo, &imgBaseAddr, VX_WRITE_ONLY,VX_MEMORY_TYPE_HOST, 0);// get data pointer of image in GPU side - memcpy((vx_uint8*)imgBaseAddr, data, imgInfo.dim_x * imgInfo.dim_y); - vxUnmapImagePatch(image_global, map_id); - imgBaseAddr = NULL; -#endif - return VSI_SUCCESS; - } -} - vsi_status vsi_nn_op_imageprocess_single_node ( vsi_nn_graph_t *graph, @@ -1121,19 +124,11 @@ vsi_status vsi_nn_op_imageprocess_single_node vsi_nn_tensor_t *tensor_out ) { - return vsi_nn_InsertImageprocessSingleNode( - graph, attr, p, data, tensor_out, 0); -} - -static void _release_image_list(vsi_nn_link_list_t *node) -{ - vsi_nn_image_list_t *image_list = (vsi_nn_image_list_t *)node; - vxReleaseImage(&(image_list->image.handle)); + return VSI_SUCCESS; } vsi_status vsi_nn_ReleaseImageprocessSingleNode() { - vsi_nn_LinkListDeinit((vsi_nn_link_list_t *)images_head, _release_image_list); return VSI_SUCCESS; } @@ -1147,7 +142,7 @@ DEF_OP_REG /* init */ NULL, /* compute */ op_compute, /* deinit */ vsi_nn_op_common_deinit, - /* check */ op_check, + /* check */ NULL, /* setup */ op_setup, /* optimize */ NULL, /* input_num */ _INPUT_NUM, diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_l2normalizescale.c b/src/tim/vx/internal/src/ops/vsi_nn_op_l2normalizescale.c index 4057e45..b102bfd 100644 --- a/src/tim/vx/internal/src/ops/vsi_nn_op_l2normalizescale.c +++ b/src/tim/vx/internal/src/ops/vsi_nn_op_l2normalizescale.c @@ -155,65 +155,6 @@ static vsi_bool _check_value_is_equal_to_one return ret; } -static vsi_bool _tensor_data_convert - ( - vsi_nn_graph_t* graph, - vsi_nn_tensor_t* in_tensor, - vsi_nn_tensor_t* out_tensor - ) -{ - vsi_bool ret = TRUE; - float* tensor_data = NULL; - uint32_t size = 0; - uint32_t stride[VSI_NN_MAX_DIM_NUM] = { 0 }; - uint8_t* data = NULL; - - tensor_data = vsi_nn_ConvertTensorToFloat32Data( graph, in_tensor ); - if ( NULL == tensor_data ) - { - VSILOGE( "Convert data fail." ); - return FALSE; - } - - size = vsi_nn_GetStrideSize( &out_tensor->attr, stride ); - data = (uint8_t *)malloc( size ); - - if ( data ) - { - uint32_t i = 0; - uint32_t elements = size / stride[0]; - vsi_status status = VSI_SUCCESS; - - for ( i = 0; i < elements; i ++ ) - { - status = vsi_nn_Float32ToDtype( tensor_data[i], &data[stride[0] * i], &out_tensor->attr.dtype ); - if( VSI_FAILURE == status ) - { - VSILOGE("Convert default_value to dtype fail"); - break; - } - } - - status = vsi_nn_CopyDataToTensor( graph, out_tensor, data ); - free( data ); - data = NULL; - if ( VSI_FAILURE == status ) - { - VSILOGE("Copy data to tensor fail"); - } - } - - if ( !in_tensor->attr.is_created_from_handle ) - { - if ( tensor_data ) - { - free(tensor_data); - } - } - - return ret; -} - static vsi_status op_compute ( vsi_nn_node_t * self, @@ -430,7 +371,7 @@ static vsi_bool op_setup attr.dtype.qnt_type = VSI_NN_QNT_TYPE_NONE; } reshape_tensor = vsi_nn_internal_new_tensor(self, &attr, 0.0f); - _tensor_data_convert(self->graph, inputs[1], reshape_tensor->t); + vsi_nn_ConvertTensor(self->graph, inputs[1], reshape_tensor->t); curr = vsi_nn_internal_new_node(self, VSI_NN_OP_MULTIPLY, 0, 0); curr->inputs[0] = output_tensor->t; diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_layernormalize.c b/src/tim/vx/internal/src/ops/vsi_nn_op_layernormalize.c index be46f09..7e19c43 100644 --- a/src/tim/vx/internal/src/ops/vsi_nn_op_layernormalize.c +++ b/src/tim/vx/internal/src/ops/vsi_nn_op_layernormalize.c @@ -53,6 +53,13 @@ static vsi_status op_compute vsi_nn_kernel_node_t n = NULL; float eps = self->nn_param.layernorm.eps; + if ( inputs[0]->attr.dtype.vx_type == VSI_NN_TYPE_FLOAT16 && + inputs[2]->attr.dtype.vx_type == VSI_NN_TYPE_FLOAT32 && + outputs[0]->attr.dtype.vx_type == VSI_NN_TYPE_UINT8 ) + { + return vsi_nn_internal_compute_node( self ); + } + param = vsi_nn_kernel_param_create(); vsi_nn_kernel_param_add_float32( param, "eps", eps ); @@ -72,6 +79,71 @@ static vsi_status op_compute return status; } /* op_compute() */ +static vsi_bool op_setup + ( + vsi_nn_node_t * self, + vsi_nn_tensor_t ** inputs, + vsi_nn_tensor_t ** outputs + ) +{ + vsi_bool ret = TRUE; + vsi_nn_internal_node_t* curr = NULL; + + if ( NULL == self ) + { + return FALSE; + } + + vsi_nn_internal_init_node_wksp( self ); + + if ( inputs[0]->attr.dtype.vx_type == VSI_NN_TYPE_FLOAT16 && + inputs[2]->attr.dtype.vx_type == VSI_NN_TYPE_FLOAT32 && + outputs[0]->attr.dtype.vx_type == VSI_NN_TYPE_UINT8 ) + { + vsi_nn_internal_tensor_t* mean_tensor = NULL; + vsi_nn_internal_tensor_t* vari_tensor = NULL; + vsi_nn_tensor_attr_t attr; + int32_t *axis = NULL; + + memcpy( &attr, &inputs[0]->attr, sizeof( attr ) ); + attr.size[0] = 1; + attr.vtl = TRUE; + attr.is_const = FALSE; + attr.dtype.qnt_type = VSI_NN_QNT_TYPE_NONE; + attr.dtype.vx_type = VSI_NN_TYPE_FLOAT16; + mean_tensor = vsi_nn_internal_new_tensor( self, &attr, 0.0f ); + vari_tensor = vsi_nn_internal_new_tensor( self, &attr, 0.0f ); + + curr = vsi_nn_internal_new_node(self, VSI_NN_OP_MOMENTS, 0, 0); + axis = (int32_t*)\ + vsi_nn_internal_new_node_param(curr, sizeof(int32_t) * 4); + axis[0] = 0; + + curr->node->nn_param.moments.axis = axis; + curr->node->nn_param.moments.axis_num = 1; + curr->inputs[0] = inputs[0]; + curr->outputs[0] = mean_tensor->t; + curr->outputs[1] = vari_tensor->t; + vsi_nn_internal_setup_node( self, curr ); + + curr = vsi_nn_internal_new_node(self, VSI_NN_OP_BATCHNORM_SINGLE, 0, 0); + curr->inputs[0] = inputs[0]; + curr->inputs[1] = mean_tensor->t; + curr->inputs[2] = vari_tensor->t; + curr->inputs[3] = inputs[2]; + curr->inputs[4] = inputs[1]; + curr->node->nn_param.batchnorm_single.eps = self->nn_param.layernorm.eps; + curr->outputs[0] = outputs[0]; + vsi_nn_internal_setup_node( self, curr ); + } + else + { + ret = vsi_nn_op_common_setup(self, inputs, outputs); + } + + return ret; +} + static vsi_bool op_check ( vsi_nn_node_t * self, @@ -84,6 +156,7 @@ static vsi_bool op_check IO_TYPE(D_F16, D_F32, D_F16, D_F16) IO_TYPE(D_F16, D_F32, D_F32, D_F16) IO_TYPE(D_F16, D_F32, D_F16, D_U8|Q_ASYM) + IO_TYPE(D_F16, D_F32, D_F32, D_U8|Q_ASYM) IO_TYPE(D_BF16, D_F32, D_F32, D_BF16) IO_TYPE(D_U8|Q_ASYM, D_F32, D_F16, D_F16) IO_TYPE(D_U8|Q_ASYM, D_F32, D_F16, D_U8|Q_ASYM) @@ -108,7 +181,7 @@ static vsi_status op_deinit vsi_nn_node_t * self ) { - uint32_t i; + uint32_t i = 0; for (i = 0; i < _VSI_NN_LAYERNORM_LOCAL_TENSOR_NUM; i++) { if (self->nn_param.layernorm.local.local_tensor[i] != NULL) @@ -117,6 +190,9 @@ static vsi_status op_deinit self->nn_param.layernorm.local.local_tensor[i] = NULL; } } + + vsi_nn_internal_deinit_node_wksp( self ); + vsi_nn_op_common_deinit(self); return VSI_SUCCESS; @@ -133,7 +209,7 @@ DEF_OP_REG /* compute */ op_compute, /* deinit */ op_deinit, /* check */ op_check, - /* setup */ vsi_nn_op_common_setup, + /* setup */ op_setup, /* optimize */ NULL, /* input_num */ _INPUT_NUM, /* output_num */ _OUTPUT_NUM diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_lstmunit_ovxlib.c b/src/tim/vx/internal/src/ops/vsi_nn_op_lstmunit_ovxlib.c index f57eddb..8e2f1e6 100644 --- a/src/tim/vx/internal/src/ops/vsi_nn_op_lstmunit_ovxlib.c +++ b/src/tim/vx/internal/src/ops/vsi_nn_op_lstmunit_ovxlib.c @@ -39,6 +39,8 @@ #include "ops/vsi_nn_op_lstmunit_ovxlib.h" #include "vsi_nn_internal_node.h" #include "vsi_nn_rnn_helper.h" +#include "utils/vsi_nn_dtype_util.h" + static vsi_nn_internal_tensor_t* create_tp_fc ( @@ -62,7 +64,7 @@ static vsi_nn_internal_tensor_t* create_tp_fc if( !bias || p->local->use_layer_norm || p->local->use_hybrid ) { /* create zero bias for NN/TP */ - tensor1 = vsi_nn_internal_create_zero_bias_tensor(self, &input->attr, &weight->attr); + tensor1 = vsi_nn_internal_create_zero_bias_tensor(self, &input->attr, &weight->attr, FALSE); tensor = tensor1->t; } @@ -108,7 +110,7 @@ static vsi_nn_internal_tensor_t* create_nn_fc if( !bias || p->local->use_layer_norm || p->local->use_hybrid ) { /* create zero bias for NN/TP */ - tensor1 = vsi_nn_internal_create_zero_bias_tensor(self, &input->attr, &weight->attr); + tensor1 = vsi_nn_internal_create_zero_bias_tensor(self, &input->attr, &weight->attr, FALSE); tensor = tensor1->t; } @@ -283,7 +285,6 @@ static vsi_bool op_setup vsi_bool is_recurrent_fc_on_tp = FALSE; vsi_nn_internal_tensor_t* input_tensor = NULL; vsi_nn_internal_tensor_t* output_tensor = NULL; - vsi_nn_internal_tensor_t* tmp_tensor = NULL; vsi_nn_internal_tensor_t* recurrent_input_tensor = NULL; vsi_nn_internal_tensor_t* input_fc_outputs[LSTMUNIT_IFCO_GATE_COUNT] = { NULL }; vsi_nn_internal_tensor_t* aux_input_fc_outputs[LSTMUNIT_IFCO_GATE_COUNT] = { NULL }; @@ -602,10 +603,30 @@ static vsi_bool op_setup if( p->local->use_projection ) { - if( p->local->use_hybrid || !p->local->use_projection_bias ) + if ( p->local->use_hybrid && p->local->use_projection_bias ) + { + vsi_bool use_virtual_tensor = inputs[LSTMUNIT_INPUT_BIAS_PROJ]->attr.vtl; + input_tensor = vsi_nn_internal_create_zero_bias_tensor(self, &output_tensor->t->attr, + &inputs[LSTMUNIT_INPUT_WEIGHT_PROJ]->attr, FALSE); + zero_bias_tensor = input_tensor->t; + + if (use_virtual_tensor) + { + curr = vsi_nn_internal_new_node( self, VSI_NN_OP_DATACONVERT, 0, 0 ); + curr->inputs[0] = inputs[LSTMUNIT_INPUT_BIAS_PROJ]; + curr->outputs[0] = zero_bias_tensor; + + vsi_nn_internal_setup_node(self, curr); + } + else + { + vsi_nn_ConvertTensor(self->graph, inputs[1], zero_bias_tensor); + } + } + else if ( p->local->use_hybrid || !p->local->use_projection_bias ) { input_tensor = vsi_nn_internal_create_zero_bias_tensor(self, &output_tensor->t->attr, - &inputs[LSTMUNIT_INPUT_WEIGHT_PROJ]->attr); + &inputs[LSTMUNIT_INPUT_WEIGHT_PROJ]->attr, FALSE); zero_bias_tensor = input_tensor->t; } else @@ -634,19 +655,8 @@ static vsi_bool op_setup curr->outputs[0] = outputs[LSTMUNIT_OUTPUT_H_STATE]; } - tmp_tensor = output_tensor; - vsi_nn_internal_setup_node(self, curr); - if( p->local->use_hybrid && p->local->use_projection_bias ) - { - curr = vsi_nn_internal_new_node( self, VSI_NN_OP_ADD, 0, 0 ); - curr->inputs[0] = tmp_tensor->t; - curr->inputs[1] = inputs[LSTMUNIT_INPUT_BIAS_PROJ]; - curr->outputs[0] = outputs[LSTMUNIT_OUTPUT_H_STATE]; - vsi_nn_internal_setup_node(self, curr); - } - /* copy h_state to output */ curr = vsi_nn_internal_new_node( self, VSI_NN_OP_DATACONVERT, 0, 0 ); curr->inputs[0] = outputs[LSTMUNIT_OUTPUT_H_STATE]; diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_moments.c b/src/tim/vx/internal/src/ops/vsi_nn_op_moments.c index fbcdd0b..2a5a5db 100644 --- a/src/tim/vx/internal/src/ops/vsi_nn_op_moments.c +++ b/src/tim/vx/internal/src/ops/vsi_nn_op_moments.c @@ -113,6 +113,7 @@ static vsi_bool op_check IO_TYPE(D_U8|Q_ASYM, D_F16, D_F16) IO_TYPE(D_I8|Q_DFP, D_F16, D_F16) IO_TYPE(D_I16|Q_DFP, D_F16, D_F16) + IO_TYPE(D_U8|Q_ASYM, D_U8|Q_ASYM, D_U8|Q_ASYM) IO_TYPE(D_F16, D_F16, D_F16) IO_TYPE(D_F32, D_F32, D_F32) IO_TYPE(D_I32, D_F32, D_F32) diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_nms.c b/src/tim/vx/internal/src/ops/vsi_nn_op_nms.c index b5e8f4f..3c8a57d 100644 --- a/src/tim/vx/internal/src/ops/vsi_nn_op_nms.c +++ b/src/tim/vx/internal/src/ops/vsi_nn_op_nms.c @@ -110,8 +110,8 @@ static vsi_bool op_setup if ( VSI_NN_DIM_AUTO == outputs[2]->attr.dim_num ) { - outputs[0]->attr.dim_num = 1; - outputs[0]->attr.size[0] = 1; + outputs[2]->attr.dim_num = 1; + outputs[2]->attr.size[0] = 1; } return TRUE; diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_prelu.c b/src/tim/vx/internal/src/ops/vsi_nn_op_prelu.c index 07f074e..5464b4c 100644 --- a/src/tim/vx/internal/src/ops/vsi_nn_op_prelu.c +++ b/src/tim/vx/internal/src/ops/vsi_nn_op_prelu.c @@ -240,11 +240,6 @@ static vsi_bool op_check return FALSE; } - if ( vsi_nn_compareVersion(self->graph, 1, 1, 20) >= 0 ) - { - vsi_nn_OpCheck( VSI_NN_OP_MULTIPLY, self, inputs, outputs ); - } - return TRUE; } /* op_check() */ diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_reduce.c b/src/tim/vx/internal/src/ops/vsi_nn_op_reduce.c index cc07d0a..6c7bda0 100644 --- a/src/tim/vx/internal/src/ops/vsi_nn_op_reduce.c +++ b/src/tim/vx/internal/src/ops/vsi_nn_op_reduce.c @@ -1028,9 +1028,8 @@ static vsi_bool op_setup int index = 0; if (valid_dim_num == 0) { - outputs[0]->attr.dim_num = 2; + outputs[0]->attr.dim_num = 1; outputs[0]->attr.size[0] = 1; - outputs[0]->attr.size[1] = 1; } else { @@ -1043,11 +1042,6 @@ static vsi_bool op_setup index++; } } - if (1 == outputs[0]->attr.dim_num) - { - outputs[0]->attr.dim_num = 2; - outputs[0]->attr.size[1] = 1; - } } } } diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_scatter_nd_update.c b/src/tim/vx/internal/src/ops/vsi_nn_op_scatter_nd_update.c new file mode 100644 index 0000000..190b04c --- /dev/null +++ b/src/tim/vx/internal/src/ops/vsi_nn_op_scatter_nd_update.c @@ -0,0 +1,179 @@ +/**************************************************************************** +* +* Copyright (c) 2020 Vivante Corporation +* +* Permission is hereby granted, free of charge, to any person obtaining a +* copy of this software and associated documentation files (the "Software"), +* to deal in the Software without restriction, including without limitation +* the rights to use, copy, modify, merge, publish, distribute, sublicense, +* and/or sell copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following conditions: +* +* The above copyright notice and this permission notice shall be included in +* all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +* DEALINGS IN THE SOFTWARE. +* +*****************************************************************************/ +#include +#include + +#include "vsi_nn_types.h" +#include "vsi_nn_log.h" +#include "vsi_nn_node.h" +#include "vsi_nn_prv.h" +#include "vsi_nn_ops.h" +#include "vsi_nn_tensor.h" +#include "vsi_nn_error.h" +#include "utils/vsi_nn_util.h" +#include "kernel/vsi_nn_kernel.h" +#include "utils/vsi_nn_constraint_check.h" + +#define _INPUT_NUM (3) +#define _OUTPUT_NUM (1) + +static vsi_status op_compute + ( + vsi_nn_node_t * self, + vsi_nn_tensor_t ** inputs, + vsi_nn_tensor_t ** outputs + ) +{ + vsi_status status = VSI_FAILURE; + vsi_nn_kernel_param_t * param = NULL; + vsi_nn_kernel_node_t n = NULL; + uint32_t i = 0; + uint32_t block_size = 1, coord_dim = 1; + uint32_t idx_num = 1; + uint32_t *input_size = inputs[2]->attr.size; + uint32_t dims_num = inputs[2]->attr.dim_num; + + if (inputs[1]->attr.dim_num > 1) + { + coord_dim = inputs[1]->attr.size[0]; + } + if ( coord_dim > 4 && input_size[dims_num - 1] > 1) + { + CHECK_STATUS(status); + return status; + } + for(i = 0; i < inputs[1]->attr.dim_num; i++) + { + idx_num *= inputs[1]->attr.size[i]; + } + idx_num /= coord_dim; + + param =vsi_nn_kernel_param_create(); + + for(i = 0; i < dims_num; ++i) + { + block_size *= input_size[i]; + } + block_size /= idx_num; + + vsi_nn_kernel_param_add_int32( param, "block_size", block_size ); + vsi_nn_kernel_param_add_int32( param, "coord_dim", coord_dim ); + vsi_nn_kernel_param_add_int32( param, "idx_num", idx_num ); + n = vsi_nn_kernel_selector( self->graph, "scatter_nd_update", inputs, _INPUT_NUM, outputs, _OUTPUT_NUM, param ); + if ( n != NULL ) + { + self->n = (vx_node)n; + status = VSI_SUCCESS; + } + + if (param != NULL) + { + vsi_nn_kernel_param_release( ¶m ); + } + + return status; +} /* op_compute() */ + +static vsi_bool op_check + ( + vsi_nn_node_t * self, + vsi_nn_tensor_t ** inputs, + vsi_nn_tensor_t ** outputs + ) +{ + BEGIN_IO_TYPE_DECL(SCATTER_ND_UPDATE, 3, 1) + IO_TYPE(D_I8|Q_DFP, D_I32, D_I8|Q_DFP, D_I8|Q_DFP) + IO_TYPE(D_I8|Q_DFP, D_I32, D_I8|Q_DFP, D_F16) + IO_TYPE(D_U8|Q_ASYM, D_I32, D_U8|Q_ASYM, D_U8|Q_ASYM) + IO_TYPE(D_U8|Q_ASYM, D_I32, D_U8|Q_ASYM, D_F16) + IO_TYPE(D_I16|Q_DFP, D_I32, D_I16|Q_DFP, D_I16|Q_DFP) + IO_TYPE(D_I16|Q_DFP, D_I32, D_I16|Q_DFP, D_F16) + IO_TYPE(D_F16, D_I32, D_F16, D_F16) + IO_TYPE(D_BF16, D_I32, D_BF16, D_BF16) + IO_TYPE(D_I32, D_I32, D_I32, D_I32) + IO_TYPE(D_U32, D_I32, D_U32, D_U32) + IO_TYPE(D_F32, D_I32, D_F32, D_F32) + END_IO_TYPE_DECL(SCATTER_ND_UPDATE) + if (!VALIDATE_OP_IO_TYPES(SCATTER_ND_UPDATE, self, inputs, self->input.num, outputs, self->output.num)) { + char* desc = generate_op_io_types_desc(inputs, + self->input.num, outputs, self->output.num); + VSILOGE("Inputs/Outputs data type not support: %s", desc); + destroy_op_io_types_desc(desc); + return FALSE; + } + + return TRUE; +} /* op_check() */ + +static vsi_bool op_setup + ( + vsi_nn_node_t * self, + vsi_nn_tensor_t ** inputs, + vsi_nn_tensor_t ** outputs + ) +{ + /* TODO: Add code to comput outputs' shape. */ + uint32_t i = 0; + + if ( VSI_NN_DIM_AUTO == outputs[0]->attr.dim_num ) + { + outputs[0]->attr.dim_num = inputs[0]->attr.dim_num; + for (i = 0; i < outputs[0]->attr.dim_num; i++) + { + outputs[0]->attr.size[i] = inputs[0]->attr.size[i]; + } + } + + return TRUE; +} /* op_setup() */ + +static vsi_status op_deinit + ( + vsi_nn_node_t * self + ) +{ + vsi_nn_op_common_deinit(self); + + return VSI_SUCCESS; +} /* op_deinit() */ + +#ifdef __cplusplus +extern "C" { +#endif +/* Registrar */ +DEF_OP_REG + ( + /* op_name */ SCATTER_ND_UPDATE, + /* init */ NULL, + /* compute */ op_compute, + /* deinit */ op_deinit, + /* check */ op_check, + /* setup */ op_setup, + /* optimize */ NULL, + /* input_num */ _INPUT_NUM, + /* output_num */ _OUTPUT_NUM + ); +#ifdef __cplusplus +} +#endif diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_signal_frame.c b/src/tim/vx/internal/src/ops/vsi_nn_op_signal_frame.c new file mode 100644 index 0000000..4cbe3f0 --- /dev/null +++ b/src/tim/vx/internal/src/ops/vsi_nn_op_signal_frame.c @@ -0,0 +1,179 @@ +/**************************************************************************** +* +* Copyright (c) 2020 Vivante Corporation +* +* Permission is hereby granted, free of charge, to any person obtaining a +* copy of this software and associated documentation files (the "Software"), +* to deal in the Software without restriction, including without limitation +* the rights to use, copy, modify, merge, publish, distribute, sublicense, +* and/or sell copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following conditions: +* +* The above copyright notice and this permission notice shall be included in +* all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +* DEALINGS IN THE SOFTWARE. +* +*****************************************************************************/ +#include +#include + +#include "vsi_nn_types.h" +#include "vsi_nn_platform.h" +#include "vsi_nn_graph.h" +#include "vsi_nn_node.h" +#include "utils/vsi_nn_math.h" +#include "vsi_nn_ops.h" +#include "vsi_nn_tensor.h" +#include "vsi_nn_tensor_util.h" +#include "vsi_nn_prv.h" +#include "vsi_nn_log.h" +#include "kernel/vsi_nn_kernel.h" +#include "utils/vsi_nn_constraint_check.h" + +#define _INPUT_NUM (1) +#define _OUTPUT_NUM (1) + +static vsi_status op_compute + ( + vsi_nn_node_t * self, + vsi_nn_tensor_t ** inputs, + vsi_nn_tensor_t ** outputs + ) +{ + vsi_status status = VSI_FAILURE; + vsi_nn_kernel_param_t * param = NULL; + vsi_nn_signalframe_param *p = &self->nn_param.signalframe; + float pad_value = p->pad_value; + + if (vsi_nn_compareVersion(self->graph, 1, 1, 33) == -1) + { + pad_value = (float)p->pad; + } + + param = vsi_nn_kernel_param_create(); + + vsi_nn_kernel_param_add_int32( param, "frame_length", p->window_length ); + vsi_nn_kernel_param_add_int32( param, "frame_step", p->step ); + vsi_nn_kernel_param_add_int32( param, "axis", p->axis ); + vsi_nn_kernel_param_add_int32( param, "pad_end", p->pad_end ); + vsi_nn_kernel_param_add_float32( param, "pad_val", pad_value ); + + self->n = (vx_node)vsi_nn_kernel_selector( self->graph, + "signal_frame", + inputs, 1, + outputs, 1, param ); + + if( self->n ) + { + status = VSI_SUCCESS; + } + + vsi_nn_kernel_param_release( ¶m ); + + return status; +} /* op_compute() */ + +static vsi_bool op_check + ( + vsi_nn_node_t * self, + vsi_nn_tensor_t ** inputs, + vsi_nn_tensor_t ** outputs + ) +{ + BEGIN_IO_TYPE_DECL(SIGNAL_FRAME, 1, 1) + IO_TYPE(D_F16, D_F16) + IO_TYPE(D_I8|Q_DFP, D_I8|Q_DFP) + IO_TYPE(D_I16|Q_DFP, D_I16|Q_DFP) + IO_TYPE(D_U8|Q_ASYM, D_U8|Q_ASYM) + IO_TYPE(D_BF16, D_BF16) + IO_TYPE(D_F32, D_F32) + END_IO_TYPE_DECL(SIGNAL_FRAME) + if (!VALIDATE_OP_IO_TYPES(SIGNAL_FRAME, self, inputs, self->input.num, outputs, self->output.num)) { + char* desc = generate_op_io_types_desc(inputs, + self->input.num, outputs, self->output.num); + VSILOGE("Inputs/Outputs data type not support: %s", desc); + destroy_op_io_types_desc(desc); + return FALSE; + } + + return TRUE; +} /* op_check() */ + +static vsi_bool op_setup + ( + vsi_nn_node_t * self, + vsi_nn_tensor_t ** inputs, + vsi_nn_tensor_t ** outputs + ) +{ + uint32_t i = 0; + vsi_bool ret = 0; + uint32_t axis = 0; + uint32_t num_frames = 0; + uint32_t frame_axis = 0; + uint32_t frame_step = 0; + uint32_t frame_length = 0; + vsi_nn_signalframe_param *p = &self->nn_param.signalframe; + + ret = TRUE; + if( VSI_NN_DIM_AUTO != outputs[0]->attr.dim_num ) + { + return ret; + } + + axis = p->axis; + if(axis >= inputs[0]->attr.dim_num) + { + return FALSE; + } + + /* signal frame will increase dim num */ + outputs[0]->attr.dim_num = inputs[0]->attr.dim_num + 1; + for (i = 0; i < axis; i++) + { + outputs[0]->attr.size[i] = inputs[0]->attr.size[i]; + } + + frame_step = p->step; + frame_length = p->window_length; + frame_axis = inputs[0]->attr.size[axis]; + num_frames = p->pad_end ? + (frame_axis + frame_step - 1) / frame_step : (frame_axis - frame_length ) / frame_step + 1; + + outputs[0]->attr.size[axis] = frame_length; + outputs[0]->attr.size[axis + 1] = num_frames; + + for (i = axis + 1; i < inputs[0]->attr.dim_num; i++) + { + outputs[0]->attr.size[i + 1] = inputs[0]->attr.size[i]; + } + + return ret; +} /* op_setup() */ + +#ifdef __cplusplus +extern "C" { +#endif +/* Registrar */ +DEF_OP_REG + ( + /* op_name */ SIGNAL_FRAME, + /* init */ NULL, + /* compute */ op_compute, + /* deinit */ vsi_nn_op_common_deinit, + /* check */ op_check, + /* setup */ op_setup, + /* optimize */ NULL, + /* input_num */ 1, + /* output_num */ 1 + ); +#ifdef __cplusplus +} +#endif diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_signalframe.c b/src/tim/vx/internal/src/ops/vsi_nn_op_signalframe.c deleted file mode 100644 index 432970b..0000000 --- a/src/tim/vx/internal/src/ops/vsi_nn_op_signalframe.c +++ /dev/null @@ -1,677 +0,0 @@ -/**************************************************************************** -* -* Copyright (c) 2020 Vivante Corporation -* -* Permission is hereby granted, free of charge, to any person obtaining a -* copy of this software and associated documentation files (the "Software"), -* to deal in the Software without restriction, including without limitation -* the rights to use, copy, modify, merge, publish, distribute, sublicense, -* and/or sell copies of the Software, and to permit persons to whom the -* Software is furnished to do so, subject to the following conditions: -* -* The above copyright notice and this permission notice shall be included in -* all copies or substantial portions of the Software. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING -* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER -* DEALINGS IN THE SOFTWARE. -* -*****************************************************************************/ -#include -#include - -#include "vsi_nn_types.h" -#include "vsi_nn_platform.h" -#include "vsi_nn_graph.h" -#include "vsi_nn_node.h" -#include "utils/vsi_nn_math.h" -#include "vsi_nn_ops.h" -#include "vsi_nn_tensor.h" -#include "vsi_nn_tensor_util.h" -#include "vsi_nn_prv.h" -#include "vsi_nn_log.h" -#include "libnnext/vsi_nn_vxkernel.h" -#include "utils/vsi_nn_constraint_check.h" - -#define _ARG_NUM (5) -#define _INPUT_NUM (1) -#define _OUTPUT_NUM (1) -#define _IO_NUM (_INPUT_NUM + _OUTPUT_NUM) -#define _PARAM_NUM (_ARG_NUM + _IO_NUM) - -#define ENABLE_CPU 0 -#define TENSOR_ALL 0 - -extern vx_kernel_description_t * vx_kernel_SIGNALFRAME_list[]; - -static vsi_status _create_local_tensor - ( - vsi_nn_node_t * self - ) -{ - //vsi_nn_tensor_t *signal_tensor = NULL; - //vsi_nn_tensor_t *frame_tensor = NULL; - vsi_nn_tensor_t *window_length_tensor = NULL; - vsi_nn_tensor_t *step_tensor = NULL; - vsi_nn_tensor_t *pad_end_tensor = NULL; - vsi_nn_tensor_t *pad_tensor = NULL; - vsi_nn_tensor_t *axis_tensor = NULL; - - if(NULL == self) - { - return VSI_FAILURE; - } - - window_length_tensor = vsi_nn_VariableToTensor(self, - (uint8_t *)&self->nn_param.signalframe.window_length, - VSI_NN_TYPE_UINT32); - if(NULL == window_length_tensor) - { - goto error; - } - - step_tensor = vsi_nn_VariableToTensor(self, - (uint8_t *)&self->nn_param.signalframe.step, - VSI_NN_TYPE_UINT32); - if(NULL == step_tensor) - { - goto error; - } - - pad_end_tensor = vsi_nn_VariableToTensor(self, - (uint8_t *)&self->nn_param.signalframe.pad_end, - VSI_NN_TYPE_UINT32); - if(NULL == pad_end_tensor) - { - goto error; - } - - pad_tensor = vsi_nn_VariableToTensor(self, - (uint8_t *)&self->nn_param.signalframe.pad, - VSI_NN_TYPE_UINT32); - if(NULL == pad_tensor) - { - goto error; - } - - axis_tensor = vsi_nn_VariableToTensor(self, - (uint8_t *)&self->nn_param.signalframe.axis, - VSI_NN_TYPE_UINT32); - if(NULL == axis_tensor) - { - goto error; - } - - self->nn_param.signalframe.local.window_length_tensor = window_length_tensor; - self->nn_param.signalframe.local.step_tensor = step_tensor; - self->nn_param.signalframe.local.pad_end_tensor = pad_end_tensor; - self->nn_param.signalframe.local.pad_tensor = pad_tensor; - self->nn_param.signalframe.local.axis_tensor = axis_tensor; - - return VSI_SUCCESS; -error: - if(window_length_tensor)vsi_nn_ReleaseTensor(&window_length_tensor); - if(step_tensor)vsi_nn_ReleaseTensor(&step_tensor); - if(pad_end_tensor)vsi_nn_ReleaseTensor(&pad_end_tensor); - if(pad_tensor)vsi_nn_ReleaseTensor(&pad_tensor); - if(axis_tensor)vsi_nn_ReleaseTensor(&axis_tensor); - return VSI_FAILURE; -} /* _create_local_tensor() */ - -static void check_tensor_shape - ( - vsi_nn_node_t * self, - vsi_nn_tensor_t * input, - vx_reference * params, - uint32_t index, - vx_bool rsFlg - ) -{ - vsi_nn_tensor_attr_t attr; - - if (index == 0 ) - { - if( input->attr.dim_num == 1 ) - { - memcpy(&attr, &(input->attr), sizeof(vsi_nn_tensor_attr_t)); - attr.size[1] = 1; - attr.dim_num = 2; - self->nn_param.signalframe.local.local_tensor[index] = - vxReshapeTensor(input->t, (int32_t*)(attr.size), attr.dim_num); - params[index] = (vx_reference)self->nn_param.signalframe.local.local_tensor[index]; - } - else - params[index] = (vx_reference)input->t; - } - else if(index == 1 ) - { - if(input->attr.dim_num == 1) - { - memcpy(&attr, &(input->attr), sizeof(vsi_nn_tensor_attr_t)); - attr.size[1] = 1; - attr.dim_num = 2; - self->nn_param.signalframe.local.local_tensor[index] = - vxReshapeTensor(input->t, (int32_t*)(attr.size), attr.dim_num); - params[index] = (vx_reference)self->nn_param.signalframe.local.local_tensor[index]; - } - else if(input->attr.dim_num == 4) - { - memcpy(&attr, &(input->attr), sizeof(vsi_nn_tensor_attr_t)); - attr.size[2] *= attr.size[3]; - attr.size[3] = 1; - attr.dim_num = 3; - self->nn_param.signalframe.local.local_tensor[index] = - vxReshapeTensor(input->t, (int32_t*)(attr.size), attr.dim_num); - params[index] = (vx_reference)self->nn_param.signalframe.local.local_tensor[index]; - } - else - params[index] = (vx_reference)input->t; - - } - else - { - VSILOGE("No more local tensor!(signalframe) at [%s : %d]\n", __FILE__, __LINE__); - } -} - -static void check_local_tensor_shape - ( - vsi_nn_node_t * self, - vx_reference * params, - uint32_t index, - vx_bool rsFlg - ) -{ - vsi_nn_tensor_attr_t attr; - - if( self->nn_param.signalframe.local.window_length_tensor->attr.dim_num == 1 ) - { - memcpy(&attr, &(self->nn_param.signalframe.local.window_length_tensor->attr), sizeof(vsi_nn_tensor_attr_t)); - attr.size[1] = 1; - attr.dim_num = 2; - self->nn_param.signalframe.local.local_tensor[index] = - vxReshapeTensor(self->nn_param.signalframe.local.window_length_tensor->t, (int32_t*)(attr.size), attr.dim_num); - params[index] = (vx_reference)self->nn_param.signalframe.local.local_tensor[index]; - } - else - params[index] = (vx_reference)self->nn_param.signalframe.local.window_length_tensor->t; - index++; - - if( self->nn_param.signalframe.local.step_tensor->attr.dim_num == 1 ) - { - memcpy(&attr, &(self->nn_param.signalframe.local.step_tensor->attr), sizeof(vsi_nn_tensor_attr_t)); - attr.size[1] = 1; - attr.dim_num = 2; - self->nn_param.signalframe.local.local_tensor[index] = - vxReshapeTensor(self->nn_param.signalframe.local.step_tensor->t, (int32_t*)(attr.size), attr.dim_num); - params[index] = (vx_reference)self->nn_param.signalframe.local.local_tensor[index]; - } - else - params[index] = (vx_reference)self->nn_param.signalframe.local.step_tensor->t; - index++; - - if( self->nn_param.signalframe.local.pad_end_tensor->attr.dim_num == 1 ) - { - memcpy(&attr, &(self->nn_param.signalframe.local.pad_end_tensor->attr), sizeof(vsi_nn_tensor_attr_t)); - attr.size[1] = 1; - attr.dim_num = 2; - self->nn_param.signalframe.local.local_tensor[index] = - vxReshapeTensor(self->nn_param.signalframe.local.pad_end_tensor->t, (int32_t*)(attr.size), attr.dim_num); - params[index] = (vx_reference)self->nn_param.signalframe.local.local_tensor[index]; - } - else - params[index] = (vx_reference)self->nn_param.signalframe.local.pad_end_tensor->t; - index++; - - if( self->nn_param.signalframe.local.pad_tensor->attr.dim_num == 1 ) - { - memcpy(&attr, &(self->nn_param.signalframe.local.pad_tensor->attr), sizeof(vsi_nn_tensor_attr_t)); - attr.size[1] = 1; - attr.dim_num = 2; - self->nn_param.signalframe.local.local_tensor[index] = - vxReshapeTensor(self->nn_param.signalframe.local.pad_tensor->t, (int32_t*)(attr.size), attr.dim_num); - params[index] = (vx_reference)self->nn_param.signalframe.local.local_tensor[index]; - } - else - params[index] = (vx_reference)self->nn_param.signalframe.local.pad_tensor->t; - index++; - - if( self->nn_param.signalframe.local.axis_tensor->attr.dim_num == 1 ) - { - memcpy(&attr, &(self->nn_param.signalframe.local.axis_tensor->attr), sizeof(vsi_nn_tensor_attr_t)); - attr.size[1] = 1; - attr.dim_num = 2; - self->nn_param.signalframe.local.local_tensor[index] = - vxReshapeTensor(self->nn_param.signalframe.local.axis_tensor->t, (int32_t*)(attr.size), attr.dim_num); - params[index] = (vx_reference)self->nn_param.signalframe.local.local_tensor[index]; - } - else - params[index] = (vx_reference)self->nn_param.signalframe.local.axis_tensor->t; - -} - -static vsi_status _create_params - ( - vsi_nn_node_t * node, - vx_reference * params, - uint32_t num - ) -{ - vsi_status status; - vx_context ctx; - vsi_nn_signalframe_param * p; - if( 0 == num ) - { - return VSI_SUCCESS; - } - memset( params, 0, sizeof( vx_reference * ) * num ); - p = &(node->nn_param.signalframe); - ctx = vxGetContext( (vx_reference)node->graph->g ); - /* Init parameters */ -#define _SET_PARAM( i, type, arg ) do{ \ - params[i] = (vx_reference)vxCreateScalar( ctx, type, &p->arg ); \ - status = vxGetStatus( params[i] ); \ - if( VSI_SUCCESS != status ) { \ - goto set_param_error; \ - } \ - } while(0) - _SET_PARAM( 0, VX_TYPE_UINT32, window_length ); - _SET_PARAM( 1, VX_TYPE_UINT32, step ); - _SET_PARAM( 2, VX_TYPE_UINT32, pad_end ); - _SET_PARAM( 3, VX_TYPE_UINT32, pad ); - _SET_PARAM( 4, VX_TYPE_UINT32, axis ); -#undef _SET_PARAM -set_param_error: - - return status; -} /* _create_params */ - -static void _release_params - ( - vx_reference * params, - uint32_t num - ) -{ - uint32_t i; - vx_scalar scalar; - for( i = 0; i < num; i ++ ) - { - scalar = (vx_scalar)params[i]; - vxReleaseScalar( &scalar ); - } -} /* _release_params() */ - -#if ENABLE_CPU -static void _set_inputs_outputs - ( - vx_reference * params, - vsi_nn_tensor_t ** inputs, - vsi_nn_tensor_t ** outputs - ) -{ - uint32_t i; - uint32_t cnt; - - /* Set inputs */ - cnt = 0; - for( i = 0; i < _INPUT_NUM; i ++, cnt ++ ) - { - params[cnt] = (vx_reference)inputs[i]->t; - } - - /* Set outputs */ - for( i = 0; i < _OUTPUT_NUM; i ++, cnt ++ ) - { - params[cnt] = (vx_reference)outputs[i]->t; - } - - /*for( i = 0; i < _ARG_NUM; i ++, cnt ++ ) - { - params[cnt] = (vx_reference)inputs[i + 1]->t; - }*/ -} /* _set_inputs_outputs() */ -#endif - -#if ENABLE_CPU -static vsi_status cpu_op_compute - ( - vsi_nn_node_t * self, - vsi_nn_tensor_t ** inputs, - vsi_nn_tensor_t ** outputs - ) -{ - vsi_status status = VSI_SUCCESS; - vx_reference params[_PARAM_NUM]; - vx_bool rsFlg = FALSE; - vx_reference * args; - args = ¶ms[_IO_NUM]; - - if( NULL == self->n ) - { - return VSI_FAILURE; - } - - /* Set inputs and outputs */ - //_set_inputs_outputs( params, inputs, outputs ); - check_tensor_shape(self, inputs[0], params, 0, rsFlg); - check_tensor_shape(self, outputs[0], params, 1, rsFlg); - if(TENSOR_ALL) - check_local_tensor_shape(self, params, 2, rsFlg); - else - _create_params( self, args, _ARG_NUM ); - - /* Pass parameters to node. */ - status = vsi_nn_ClientNodePassParameters( self->n, params, _PARAM_NUM ); - - _release_params( args, _ARG_NUM ); - - return status; -} -#endif - -static vsi_status vx_op_pre_compute - ( - vsi_nn_node_t * self, - vsi_nn_tensor_t ** inputs, - vsi_nn_tensor_t ** outputs, - vsi_nn_kernel_info_t * kernel_info - ) -{ - vsi_nn_type_e inputDataFormat = inputs[0]->attr.dtype.vx_type; - vsi_nn_type_e outputDataFormat = outputs[0]->attr.dtype.vx_type; - uint32_t axis0 = self->nn_param.signalframe.axis; - uint32_t axis = axis0; - uint32_t dim = inputs[0]->attr.dim_num; - vx_bool dataTypeFlg = FALSE; - vx_bool etFlg = FALSE; - - if((inputDataFormat == VSI_NN_TYPE_INT8 && outputDataFormat == VSI_NN_TYPE_INT8) || - (inputDataFormat == VSI_NN_TYPE_UINT8 && outputDataFormat == VSI_NN_TYPE_UINT8)) - etFlg = TRUE; - - if ((inputDataFormat == VSI_NN_TYPE_FLOAT16 && outputDataFormat == VSI_NN_TYPE_FLOAT16) || - (inputDataFormat == VSI_NN_TYPE_INT16 && outputDataFormat == VSI_NN_TYPE_INT16) || - etFlg) - dataTypeFlg = TRUE; - - axis = dim - axis0 - 1; - - if (dataTypeFlg - && ((dim == 1 && axis==0) || (dim == 2 && axis==1) || (dim == 3 && axis==2))) - { - kernel_info->kernel_index = 1; - if(etFlg) - { - kernel_info->kernel_index = 4; - } - } - else if(dataTypeFlg - && ((dim == 2 && axis==0) || (dim == 3 && axis==1))) - { - kernel_info->kernel_index = 2; - if(etFlg) - { - kernel_info->kernel_index = 5; - } - } - else if(dataTypeFlg - && (dim == 3 && axis==0)) - { - kernel_info->kernel_index = 3; - if(etFlg) - { - kernel_info->kernel_index = 6; - } - } - else - { - VSILOGE("Not support input or output data format!(SIGNALFRAME) at [%s : %d]\n", __FILE__, __LINE__); - return VSI_FAILURE; - } - return VSI_SUCCESS; -} - -static vsi_status vx_op_compute - ( - vsi_nn_node_t * self, - vsi_nn_tensor_t ** inputs, - vsi_nn_tensor_t ** outputs - ) -{ - vsi_status status = VSI_SUCCESS; - vx_reference params[_PARAM_NUM]; - vx_border_t border; - vx_bool rsFlg = FALSE; - vx_reference * args; - args = ¶ms[_IO_NUM]; - - if( NULL == self->n ) - { - return VSI_FAILURE; - } - - /* Set inputs and outputs */ - //_set_inputs_outputs( params, inputs, outputs ); - check_tensor_shape(self, inputs[0], params, 0, rsFlg); - check_tensor_shape(self, outputs[0], params, 1, rsFlg); - if(TENSOR_ALL) - check_local_tensor_shape(self, params, 2, rsFlg); - else - _create_params( self, args, _ARG_NUM ); - - /* Pass parameters to node. */ - status |= vsi_nn_ClientNodePassParameters( self->n, params, _PARAM_NUM ); - - _release_params( args, _ARG_NUM ); - - border.mode = VX_BORDER_CONSTANT; - border.constant_value.U32 = 0; - border.constant_value.S16 = 0; - border.constant_value.U8 = 0; - status |= vxSetNodeAttribute(self->n, VX_NODE_BORDER, &border, sizeof(border)); - - return status; -} - -static vsi_status op_compute - ( - vsi_nn_node_t * self, - vsi_nn_tensor_t ** inputs, - vsi_nn_tensor_t ** outputs - ) -{ - vsi_status status; - vsi_nn_kernel_info_t kernel_info; - status = VSI_SUCCESS; - - memset(&kernel_info, 0x0, sizeof(vsi_nn_kernel_info_t)); - if(0) - { - status = _create_local_tensor(self); - if(status != VSI_SUCCESS) - { - return status; - } - } - -#if ENABLE_CPU //cpu - { - kernel_info.resource_num = 1; - kernel_info.resource_name = (char **)malloc(kernel_info.resource_num * sizeof(char *)); - kernel_info.resource_name[0] = "vsi_nn_kernel_signalframe"; - kernel_info.type = VX_KERNEL_TYPE_CPU; - kernel_info.kernel = vx_kernel_SIGNALFRAME_list; - kernel_info.init_index = 0; - - self->n = vsi_nn_RegisterClientKernelAndNewNode( - self->graph, &kernel_info); - if (kernel_info.resource_name) free(kernel_info.resource_name); - if( NULL == self->n ) - { - return VSI_FAILURE; - } - - status = cpu_op_compute(self, inputs, outputs); - - return status; - } -#endif - - kernel_info.resource_num = 1; - kernel_info.resource_name = (char **)malloc(kernel_info.resource_num * sizeof(char *)); - kernel_info.resource_name[0] = "vsi_nn_kernel_signalframe"; - kernel_info.type = vsi_nn_GetVXKernelTypeForShader(); - kernel_info.kernel = vx_kernel_SIGNALFRAME_list; - kernel_info.init_index = 1; - - if (vsi_nn_is_do_vx_op_pre_init(kernel_info.type)) - { - vx_op_pre_compute(self, inputs, outputs, &kernel_info); - } - - self->n = vsi_nn_RegisterClientKernelAndNewNode( - self->graph, &kernel_info); - if (kernel_info.resource_name) free(kernel_info.resource_name); - if( NULL == self->n ) - { - return VSI_FAILURE; - } - - status |= vx_op_compute(self, inputs, outputs); - - return status; -} /* op_compute() */ - -static vsi_bool op_check - ( - vsi_nn_node_t * self, - vsi_nn_tensor_t ** inputs, - vsi_nn_tensor_t ** outputs - ) -{ - BEGIN_IO_TYPE_DECL(SIGNAL_FRAME, 1, 1) - IO_TYPE(D_F16, D_F16) - IO_TYPE(D_I8|Q_DFP, D_I8|Q_DFP) - IO_TYPE(D_I16|Q_DFP, D_I16|Q_DFP) - IO_TYPE(D_U8|Q_ASYM, D_U8|Q_ASYM) - IO_TYPE(D_F32, D_F32) - END_IO_TYPE_DECL(SIGNAL_FRAME) - if(!VALIDATE_OP_IO_TYPES(SIGNAL_FRAME, self, inputs, self->input.num, outputs, self->output.num)) { - char* desc = generate_op_io_types_desc(inputs, - self->input.num, outputs, self->output.num); - VSILOGE("Inputs/Outputs data type not support: %s", desc); - destroy_op_io_types_desc(desc); - return FALSE; - } - - return TRUE; -} /* op_check() */ - -static vsi_status op_deinit - ( - vsi_nn_node_t * self - ) -{ - uint32_t i; - for (i = 0; i < _VSI_NN_SIGNALFRAME_LOCAL_TENSOR_NUM; i++) - { - if (self->nn_param.signalframe.local.local_tensor[i] != NULL) - { - vxReleaseTensor(&(self->nn_param.signalframe.local.local_tensor[i])); - self->nn_param.signalframe.local.local_tensor[i] = NULL; - } - } - - if(self->nn_param.signalframe.local.window_length_tensor) - vsi_nn_ReleaseTensor(&self->nn_param.signalframe.local.window_length_tensor); - if(self->nn_param.signalframe.local.step_tensor) - vsi_nn_ReleaseTensor(&self->nn_param.signalframe.local.step_tensor); - if(self->nn_param.signalframe.local.pad_end_tensor) - vsi_nn_ReleaseTensor(&self->nn_param.signalframe.local.pad_end_tensor); - if(self->nn_param.signalframe.local.pad_tensor) - vsi_nn_ReleaseTensor(&self->nn_param.signalframe.local.pad_tensor); - if(self->nn_param.signalframe.local.axis_tensor) - vsi_nn_ReleaseTensor(&self->nn_param.signalframe.local.axis_tensor); - vsi_nn_op_common_deinit(self); - - return VSI_SUCCESS; -} /* op_deinit() */ - -static vsi_bool op_setup - ( - vsi_nn_node_t * self, - vsi_nn_tensor_t ** inputs, - vsi_nn_tensor_t ** outputs - ) -{ - uint32_t i; - vsi_bool ret; - uint32_t axis; - - ret = TRUE; - if( VSI_NN_DIM_AUTO != outputs[0]->attr.dim_num ) - { - return ret; - } - - axis = self->nn_param.signalframe.axis; - if(axis >= inputs[0]->attr.dim_num) - { - return FALSE; - } - - /* signal frame will increase dim num */ - outputs[0]->attr.dim_num = inputs[0]->attr.dim_num + 1; - for(i = 0; i < axis; i++) - { - outputs[0]->attr.size[i] = inputs[0]->attr.size[i]; - } - if(self->nn_param.signalframe.pad_end) - { - outputs[0]->attr.size[axis] = inputs[0]->attr.size[axis]; - } - else - { - if(inputs[0]->attr.size[axis] >= self->nn_param.signalframe.window_length) - { - outputs[0]->attr.size[axis] = (inputs[0]->attr.size[axis] - self->nn_param.signalframe.window_length) \ - / self->nn_param.signalframe.step + 1; - } - else - { - outputs[0]->attr.size[axis] = 0; - return FALSE; - } - } - for(i = axis; i < inputs[0]->attr.dim_num; i++) - { - outputs[0]->attr.size[i + 1] = inputs[0]->attr.size[i]; - } - - return ret; -} /* op_setup() */ - -#ifdef __cplusplus -extern "C" { -#endif -/* Registrar */ -DEF_OP_REG - ( - /* op_name */ SIGNAL_FRAME, - /* init */ NULL, - /* compute */ op_compute, - /* deinit */ op_deinit, - /* check */ op_check, - /* setup */ op_setup, - /* optimize */ NULL, - /* input_num */ 1, - /* output_num */ 1 - ); -#ifdef __cplusplus -} -#endif - diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_spatial_transformer.c b/src/tim/vx/internal/src/ops/vsi_nn_op_spatial_transformer.c index c514fbf..4213be7 100644 --- a/src/tim/vx/internal/src/ops/vsi_nn_op_spatial_transformer.c +++ b/src/tim/vx/internal/src/ops/vsi_nn_op_spatial_transformer.c @@ -34,474 +34,12 @@ #include "vsi_nn_ops.h" #include "vsi_nn_tensor.h" #include "vsi_nn_tensor_util.h" -#include "libnnext/vsi_nn_vxkernel.h" #include "utils/vsi_nn_dtype_util.h" +#include "kernel/vsi_nn_kernel.h" +#include "utils/vsi_nn_constraint_check.h" -#define _ARG_NUM (2) #define _INPUT_NUM (2) #define _OUTPUT_NUM (1) -#define _IO_NUM (_INPUT_NUM + _OUTPUT_NUM) -#define _PARAM_NUM (_ARG_NUM + _IO_NUM) -#define _VSI_PARAM (vsi_nn_spatial_transformer_param) - -extern vx_kernel_description_t * vx_kernel_SPATIAL_TRANSFORMER_list[]; - -static void _set_inputs_outputs - ( - vx_reference * params, - vsi_nn_tensor_t ** inputs, - vsi_nn_tensor_t ** outputs - ) -{ - uint32_t i; - uint32_t cnt; - - /* Set inputs */ - cnt = 0; - for ( i = 0; i < _INPUT_NUM; i ++, cnt ++ ) - { - params[cnt] = (vx_reference)inputs[i]->t; - } - - /* Set outputs */ - for ( i = 0; i < _OUTPUT_NUM; i ++, cnt ++ ) - { - params[cnt] = (vx_reference)outputs[i]->t; - } -} /* _set_inputs_outputs() */ - - -static vsi_status _create_params - ( - vsi_nn_node_t * node, - vx_reference * params, - uint32_t num - ) -{ - vsi_status status = VSI_SUCCESS; - vx_context ctx; - vsi_nn_spatial_transformer_param * p; - int32_t flag = 0; - vsi_nn_tensor_t * thre_tensor; - vsi_nn_tensor_attr_t attr; - - uint16_t value_buf[6] = {0}; - - if ( 0 == num ) - { - return VSI_SUCCESS; - } - memset( params, 0, sizeof( vx_reference * ) * num ); - p = (vsi_nn_spatial_transformer_param *)node->nn_param.client_param; - ctx = vxGetContext( (vx_reference)node->graph->g ); - - flag = ((p->has_theta_1_1 == 1) - | ((p->has_theta_1_2 == 1) << 1) - | ((p->has_theta_1_3 == 1) << 2) - | ((p->has_theta_2_1 == 1) << 3) - | ((p->has_theta_2_2 == 1) << 4) - | ((p->has_theta_2_3 == 1) << 5)); - - params[0] = (vx_reference)vxCreateScalar( ctx, VSI_NN_TYPE_INT32, &flag ); - - memset( &attr, 0, sizeof( vsi_nn_tensor_attr_t ) ); - attr.size[0] = 6; - attr.size[1] = 1; - attr.size[2] = 1; - attr.size[3] = 1; - attr.dim_num = 4; - attr.is_const = TRUE; - attr.dtype.vx_type = VSI_NN_TYPE_FLOAT16; - - vsi_nn_Float32ToDtype(p->theta_1_1, (uint8_t*)(&value_buf[0]), &attr.dtype); - vsi_nn_Float32ToDtype(p->theta_1_2, (uint8_t*)(&value_buf[1]), &attr.dtype); - vsi_nn_Float32ToDtype(p->theta_1_3, (uint8_t*)(&value_buf[2]), &attr.dtype); - vsi_nn_Float32ToDtype(p->theta_2_1, (uint8_t*)(&value_buf[3]), &attr.dtype); - vsi_nn_Float32ToDtype(p->theta_2_2, (uint8_t*)(&value_buf[4]), &attr.dtype); - vsi_nn_Float32ToDtype(p->theta_2_3, (uint8_t*)(&value_buf[5]), &attr.dtype); - - thre_tensor = vsi_nn_CreateTensorFromData( node->graph,(uint8_t *)&value_buf, &attr ); - - params[1] = (vx_reference)thre_tensor->t; - p->lcl.local_tensor = thre_tensor; - p->lcl.scl = (vx_scalar)params[0]; -#if 0 - /* Init parameters */ - #define _SET_PARAM( i, type, arg ) do{ \ - params[i] = (vx_reference)vxCreateScalar( ctx, type, &p->arg ); \ - status = vxGetStatus( params[i] ); \ - if( VSI_SUCCESS != status ) { \ - goto set_param_error; \ - } \ - } while(0) - _SET_PARAM( 0, VSI_NN_TYPE_FLOAT32, has_theta_1_3 ); - _SET_PARAM( 1, VSI_NN_TYPE_FLOAT32, has_theta_2_1 ); - _SET_PARAM( 2, VSI_NN_TYPE_FLOAT32, has_theta_1_2 ); - _SET_PARAM( 3, VSI_NN_TYPE_FLOAT32, theta_2_1 ); - _SET_PARAM( 4, VSI_NN_TYPE_FLOAT32, has_output_W ); - _SET_PARAM( 5, VSI_NN_TYPE_INT32, output_W ); - _SET_PARAM( 6, VSI_NN_TYPE_FLOAT32, theta_1_3 ); - _SET_PARAM( 7, VSI_NN_TYPE_FLOAT32, theta_2_2 ); - _SET_PARAM( 8, VSI_NN_TYPE_FLOAT32, theta_1_2 ); - _SET_PARAM( 9, VSI_NN_TYPE_INT32, output_H ); - _SET_PARAM( 10, VSI_NN_TYPE_FLOAT32, has_theta_2_3 ); - _SET_PARAM( 11, VSI_NN_TYPE_FLOAT32, theta_2_3 ); - _SET_PARAM( 12, VSI_NN_TYPE_FLOAT32, has_theta_2_2 ); - _SET_PARAM( 13, VSI_NN_TYPE_FLOAT32, has_output_H ); - _SET_PARAM( 14, VSI_NN_TYPE_FLOAT32, has_theta_1_1 ); - _SET_PARAM( 15, VSI_NN_TYPE_FLOAT32, theta_1_1 ); - #undef _SET_PARAM -#endif -//set_param_error: - - return status; -} /* _create_params */ - -static void _release_params - ( - vsi_nn_node_t * node, - vx_reference * params, - uint32_t num - ) -{ - vsi_nn_spatial_transformer_param * p = NULL; - - p = (vsi_nn_spatial_transformer_param *)node->nn_param.client_param; - - if (p->lcl.local_tensor) vsi_nn_ReleaseTensor(&p->lcl.local_tensor); - if (p->lcl.scl) vxReleaseScalar(&p->lcl.scl); -} /* _release_params() */ - -static vsi_status cpu_op_compute - ( - vsi_nn_node_t * self, - vsi_nn_tensor_t ** inputs, - vsi_nn_tensor_t ** outputs - ) -{ - vsi_status status = VSI_SUCCESS; - vx_reference params[_PARAM_NUM]; - vx_reference * args; - - args = ¶ms[_IO_NUM]; - - if( NULL == self->n ) - { - return VSI_FAILURE; - } - - /* Set inputs and outputs */ - _set_inputs_outputs( params, inputs, outputs ); - - /* Init parameters. */ - _create_params( self, args, _ARG_NUM ); - - /* Pass parameters to node. */ - status = vsi_nn_ClientNodePassParameters( self->n, params, _PARAM_NUM ); - - _release_params( self, args, _ARG_NUM ); - - return status; -} - -vsi_status setUPGridData(uint32_t output_W_, uint32_t output_H_, float scale, int32_t zeropoint, - vsi_nn_dtype_t data_type, vsi_nn_qnt_type_e qnt_type, uint8_t fp, int16_t *tensorData) -{ - vsi_status status = VSI_SUCCESS; - uint32_t x = 0; - uint32_t y = 0; - uint32_t idx = 0; - float *tmp_buf = NULL; - uint32_t i = 0; - vsi_nn_dtype_t dtype; - - dtype.vx_type = VSI_NN_TYPE_FLOAT16; - dtype.qnt_type = VSI_NN_QNT_TYPE_NONE; - dtype.fl = 0; - dtype.scale = 1; - dtype.zero_point = 0; - - tmp_buf = (float*) malloc(output_W_ * output_H_ * 3 * sizeof(float)); - if ( tmp_buf == NULL ) - { - return VX_FAILURE; - } - for (y = 0; y < output_H_; y++ ) - { - for (x = 0; x < output_W_; x++) - { - float data0 = y * (float)1.0 / (float)output_H_ * 2 - 1; - float data1 = x * (float)1.0 / (float)output_W_ * 2 - 1; - float data2 = 1; - - tmp_buf[idx++] = data0; - tmp_buf[idx++] = data1; - tmp_buf[idx++] = data2; - } - } - - for (i = 0; i < output_H_ * output_W_ * 3; i++) - { - vsi_nn_Float32ToDtype( tmp_buf[i], (uint8_t*)&tensorData[i], &dtype ); - } - - if (tmp_buf) - { - free(tmp_buf); - tmp_buf = NULL; - } - return status; -} - -static vsi_status vx_op_compute_setupThre - ( - vsi_nn_node_t * self, - vsi_nn_tensor_t ** inputs, - vsi_nn_tensor_t ** outputs - ) -{ - vsi_status status = VSI_SUCCESS; - vx_reference params[4] = {NULL}; - //vx_reference * args; - vsi_nn_spatial_transformer_param * p = NULL; - int flag = 0; - vsi_nn_tensor_t * thre_tensor = NULL; - vsi_nn_tensor_attr_t attr; - vx_context ctx = NULL; - vx_scalar flag_s = NULL; - vx_tensor tmp_t = NULL, tmp_t1 = NULL; - - //float flag_buf[6]; - vx_uint16 value_buf[6]; - - memset( params, 0, sizeof( vx_reference * ) * 4 ); - p = (vsi_nn_spatial_transformer_param *)self->nn_param.client_param; - ctx = vxGetContext( (vx_reference)self->graph->g ); - - flag = ((p->has_theta_1_1 == 1) - | ((p->has_theta_1_2 == 1) << 1) - | ((p->has_theta_1_3 == 1) << 2) - | ((p->has_theta_2_1 == 1) << 3) - | ((p->has_theta_2_2 == 1) << 4) - | ((p->has_theta_2_3 == 1) << 5)); - - memset( &attr, 0, sizeof( vsi_nn_tensor_attr_t ) ); - attr.size[0] = 6; - attr.size[1] = 1; - attr.size[2] = 1; - attr.size[3] = 1; - attr.dim_num = 4; - attr.is_const = TRUE; - attr.dtype.vx_type = VSI_NN_TYPE_FLOAT16; - attr.dtype.qnt_type = VSI_NN_QNT_TYPE_NONE; - attr.dtype.fl = 0; - attr.dtype.scale = 1; - attr.dtype.zero_point = 0; - attr.vtl = FALSE; - vsi_nn_Float32ToDtype( p->theta_1_1, (uint8_t*)(&value_buf[0]), &attr.dtype ); - vsi_nn_Float32ToDtype( p->theta_1_2, (uint8_t*)(&value_buf[1]), &attr.dtype ); - vsi_nn_Float32ToDtype( p->theta_1_3, (uint8_t*)(&value_buf[2]), &attr.dtype ); - vsi_nn_Float32ToDtype( p->theta_2_1, (uint8_t*)(&value_buf[3]), &attr.dtype ); - vsi_nn_Float32ToDtype( p->theta_2_2, (uint8_t*)(&value_buf[4]), &attr.dtype ); - vsi_nn_Float32ToDtype( p->theta_2_3, (uint8_t*)(&value_buf[5]), &attr.dtype ); - - thre_tensor = vsi_nn_CreateTensorFromData( self->graph,(uint8_t *)&value_buf, &attr ); - - if ( NULL == self->n ) - { - status = VSI_FAILURE; - if (thre_tensor) - { - vsi_nn_ReleaseTensor( &thre_tensor); - thre_tensor = NULL; - } - return status; - } - - flag_s = vxCreateScalar( ctx, VSI_NN_TYPE_INT32, &flag ); - - params[0] = (vx_reference)thre_tensor->t; - - attr.size[0] = inputs[0]->attr.size[0] * inputs[0]->attr.size[1]; - attr.size[1] = 1; - attr.size[2] = inputs[0]->attr.size[2]; - attr.size[3] = inputs[0]->attr.size[3]; - attr.dim_num = inputs[0]->attr.dim_num; - - tmp_t = vxReshapeTensor( inputs[0]->t, (vx_int32*)attr.size, attr.dim_num ); - - params[1] = (vx_reference)tmp_t; - params[2] = (vx_reference)flag_s; - - attr.size[0] = outputs[0]->attr.size[0] * outputs[0]->attr.size[1]; - attr.size[1] = 1; - attr.size[2] = outputs[0]->attr.size[2]; - attr.size[3] = outputs[0]->attr.size[3]; - attr.dim_num = outputs[0]->attr.dim_num; - - tmp_t1 = vxReshapeTensor( outputs[0]->t, (vx_int32*)attr.size, attr.dim_num ); - - params[3] = (vx_reference)tmp_t1; - - /* Pass parameters to node. */ - status = vsi_nn_ClientNodePassParameters( self->n, params, 4 ); - - //_release_params( args, 4 ); - if (thre_tensor) - { - vsi_nn_ReleaseTensor( &thre_tensor); - thre_tensor = NULL; - } - if (tmp_t) - { - vxReleaseTensor( &tmp_t ); - tmp_t = NULL; - } - if (tmp_t1) - { - vxReleaseTensor( &tmp_t1 ); - tmp_t1 = NULL; - } - if (flag_s) - { - vxReleaseScalar( &flag_s ); - flag_s = NULL; - } - - return status; -} - -static vsi_status vx_op_compute_gemm - ( - vsi_nn_node_t * self, - vsi_nn_tensor_t ** inputs, - vsi_nn_tensor_t ** outputs - ) -{ - vsi_status status = VSI_SUCCESS; - vx_reference params[3] = {NULL}; - vx_tensor paraTensor0 = NULL, paraTensor1 = NULL, paraTensor2 = NULL; - int32_t size[4] = {1}; - vsi_nn_tensor_attr_t out_attr; - int16_t *out_buffer = NULL; - uint32_t output_H = 0, output_W = 0; - float *buf = NULL; - - memcpy( &out_attr, &(outputs[0]->attr), sizeof(vsi_nn_tensor_attr_t) ); - output_W = out_attr.size[0]; - output_H = out_attr.size[1]; - out_buffer = (int16_t*)malloc( output_W * output_H * 3 * sizeof(int16_t) ); - status = setUPGridData( output_W, output_H, out_attr.dtype.scale, out_attr.dtype.zero_point, - out_attr.dtype, out_attr.dtype.qnt_type, out_attr.dtype.fl, out_buffer ); - if (status == VSI_FAILURE) - { - goto OnError; - } - status = vsi_nn_copy_tensor_patch( inputs[1]->t, &inputs[1]->attr, out_buffer, VX_WRITE_ONLY ); - if (status == VSI_FAILURE) - { - goto OnError; - } - /* Copy tensor to buffer, and convert bufer to float32 format */ - buf = vsi_nn_ConvertTensorToFloat32Data(self->graph, inputs[1]); - if (buf == NULL) - { - goto OnError; - } - memset( params, 0, sizeof( vx_reference * ) * 3 ); - - size[0] = inputs[0]->attr.size[0] * inputs[0]->attr.size[1]; - size[1] = 1; - paraTensor0 = vxReshapeTensor( inputs[0]->t, size, 2 ); - - size[0] = inputs[1]->attr.size[0] * output_W; - size[1] = output_H; - paraTensor1 = vxReshapeTensor( inputs[1]->t, size, 2 ); - - size[0] = inputs[0]->attr.size[1] * output_W; - size[1] = output_H; - paraTensor2 = vxReshapeTensor( inputs[2]->t, size, 2 ); - - if ( NULL == self->n ) - { - status = VSI_FAILURE; - goto OnError; - } - - params[0] = (vx_reference)paraTensor0; - params[1] = (vx_reference)paraTensor1; - params[2] = (vx_reference)paraTensor2; - /* Pass parameters to node. */ - status = vsi_nn_ClientNodePassParameters( self->n, params, 3 ); - -OnError: - if (paraTensor0) - { - vxReleaseTensor( ¶Tensor0 ); - paraTensor0 = NULL; - } - if (paraTensor1) - { - vxReleaseTensor( ¶Tensor1 ); - paraTensor1 = NULL; - } - if (paraTensor2) - { - vxReleaseTensor( ¶Tensor2 ); - paraTensor2 = NULL; - } - if (out_buffer) - { - free(out_buffer); - out_buffer = NULL; - } - return status; -} - - -static vsi_status vx_op_compute_interp - ( - vsi_nn_node_t * self, - vsi_nn_tensor_t ** inputs, - vsi_nn_tensor_t ** outputs - ) -{ - vsi_status status = VSI_SUCCESS; - vx_reference params[3]; - vx_border_t border; - - memset( params, 0, sizeof( vx_reference * ) * 3 ); - - params[0] = (vx_reference)inputs[3]->t; - params[1] = (vx_reference)inputs[2]->t; - params[2] =(vx_reference)outputs[0]->t; - - if( NULL == self->n ) - { - return VSI_FAILURE; - } - - /* Pass parameters to node. */ - status = vsi_nn_ClientNodePassParameters( self->n, params, 3 ); - - border.mode = VX_BORDER_CONSTANT; - border.constant_value.S16 = 0; - - status |= vxSetNodeAttribute( self->n, VX_NODE_BORDER, - &border, sizeof(border) ); - // _release_params( args, 3 ); - - return status; -} - -static vsi_nn_op_compute_t op_compute_list[] = -{ - cpu_op_compute, - vx_op_compute_setupThre, - vx_op_compute_gemm, - vx_op_compute_interp, - NULL -}; static vsi_status op_compute ( @@ -510,180 +48,40 @@ static vsi_status op_compute vsi_nn_tensor_t ** outputs ) { - vsi_status status = VX_SUCCESS; - vsi_nn_kernel_info_t kernel_info; - char *path = NULL; - vsi_nn_tensor_attr_t attr,outattr; - vsi_nn_tensor_t *tmp_output_tensor[5] = {0}; - vsi_nn_tensor_t *input_t,*fc_t,*output_t; - vx_graph graph = self->graph->g; + vsi_status status = VSI_FAILURE; + vsi_nn_kernel_param_t * param = NULL; + vsi_nn_spatial_transformer_param * p; + p = (vsi_nn_spatial_transformer_param *)&self->nn_param.spatial_transformer; - memset( &kernel_info, 0x0, sizeof(vsi_nn_kernel_info_t) ); - memset( &attr, 0, sizeof(vsi_nn_tensor_attr_t) ); + param = vsi_nn_kernel_param_create(); - memcpy( &attr, &(inputs[0]->attr), sizeof(vsi_nn_tensor_attr_t) ); - attr.dtype.vx_type = VSI_NN_TYPE_FLOAT16; - attr.dtype.qnt_type = VSI_NN_QNT_TYPE_NONE; - attr.dtype.fl = 0; - attr.dtype.scale = 1; - attr.dtype.zero_point = 0; - attr.vtl = FALSE; + vsi_nn_kernel_param_add_int32( param, "has_theta_1_1", p->has_theta_1_1 ); + vsi_nn_kernel_param_add_int32( param, "has_theta_1_2", p->has_theta_1_2 ); + vsi_nn_kernel_param_add_int32( param, "has_theta_1_3", p->has_theta_1_3 ); + vsi_nn_kernel_param_add_int32( param, "has_theta_2_1", p->has_theta_2_1 ); + vsi_nn_kernel_param_add_int32( param, "has_theta_2_2", p->has_theta_2_2 ); + vsi_nn_kernel_param_add_int32( param, "has_theta_2_3", p->has_theta_2_3 ); + vsi_nn_kernel_param_add_float32( param, "theta_1_1", p->theta_1_1 ); + vsi_nn_kernel_param_add_float32( param, "theta_1_2", p->theta_1_2 ); + vsi_nn_kernel_param_add_float32( param, "theta_1_3", p->theta_1_3 ); + vsi_nn_kernel_param_add_float32( param, "theta_2_1", p->theta_2_1 ); + vsi_nn_kernel_param_add_float32( param, "theta_2_2", p->theta_2_2 ); + vsi_nn_kernel_param_add_float32( param, "theta_2_3", p->theta_2_3 ); + vsi_nn_kernel_param_add_int32( param, "align_corners", p->align_corners ); - input_t = vsi_nn_CreateTensor( self->graph, &attr ); + self->n = (vx_node)vsi_nn_kernel_selector( self->graph, + "spatial_transformer", + inputs, 2, + outputs, 1, param ); - memcpy( &attr, &(inputs[1]->attr), sizeof(vsi_nn_tensor_attr_t) ); - attr.dtype.vx_type = VSI_NN_TYPE_FLOAT16; - attr.dtype.qnt_type = VSI_NN_QNT_TYPE_NONE; - attr.dtype.fl = 0; - attr.dtype.scale = 1; - attr.dtype.zero_point = 0; - attr.vtl = FALSE; - fc_t= vsi_nn_CreateTensor( self->graph, &attr ); - - memcpy( &attr, &(outputs[0]->attr), sizeof(vsi_nn_tensor_attr_t) ); - attr.dtype.vx_type = VSI_NN_TYPE_FLOAT16; - attr.dtype.qnt_type = VSI_NN_QNT_TYPE_NONE; - attr.dtype.fl = 0; - attr.dtype.scale = 1; - attr.dtype.zero_point = 0; - attr.vtl = FALSE; - output_t= vsi_nn_CreateTensor( self->graph, &attr ); - - vxTensorCopyNode( graph, inputs[0]->t, input_t->t ); - vxTensorCopyNode( graph, inputs[1]->t, fc_t->t ); - vxTensorCopyNode( graph, output_t->t, outputs[0]->t ); - - memcpy( &outattr, &(outputs[0]->attr), sizeof(vsi_nn_tensor_attr_t) ); - // Tensor for thre_output - memset( &attr, 0, sizeof(vsi_nn_tensor_attr_t)); - attr.size[0] = 3; - attr.size[1] = 2; - attr.size[2] = 1; - attr.size[3] = 1; - attr.dim_num = 2; - attr.dtype.vx_type = VSI_NN_TYPE_FLOAT16; - attr.vtl = FALSE; - tmp_output_tensor[0] = vsi_nn_CreateTensor( self->graph, &attr ); - - // Tensor for grid - attr.size[0] = 3; - attr.size[1] = outattr.size[0] * outattr.size[1];//p->output_H * p->output_W; - attr.size[2] = 1; - attr.size[3] = 1; - attr.dim_num = 2; - attr.dtype.vx_type = VSI_NN_TYPE_FLOAT16; - attr.vtl = FALSE; - tmp_output_tensor[1] = vsi_nn_CreateTensor( self->graph, &attr ); - - // Tensor for grid_out - attr.size[0] = 2 * outattr.size[0];//2 * p->output_W; - attr.size[1] = outattr.size[1];//p->output_H ; - attr.size[2] = 1; - attr.size[3] = 1; - attr.dim_num = 2; - attr.dtype.vx_type = VSI_NN_TYPE_FLOAT16; - attr.vtl = FALSE; - tmp_output_tensor[2] = vsi_nn_CreateTensor( self->graph, &attr ); - status = VSI_FAILURE; - - - kernel_info.type = VX_KERNEL_TYPE_VX; - kernel_info.kernel = vx_kernel_SPATIAL_TRANSFORMER_list; - kernel_info.resource_num = 1; - kernel_info.resource_name = (char **)malloc( kernel_info.resource_num * sizeof(char *) ); - kernel_info.resource_name[0] = "vsi_nn_kernel_transform_setupThres"; - - path = getenv("USER_VX_SOURCE_PATH"); - if(path) - vsi_nn_VxResourceSetPath( path ); - - kernel_info.kernel_index = 1; - kernel_info.init_index = 1; - - // add setupThre - self->n = vsi_nn_RegisterClientKernelAndNewNode( self->graph, &kernel_info); - - if (NULL != op_compute_list[kernel_info.init_index]) + if( self->n ) { - status = op_compute_list[kernel_info.init_index]( self, &fc_t, tmp_output_tensor ); + status = VSI_SUCCESS; } - if ( NULL == self->n ) - { - status = VSI_FAILURE; - goto final; - } + vsi_nn_kernel_param_release( ¶m ); - // add gemm - kernel_info.kernel_index = 2; - kernel_info.init_index = 2; - kernel_info.resource_name[0] = "vsi_nn_kernel_transform_gemm"; - self->n = vsi_nn_RegisterClientKernelAndNewNode( - self->graph, &kernel_info); - if (NULL != op_compute_list[kernel_info.init_index]) - { - status = op_compute_list[kernel_info.init_index](self, tmp_output_tensor, outputs); - } - - // add interp - if (input_t->attr.dim_num == 2 && input_t->attr.dtype.vx_type == VSI_NN_TYPE_FLOAT16 - && output_t->attr.dtype.vx_type == VSI_NN_TYPE_FLOAT16) - { - kernel_info.kernel_index = 3; - kernel_info.init_index = 3; - } - else if (input_t->attr.dim_num == 4 && input_t->attr.dtype.vx_type == VSI_NN_TYPE_FLOAT16 - && output_t->attr.dtype.vx_type == VSI_NN_TYPE_FLOAT16) - { - kernel_info.kernel_index = 4; - kernel_info.init_index = 3; - } - kernel_info.resource_name[0] = "vsi_nn_kernel_transform_interp"; - self->n = vsi_nn_RegisterClientKernelAndNewNode( self->graph, &kernel_info); - tmp_output_tensor[3] = input_t; - - if (NULL != op_compute_list[kernel_info.init_index]) - { - status = op_compute_list[kernel_info.init_index]( self, tmp_output_tensor, &output_t ); - } - if (tmp_output_tensor[0]) - { - vsi_nn_ReleaseTensor( &tmp_output_tensor[0] ); - tmp_output_tensor[0] = NULL; - } - if (tmp_output_tensor[1]) - { - vsi_nn_ReleaseTensor( &tmp_output_tensor[1] ); - tmp_output_tensor[1] = NULL; - } - if (tmp_output_tensor[2]) - { - vsi_nn_ReleaseTensor( &tmp_output_tensor[2] ); - tmp_output_tensor[2] = NULL; - } - if (input_t) - { - vsi_nn_ReleaseTensor( &input_t ); - input_t = NULL; - } - if (fc_t) - { - vsi_nn_ReleaseTensor( &fc_t ); - fc_t = NULL; - } - if (output_t) - { - vsi_nn_ReleaseTensor( &output_t ); - output_t = NULL; - } - -final: - if (kernel_info.resource_name) - { - free(kernel_info.resource_name); - kernel_info.resource_name = NULL; - } return status; } /* op_compute() */ @@ -694,7 +92,43 @@ static vsi_bool op_check vsi_nn_tensor_t ** outputs ) { - /*TODO: Check tensor shapes. */ + BEGIN_IO_TYPE_DECL(SPATIAL_TRANSFORMER, 2, 1) + /* IO_TYPE(INPUT, OUTPUT) */ + IO_TYPE(D_F16, D_F16, D_F16) + IO_TYPE(D_F16, D_F16, D_I16|Q_DFP) + IO_TYPE(D_F16, D_F16, D_U8|Q_ASYM) + IO_TYPE(D_F16, D_F16, D_I8|Q_DFP) + IO_TYPE(D_F16, D_I16|Q_DFP, D_F16) + IO_TYPE(D_F16, D_U8|Q_ASYM, D_F16) + IO_TYPE(D_F16, D_I8|Q_DFP, D_F16) + IO_TYPE(D_I16|Q_DFP, D_F16, D_F16) + IO_TYPE(D_U8|Q_ASYM, D_F16, D_F16) + IO_TYPE(D_I8|Q_DFP, D_F16, D_F16) + IO_TYPE(D_I16|Q_DFP, D_I16|Q_DFP, D_F16) + IO_TYPE(D_U8|Q_ASYM, D_U8|Q_ASYM, D_F16) + IO_TYPE(D_I8|Q_DFP, D_I8|Q_DFP, D_F16) + IO_TYPE(D_F16, D_I16|Q_DFP, D_I16|Q_DFP) + IO_TYPE(D_F16, D_U8|Q_ASYM, D_U8|Q_ASYM) + IO_TYPE(D_F16, D_I8|Q_DFP, D_I8|Q_DFP) + IO_TYPE(D_F16, D_I16|Q_DFP, D_I16|Q_DFP) + IO_TYPE(D_F16, D_U8|Q_ASYM, D_U8|Q_ASYM) + IO_TYPE(D_F16, D_I8|Q_DFP, D_I8|Q_DFP) + IO_TYPE(D_I16|Q_DFP, D_F16, D_I16|Q_DFP) + IO_TYPE(D_U8|Q_ASYM, D_F16, D_U8|Q_ASYM) + IO_TYPE(D_I8|Q_DFP, D_F16, D_I8|Q_DFP) + IO_TYPE(D_I16|Q_DFP, D_I16|Q_DFP, D_I16|Q_DFP) + IO_TYPE(D_U8|Q_ASYM, D_U8|Q_ASYM, D_U8|Q_ASYM) + IO_TYPE(D_I8|Q_DFP, D_I8|Q_DFP, D_I8|Q_DFP) + END_IO_TYPE_DECL(SPATIAL_TRANSFORMER) + if (!VALIDATE_OP_IO_TYPES(SPATIAL_TRANSFORMER, self, inputs, self->input.num, outputs, self->output.num)) + { + char* desc = generate_op_io_types_desc(inputs, + self->input.num, outputs, self->output.num); + VSILOGE("Inputs/Outputs data type not support: %s", desc); + destroy_op_io_types_desc(desc); + return FALSE; + } + return TRUE; } /* op_check() */ @@ -705,18 +139,47 @@ static vsi_bool op_setup vsi_nn_tensor_t ** outputs ) { - /* TODO: Add code to comput outputs' shape. */ - //vsi_nn_spatial_transformer_param * p; - //p = (vsi_nn_spatial_transformer_param *)&node->nn_param.client_param; + vsi_nn_spatial_transformer_param * p; + p = (vsi_nn_spatial_transformer_param *)&node->nn_param.spatial_transformer; + + if( VSI_NN_DIM_AUTO == outputs[0]->attr.dim_num ) + { + uint32_t i = 0; + + outputs[0]->attr.dim_num = inputs[0]->attr.dim_num; + outputs[0]->attr.size[0] = p->output_W; + outputs[0]->attr.size[1] = p->output_H; + if (p->output_W == 0) + { + outputs[0]->attr.size[0] = inputs[0]->attr.size[0]; + } + + if (p->output_H == 0) + { + outputs[0]->attr.size[0] = inputs[0]->attr.size[1]; + } + + for (i = 2; i < outputs[0]->attr.dim_num; i++) + { + outputs[0]->attr.size[i] = inputs[0]->attr.size[i]; + } + } - outputs[0]->attr.dim_num = inputs[0]->attr.dim_num; - outputs[0]->attr.size[0] = inputs[0]->attr.size[0];//p->output_W; // W - outputs[0]->attr.size[1] = inputs[0]->attr.size[1];//p->output_H; // H - outputs[0]->attr.size[2] = inputs[0]->attr.size[2]; // C - outputs[0]->attr.size[3] = inputs[0]->attr.size[3]; // N return TRUE; } /* op_setup() */ +static vsi_status op_init + ( + vsi_nn_node_t * self + ) +{ + vsi_status status = VSI_SUCCESS; + + self->nn_param.spatial_transformer.align_corners = FALSE; + + return status; +} /* op_init() */ + #ifdef __cplusplus extern "C" { #endif @@ -724,7 +187,7 @@ extern "C" { DEF_OP_REG ( /* op_name */ SPATIAL_TRANSFORMER, - /* init */ NULL, + /* init */ op_init, /* compute */ op_compute, /* deinit */ vsi_nn_op_common_deinit, /* check */ op_check, diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_sync_host.c b/src/tim/vx/internal/src/ops/vsi_nn_op_sync_host.c index 957ecd5..812cea3 100644 --- a/src/tim/vx/internal/src/ops/vsi_nn_op_sync_host.c +++ b/src/tim/vx/internal/src/ops/vsi_nn_op_sync_host.c @@ -24,72 +24,15 @@ #include #include -#include "vsi_nn_pub.h" -#include "libnnext/vsi_nn_vxkernel.h" +#include "vsi_nn_types.h" +#include "vsi_nn_platform.h" +#include "vsi_nn_prv.h" +#include "vsi_nn_graph.h" +#include "kernel/vsi_nn_kernel.h" -#define _ARG_NUM (0) #define _INPUT_NUM (1) #define _OUTPUT_NUM (1) -#define _VX_KERNEL_VAR (vx_client_kernel_SYNC_HOST) -#define _IO_NUM (_INPUT_NUM + _OUTPUT_NUM) -#define _PARAM_NUM (_ARG_NUM + _IO_NUM) -#define _VSI_PARAM (vsi_nn_client_sync_host_param) -extern vx_kernel_description_t * vx_kernel_SYNC_HOST_list[]; - -static void _set_inputs_outputs - ( - vx_reference * params, - vsi_nn_tensor_t ** inputs, - vsi_nn_tensor_t ** outputs - ) -{ - uint32_t i; - uint32_t cnt; - - /* Set inputs */ - cnt = 0; - for( i = 0; i < _INPUT_NUM; i ++, cnt ++ ) - { - params[cnt] = (vx_reference)inputs[i]->t; - } - - /* Set outputs */ - for( i = 0; i < _OUTPUT_NUM; i ++, cnt ++ ) - { - params[cnt] = (vx_reference)outputs[i]->t; - } -} /* _set_inputs_outputs() */ - -static vsi_status cpu_op_compute - ( - vsi_nn_node_t * self, - vsi_nn_tensor_t ** inputs, - vsi_nn_tensor_t ** outputs - ) -{ - vsi_status status = VSI_SUCCESS; - vx_reference params[_PARAM_NUM]; - - if( NULL == self->n ) - { - return VSI_FAILURE; - } - - /* Set inputs and outputs */ - _set_inputs_outputs( params, inputs, outputs ); - - /* Pass parameters to node. */ - status = vsi_nn_ClientNodePassParameters( self->n, params, _PARAM_NUM ); - - return status; -} - -static vsi_nn_op_compute_t op_compute_list[] = -{ - cpu_op_compute, - NULL -}; static vsi_status op_compute ( @@ -98,48 +41,20 @@ static vsi_status op_compute vsi_nn_tensor_t ** outputs ) { - vsi_status status; - vsi_nn_kernel_info_t kernel_info; - char *path = NULL; + vsi_status status = VSI_FAILURE; - memset(&kernel_info, 0x0, sizeof(vsi_nn_kernel_info_t)); - status = VSI_FAILURE; - kernel_info.type = VX_KERNEL_TYPE_CPU; - kernel_info.kernel = vx_kernel_SYNC_HOST_list; - kernel_info.resource_num = 1; - kernel_info.resource_name = (char **)malloc(kernel_info.resource_num * sizeof(char *)); - kernel_info.resource_name[0] = "sync_host"; - path = getenv("USER_VX_SOURCE_PATH"); - if(path) - vsi_nn_VxResourceSetPath(path); + self->n = (vx_node)vsi_nn_kernel_selector( self->graph, + "sync_host", + inputs, 1, + outputs, 1, NULL ); - if( kernel_info.type == VX_KERNEL_TYPE_VX) + if( self->n ) { - kernel_info.kernel_index = 1; - kernel_info.init_index = 1; - } - else /*kernel_info.type = VX_KERNEL_TYPE_CPU;*/ - { - kernel_info.kernel_index = 0; - kernel_info.init_index = 0; + status = VSI_SUCCESS; } - self->n = vsi_nn_RegisterClientKernelAndNewNode( - self->graph, &kernel_info); - if (kernel_info.resource_name) - { - free(kernel_info.resource_name); - } - if( NULL == self->n ) - { - return VSI_FAILURE; - } - if (NULL != op_compute_list[kernel_info.init_index]) - { - status = op_compute_list[kernel_info.init_index](self, inputs, outputs); - } return status; -} /* op_compute() */ +}/* op_compute() */ static vsi_bool op_check ( diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_tensorstackconcat.c b/src/tim/vx/internal/src/ops/vsi_nn_op_tensorstackconcat.c index f752f1e..5c346bb 100644 --- a/src/tim/vx/internal/src/ops/vsi_nn_op_tensorstackconcat.c +++ b/src/tim/vx/internal/src/ops/vsi_nn_op_tensorstackconcat.c @@ -34,202 +34,60 @@ #include "vsi_nn_tensor_util.h" #include "vsi_nn_prv.h" #include "vsi_nn_log.h" -#include "libnnext/vsi_nn_vxkernel.h" +#include "kernel/vsi_nn_kernel.h" #include "utils/vsi_nn_constraint_check.h" -#define _ARG_NUM (0) #define _INPUT_NUM (2) #define _OUTPUT_NUM (1) -#define _IO_NUM (_INPUT_NUM + _OUTPUT_NUM) -#define _PARAM_NUM (_ARG_NUM + _IO_NUM) -extern vx_kernel_description_t * vx_kernel_TENSORSTACKCONCAT_list[]; - -static vsi_bool _reshape_tensor +static vsi_bool _get_stackconcat_shape ( - vsi_nn_node_t * self, - vsi_nn_tensor_t ** inputs, - vsi_nn_tensor_t ** outputs + const int32_t* shape_x, const int32_t rank_x, + const int32_t* shape_output, const int32_t rank_output, + const int32_t axis, + int32_t* out_shape_0, uint32_t* out_rank_0, + int32_t* out_shape_1, uint32_t* out_rank_1, + int32_t* out_shape_output, uint32_t* out_rank_output ) { - uint32_t i = 0; - uint32_t sizes[VSI_NN_MAX_DIM_NUM] = {0}; - int32_t axis = 0; - vsi_nn_tensorstackconcat_param * p = NULL; - uint32_t before_size = 1; - uint32_t after_size = 1; - uint32_t * input_sizes = inputs[0]->attr.size; - uint32_t dims = inputs[0]->attr.dim_num; - uint32_t * output_sizes = outputs[0]->attr.size; - uint32_t new_dims = 0; + int32_t i = 0; + uint32_t innerSize = 1; + uint32_t outerSize = 1; - p = &(self->nn_param.tensorstackconcat); - axis = p->axis; - - for ( i = 0; i < (uint32_t)axis; i++) + for ( i = 0; i < rank_x; i++) { - before_size *= input_sizes[i]; + innerSize *= shape_x[i]; } - for ( i = axis + 1; i < dims; i++) + for ( i = axis + 1; i < rank_x; i++) { - after_size *= input_sizes[i]; + outerSize *= shape_x[i]; } - sizes[0] = before_size; - sizes[1] = input_sizes[axis]; - sizes[2] = after_size; - new_dims = 3; - p->local->local_tensor[0] = vxReshapeTensor(inputs[0]->t, (int32_t *)sizes, new_dims); + out_shape_0[0] = innerSize; + out_shape_0[1] = shape_x[axis]; + out_shape_0[2] = outerSize; + *out_rank_0 = 3; - sizes[0] = 1; - sizes[1] = 1; - new_dims = 2; - p->local->local_tensor[1] = vxReshapeTensor(inputs[1]->t, (int32_t *)sizes, new_dims); + out_shape_1[0] = 1; + out_shape_1[1] = 1; + *out_rank_1 = 2; - before_size = 1; - after_size = 1; - for ( i = 0; i < (uint32_t)axis; i++) + innerSize = 1; + outerSize = 1; + for ( i = 0; i < axis; i++) { - before_size *= output_sizes[i]; + innerSize *= shape_output[i]; } - for ( i = axis + 1; i < dims; i++) + for ( i = axis + 1; i < rank_output; i++) { - after_size *= output_sizes[i]; + outerSize *= shape_output[i]; } - sizes[0] = before_size; - sizes[1] = output_sizes[axis]; - sizes[2] = after_size; - new_dims = 3; - p->local->local_tensor[2] = vxReshapeTensor(outputs[0]->t, (int32_t *)sizes, new_dims); + out_shape_output[0] = innerSize; + out_shape_output[1] = shape_output[axis]; + out_shape_output[2] = outerSize; + *out_rank_output = 3; - p->axis = 1; return TRUE; } - -static void _set_inputs_outputs - ( - vsi_nn_node_t * self, - vx_reference * params, - vsi_nn_tensor_t ** inputs, - vsi_nn_tensor_t ** outputs - ) -{ - vsi_nn_tensorstackconcat_param *p = NULL; - uint32_t i = 0; - - p = &(self->nn_param.tensorstackconcat); - - for (i = 0; i < _IO_NUM; i++) - { - params[i] = (vx_reference)(p->local->local_tensor[i]); - } -} /* _set_inputs_outputs() */ - -static vsi_status cpu_op_compute - ( - vsi_nn_node_t * self, - vsi_nn_tensor_t ** inputs, - vsi_nn_tensor_t ** outputs - ) -{ - vsi_status status = VSI_SUCCESS; - vx_reference params[_PARAM_NUM]; - - if( NULL == self->n ) - { - return VSI_FAILURE; - } - - /* Set inputs and outputs */ - _set_inputs_outputs( self, params, inputs, outputs ); - - /* Pass parameters to node. */ - status = vsi_nn_ClientNodePassParameters( self->n, params, _PARAM_NUM ); - - //_release_params( args, _ARG_NUM ); - - return status; -} - -static vsi_status vx_op_pre_compute - ( - vsi_nn_node_t * self, - vsi_nn_tensor_t ** inputs, - vsi_nn_tensor_t ** outputs, - vsi_nn_kernel_info_t * kernel_info - ) -{ - vsi_nn_type_e inputDataFormat = inputs[0]->attr.dtype.vx_type; - vsi_nn_type_e outputDataFormat = outputs[0]->attr.dtype.vx_type; - int8_t inputFixedPointPos = inputs[0]->attr.dtype.fl; - int8_t outputFixedPointPos = outputs[0]->attr.dtype.fl; - int32_t inputZeroPoint = inputs[0]->attr.dtype.zero_point; - int32_t outputZeroPoint = outputs[0]->attr.dtype.zero_point; - vx_float32 inputScale = inputs[0]->attr.dtype.scale; - vx_float32 outputScale = outputs[0]->attr.dtype.scale; - vsi_bool is16Bits = FALSE; - vsi_bool is8Bits = FALSE; - - is16Bits = ((inputDataFormat == VSI_NN_TYPE_FLOAT16 && outputDataFormat == VSI_NN_TYPE_FLOAT16) - || (inputDataFormat == VSI_NN_TYPE_INT16 && outputDataFormat == VSI_NN_TYPE_INT16 - && inputFixedPointPos == outputFixedPointPos)) ? TRUE : FALSE; - is8Bits = ((inputDataFormat == VSI_NN_TYPE_INT8 && outputDataFormat == VSI_NN_TYPE_INT8 - && inputFixedPointPos == outputFixedPointPos) - || (inputDataFormat == VSI_NN_TYPE_UINT8 && outputDataFormat == VSI_NN_TYPE_UINT8 - && inputZeroPoint == outputZeroPoint && inputScale == outputScale)) ? TRUE : FALSE; - - if (is16Bits) - { - kernel_info->kernel_index = 1; - } - else if (is8Bits) - { - kernel_info->kernel_index = 2; - } - else - { - VSILOGE("Not support input or output data format!(TENSORSTACKCONCAT) at [%s : %d]\n", __FILE__, __LINE__); - return VSI_FAILURE; - } - - return VSI_SUCCESS; -} - -static vsi_status vx_op_compute - ( - vsi_nn_node_t * self, - vsi_nn_tensor_t ** inputs, - vsi_nn_tensor_t ** outputs - ) -{ - vsi_status status = VSI_SUCCESS; - vx_reference params[_PARAM_NUM]; - vx_border_t border; - - if( NULL == self->n ) - { - return VSI_FAILURE; - } - - /* Set inputs and outputs */ - _set_inputs_outputs( self, params, inputs, outputs ); - - /* Pass parameters to node. */ - status = vsi_nn_ClientNodePassParameters( self->n, params, _PARAM_NUM ); - - border.mode = VX_BORDER_REPLICATE; - border.constant_value.U32 = 0; - status |= vxSetNodeAttribute(self->n, VX_NODE_BORDER, &border, sizeof(border)); - - return status; -} - -static vsi_nn_op_compute_t op_compute_list[] = -{ - cpu_op_compute, - vx_op_compute, - NULL -}; - static vsi_status op_compute ( vsi_nn_node_t * self, @@ -238,39 +96,39 @@ static vsi_status op_compute ) { vsi_status status = VSI_FAILURE; - vsi_nn_kernel_info_t kernel_info; + vsi_nn_tensor_t* reshape_tensors[3] = { NULL }; + int32_t shape[3][VSI_NN_MAX_DIM_NUM] = { { 0 } }; + uint32_t rank[3] = {0}; - memset(&kernel_info, 0x0, sizeof(vsi_nn_kernel_info_t)); + _get_stackconcat_shape( + (int32_t *)inputs[0]->attr.size, inputs[0]->attr.dim_num, + (int32_t *)outputs[0]->attr.size, outputs[0]->attr.dim_num, + self->nn_param.tensorstackconcat.axis, + shape[0], &rank[0], shape[1], &rank[1], shape[2], &rank[2] ); - /* reshape input/output */ - _reshape_tensor( self, inputs, outputs); + reshape_tensors[0] = vsi_nn_reshape_tensor( self->graph, + inputs[0], (uint32_t*)shape[0], rank[0] ); + reshape_tensors[1] = vsi_nn_reshape_tensor( self->graph, + inputs[1], (uint32_t*)shape[1], rank[1] ); + reshape_tensors[2] = vsi_nn_reshape_tensor( self->graph, + outputs[0], (uint32_t*)shape[2], rank[2] ); - kernel_info.resource_num = 1; - kernel_info.resource_name = (char **)malloc(kernel_info.resource_num * sizeof(char *)); - kernel_info.resource_name[0] = "vsi_nn_kernel_tensorstackconcat"; - kernel_info.type = vsi_nn_GetVXKernelTypeForShader(); - kernel_info.kernel = vx_kernel_TENSORSTACKCONCAT_list; - kernel_info.init_index = 1; + self->n = (vx_node)vsi_nn_kernel_selector( self->graph, + "tensorstackconcat", + &reshape_tensors[0], 2, + &reshape_tensors[2], 1, NULL ); - if (vsi_nn_is_do_vx_op_pre_init(kernel_info.type)) + vsi_nn_ReleaseTensor( &reshape_tensors[0] ); + vsi_nn_ReleaseTensor( &reshape_tensors[1] ); + vsi_nn_ReleaseTensor( &reshape_tensors[2] ); + + if( self->n ) { - vx_op_pre_compute(self, inputs, outputs, &kernel_info); + status = VSI_SUCCESS; } - self->n = vsi_nn_RegisterClientKernelAndNewNode( - self->graph, &kernel_info); - if (kernel_info.resource_name) free(kernel_info.resource_name); - if( NULL == self->n ) - { - return VSI_FAILURE; - } - - if (NULL != op_compute_list[kernel_info.init_index]) - { - status = op_compute_list[kernel_info.init_index](self, inputs, outputs); - } return status; -} /* op_compute() */ +} static vsi_bool op_setup ( @@ -281,23 +139,15 @@ static vsi_bool op_setup { vsi_bool ret = FALSE; vsi_nn_tensorstackconcat_param *p = NULL; - vsi_nn_stackconcat_lcl_data *local = NULL; int32_t axis = 0; - if( NULL == self ) + if ( NULL == self ) { return ret; } p = &(self->nn_param.tensorstackconcat); axis = p->axis; - local = (vsi_nn_stackconcat_lcl_data *)malloc(sizeof(vsi_nn_stackconcat_lcl_data)); - if (NULL == local) - { - return ret; - } - memset(local, 0, sizeof(vsi_nn_stackconcat_lcl_data)); - p->local = local; if (axis < 0) { @@ -333,12 +183,12 @@ static vsi_bool op_check VSILOGE("Invalid Axis: %d, (TENSORSTACKCONCAT) at [%s : %d]\n", axis, __FILE__, __LINE__); return FALSE; } - if( VSI_NN_DIM_AUTO == out_dims ) + if ( VSI_NN_DIM_AUTO == out_dims ) { VSILOGE("Invalid output, (TENSORSTACKCONCAT) at [%s : %d]\n", __FILE__, __LINE__); return FALSE; } - if( dims != out_dims ) + if ( dims != out_dims ) { VSILOGE("Input and output's dims not matched, (TENSORSTACKCONCAT) at [%s : %d]\n", __FILE__, __LINE__); return FALSE; @@ -346,12 +196,14 @@ static vsi_bool op_check { BEGIN_IO_TYPE_DECL(TENSORSTACKCONCAT, 2, 1) - IO_TYPE(D_F16, D_I32, D_F16) + IO_TYPE(D_F16, D_I32, D_F16) + IO_TYPE(D_BF16, D_I32, D_BF16) IO_TYPE(D_U8|Q_ASYM, D_I32, D_U8|Q_ASYM) IO_TYPE(D_I16|Q_DFP, D_I32, D_I16|Q_DFP) - IO_TYPE(D_I8|Q_DFP, D_I32, D_I8|Q_DFP) + IO_TYPE(D_I8|Q_DFP, D_I32, D_I8|Q_DFP) END_IO_TYPE_DECL(TENSORSTACKCONCAT) - if(!VALIDATE_OP_IO_TYPES(TENSORSTACKCONCAT, self, inputs, self->input.num, outputs, self->output.num)) { + if (!VALIDATE_OP_IO_TYPES(TENSORSTACKCONCAT, self, inputs, self->input.num, outputs, self->output.num)) + { char* desc = generate_op_io_types_desc(inputs, self->input.num, outputs, self->output.num); VSILOGE("Inputs/Outputs data type not support: %s", desc); @@ -375,30 +227,6 @@ static vsi_status op_init return status; } /* op_init() */ -static vsi_status op_deinit - ( - vsi_nn_node_t * self - ) -{ - vsi_nn_tensorstackconcat_param *p = &(self->nn_param.tensorstackconcat); - uint32_t i = 0; - if (p->local) - { - for (i = 0; i < _VSI_NN_STACKCONCAT_LOCAL_TENSOR_NUM; i++) - { - if (p->local->local_tensor[i]) - { - vxReleaseTensor(&(p->local->local_tensor[i])); - p->local->local_tensor[i] = NULL; - } - } - vsi_nn_safe_free(p->local); - } - vsi_nn_op_common_deinit(self); - - return VSI_SUCCESS; -} /* op_deinit() */ - #ifdef __cplusplus extern "C" { #endif @@ -408,7 +236,7 @@ DEF_OP_REG /* op_name */ TENSORSTACKCONCAT, /* init */ op_init, /* compute */ op_compute, - /* deinit */ op_deinit, + /* deinit */ vsi_nn_op_common_deinit, /* check */ op_check, /* setup */ op_setup, /* optimize */ NULL, @@ -418,4 +246,3 @@ DEF_OP_REG #ifdef __cplusplus } #endif - diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_variable.c b/src/tim/vx/internal/src/ops/vsi_nn_op_variable.c index 5cbd3cb..a0d5e6f 100644 --- a/src/tim/vx/internal/src/ops/vsi_nn_op_variable.c +++ b/src/tim/vx/internal/src/ops/vsi_nn_op_variable.c @@ -35,6 +35,7 @@ #include "vsi_nn_tensor.h" #include "vsi_nn_tensor_util.h" #include "vsi_nn_log.h" +#include "utils/vsi_nn_dtype_util.h" #include "utils/vsi_nn_util.h" static vsi_status op_compute @@ -96,7 +97,8 @@ static vsi_status op_optimize return VSI_FAILURE; } memset(local, 0, sizeof(vsi_nn_variable_lcl_data)); - if( NULL != inputs[0]->t && NULL == outputs[0]->t ) + if( NULL != inputs[0]->t && NULL == outputs[0]->t && + vsi_nn_DtypeCompare(&inputs[0]->attr.dtype, &outputs[0]->attr.dtype)) { VSILOGD("Optimize %s, uid %u", vsi_nn_OpGetName(self->op), self->uid); outputs[0]->t = vxReshapeTensor(inputs[0]->t, (int32_t *)outputs[0]->attr.size, outputs[0]->attr.dim_num); diff --git a/src/tim/vx/internal/src/utils/vsi_nn_code_generator.c b/src/tim/vx/internal/src/utils/vsi_nn_code_generator.c index 8c07eb6..470b1a3 100644 --- a/src/tim/vx/internal/src/utils/vsi_nn_code_generator.c +++ b/src/tim/vx/internal/src/utils/vsi_nn_code_generator.c @@ -428,6 +428,8 @@ static _op_param_gen_t s_op_gen[] = /* ONE_HOT */ NULL, /* NMS */ NULL, /* GROUPED_CONV1D */ NULL, + /* SCATTER_ND_UPDATE */ NULL, + /* GELU */ NULL, }; _compiler_assert( _cnt_of_array(s_op_gen) == VSI_NN_OP_NUM, vsi_nn_code_generator_c ); diff --git a/src/tim/vx/internal/src/vsi_nn_context.c b/src/tim/vx/internal/src/vsi_nn_context.c index 0ea0e3e..eb7f494 100644 --- a/src/tim/vx/internal/src/vsi_nn_context.c +++ b/src/tim/vx/internal/src/vsi_nn_context.c @@ -51,7 +51,12 @@ static vsi_status query_hardware_caps #if VX_HARDWARE_CAPS_PARAMS_EXT_SUPPORT context->config.subGroupSize = paramExt.subGroupSize; +#if VX_VA40_EXT_SUPPORT + context->config.use_40bits_va = paramExt.supportVA40; #endif + +#endif + if(param.evis1 == TRUE && param.evis2 == FALSE) { context->config.evis.ver = VSI_NN_HW_EVIS_1; @@ -70,6 +75,46 @@ final: return status; } +int32_t vsi_nn_getEnv(const char* name, char** env_s) { + int32_t ret = 0; + *env_s = getenv(name); + if (*env_s) { + ret = TRUE; + } + return ret; +} + +static vsi_status vsi_nn_initOptions + ( + vsi_nn_runtime_option_t *options + ) +{ + char* env_s = NULL; + + env_s = NULL; + options->enable_shader = 1; + if (vsi_nn_getEnv("VIV_VX_ENABLE_SHADER", &env_s) && env_s) + { + options->enable_shader = atoi(env_s); + } + + env_s = NULL; + options->enable_opcheck = 1; + if (vsi_nn_getEnv("VSI_NN_ENABLE_OPCHECK", &env_s) && env_s) + { + options->enable_opcheck = atoi(env_s); + } + + env_s = NULL; + options->enable_concat_optimize = 1; + if (vsi_nn_getEnv("VSI_NN_ENABLE_CONCAT_OPTIMIZE", &env_s) && env_s) + { + options->enable_concat_optimize = atoi(env_s); + } + + return VSI_SUCCESS; +} + vsi_nn_context_t vsi_nn_CreateContext ( void ) { @@ -96,6 +141,12 @@ vsi_nn_context_t vsi_nn_CreateContext return NULL; } + if (vsi_nn_initOptions(&context->options) != VSI_SUCCESS) + { + vsi_nn_ReleaseContext(&context); + return NULL; + } + return context; } /* vsi_nn_CreateContext() */ @@ -113,4 +164,3 @@ void vsi_nn_ReleaseContext *ctx = NULL; } } /* vsi_nn_ReleaseContext() */ - diff --git a/src/tim/vx/internal/src/vsi_nn_graph.c b/src/tim/vx/internal/src/vsi_nn_graph.c index c94a1ca..57ab550 100644 --- a/src/tim/vx/internal/src/vsi_nn_graph.c +++ b/src/tim/vx/internal/src/vsi_nn_graph.c @@ -739,6 +739,13 @@ vsi_status vsi_nn_SetupGraph goto final; } + /* Set all of tensor attribute in graph to high precision */ + status = set_graph_precision(graph, nodes_list); + if(VSI_SUCCESS != status) + { + goto final; + } + /* Create vx node and vx virtual tensor */ status = compute_node( graph, nodes_list ); if(VSI_SUCCESS != status) @@ -746,12 +753,6 @@ vsi_status vsi_nn_SetupGraph goto final; } - /* Set all of tensor attribute in graph to high precision */ - status = set_graph_precision(graph, nodes_list); - if(VSI_SUCCESS != status) - { - goto final; - } /* Try setup graph complete signal node. */ status = vsi_nn_TrySetupCompleteSignalNode( graph ); TEST_CHECK_STATUS( status, final ); diff --git a/src/tim/vx/internal/src/vsi_nn_internal_node.c b/src/tim/vx/internal/src/vsi_nn_internal_node.c index 9c4485e..7d9a623 100644 --- a/src/tim/vx/internal/src/vsi_nn_internal_node.c +++ b/src/tim/vx/internal/src/vsi_nn_internal_node.c @@ -159,7 +159,8 @@ vsi_nn_internal_tensor_t* vsi_nn_internal_create_zero_bias_tensor ( vsi_nn_node_t* node, vsi_nn_tensor_attr_t* input_attr, - vsi_nn_tensor_attr_t* weight_attr + vsi_nn_tensor_attr_t* weight_attr, + vsi_bool use_virtual_tensor ) { vsi_nn_tensor_attr_t attr; @@ -171,8 +172,8 @@ vsi_nn_internal_tensor_t* vsi_nn_internal_create_zero_bias_tensor /* create zero bias for NN/TP */ attr.size[0] = weight_attr->size[1]; attr.dim_num = 1; - attr.vtl = FALSE; - attr.is_const = TRUE; + attr.vtl = use_virtual_tensor; + attr.is_const = !use_virtual_tensor; if(input_attr->dtype.qnt_type != VSI_NN_QNT_TYPE_NONE && input_attr->dtype.qnt_type != weight_attr->dtype.qnt_type) @@ -366,20 +367,19 @@ vsi_nn_internal_node_t* vsi_nn_internal_get_node_by_uid int uid ) { - vsi_nn_internal_node_t* head = NULL; vsi_nn_internal_node_t* curr = NULL; if( node && node->internal_node_wksp ) { - head = WKSP(node)->nodes; - while( NULL != head ) + curr = WKSP(node)->nodes; + while( NULL != curr ) { - curr = (vsi_nn_internal_node_t *)vsi_nn_LinkListPopStart( - (vsi_nn_link_list_t **)&head ); if( curr->node->uid == (uint32_t)uid ) { return curr; } + + curr = (vsi_nn_internal_node_t *)vsi_nn_LinkListNext( (vsi_nn_link_list_t *)curr ); } } diff --git a/src/tim/vx/internal/src/vsi_nn_node_attr_template.c b/src/tim/vx/internal/src/vsi_nn_node_attr_template.c index 6743605..e0a5bd6 100644 --- a/src/tim/vx/internal/src/vsi_nn_node_attr_template.c +++ b/src/tim/vx/internal/src/vsi_nn_node_attr_template.c @@ -189,6 +189,7 @@ static _node_template s_template[] = /* GROUPNORM */ NULL, /* SEQUENCE_MASK */ NULL, /* REPEAT */ NULL, + /* SCATTER_ND_UPDATE */ NULL, }; //_compiler_assert( _cnt_of_array(s_template) == VSI_NN_OP_NUM, vsi_nn_node_attr_template_c ); diff --git a/src/tim/vx/internal/src/vsi_nn_ops.c b/src/tim/vx/internal/src/vsi_nn_ops.c index aba3f2c..84cdfb1 100644 --- a/src/tim/vx/internal/src/vsi_nn_ops.c +++ b/src/tim/vx/internal/src/vsi_nn_ops.c @@ -26,6 +26,7 @@ #include "vsi_nn_client_op.h" #include "vsi_nn_node.h" #include "vsi_nn_types.h" +#include "vsi_nn_graph.h" #include "vsi_nn_log.h" #define DEF_OP(NAME, ...) extern vsi_nn_op_proc_t vsi_nn_op_##NAME; @@ -268,12 +269,12 @@ vsi_bool vsi_nn_OpCheck ret = FALSE; proc = vsi_nn_OpGetProc( op ); - if( NULL != proc ) + if ( NULL != proc ) { ret = TRUE; - if( proc->check ) + if ( proc->check && node->graph->ctx->options.enable_opcheck) { - ret = proc->check( node, inputs, outputs ); + ret = proc->check( node, inputs, outputs ); } } return ret; @@ -380,4 +381,3 @@ const char * vsi_nn_OpGetName } return name; } /* vsi_nn_GetOpName() */ - diff --git a/src/tim/vx/internal/src/vsi_nn_rnn_helper.c b/src/tim/vx/internal/src/vsi_nn_rnn_helper.c index b12b5d4..b389bc2 100644 --- a/src/tim/vx/internal/src/vsi_nn_rnn_helper.c +++ b/src/tim/vx/internal/src/vsi_nn_rnn_helper.c @@ -339,7 +339,7 @@ vsi_nn_internal_tensor_t* vsi_nn_rnn_create_tp_fc if( !bias ) { /* create zero bias for NN/TP */ - tensor1 = vsi_nn_internal_create_zero_bias_tensor(self, &input->attr, &weight->attr); + tensor1 = vsi_nn_internal_create_zero_bias_tensor(self, &input->attr, &weight->attr, FALSE); tensor = tensor1->t; } vsi_nn_internal_init_tensor_attr(&attr, output_dtype, use_virtual_tensor); @@ -382,7 +382,7 @@ vsi_nn_internal_tensor_t* vsi_nn_rnn_create_nn_fc if( !bias ) { /* create zero bias for NN/TP */ - tensor1 = vsi_nn_internal_create_zero_bias_tensor(self, &input->attr, &weight->attr); + tensor1 = vsi_nn_internal_create_zero_bias_tensor(self, &input->attr, &weight->attr, FALSE); tensor = tensor1->t; } @@ -475,7 +475,7 @@ vsi_nn_internal_tensor_t* vsi_nn_rnn_create_nn_fc_relu if( !bias ) { /* create zero bias for NN/TP */ - tensor1 = vsi_nn_internal_create_zero_bias_tensor(self, &input->attr, &weight->attr); + tensor1 = vsi_nn_internal_create_zero_bias_tensor(self, &input->attr, &weight->attr, FALSE); tensor = tensor1->t; } diff --git a/src/tim/vx/internal/src/vsi_nn_tensor.c b/src/tim/vx/internal/src/vsi_nn_tensor.c index a7bd3c7..263d26f 100644 --- a/src/tim/vx/internal/src/vsi_nn_tensor.c +++ b/src/tim/vx/internal/src/vsi_nn_tensor.c @@ -2356,3 +2356,66 @@ vsi_status vsi_nn_SwapHandle return VSI_SUCCESS; } /* vsi_nn_SwapHandle() */ +vsi_bool vsi_nn_ConvertTensor + ( + vsi_nn_graph_t* graph, + vsi_nn_tensor_t* input, + vsi_nn_tensor_t* output + ) +{ + vsi_bool ret = TRUE; + uint8_t* src_buf = NULL; + uint32_t sz = 0; + uint32_t src_stride = 0; + uint32_t dst_stride = 0; + uint32_t dst_buf_sz = 0; + uint8_t* dst_buf = NULL; + + if( NULL == graph || NULL == input || NULL == output ) + { + return FALSE; + } + + src_buf = vsi_nn_ConvertTensorToData( graph, input ); + if ( NULL == src_buf ) + { + VSILOGE( "Convert data fail." ); + return FALSE; + } + + sz = vsi_nn_GetElementNum( output ); + src_stride = vsi_nn_TypeGetBytes( input->attr.dtype.vx_type ); + dst_stride = vsi_nn_TypeGetBytes( output->attr.dtype.vx_type ); + dst_buf_sz = sz * dst_stride; + dst_buf = (uint8_t *)malloc( dst_buf_sz ); + + if ( dst_buf ) + { + uint32_t i = 0; + vsi_status status = VSI_SUCCESS; + + for ( i = 0; i < sz; i ++ ) + { + status = vsi_nn_DtypeConvert( &src_buf[src_stride * i], + &input->attr.dtype, &dst_buf[dst_stride * i], &output->attr.dtype ); + if( VSI_FAILURE == status ) + { + ret = FALSE; + VSILOGE("Convert default_value to dtype fail"); + break; + } + } + + status = vsi_nn_CopyDataToTensor( graph, output, dst_buf ); + if ( VSI_FAILURE == status ) + { + ret = FALSE; + VSILOGE("Copy data to tensor fail"); + } + } + + vsi_nn_safe_free( src_buf ); + vsi_nn_safe_free( dst_buf ); + + return ret; +} \ No newline at end of file